예제 #1
0
    def __init__(self, g_orig, haplotype, num_errors):
        '''Initialize phasing statistics object for the original genotype g/and haplotype set problem.haplotype.'''
        h = haplotype.data
        r_orig = recode.recode_single_genotype(g_orig)
        r = recode.recode_single_genotype(h)

        # Sizes
        self.time = 0
        self.num_snps = haplotype.num_snps
        self.num_samples = haplotype.num_samples
        self.num_genotypes = haplotype.num_data / 2
        self.num_haplotypes = haplotype.num_data

        # Arrays
        self.fill = np.array([
            haplotype.fill_fraction(sample=x)
            for x in xrange(haplotype.num_samples)
        ])

        # Fields
        # A Field factory method
        field = lambda index: StatsField(self, h, index)
        self.called_orig = field(recode.where_called(r_orig))
        self.imputed = field(recode.where_full_imputed(r, r_orig))
        self.imputed_partial = field(recode.where_partial_imputed(r, r_orig))
        self.errors = field(recode.where_error(r, r_orig))
        self.errors_partial = field(recode.where_partial_error(r, r_orig))
        self.called = field(recode.where_called(r))
        self.partial_called = field(recode.where_partial_called(r))
        self.still_missing = field(recode.where_still_missing(r, r_orig))

        # Scalars
        self.num_filled_haplotypes = haplotype.num_filled
        self.num_errors = num_errors  # Redundant
예제 #2
0
def prob_ibd_single_locus(problem, id1, id2, snps, params):
    '''Is the segment s (defined by the SNP array snps, which may be a frame within the actual segment)
    an IBD segment between samples id1 and id2 or not? Outputs an IBD probability estimate.
    
    This estimate is based on a single-locus IBD posterior probability estimation. Assuming loci
    are statistically independent.'''
    
    # Gather prior information: allele frequencies and condensed identity coefficients
    p = problem.info.allele_frequency(1)
    sample_id = problem.pedigree.sample_id
    _, Delta = params.id_coefs(sample_id[id1], sample_id[id2])
    print 'Delta', Delta
    
    # Load and hash genotype pairs at each SNP
    g = problem.g
    r1, r2 = recode.recode_single_genotype(g[snps, id1, :]), recode.recode_single_genotype(g[snps, id2, :])
    gg_hash = __HASH_TO_T[3 * r1 + r2 - 8]
    
    # Compute posterior (loop over T-states, bulk-set corresponding entries in output array)
    prob_ibd = np.zeros_like(snps, dtype=np.float) 
    for t in __HASH_TO_T:
        index = np.where(gg_hash == t)[0]
        print 'State T=%d, #occurrences %d' % (t, len(index))
        prob_ibd[index] = ibd_posterior_gg(Delta, p[index], t)
    return prob_ibd
예제 #3
0
    def __init__(self, g_orig, haplotype, num_errors):
        '''Initialize phasing statistics object for the original genotype g/and haplotype set problem.haplotype.'''
        h = haplotype.data
        r_orig = recode.recode_single_genotype(g_orig)
        r = recode.recode_single_genotype(h)
        
        # Sizes
        self.time = 0
        self.num_snps = haplotype.num_snps
        self.num_samples = haplotype.num_samples
        self.num_genotypes = haplotype.num_data / 2
        self.num_haplotypes = haplotype.num_data
        
        # Arrays
        self.fill = np.array([haplotype.fill_fraction(sample=x) for x in xrange(haplotype.num_samples)])
        
        # Fields
        # A Field factory method
        field = lambda index: StatsField(self, h, index)
        self.called_orig = field(recode.where_called(r_orig))
        self.imputed = field(recode.where_full_imputed(r, r_orig))
        self.imputed_partial = field(recode.where_partial_imputed(r, r_orig))
        self.errors = field(recode.where_error(r, r_orig))
        self.errors_partial = field(recode.where_partial_error(r, r_orig))
        self.called = field(recode.where_called(r))
        self.partial_called = field(recode.where_partial_called(r))
        self.still_missing = field(recode.where_still_missing(r, r_orig))

        # Scalars
        self.num_filled_haplotypes = haplotype.num_filled
        self.num_errors = num_errors  # Redundant
예제 #4
0
파일: ibd_hmm.py 프로젝트: orenlivne/ober
 def __init__(self, lam, Delta, x, p, g1, g2, e, Pi=None, debug=False, snps=None):
     self.Delta = Delta 
     self.x = x
     self.lam = lam
     self.D = np.dot(np.ones(9)[:, np.newaxis], Delta[:, np.newaxis].transpose())
     self.I_minus_D = np.eye(9) - self.D
     self.lam_x = lam * np.diff(x)
     self.p = p
     self.e = e
     self.E = emission_error(e)
     self.r1 = recode.recode_single_genotype(g1)
     self.r2 = recode.recode_single_genotype(g2)
     self.Pi = Pi if Pi is not None else Delta
     self.debug = debug
     self.snps = snps if snps is not None else np.arange(len(x))
     
     # Define HMM
     self.m = hmmt.HMM(9, A=lambda k: self.__transition_probability(k),
                       B=lambda k: self.__emission_probability(k),
                       Pi=self.Pi, V=ProbIbdHmmCalculator.__T_STATE)
     self.Obs = ProbIbdHmmCalculator.__HASH_TO_T_STATE[3 * self.r1 + self.r2 - 8]
     # Output fields
     self.Gamma = None
     self.p_ibd_gamma = None
     self.Q_star = None
     self.p_ibd = None
예제 #5
0
def recode_single(g1, g2, data_filter):
    '''Recode a genotype pair to a single genotype code as in the recode module. Restrict to entries
    where both are called.'''
    r1, r2 = recode.recode_single_genotype(g1), recode.recode_single_genotype(
        g2)
    called = np.where(data_filter(r1, r2))
    return r1[called], r2[called], called
예제 #6
0
def impute_from_fully_called(g, h):
    '''Impute missing genotypes in g from fully-called haplotypes h. The imputation is typically done on the
    genotypes after phasing, which may contain zeroed-out entries found to be Mendelian errors, and thus
    will benefit from imputation here. Partially-filled genotypes are also overridden by haps, if the
    latter are fully called. Returns the number of imputed genotypes'''
    imputed = recode.where_full_imputed(recode.recode_single_genotype(h), recode.recode_single_genotype(g))
    g[imputed[SNP], imputed[SAMPLE], :] = h[imputed[SNP], imputed[SAMPLE], :]
    return len(imputed[0])
예제 #7
0
def impute_from_fully_called(g, h):
    '''Impute missing genotypes in g from fully-called haplotypes h. The imputation is typically done on the
    genotypes after phasing, which may contain zeroed-out entries found to be Mendelian errors, and thus
    will benefit from imputation here. Partially-filled genotypes are also overridden by haps, if the
    latter are fully called. Returns the number of imputed genotypes'''
    imputed = recode.where_full_imputed(recode.recode_single_genotype(h),
                                        recode.recode_single_genotype(g))
    g[imputed[SNP], imputed[SAMPLE], :] = h[imputed[SNP], imputed[SAMPLE], :]
    return len(imputed[0])
예제 #8
0
def genotype_ibs_segments(genotype,
                          id1,
                          id2,
                          snps,
                          error_filter='median',
                          error_filter_length=5,
                          margin=0.0,
                          min_ibs_len_snp=400,
                          debug=False):
    '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2
    in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array.
    
    See ibs_segments() for a description of optional parameters.'''
    num_snps = genotype.num_snps
    g = genotype.data
    g1 = recode.recode_single_genotype(g[snps, id1, :])
    g2 = recode.recode_single_genotype(g[snps, id2, :])
    d = (recode.ibs_state(g1, g2) == 0).astype(np.byte)

    # Consider informative or the specified SNPs only
    filtered_diff = filter_diff(d, error_filter, error_filter_length)
    error_snps = snps[np.nonzero(d - filtered_diff)[0]]

    # Detect edges as non-zero gradient points; output sufficiently long segments
    if np.size(filtered_diff) == 0:
        # No data to consider ==> no IBD intervals can be identified
        segments = []
    else:
        # Convert recombination locations to segments of no recombination; filter short segments
        bp = genotype.snp['base_pair']
        #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)
        segments = [
            Segment(((x[0], x[1])), [id1, id2],
                    (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)),
                    error_snps=segment.in_segment(error_snps, x),
                    collapse_to_set=False) for x in
            segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)
        ]

    # Cut segment margins
    if margin >= constants.SMALL_FLOAT:
        segments = [
            s for s in (s.middle_part(
                genotype.nearest_snp, bp, margin, collapse_to_set=False)
                        for s in segments) if s
        ]

    # Restrict errors to those inside segments
    segment_set = SegmentSet(segments,
                             np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)),
                                                              np.array([id1, id2])), dtype=int) \
                             if segments else gt.empty_errors_array())
    if debug:
        print 'ibs_segments()', segment_set
        print 'errors', segment_set.errors
    return segment_set
예제 #9
0
    def __init__(self, problem, fraction=None, test_index=None):
        '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index
        is specified, these specific test indices are used; otherwise a random fraction is generated.
        
        If test_index = 'hap', data is read from problem.h (haplotype array). The entire array
        is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.'''
        # Create a working copy of the problem. Only the data is copied.
        if not (fraction is not None) ^ (test_index is not None):
            raise ValueError('Must specify fraction or test_index')
        self.problem = Problem(problem.pedigree, problem.genotype.copy())
        self.h = self.problem.h

        # Create test set; save original genotypes in g_orig
        if test_index is None:
            self.fraction = fraction
            self.g_orig, i = clear_random_portion(self.problem.genotype.data,
                                                  fraction)
        elif test_index == 'hap':
            # Don't clear anything; call everything a test index.
            h = problem.h
            i = tuple(
                util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1])))
            self.g_orig = problem.g
            self.h = h
            self.fraction = 1.0
        else:
            self.g_orig, i = clear_index(self.problem.g, test_index)
            self.fraction = (1.0 * i[0].size) / (self.h.shape[0] *
                                                 self.h.shape[1])
        self.num_tests = i[0].size
        self.test_index = i
        self.r_orig = recode.recode_single_genotype(self.g_orig)
        self.fill = self.problem.fill_fraction()[:, SAMPLE]
        self.__recode_single_genotype = None
예제 #10
0
 def __init__(self, problem, fraction=None, test_index=None):
     '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index
     is specified, these specific test indices are used; otherwise a random fraction is generated.
     
     If test_index = 'hap', data is read from problem.h (haplotype array). The entire array
     is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.'''
     # Create a working copy of the problem. Only the data is copied.
     if not (fraction is not None) ^ (test_index is not None):
         raise ValueError('Must specify fraction or test_index')
     self.problem = Problem(problem.pedigree, problem.genotype.copy())
     self.h = self.problem.h
     
     # Create test set; save original genotypes in g_orig
     if test_index is None:
         self.fraction = fraction
         self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction)
     elif test_index == 'hap':
         # Don't clear anything; call everything a test index.
         h = problem.h
         i = tuple(util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1])))
         self.g_orig = problem.g
         self.h = h
         self.fraction = 1.0
     else:
         self.g_orig, i = clear_index(self.problem.g, test_index)
         self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1])
     self.num_tests = i[0].size
     self.test_index = i
     self.r_orig = recode.recode_single_genotype(self.g_orig)
     self.fill = self.problem.fill_fraction()[:, SAMPLE]
     self.__recode_single_genotype = None
예제 #11
0
파일: ibd.py 프로젝트: orenlivne/ober
def genotype_ibs_segments(genotype, id1, id2, snps,
                          error_filter='median', error_filter_length=5, margin=0.0,
                          min_ibs_len_snp=400, debug=False):
    '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2
    in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array.
    
    See ibs_segments() for a description of optional parameters.'''
    num_snps = genotype.num_snps
    g = genotype.data
    g1 = recode.recode_single_genotype(g[snps, id1, :])
    g2 = recode.recode_single_genotype(g[snps, id2, :])
    d = (recode.ibs_state(g1, g2) == 0).astype(np.byte)

    # Consider informative or the specified SNPs only
    filtered_diff = filter_diff(d, error_filter, error_filter_length)
    error_snps = snps[np.nonzero(d - filtered_diff)[0]]
    
    # Detect edges as non-zero gradient points; output sufficiently long segments
    if np.size(filtered_diff) == 0:
        # No data to consider ==> no IBD intervals can be identified
        segments = []
    else:
        # Convert recombination locations to segments of no recombination; filter short segments
        bp = genotype.snp['base_pair']
        #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)
        segments = [Segment(((x[0], x[1])), [id1, id2], (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)),
                            error_snps=segment.in_segment(error_snps, x), collapse_to_set=False)
                    for x in segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)]
    
    # Cut segment margins
    if margin >= constants.SMALL_FLOAT:
        segments = [s for s in (s.middle_part(genotype.nearest_snp, bp, margin, collapse_to_set=False)
                                for s in segments) if s]

    # Restrict errors to those inside segments
    segment_set = SegmentSet(segments,
                             np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)),
                                                              np.array([id1, id2])), dtype=int) \
                             if segments else gt.empty_errors_array())
    if debug:
        print 'ibs_segments()', segment_set
        print 'errors', segment_set.errors
    return segment_set
예제 #12
0
    def __init__(self, t, g):
        '''t = ImputationSet imputed data set.
        g = Genotype data for all samples; sample indices must correspond to t''s sample indices.
        '''
        self.t = t
        self.T = t.sample_index  # training set
        self.I = t.sample_index_to_impute  # imputed sample set
        self.imputed_data = t.imputed_data[:, self.I, :]
        self.genetic_coord = self.t.snp['dist_cm']

        self.g = g
        self.rg = recode.recode_single_genotype(g)
        self.ri = recode.recode_single_genotype(t.imputed_data)
        self.maf = np.amin(im.gt.allele_frequencies_by_snp(g), axis=0)
        # self.concordance_training = self.concordance(SNP, samples=self.T)
        # self.concordance_imputed = self.concordance(SNP, samples=self.I)
        # self.concordance_all = self.concordance(SNP)
       
        # Lazily-initialized cached properties
        self._allele_count = None
        self._frequency = None
예제 #13
0
    def __init__(self,
                 lam,
                 Delta,
                 x,
                 p,
                 g1,
                 g2,
                 e,
                 Pi=None,
                 debug=False,
                 snps=None):
        self.Delta = Delta
        self.x = x
        self.lam = lam
        self.D = np.dot(
            np.ones(9)[:, np.newaxis], Delta[:, np.newaxis].transpose())
        self.I_minus_D = np.eye(9) - self.D
        self.lam_x = lam * np.diff(x)
        self.p = p
        self.e = e
        self.E = emission_error(e)
        self.r1 = recode.recode_single_genotype(g1)
        self.r2 = recode.recode_single_genotype(g2)
        self.Pi = Pi if Pi is not None else Delta
        self.debug = debug
        self.snps = snps if snps is not None else np.arange(len(x))

        # Define HMM
        self.m = hmmt.HMM(9,
                          A=lambda k: self.__transition_probability(k),
                          B=lambda k: self.__emission_probability(k),
                          Pi=self.Pi,
                          V=ProbIbdHmmCalculator.__T_STATE)
        self.Obs = ProbIbdHmmCalculator.__HASH_TO_T_STATE[3 * self.r1 +
                                                          self.r2 - 8]
        # Output fields
        self.Gamma = None
        self.p_ibd_gamma = None
        self.Q_star = None
        self.p_ibd = None
예제 #14
0
def __handle_estimate_genotype_frequencies(self, request):
    """Estimate genotype frequencies from the genotype data and save them in ProblemInfo."""
    # Load problem fields
    problem = request.problem
    snp_metadata = problem.info.snp
    snp_count = snp_metadata["count"]

    # Recode genotypes to a single number
    r = recode.recode_single_genotype(problem.genotype.data)

    # Count genotype appearances for each SNP, and save in SNP annotation array.
    # The frequency table column order matches the GENOTYPE_CODE array. This includes filled
    # and missing genotypes: (1,1),(1,2),(2,2),(0,0).
    for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()):
        snp_count[:, col] = statutil.hist(np.where(r == genotype_code)[0], problem.num_snps)

    # Calculate frequencies
    snp_metadata["frequency"] = statutil.scale_row_sums(snp_count.astype("float"))

    return False
예제 #15
0
 def test_phase_and_get_stats(self):
     '''Test gathering phasing statistics in a Stats object.'''
     nuclear_family_phaser().run(self.problem)
     g_orig = self.problem.genotype.data.copy()
     stats = v.Stats(g_orig, self.problem.haplotype, self.problem.num_errors)
     itu.assert_problem_stats(self.problem, 167336, 137693, 102)
     
     # Check stats fields
     partially_filled_genotype = np.where(recode.recode_single_genotype(self.problem.genotype.data) < 0)
     assert_equal(len(partially_filled_genotype[0]), 279, 'Unexpected # of partially-filled genotypes')
     assert_equal(stats.imputed.total, 185, 'Unexpected # of fully-imputed genotypes')
     assert_almost_equal(stats.imputed.fraction, 0.00221, decimal=5, err_msg='Unexpected fraction of fully-imputed genotypes')
     assert_equal(stats.imputed_partial.total, 45, 'Unexpected # of partially-imputed genotypes')
     # Check that printouts don't crash us
     stats.pprint(open(os.devnull, 'wb'))
     # Check that stats is pickable and unpicklable
     out_name = util.temp_filename(suffix='.npz')
     out = open(out_name, 'wb')
     np.savez(out, stats=np.array([stats]))
     out.close()
     loaded = np.load(out_name)
     assert_equal(loaded['stats'][0].imputed_partial.total, stats.imputed_partial.total, 'Pickling & unpickling a Stats object failed')
예제 #16
0
def __handle_estimate_genotype_frequencies(self, request):
    '''Estimate genotype frequencies from the genotype data and save them in ProblemInfo.'''
    # Load problem fields
    problem = request.problem
    snp_metadata = problem.info.snp
    snp_count = snp_metadata['count']

    # Recode genotypes to a single number
    r = recode.recode_single_genotype(problem.genotype.data)

    # Count genotype appearances for each SNP, and save in SNP annotation array.
    # The frequency table column order matches the GENOTYPE_CODE array. This includes filled
    # and missing genotypes: (1,1),(1,2),(2,2),(0,0).
    for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()):
        snp_count[:, col] = statutil.hist(
            np.where(r == genotype_code)[0], problem.num_snps)

    # Calculate frequencies
    snp_metadata['frequency'] = statutil.scale_row_sums(
        snp_count.astype('float'))

    return False
예제 #17
0
def __handle_fill_missing_genotypes(self, request):
    '''Fill missing genotype entries by randomly sampling from the multinomial distribution with
    estimated genotype frequencies at the corresponding SNP.'''
    # Load problem fields 
    if request.params.debug:
        print 'Filling missing genotypes from estimated genotype distribution'
    problem = request.problem
    g = problem.genotype.data
    snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES]

    # Recode genotypes to a single number
    r = recode.recode_single_genotype(g)
    # Find SNP, sample indices of missing data
    missing = recode.where_missing(r)
    
    # Generate random multinomial values; map them to genotype codes
    filled_code = multinomial_elementwise(scale_row_sums(snp_frequency[missing[SNP]])) + 2 
    
    # Fill-in all genotypes of a certain value in a vectorized manner 
    for (genotype, code) in recode.GENOTYPE_CODE.iteritems():
        index = np.where(filled_code == code)[0]
        g[missing[SNP][index], missing[SAMPLE][index], :] = genotype
    return False
예제 #18
0
def __handle_fill_missing_genotypes(self, request):
    '''Fill missing genotype entries by randomly sampling from the multinomial distribution with
    estimated genotype frequencies at the corresponding SNP.'''
    # Load problem fields
    if request.params.debug:
        print 'Filling missing genotypes from estimated genotype distribution'
    problem = request.problem
    g = problem.genotype.data
    snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES]

    # Recode genotypes to a single number
    r = recode.recode_single_genotype(g)
    # Find SNP, sample indices of missing data
    missing = recode.where_missing(r)

    # Generate random multinomial values; map them to genotype codes
    filled_code = multinomial_elementwise(
        scale_row_sums(snp_frequency[missing[SNP]])) + 2

    # Fill-in all genotypes of a certain value in a vectorized manner
    for (genotype, code) in recode.GENOTYPE_CODE.iteritems():
        index = np.where(filled_code == code)[0]
        g[missing[SNP][index], missing[SAMPLE][index], :] = genotype
    return False
예제 #19
0
def recode_single(g1, g2, data_filter):
    '''Recode a genotype pair to a single genotype code as in the recode module. Restrict to entries
    where both are called.'''
    r1, r2 = recode.recode_single_genotype(g1), recode.recode_single_genotype(g2)
    called = np.where(data_filter(r1, r2))
    return r1[called], r2[called], called
예제 #20
0
 def __remove_partial_calls(self):
     '''Set partially-imputed genotypes to missing. A prerequisite for loading into PLINK, for instance
     (but not PLINK-SEQ).'''
     self.h[recode.where_partial_called(recode.recode_single_genotype(self.h)), :] = MISSING
예제 #21
0
    def impute(self, samples=None):
        '''Infer imputed genotypes at all samples of h from the samples of g.'''
        # Aliases
        r = self.ratio_threshold
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index:
                print 'Initial g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j]))
            else:
                print 'Initial h[%d] = %s' % (j, repr(self.h[j]))
        
        # Create a queue of haplotypes that can be used to impute others. Initially, it
        # is the training set homozygotes; every time we phase a training set het, its other
        # allele is appended to the queue.
        q = Queue()

        # Initial condition: phase all hom training samples         
        hom = self.__phase_hom()
        for hap in itertools.product(hom, ALLELES):
            if self.debug:
                print 'Adding hom haplotype to queue', hap
            q.put(hap)
        num_hom_haps = q.qsize()
        if self.debug:
            print 'Items on queue : %d' % (q.qsize(),)
            print 'filled haps    : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,)
            HH = recode.recode_single_genotype(self.h)
            print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,)
            print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,)
        count = 0
        while not q.empty():
            # Get the IBD group of an imputed haplotype (all haps that are IBD with it at self.snp)
            hap = q.get()
            count += 1
            if count > self.max_iter:
                raise ValueError('Possible imputation bug - exceeded max number of iterations')
            if self.debug:
                print '*' * 55
                print 'Iteration %d, imputing from hap %s' % (count, hap)
                print '*' * 55
                if self.debug_sample >= 0:
                    j = self.debug_sample
                    print 'h[%d] = %s' % (j, repr(self.h[j]))
                    if self.h[j, 0] == 1:
                        pass
            group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE])
            if group.size:
                # if self.debug: print 'group', group
                s, a = group[:, SAMPLE], group[:, ALLELE]
                H = self.h[s, a]
#                    print 'H', H
                # Find haplotypes that have been imputed with each of the alleles
                R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)]
                if self.debug:
                    print 'IBD group %d (%d haps):\n%s' % (self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr(np.concatenate((group, H[np.newaxis].transpose()), axis=1)))
                    print 'R1 = %s' % (repr(list(map(tuple, R1))),)
                    print 'R2 = %s' % (repr(list(map(tuple, R2))),)
#                    print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1)))
#                    print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1)))
                # Majority vote: if there are enough haps with one allele, override the rest.
                # Otherwise, an unresolved conflict ==> zero everyone out.
                l1, l2 = len(R1), len(R2)
                consensus = 1 if l1 >= r * l2 else (2 if l2 >= r * l1 else MISSING)
                self.h[s, a] = consensus
                if consensus == 0:
                    # If no consensus is reached, keep the already-imputed values in place, otherwise
                    # we can run into an infinite loop by imputing and erasing h-entries.  
                    self.h[R1[:, 0], R1[:, 1]] = 1
                    self.h[R2[:, 0], R2[:, 1]] = 2
                H = self.h[s]
                if self.debug:
                    print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus)
                    print 'Items on queue : %d' % (q.qsize(),)
                    print 'filled haps    : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,)
                    HH = recode.recode_single_genotype(self.h)
                    print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,)
                    print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,)
                
                # Phase training hets (this includes BOTH partially-called = potential hets an
                # fully-called hets) with one imputed allele
                i = np.array([self.training_index.has_key(x) for x in s])
                si = s[i]
                G = self.g[map(self.training_index.get, si), :]
                unphased_hets = np.where(((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) 
                                         & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING)
                                            & (G[:, PATERNAL] != G[:, MATERNAL])))
                if unphased_hets[0].size:
                    if self.debug:
                        if count >= num_hom_haps:
                            print 'After hom, items on queue %d' % (q.qsize(),)
                            pass
                        print 'unphased_hets', unphased_hets
                        print 'si', si
                        print 'index i[unphased_hets]', np.where(i)[0][unphased_hets]
                        print 'H_unphased', H[i][unphased_hets]
                        print 'G_unphased', G[unphased_hets]
                    H_unphased = H[i][unphased_hets]
                    H_phased = H_unphased.copy()
                    complete_haplotype_partial(H_phased, G[unphased_hets])
#                    if self.debug:
#                        print 'Phasing hets'
#                        print 'i', np.where(i) 
#                        print 'H_unphased', H_unphased
#                        print 'G of unphased_hets', G[unphased_hets]
                    newly_phased_alleles = np.where(H_phased != H_unphased)[1]
                    self.h[si[unphased_hets]] = H_phased[:]
#                    if self.debug:
#                        print 'After phasing H_unphased', self.h[s[unphased_hets]]
#                        print 'unphased_hets', s[unphased_hets]
#                        print 'newly_phased_alleles', newly_phased_alleles
                    # Append the new data we can now make use of to the queue
                    if self.debug:
                        print 'After phasing them'
                        print 'H_phased  ', H_phased
                    for hap in zip(si[unphased_hets], newly_phased_alleles):
                        if self.debug:
                            print 'Adding phased het haplotypes to queue', hap
                        q.put(hap)
        self.__override_training_imputed_by_genotypes()
        if self.remove_partial_calls: self.__remove_partial_calls()
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index:
                print 'Final g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j]))
            else:
                print 'h[%d] = %s' % (j, repr(self.h[j]))
예제 #22
0
파일: qc.py 프로젝트: orenlivne/ober
def qc(ibd, t, snp=None, majority_threshold=0.66, debug=1, samples=None, genotype=None,
       debug_sample= -1, ibd_sample_index=None):
    '''Main call that separately imputes each SNP in a list of SNPs.
    ibd = IBD dictionary - a SegmentIndex instance
    t = ImputationSet instance. Contains training genotypes and imputed genotypes.
    majority_threshold = Threshold for resolving IBD conflicts via the majority vote 
    samples = samples to impute (if None, all samples are imputed)
    genotype = true genotype at all samples (non-None for validation studies).'''
    # If problem sample indices require translation, augment the t.imputed_data, t.imputed_hap_type arrays
    # to the #genotyped samples in the IBD segment index; then impute all of them; in the end, restrict the
    # result to the problem's sample index subset.
    imputed_data, imputed_hap_type = t.imputed_data, t.imputed_hap_type
    sample_translation = ibd_sample_index is not None
    if sample_translation:
        imputed_data = np.zeros((t.num_snps, ibd.num_samples, 2), dtype=np.byte)
        imputed_data[:, ibd_sample_index, :] = t.imputed_data
        imputed_hap_type = np.zeros((t.num_snps, ibd.num_samples), dtype=np.byte)
        imputed_hap_type[:, ibd_sample_index] = t.imputed_hap_type
        
    snp = snp if snp is not None else np.arange(len(t.snp))
    if debug >= 1:
        print 'Imputing SNPs'
        print 'majority_threshold', majority_threshold
        print 'snp', snp
    for snp_index in snp:
        chrom = t.snp['chrom'][snp_index]
        snp_bp = t.snp['base_pair'][snp_index]
        if debug >= 1:
            print '====== SNP %4d (%-22s): chr%-2d:%-9d x=%.2f ======' % \
        (snp_index, t.snp['name'][snp_index], chrom, snp_bp, t.snp['dist_cm'][snp_index])
        t_start = time.time()
        _IbdQc(imputed_data[snp_index], imputed_hap_type[snp_index],
               ibd, t.training_data[snp_index], t.sample_index,
               chrom, snp_bp, debug=(debug >= 2), majority_threshold=majority_threshold,
               debug_sample=debug_sample).impute(samples)           
        t_impute = time.time() - t_start
        if debug >= 1:
            result = imputed_data[snp_index]
            r = recode.recode_single_genotype(result)
            if sample_translation: r = r[ibd_sample_index]
            num_fully_called = len(np.where(r > 0)[0])
            num_alleles_called = len(result.nonzero()[0])
            result_training = result[t.sample_index]
            num_phased = len(result_training.nonzero()[0])
            # Per Rebecca's request: print the list of samples IDs that were not fully called
#             snp_name = t.snp['name'][snp_index]
#             for sample in im.examples.sample_index_to_id()[np.where(r <= 0)[0]]:
#                 print '%s,%s' % (snp_name, sample)
            if genotype is not None:
                rg = recode.recode_single_genotype(genotype[snp_index])
                if debug >= 2:
                    eq = ((r <= 0) | (rg <= 0) | (r == rg)).astype(int)  
                    np.set_printoptions(threshold=np.nan)
                    group_index = ibd._group_index[0 if ibd.test_index else ibd.nearest_left_snp(chrom, snp_bp) - ibd._start]
                    if sample_translation: group_index, result = group_index[ibd_sample_index], result[ibd_sample_index]
                    all_haps = np.concatenate((np.arange(t.num_samples)[np.newaxis].transpose(),
                                          genotype[snp_index], result.tolist(),
                                          group_index  # ,eq[np.newaxis].transpose(
                                          ), axis=1) 
                    print 'All (true vs. imputed):'
                    print all_haps
                    print 'Discordant (true vs. imputed):'
                    discordant = all_haps[np.where(eq == 0)[0]]
                    print discordant
                    bad_paternal = np.where(discordant[:, 1] != discordant[:, 3])[0]
                    bad_maternal = np.where(discordant[:, 2] != discordant[:, 4])[0]
                    print 'bad_paternal', bad_paternal
                    print 'bad_maternal', bad_maternal
    
                    # dis = compressed form of haplotype report obtained from the discordant array 
                    dis = np.zeros((discordant.shape[0], 3), dtype=np.uint)
                    if bad_paternal.size:
                        dis[bad_paternal, :] = discordant[bad_paternal][:, np.array([0, 1, 5])]
                    if bad_maternal.size:
                        dis[bad_maternal, :] = discordant[bad_maternal][:, np.array([0, 2, 6])]
                    dis = np.concatenate((dis, np.zeros((dis.shape[0], 1), dtype=np.uint)), axis=1)
                    if bad_paternal.size:
                        dis[bad_paternal, 3] = PATERNAL
                    dis[bad_maternal, 3] = MATERNAL
                    dis = dis[:, np.array([0, 3, 1, 2])]
                    bad_groups = np.array(Counter(dis[:, 3]).items(), dtype=np.uint)
                    if bad_groups.size:
                        bad_groups = bad_groups[np.lexsort((bad_groups[:, 0], -bad_groups[:, 1]))]
                    print 'bad_group id, #discordances\n', bad_groups

            G, H = t.training_data[snp_index], result[t.sample_index]
            changed = np.where((H[:, 0] != MISSING) & (H[:, 1] != MISSING) & (G[:, 0] != MISSING) & (G[:, 1] != MISSING) & 
                               (H[:, 0] + H[:, 1] != G[:, 0] + G[:, 1]))[0]

            print 'Time               %6.2f s' % (t_impute,)
            print 'Call rate allele   %6.2f%% (%d/%d)' % \
            ((100.0 * num_alleles_called) / result.size, num_alleles_called, result.size)
            print 'Call rate genotype %6.2f%% (%d/%d)' % \
            ((100.0 * num_fully_called) / result.shape[0], num_fully_called, result.shape[0])
            print 'Phased training    %6.2f%% (%d/%d)' % ((100.*num_phased) / result_training.size, num_phased, result_training.size)                
            if genotype is not None:
                c, con, dis = recode.concordance_recoded(r, rg)
                print 'Concordance        %6.2f%% (%d/%d)' % (100.*c, con, con + dis)
            if changed.size:
                print 'Changed training   %6.2f%% (%d/%d) %s' % ((100.*len(changed)) / len(t.sample_index), len(changed), len(t.sample_index), repr(t.sample_index[changed])[6:-1])
    
    # Restrict imputed results to problem's sample index subset
    if sample_translation:
        t.imputed_data = imputed_data[:, ibd_sample_index, :]
        t.imputed_hap_type = imputed_hap_type[:, ibd_sample_index]
예제 #23
0
파일: qc.py 프로젝트: orenlivne/ober
class _IbdQc(object):
    '''Calculates a QC measure for a single variant using IBD cliques.'''
    
    #---------------------------------------------
    # Constants
    #---------------------------------------------
    __EMPTY_ARRAY = np.array([])
    
    #---------------------------------------------
    # Constructors
    #---------------------------------------------
    def __init__(self, h, hap_type, ibd, g, training_sample_index, chrom, snp_bp, debug=False, majority_threshold=0.66,
                 debug_sample= -1, max_iter=1000):
        '''Initialize an imputer that changes the result h in-place using the IBD index ibd
        and training genotype data g at snp position snp_bp. majority vote = threshold for majority vote. When
        |# haps with majority allele| >= majority_threshold*(All haps) in a clique, the vote is accepted.''' 

        # Input fields
        self.h, self.hap_type, self.g, self.ibd, self.training_sample_index, self.max_iter, self.debug, \
        self.debug_sample, self.chrom = h, hap_type, g, ibd, training_sample_index, max_iter, debug, \
        debug_sample, chrom

        # Maps sample ID to training set index
        self.training_index = dict(zip(self.training_sample_index, xrange(self.g.shape[0])))
        self.ratio_threshold = majority_threshold / (1 - majority_threshold)

        # Find the appropriate IBD index SNP for the target base-pair position
        self.snp = ibd.nearest_left_snp(chrom, snp_bp)
        if self.debug:
            ibd.find(self.chrom, self.snp, self.training_sample_index[0], PATERNAL)
            print 'IBD index file %s/chr%d/region-%d.npz, nearest SNP %d' % (ibd._index_dir, ibd._chrom, ibd._start, self.snp)
        
    #---------------------------------------------
    # Methods
    #---------------------------------------------                 
    def impute(self, samples=None):
        '''Infer imputed genotypes at all samples of h from the samples of g.'''
        # Aliases
        r = self.ratio_threshold
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index: print 'Initial g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j]))
            else: print 'Initial h[%d] = %s' % (j, repr(self.h[j]))
        
        # Create a queue of haplotypes that can be used to impute others. Initially, it
        # is the training set homozygotes; every time we phase a training set het, its other
        # allele is appended to the queue.
        #
        # TODO: possibly replace by a priority queue where alleles are ordered by their clique sizes?
        # (we have extra confidence in those alleles; not sure it matters though) 
        q = Queue()

        # Initial condition: phase all hom training samples         
        hom = self.__phase_hom()
        for hap in itertools.product(hom, ALLELES):
            if self.debug: print 'Adding hom haplotype to queue', hap
            q.put(hap)
        num_hom_haps = q.qsize()
        if self.debug:
            print 'Items on queue : %d' % (q.qsize(),)
            print 'filled haps    : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,)
            HH = recode.recode_single_genotype(self.h)
            print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,)
            print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,)
        count = 0
        while not q.empty():
            # Find group = the IBD clique of an imputed haplotype (all haps that are IBD with it at self.snp)
            hap = q.get()
            count += 1
            if count > self.max_iter: raise ValueError('Possible imputation bug - exceeded maximum number of iterations')
            if self.debug:
                print '*' * 55
                print 'Iteration %d, imputing from hap %s' % (count, hap)
                print '*' * 55
                if self.debug_sample >= 0:
                    j = self.debug_sample
                    print 'h[%d] = %s' % (j, repr(self.h[j]))
                    if self.h[j, 0] == 1:
                        pass
            group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE])
            if group.size:
                # if self.debug: print 'group', group
                s, a = group[:, SAMPLE], group[:, ALLELE]
                H = self.h[s, a]
#                    print 'H', H
                # Find haplotypes that have been imputed as allele 1 and those imputed as allele 2
                R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)]
                if self.debug:
                    print 'IBD group %d (%d haps):\n%s' % (self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr(np.concatenate((group, H[np.newaxis].transpose()), axis=1)))
                    print 'R1 = %s' % (repr(list(map(tuple, R1))),)
                    print 'R2 = %s' % (repr(list(map(tuple, R2))),)
#                    print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1)))
#                    print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1)))
                # Majority vote: if there are enough haps with one allele, override the rest.
                # Otherwise, an unresolved conflict ==> zero everyone out.
                l1, l2 = len(R1), len(R2)
                consensus = 1 if l1 >= r * l2 else (2 if l2 >= r * l1 else MISSING)
                self.h[s, a] = consensus
                if consensus == 0:
                    # If no consensus is reached, keep the already-imputed values in place, otherwise
                    # we can run into an infinite loop by imputing and erasing h-entries.  
                    self.h[R1[:, 0], R1[:, 1]] = 1
                    self.h[R2[:, 0], R2[:, 1]] = 2
                H = self.h[s]
                if self.debug:
                    print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus)
                    print 'Items on queue : %d' % (q.qsize(),)
                    print 'filled haps    : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,)
                    HH = recode.recode_single_genotype(self.h)
                    print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,)
                    print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,)
                
                # Phase training hets (this includes BOTH partially-called = potential hets an
                # fully-called hets) with one imputed allele
                i = np.array([self.training_index.has_key(x) for x in s])
                si = s[i]
                G = self.g[map(self.training_index.get, si), :]
                unphased_hets = np.where(((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) 
                                         & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING)
                                            & (G[:, PATERNAL] != G[:, MATERNAL])))
                if unphased_hets[0].size:
                    if self.debug:
                        if count >= num_hom_haps:
                            print 'After hom, items on queue %d' % (q.qsize(),)
                            pass
                        print 'unphased_hets', unphased_hets
                        print 'si', si
                        print 'index i[unphased_hets]', np.where(i)[0][unphased_hets]
                        print 'H_unphased', H[i][unphased_hets]
                        print 'G_unphased', G[unphased_hets]
                    H_unphased = H[i][unphased_hets]
                    H_phased = H_unphased.copy()
                    complete_haplotype_partial(H_phased, G[unphased_hets])
#                    if self.debug:
#                        print 'Phasing hets'
#                        print 'i', np.where(i) 
#                        print 'H_unphased', H_unphased
#                        print 'G of unphased_hets', G[unphased_hets]
                    newly_phased_alleles = np.where(H_phased != H_unphased)[1]
                    self.h[si[unphased_hets]] = H_phased[:]
#                    if self.debug:
#                        print 'After phasing H_unphased', self.h[s[unphased_hets]]
#                        print 'unphased_hets', s[unphased_hets]
#                        print 'newly_phased_alleles', newly_phased_alleles
                    # Append the new data we can now make use of to the queue
                    if self.debug:
                        print 'After phasing them'
                        print 'H_phased  ', H_phased
                    for hap in zip(si[unphased_hets], newly_phased_alleles):
                        if self.debug:
                            print 'Adding phased het haplotypes to queue', hap
                        q.put(hap)
        self.__override_training_imputed_by_genotypes()
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index: print 'Final g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j]))
            else: print 'h[%d] = %s' % (j, repr(self.h[j]))
예제 #24
0
 def recoded_genotype(self):
     '''Return the genotype test set, recoded as a single number of allele pair.'''
     if self.__recode_single_genotype is None:
         self.__recode_single_genotype = recode.recode_single_genotype(self.test_called) 
     return self.__recode_single_genotype
예제 #25
0
 def test_orig(self):
     '''Return the original set of deleted test genotypes.'''
     i = self.test_index
     return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :])
예제 #26
0
 def recoded_genotype(self):
     '''Return the genotype test set, recoded as a single number of allele pair.'''
     if self.__recode_single_genotype is None:
         self.__recode_single_genotype = recode.recode_single_genotype(
             self.test_called)
     return self.__recode_single_genotype
예제 #27
0
 def test_orig(self):
     '''Return the original set of deleted test genotypes.'''
     i = self.test_index
     return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :])
예제 #28
0
파일: diff.py 프로젝트: orenlivne/ober
def ibs_diff(g, id1, id2):
    '''Return the IBS difference between two haplotypes (0 if IBS >= 1, 1 if IBS = 0).'''
    g1, g2 = recode.recode_single_genotype(g[:, id1, :]), recode.recode_single_genotype(g[:, id2, :])
    return (recode.ibs_state(g1, g2) == 0).astype(np.byte)
예제 #29
0
파일: diff.py 프로젝트: orenlivne/ober
def ibs_state(g, id1, id2):
    '''Return the IBS difference between two haplotypes (IBS=0,1 or 2).'''
    g1, g2 = recode.recode_single_genotype(g[:, id1, :]), recode.recode_single_genotype(g[:, id2, :])
    return recode.ibs_state(g1, g2)
예제 #30
0
파일: diff.py 프로젝트: orenlivne/ober
def ibs_state(g, id1, id2):
    '''Return the IBS difference between two haplotypes (IBS=0,1 or 2).'''
    g1, g2 = recode.recode_single_genotype(
        g[:, id1, :]), recode.recode_single_genotype(g[:, id2, :])
    return recode.ibs_state(g1, g2)
예제 #31
0
 def __remove_partial_calls(self):
     '''Set partially-imputed genotypes to missing. A prerequisite for loading into PLINK, for instance
     (but not PLINK-SEQ).'''
     self.h[recode.where_partial_called(
         recode.recode_single_genotype(self.h)), :] = MISSING
예제 #32
0
def impute(ibd,
           t,
           snp=None,
           majority_threshold=0.66,
           debug=1,
           samples=None,
           genotype=None,
           debug_sample=-1,
           ibd_sample_index=None,
           remove_partial_calls=False):
    '''Main call that separately imputes each SNP in a list of SNPs.
    ibd = IBD dictionary - a SegmentIndex instance
    t = ImputationSet instance. Contains training genotypes and imputed genotypes.
    majority_threshold = Threshold for resolving IBD conflicts via the majority vote 
    samples = samples to impute (if None, all samples are imputed)
    genotype = true genotype at all samples (non-None for validation studies)

    ibd_sample_index = optional an array whose ith element is the pedigree sample index corresponding to the ith sample ID
    in the ID list genotyped_ids. genotyped_ids is a subset of the pedigree sample index set; for sample
    indices that don''t appear in genotyped_ids, the corresponding output array element is set to -1.
    '''
    # If problem sample indices require translation, augment the t.imputed_data, t.imputed_hap_type arrays
    # to the #genotyped samples in the IBD segment index; then impute all of them; in the end, restrict the
    # result to the problem's sample index subset.
    imputed_data, imputed_hap_type = t.imputed_data, t.imputed_hap_type
    sample_translation = ibd_sample_index is not None
    if sample_translation:
        imputed_data = np.zeros((t.num_snps, ibd.num_samples, 2),
                                dtype=np.byte)
        imputed_data[:, ibd_sample_index, :] = t.imputed_data
        imputed_hap_type = np.zeros((t.num_snps, ibd.num_samples),
                                    dtype=np.byte)
        imputed_hap_type[:, ibd_sample_index] = t.imputed_hap_type

    snp = snp if snp is not None else np.arange(len(t.snp))
    if debug >= 1:
        print 'Imputing SNPs'
        print 'majority_threshold', majority_threshold
        print 'snp', snp
    for snp_index in snp:
        chrom = t.snp['chrom'][snp_index]
        snp_bp = t.snp['base_pair'][snp_index]
        if debug >= 1:
            print '====== SNP %4d (%-22s): chr%-2d:%-9d x=%.2f ======' % \
        (snp_index, t.snp['name'][snp_index], chrom, snp_bp, t.snp['dist_cm'][snp_index])
        t_start = time.time()
        _IbdIndexImputer(
            imputed_data[snp_index],
            imputed_hap_type[snp_index],
            ibd,
            t.training_data[snp_index],
            t.sample_index,
            chrom,
            snp_bp,
            debug=(debug >= 2),
            majority_threshold=majority_threshold,
            debug_sample=debug_sample,
            remove_partial_calls=remove_partial_calls).impute(samples)
        t_impute = time.time() - t_start
        if debug >= 1:
            result = imputed_data[snp_index]
            r = recode.recode_single_genotype(result)
            if sample_translation: r = r[ibd_sample_index]
            num_fully_called = len(np.where(r > 0)[0])
            num_alleles_called = len(result.nonzero()[0])
            result_training = result[t.sample_index]
            num_phased = len(result_training.nonzero()[0])
            # Per Rebecca's request: print the list of samples IDs that were not fully called
            #             snp_name = t.snp['name'][snp_index]
            #             for sample in im.examples.sample_index_to_id()[np.where(r <= 0)[0]]:
            #                 print '%s,%s' % (snp_name, sample)
            if genotype is not None:
                rg = recode.recode_single_genotype(genotype[snp_index])
                if debug >= 2:
                    eq = ((r <= 0) | (rg <= 0) | (r == rg)).astype(int)
                    np.set_printoptions(threshold=np.nan)
                    group_index = ibd._group_index[
                        0 if ibd.
                        test_index else ibd.nearest_left_snp(chrom, snp_bp) -
                        ibd._start]
                    if sample_translation:
                        group_index, result = group_index[
                            ibd_sample_index], result[ibd_sample_index]
                    all_haps = np.concatenate(
                        (
                            np.arange(t.num_samples)[np.newaxis].transpose(),
                            genotype[snp_index],
                            result.tolist(),
                            group_index  # ,eq[np.newaxis].transpose(
                        ),
                        axis=1)
                    print 'All (true vs. imputed):'
                    print all_haps
                    print 'Discordant (true vs. imputed):'
                    discordant = all_haps[np.where(eq == 0)[0]]
                    print discordant
                    bad_paternal = np.where(
                        discordant[:, 1] != discordant[:, 3])[0]
                    bad_maternal = np.where(
                        discordant[:, 2] != discordant[:, 4])[0]
                    print 'bad_paternal', bad_paternal
                    print 'bad_maternal', bad_maternal

                    # dis = compressed form of haplotype report obtained from the discordant array
                    dis = np.zeros((discordant.shape[0], 3), dtype=np.uint)
                    if bad_paternal.size:
                        dis[bad_paternal, :] = discordant[
                            bad_paternal][:, np.array([0, 1, 5])]
                    if bad_maternal.size:
                        dis[bad_maternal, :] = discordant[
                            bad_maternal][:, np.array([0, 2, 6])]
                    dis = np.concatenate(
                        (dis, np.zeros((dis.shape[0], 1), dtype=np.uint)),
                        axis=1)
                    if bad_paternal.size:
                        dis[bad_paternal, 3] = PATERNAL
                    dis[bad_maternal, 3] = MATERNAL
                    dis = dis[:, np.array([0, 3, 1, 2])]
                    bad_groups = np.array(Counter(dis[:, 3]).items(),
                                          dtype=np.uint)
                    if bad_groups.size:
                        bad_groups = bad_groups[np.lexsort(
                            (bad_groups[:, 0], -bad_groups[:, 1]))]
                    print 'bad_group id, #discordances\n', bad_groups

            G, H = t.training_data[snp_index], result[t.sample_index]
            changed = np.where((H[:, 0] != MISSING) & (H[:, 1] != MISSING)
                               & (G[:, 0] != MISSING) & (G[:, 1] != MISSING)
                               & (H[:, 0] + H[:, 1] != G[:, 0] + G[:, 1]))[0]

            print 'Time               %6.2f s' % (t_impute, )
            print 'Call rate allele   %6.2f%% (%d/%d)' % \
            ((100.0 * num_alleles_called) / result.size, num_alleles_called, result.size)
            print 'Call rate genotype %6.2f%% (%d/%d)' % \
            ((100.0 * num_fully_called) / result.shape[0], num_fully_called, result.shape[0])
            print 'Phased training    %6.2f%% (%d/%d)' % (
                (100. * num_phased) / result_training.size, num_phased,
                result_training.size)
            if genotype is not None:
                c, con, dis = recode.concordance_recoded(r, rg)
                print 'Concordance        %6.2f%% (%d/%d)' % (100. * c, con,
                                                              con + dis)
            if changed.size:
                print 'Changed training   %6.2f%% (%d/%d) %s' % (
                    (100. * len(changed)) / len(t.sample_index), len(changed),
                    len(t.sample_index), repr(t.sample_index[changed])[6:-1])

    # Restrict imputed results to problem's sample index subset
    if sample_translation:
        t.imputed_data = imputed_data[:, ibd_sample_index, :]
        t.imputed_hap_type = imputed_hap_type[:, ibd_sample_index]
예제 #33
0
    def impute(self, samples=None):
        '''Infer imputed genotypes at all samples of h from the samples of g.'''
        # Aliases
        r = self.ratio_threshold
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index:
                print 'Initial g[%d] = %s, h[%d] = %s' % (
                    self.training_index[j], repr(
                        self.g[self.training_index[j]]), j, repr(self.h[j]))
            else:
                print 'Initial h[%d] = %s' % (j, repr(self.h[j]))

        # Create a queue of haplotypes that can be used to impute others. Initially, it
        # is the training set homozygotes; every time we phase a training set het, its other
        # allele is appended to the queue.
        q = Queue()

        # Initial condition: phase all hom training samples
        hom = self.__phase_hom()
        for hap in itertools.product(hom, ALLELES):
            if self.debug:
                print 'Adding hom haplotype to queue', hap
            q.put(hap)
        num_hom_haps = q.qsize()
        if self.debug:
            print 'Items on queue : %d' % (q.qsize(), )
            print 'filled haps    : %.2f%%' % (
                (100. * len(self.h.nonzero()[0])) / self.h.size, )
            HH = recode.recode_single_genotype(self.h)
            print 'filled samples : %.2f%%' % (
                (100. * len(np.where(HH > 0)[0])) / HH.size, )
            print 'phased training: %.2f%%' % (
                (100. * len(self.h[self.training_sample_index].nonzero()[0])) /
                self.h[self.training_sample_index].size, )
        count = 0
        while not q.empty():
            # Get the IBD group of an imputed haplotype (all haps that are IBD with it at self.snp)
            hap = q.get()
            count += 1
            if count > self.max_iter:
                raise ValueError(
                    'Possible imputation bug - exceeded max number of iterations'
                )
            if self.debug:
                print '*' * 55
                print 'Iteration %d, imputing from hap %s' % (count, hap)
                print '*' * 55
                if self.debug_sample >= 0:
                    j = self.debug_sample
                    print 'h[%d] = %s' % (j, repr(self.h[j]))
                    if self.h[j, 0] == 1:
                        pass
            group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE],
                                  hap[ALLELE])
            if group.size:
                # if self.debug: print 'group', group
                s, a = group[:, SAMPLE], group[:, ALLELE]
                H = self.h[s, a]
                #                    print 'H', H
                # Find haplotypes that have been imputed with each of the alleles
                R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)]
                if self.debug:
                    print 'IBD group %d (%d haps):\n%s' % (
                        self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE],
                                             hap[ALLELE]), len(group),
                        repr(
                            np.concatenate(
                                (group, H[np.newaxis].transpose()), axis=1)))
                    print 'R1 = %s' % (repr(list(map(tuple, R1))), )
                    print 'R2 = %s' % (repr(list(map(tuple, R2))), )
#                    print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1)))
#                    print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1)))
# Majority vote: if there are enough haps with one allele, override the rest.
# Otherwise, an unresolved conflict ==> zero everyone out.
                l1, l2 = len(R1), len(R2)
                consensus = 1 if l1 >= r * l2 else (
                    2 if l2 >= r * l1 else MISSING)
                self.h[s, a] = consensus
                if consensus == 0:
                    # If no consensus is reached, keep the already-imputed values in place, otherwise
                    # we can run into an infinite loop by imputing and erasing h-entries.
                    self.h[R1[:, 0], R1[:, 1]] = 1
                    self.h[R2[:, 0], R2[:, 1]] = 2
                H = self.h[s]
                if self.debug:
                    print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus)
                    print 'Items on queue : %d' % (q.qsize(), )
                    print 'filled haps    : %.2f%%' % (
                        (100. * len(self.h.nonzero()[0])) / self.h.size, )
                    HH = recode.recode_single_genotype(self.h)
                    print 'filled samples : %.2f%%' % (
                        (100. * len(np.where(HH > 0)[0])) / HH.size, )
                    print 'phased training: %.2f%%' % (
                        (100. * len(
                            self.h[self.training_sample_index].nonzero()[0])) /
                        self.h[self.training_sample_index].size, )

                # Phase training hets (this includes BOTH partially-called = potential hets an
                # fully-called hets) with one imputed allele
                i = np.array([self.training_index.has_key(x) for x in s])
                si = s[i]
                G = self.g[map(self.training_index.get, si), :]
                unphased_hets = np.where(
                    ((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING))
                    & ((G[:, PATERNAL] != MISSING)
                       | (G[:, MATERNAL] != MISSING)
                       & (G[:, PATERNAL] != G[:, MATERNAL])))
                if unphased_hets[0].size:
                    if self.debug:
                        if count >= num_hom_haps:
                            print 'After hom, items on queue %d' % (
                                q.qsize(), )
                            pass
                        print 'unphased_hets', unphased_hets
                        print 'si', si
                        print 'index i[unphased_hets]', np.where(
                            i)[0][unphased_hets]
                        print 'H_unphased', H[i][unphased_hets]
                        print 'G_unphased', G[unphased_hets]
                    H_unphased = H[i][unphased_hets]
                    H_phased = H_unphased.copy()
                    complete_haplotype_partial(H_phased, G[unphased_hets])
                    #                    if self.debug:
                    #                        print 'Phasing hets'
                    #                        print 'i', np.where(i)
                    #                        print 'H_unphased', H_unphased
                    #                        print 'G of unphased_hets', G[unphased_hets]
                    newly_phased_alleles = np.where(H_phased != H_unphased)[1]
                    self.h[si[unphased_hets]] = H_phased[:]
                    #                    if self.debug:
                    #                        print 'After phasing H_unphased', self.h[s[unphased_hets]]
                    #                        print 'unphased_hets', s[unphased_hets]
                    #                        print 'newly_phased_alleles', newly_phased_alleles
                    # Append the new data we can now make use of to the queue
                    if self.debug:
                        print 'After phasing them'
                        print 'H_phased  ', H_phased
                    for hap in zip(si[unphased_hets], newly_phased_alleles):
                        if self.debug:
                            print 'Adding phased het haplotypes to queue', hap
                        q.put(hap)
        self.__override_training_imputed_by_genotypes()
        if self.remove_partial_calls: self.__remove_partial_calls()
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index:
                print 'Final g[%d] = %s, h[%d] = %s' % (
                    self.training_index[j], repr(
                        self.g[self.training_index[j]]), j, repr(self.h[j]))
            else:
                print 'h[%d] = %s' % (j, repr(self.h[j]))
예제 #34
0
파일: diff.py 프로젝트: orenlivne/ober
def ibs_diff(g, id1, id2):
    '''Return the IBS difference between two haplotypes (0 if IBS >= 1, 1 if IBS = 0).'''
    g1, g2 = recode.recode_single_genotype(
        g[:, id1, :]), recode.recode_single_genotype(g[:, id2, :])
    return (recode.ibs_state(g1, g2) == 0).astype(np.byte)