def __init__(self, g_orig, haplotype, num_errors): '''Initialize phasing statistics object for the original genotype g/and haplotype set problem.haplotype.''' h = haplotype.data r_orig = recode.recode_single_genotype(g_orig) r = recode.recode_single_genotype(h) # Sizes self.time = 0 self.num_snps = haplotype.num_snps self.num_samples = haplotype.num_samples self.num_genotypes = haplotype.num_data / 2 self.num_haplotypes = haplotype.num_data # Arrays self.fill = np.array([ haplotype.fill_fraction(sample=x) for x in xrange(haplotype.num_samples) ]) # Fields # A Field factory method field = lambda index: StatsField(self, h, index) self.called_orig = field(recode.where_called(r_orig)) self.imputed = field(recode.where_full_imputed(r, r_orig)) self.imputed_partial = field(recode.where_partial_imputed(r, r_orig)) self.errors = field(recode.where_error(r, r_orig)) self.errors_partial = field(recode.where_partial_error(r, r_orig)) self.called = field(recode.where_called(r)) self.partial_called = field(recode.where_partial_called(r)) self.still_missing = field(recode.where_still_missing(r, r_orig)) # Scalars self.num_filled_haplotypes = haplotype.num_filled self.num_errors = num_errors # Redundant
def prob_ibd_single_locus(problem, id1, id2, snps, params): '''Is the segment s (defined by the SNP array snps, which may be a frame within the actual segment) an IBD segment between samples id1 and id2 or not? Outputs an IBD probability estimate. This estimate is based on a single-locus IBD posterior probability estimation. Assuming loci are statistically independent.''' # Gather prior information: allele frequencies and condensed identity coefficients p = problem.info.allele_frequency(1) sample_id = problem.pedigree.sample_id _, Delta = params.id_coefs(sample_id[id1], sample_id[id2]) print 'Delta', Delta # Load and hash genotype pairs at each SNP g = problem.g r1, r2 = recode.recode_single_genotype(g[snps, id1, :]), recode.recode_single_genotype(g[snps, id2, :]) gg_hash = __HASH_TO_T[3 * r1 + r2 - 8] # Compute posterior (loop over T-states, bulk-set corresponding entries in output array) prob_ibd = np.zeros_like(snps, dtype=np.float) for t in __HASH_TO_T: index = np.where(gg_hash == t)[0] print 'State T=%d, #occurrences %d' % (t, len(index)) prob_ibd[index] = ibd_posterior_gg(Delta, p[index], t) return prob_ibd
def __init__(self, g_orig, haplotype, num_errors): '''Initialize phasing statistics object for the original genotype g/and haplotype set problem.haplotype.''' h = haplotype.data r_orig = recode.recode_single_genotype(g_orig) r = recode.recode_single_genotype(h) # Sizes self.time = 0 self.num_snps = haplotype.num_snps self.num_samples = haplotype.num_samples self.num_genotypes = haplotype.num_data / 2 self.num_haplotypes = haplotype.num_data # Arrays self.fill = np.array([haplotype.fill_fraction(sample=x) for x in xrange(haplotype.num_samples)]) # Fields # A Field factory method field = lambda index: StatsField(self, h, index) self.called_orig = field(recode.where_called(r_orig)) self.imputed = field(recode.where_full_imputed(r, r_orig)) self.imputed_partial = field(recode.where_partial_imputed(r, r_orig)) self.errors = field(recode.where_error(r, r_orig)) self.errors_partial = field(recode.where_partial_error(r, r_orig)) self.called = field(recode.where_called(r)) self.partial_called = field(recode.where_partial_called(r)) self.still_missing = field(recode.where_still_missing(r, r_orig)) # Scalars self.num_filled_haplotypes = haplotype.num_filled self.num_errors = num_errors # Redundant
def __init__(self, lam, Delta, x, p, g1, g2, e, Pi=None, debug=False, snps=None): self.Delta = Delta self.x = x self.lam = lam self.D = np.dot(np.ones(9)[:, np.newaxis], Delta[:, np.newaxis].transpose()) self.I_minus_D = np.eye(9) - self.D self.lam_x = lam * np.diff(x) self.p = p self.e = e self.E = emission_error(e) self.r1 = recode.recode_single_genotype(g1) self.r2 = recode.recode_single_genotype(g2) self.Pi = Pi if Pi is not None else Delta self.debug = debug self.snps = snps if snps is not None else np.arange(len(x)) # Define HMM self.m = hmmt.HMM(9, A=lambda k: self.__transition_probability(k), B=lambda k: self.__emission_probability(k), Pi=self.Pi, V=ProbIbdHmmCalculator.__T_STATE) self.Obs = ProbIbdHmmCalculator.__HASH_TO_T_STATE[3 * self.r1 + self.r2 - 8] # Output fields self.Gamma = None self.p_ibd_gamma = None self.Q_star = None self.p_ibd = None
def recode_single(g1, g2, data_filter): '''Recode a genotype pair to a single genotype code as in the recode module. Restrict to entries where both are called.''' r1, r2 = recode.recode_single_genotype(g1), recode.recode_single_genotype( g2) called = np.where(data_filter(r1, r2)) return r1[called], r2[called], called
def impute_from_fully_called(g, h): '''Impute missing genotypes in g from fully-called haplotypes h. The imputation is typically done on the genotypes after phasing, which may contain zeroed-out entries found to be Mendelian errors, and thus will benefit from imputation here. Partially-filled genotypes are also overridden by haps, if the latter are fully called. Returns the number of imputed genotypes''' imputed = recode.where_full_imputed(recode.recode_single_genotype(h), recode.recode_single_genotype(g)) g[imputed[SNP], imputed[SAMPLE], :] = h[imputed[SNP], imputed[SAMPLE], :] return len(imputed[0])
def genotype_ibs_segments(genotype, id1, id2, snps, error_filter='median', error_filter_length=5, margin=0.0, min_ibs_len_snp=400, debug=False): '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2 in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array. See ibs_segments() for a description of optional parameters.''' num_snps = genotype.num_snps g = genotype.data g1 = recode.recode_single_genotype(g[snps, id1, :]) g2 = recode.recode_single_genotype(g[snps, id2, :]) d = (recode.ibs_state(g1, g2) == 0).astype(np.byte) # Consider informative or the specified SNPs only filtered_diff = filter_diff(d, error_filter, error_filter_length) error_snps = snps[np.nonzero(d - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: # Convert recombination locations to segments of no recombination; filter short segments bp = genotype.snp['base_pair'] #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) segments = [ Segment(((x[0], x[1])), [id1, id2], (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x), collapse_to_set=False) for x in segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) ] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [ s for s in (s.middle_part( genotype.nearest_snp, bp, margin, collapse_to_set=False) for s in segments) if s ] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple( util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None
def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple(util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None
def genotype_ibs_segments(genotype, id1, id2, snps, error_filter='median', error_filter_length=5, margin=0.0, min_ibs_len_snp=400, debug=False): '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2 in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array. See ibs_segments() for a description of optional parameters.''' num_snps = genotype.num_snps g = genotype.data g1 = recode.recode_single_genotype(g[snps, id1, :]) g2 = recode.recode_single_genotype(g[snps, id2, :]) d = (recode.ibs_state(g1, g2) == 0).astype(np.byte) # Consider informative or the specified SNPs only filtered_diff = filter_diff(d, error_filter, error_filter_length) error_snps = snps[np.nonzero(d - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: # Convert recombination locations to segments of no recombination; filter short segments bp = genotype.snp['base_pair'] #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) segments = [Segment(((x[0], x[1])), [id1, id2], (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x), collapse_to_set=False) for x in segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [s for s in (s.middle_part(genotype.nearest_snp, bp, margin, collapse_to_set=False) for s in segments) if s] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def __init__(self, t, g): '''t = ImputationSet imputed data set. g = Genotype data for all samples; sample indices must correspond to t''s sample indices. ''' self.t = t self.T = t.sample_index # training set self.I = t.sample_index_to_impute # imputed sample set self.imputed_data = t.imputed_data[:, self.I, :] self.genetic_coord = self.t.snp['dist_cm'] self.g = g self.rg = recode.recode_single_genotype(g) self.ri = recode.recode_single_genotype(t.imputed_data) self.maf = np.amin(im.gt.allele_frequencies_by_snp(g), axis=0) # self.concordance_training = self.concordance(SNP, samples=self.T) # self.concordance_imputed = self.concordance(SNP, samples=self.I) # self.concordance_all = self.concordance(SNP) # Lazily-initialized cached properties self._allele_count = None self._frequency = None
def __init__(self, lam, Delta, x, p, g1, g2, e, Pi=None, debug=False, snps=None): self.Delta = Delta self.x = x self.lam = lam self.D = np.dot( np.ones(9)[:, np.newaxis], Delta[:, np.newaxis].transpose()) self.I_minus_D = np.eye(9) - self.D self.lam_x = lam * np.diff(x) self.p = p self.e = e self.E = emission_error(e) self.r1 = recode.recode_single_genotype(g1) self.r2 = recode.recode_single_genotype(g2) self.Pi = Pi if Pi is not None else Delta self.debug = debug self.snps = snps if snps is not None else np.arange(len(x)) # Define HMM self.m = hmmt.HMM(9, A=lambda k: self.__transition_probability(k), B=lambda k: self.__emission_probability(k), Pi=self.Pi, V=ProbIbdHmmCalculator.__T_STATE) self.Obs = ProbIbdHmmCalculator.__HASH_TO_T_STATE[3 * self.r1 + self.r2 - 8] # Output fields self.Gamma = None self.p_ibd_gamma = None self.Q_star = None self.p_ibd = None
def __handle_estimate_genotype_frequencies(self, request): """Estimate genotype frequencies from the genotype data and save them in ProblemInfo.""" # Load problem fields problem = request.problem snp_metadata = problem.info.snp snp_count = snp_metadata["count"] # Recode genotypes to a single number r = recode.recode_single_genotype(problem.genotype.data) # Count genotype appearances for each SNP, and save in SNP annotation array. # The frequency table column order matches the GENOTYPE_CODE array. This includes filled # and missing genotypes: (1,1),(1,2),(2,2),(0,0). for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()): snp_count[:, col] = statutil.hist(np.where(r == genotype_code)[0], problem.num_snps) # Calculate frequencies snp_metadata["frequency"] = statutil.scale_row_sums(snp_count.astype("float")) return False
def test_phase_and_get_stats(self): '''Test gathering phasing statistics in a Stats object.''' nuclear_family_phaser().run(self.problem) g_orig = self.problem.genotype.data.copy() stats = v.Stats(g_orig, self.problem.haplotype, self.problem.num_errors) itu.assert_problem_stats(self.problem, 167336, 137693, 102) # Check stats fields partially_filled_genotype = np.where(recode.recode_single_genotype(self.problem.genotype.data) < 0) assert_equal(len(partially_filled_genotype[0]), 279, 'Unexpected # of partially-filled genotypes') assert_equal(stats.imputed.total, 185, 'Unexpected # of fully-imputed genotypes') assert_almost_equal(stats.imputed.fraction, 0.00221, decimal=5, err_msg='Unexpected fraction of fully-imputed genotypes') assert_equal(stats.imputed_partial.total, 45, 'Unexpected # of partially-imputed genotypes') # Check that printouts don't crash us stats.pprint(open(os.devnull, 'wb')) # Check that stats is pickable and unpicklable out_name = util.temp_filename(suffix='.npz') out = open(out_name, 'wb') np.savez(out, stats=np.array([stats])) out.close() loaded = np.load(out_name) assert_equal(loaded['stats'][0].imputed_partial.total, stats.imputed_partial.total, 'Pickling & unpickling a Stats object failed')
def __handle_estimate_genotype_frequencies(self, request): '''Estimate genotype frequencies from the genotype data and save them in ProblemInfo.''' # Load problem fields problem = request.problem snp_metadata = problem.info.snp snp_count = snp_metadata['count'] # Recode genotypes to a single number r = recode.recode_single_genotype(problem.genotype.data) # Count genotype appearances for each SNP, and save in SNP annotation array. # The frequency table column order matches the GENOTYPE_CODE array. This includes filled # and missing genotypes: (1,1),(1,2),(2,2),(0,0). for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()): snp_count[:, col] = statutil.hist( np.where(r == genotype_code)[0], problem.num_snps) # Calculate frequencies snp_metadata['frequency'] = statutil.scale_row_sums( snp_count.astype('float')) return False
def __handle_fill_missing_genotypes(self, request): '''Fill missing genotype entries by randomly sampling from the multinomial distribution with estimated genotype frequencies at the corresponding SNP.''' # Load problem fields if request.params.debug: print 'Filling missing genotypes from estimated genotype distribution' problem = request.problem g = problem.genotype.data snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES] # Recode genotypes to a single number r = recode.recode_single_genotype(g) # Find SNP, sample indices of missing data missing = recode.where_missing(r) # Generate random multinomial values; map them to genotype codes filled_code = multinomial_elementwise(scale_row_sums(snp_frequency[missing[SNP]])) + 2 # Fill-in all genotypes of a certain value in a vectorized manner for (genotype, code) in recode.GENOTYPE_CODE.iteritems(): index = np.where(filled_code == code)[0] g[missing[SNP][index], missing[SAMPLE][index], :] = genotype return False
def __handle_fill_missing_genotypes(self, request): '''Fill missing genotype entries by randomly sampling from the multinomial distribution with estimated genotype frequencies at the corresponding SNP.''' # Load problem fields if request.params.debug: print 'Filling missing genotypes from estimated genotype distribution' problem = request.problem g = problem.genotype.data snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES] # Recode genotypes to a single number r = recode.recode_single_genotype(g) # Find SNP, sample indices of missing data missing = recode.where_missing(r) # Generate random multinomial values; map them to genotype codes filled_code = multinomial_elementwise( scale_row_sums(snp_frequency[missing[SNP]])) + 2 # Fill-in all genotypes of a certain value in a vectorized manner for (genotype, code) in recode.GENOTYPE_CODE.iteritems(): index = np.where(filled_code == code)[0] g[missing[SNP][index], missing[SAMPLE][index], :] = genotype return False
def recode_single(g1, g2, data_filter): '''Recode a genotype pair to a single genotype code as in the recode module. Restrict to entries where both are called.''' r1, r2 = recode.recode_single_genotype(g1), recode.recode_single_genotype(g2) called = np.where(data_filter(r1, r2)) return r1[called], r2[called], called
def __remove_partial_calls(self): '''Set partially-imputed genotypes to missing. A prerequisite for loading into PLINK, for instance (but not PLINK-SEQ).''' self.h[recode.where_partial_called(recode.recode_single_genotype(self.h)), :] = MISSING
def impute(self, samples=None): '''Infer imputed genotypes at all samples of h from the samples of g.''' # Aliases r = self.ratio_threshold if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Initial g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'Initial h[%d] = %s' % (j, repr(self.h[j])) # Create a queue of haplotypes that can be used to impute others. Initially, it # is the training set homozygotes; every time we phase a training set het, its other # allele is appended to the queue. q = Queue() # Initial condition: phase all hom training samples hom = self.__phase_hom() for hap in itertools.product(hom, ALLELES): if self.debug: print 'Adding hom haplotype to queue', hap q.put(hap) num_hom_haps = q.qsize() if self.debug: print 'Items on queue : %d' % (q.qsize(),) print 'filled haps : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,) print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,) count = 0 while not q.empty(): # Get the IBD group of an imputed haplotype (all haps that are IBD with it at self.snp) hap = q.get() count += 1 if count > self.max_iter: raise ValueError('Possible imputation bug - exceeded max number of iterations') if self.debug: print '*' * 55 print 'Iteration %d, imputing from hap %s' % (count, hap) print '*' * 55 if self.debug_sample >= 0: j = self.debug_sample print 'h[%d] = %s' % (j, repr(self.h[j])) if self.h[j, 0] == 1: pass group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]) if group.size: # if self.debug: print 'group', group s, a = group[:, SAMPLE], group[:, ALLELE] H = self.h[s, a] # print 'H', H # Find haplotypes that have been imputed with each of the alleles R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)] if self.debug: print 'IBD group %d (%d haps):\n%s' % (self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr(np.concatenate((group, H[np.newaxis].transpose()), axis=1))) print 'R1 = %s' % (repr(list(map(tuple, R1))),) print 'R2 = %s' % (repr(list(map(tuple, R2))),) # print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1))) # print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1))) # Majority vote: if there are enough haps with one allele, override the rest. # Otherwise, an unresolved conflict ==> zero everyone out. l1, l2 = len(R1), len(R2) consensus = 1 if l1 >= r * l2 else (2 if l2 >= r * l1 else MISSING) self.h[s, a] = consensus if consensus == 0: # If no consensus is reached, keep the already-imputed values in place, otherwise # we can run into an infinite loop by imputing and erasing h-entries. self.h[R1[:, 0], R1[:, 1]] = 1 self.h[R2[:, 0], R2[:, 1]] = 2 H = self.h[s] if self.debug: print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus) print 'Items on queue : %d' % (q.qsize(),) print 'filled haps : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,) print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,) # Phase training hets (this includes BOTH partially-called = potential hets an # fully-called hets) with one imputed allele i = np.array([self.training_index.has_key(x) for x in s]) si = s[i] G = self.g[map(self.training_index.get, si), :] unphased_hets = np.where(((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING) & (G[:, PATERNAL] != G[:, MATERNAL]))) if unphased_hets[0].size: if self.debug: if count >= num_hom_haps: print 'After hom, items on queue %d' % (q.qsize(),) pass print 'unphased_hets', unphased_hets print 'si', si print 'index i[unphased_hets]', np.where(i)[0][unphased_hets] print 'H_unphased', H[i][unphased_hets] print 'G_unphased', G[unphased_hets] H_unphased = H[i][unphased_hets] H_phased = H_unphased.copy() complete_haplotype_partial(H_phased, G[unphased_hets]) # if self.debug: # print 'Phasing hets' # print 'i', np.where(i) # print 'H_unphased', H_unphased # print 'G of unphased_hets', G[unphased_hets] newly_phased_alleles = np.where(H_phased != H_unphased)[1] self.h[si[unphased_hets]] = H_phased[:] # if self.debug: # print 'After phasing H_unphased', self.h[s[unphased_hets]] # print 'unphased_hets', s[unphased_hets] # print 'newly_phased_alleles', newly_phased_alleles # Append the new data we can now make use of to the queue if self.debug: print 'After phasing them' print 'H_phased ', H_phased for hap in zip(si[unphased_hets], newly_phased_alleles): if self.debug: print 'Adding phased het haplotypes to queue', hap q.put(hap) self.__override_training_imputed_by_genotypes() if self.remove_partial_calls: self.__remove_partial_calls() if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Final g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'h[%d] = %s' % (j, repr(self.h[j]))
def qc(ibd, t, snp=None, majority_threshold=0.66, debug=1, samples=None, genotype=None, debug_sample= -1, ibd_sample_index=None): '''Main call that separately imputes each SNP in a list of SNPs. ibd = IBD dictionary - a SegmentIndex instance t = ImputationSet instance. Contains training genotypes and imputed genotypes. majority_threshold = Threshold for resolving IBD conflicts via the majority vote samples = samples to impute (if None, all samples are imputed) genotype = true genotype at all samples (non-None for validation studies).''' # If problem sample indices require translation, augment the t.imputed_data, t.imputed_hap_type arrays # to the #genotyped samples in the IBD segment index; then impute all of them; in the end, restrict the # result to the problem's sample index subset. imputed_data, imputed_hap_type = t.imputed_data, t.imputed_hap_type sample_translation = ibd_sample_index is not None if sample_translation: imputed_data = np.zeros((t.num_snps, ibd.num_samples, 2), dtype=np.byte) imputed_data[:, ibd_sample_index, :] = t.imputed_data imputed_hap_type = np.zeros((t.num_snps, ibd.num_samples), dtype=np.byte) imputed_hap_type[:, ibd_sample_index] = t.imputed_hap_type snp = snp if snp is not None else np.arange(len(t.snp)) if debug >= 1: print 'Imputing SNPs' print 'majority_threshold', majority_threshold print 'snp', snp for snp_index in snp: chrom = t.snp['chrom'][snp_index] snp_bp = t.snp['base_pair'][snp_index] if debug >= 1: print '====== SNP %4d (%-22s): chr%-2d:%-9d x=%.2f ======' % \ (snp_index, t.snp['name'][snp_index], chrom, snp_bp, t.snp['dist_cm'][snp_index]) t_start = time.time() _IbdQc(imputed_data[snp_index], imputed_hap_type[snp_index], ibd, t.training_data[snp_index], t.sample_index, chrom, snp_bp, debug=(debug >= 2), majority_threshold=majority_threshold, debug_sample=debug_sample).impute(samples) t_impute = time.time() - t_start if debug >= 1: result = imputed_data[snp_index] r = recode.recode_single_genotype(result) if sample_translation: r = r[ibd_sample_index] num_fully_called = len(np.where(r > 0)[0]) num_alleles_called = len(result.nonzero()[0]) result_training = result[t.sample_index] num_phased = len(result_training.nonzero()[0]) # Per Rebecca's request: print the list of samples IDs that were not fully called # snp_name = t.snp['name'][snp_index] # for sample in im.examples.sample_index_to_id()[np.where(r <= 0)[0]]: # print '%s,%s' % (snp_name, sample) if genotype is not None: rg = recode.recode_single_genotype(genotype[snp_index]) if debug >= 2: eq = ((r <= 0) | (rg <= 0) | (r == rg)).astype(int) np.set_printoptions(threshold=np.nan) group_index = ibd._group_index[0 if ibd.test_index else ibd.nearest_left_snp(chrom, snp_bp) - ibd._start] if sample_translation: group_index, result = group_index[ibd_sample_index], result[ibd_sample_index] all_haps = np.concatenate((np.arange(t.num_samples)[np.newaxis].transpose(), genotype[snp_index], result.tolist(), group_index # ,eq[np.newaxis].transpose( ), axis=1) print 'All (true vs. imputed):' print all_haps print 'Discordant (true vs. imputed):' discordant = all_haps[np.where(eq == 0)[0]] print discordant bad_paternal = np.where(discordant[:, 1] != discordant[:, 3])[0] bad_maternal = np.where(discordant[:, 2] != discordant[:, 4])[0] print 'bad_paternal', bad_paternal print 'bad_maternal', bad_maternal # dis = compressed form of haplotype report obtained from the discordant array dis = np.zeros((discordant.shape[0], 3), dtype=np.uint) if bad_paternal.size: dis[bad_paternal, :] = discordant[bad_paternal][:, np.array([0, 1, 5])] if bad_maternal.size: dis[bad_maternal, :] = discordant[bad_maternal][:, np.array([0, 2, 6])] dis = np.concatenate((dis, np.zeros((dis.shape[0], 1), dtype=np.uint)), axis=1) if bad_paternal.size: dis[bad_paternal, 3] = PATERNAL dis[bad_maternal, 3] = MATERNAL dis = dis[:, np.array([0, 3, 1, 2])] bad_groups = np.array(Counter(dis[:, 3]).items(), dtype=np.uint) if bad_groups.size: bad_groups = bad_groups[np.lexsort((bad_groups[:, 0], -bad_groups[:, 1]))] print 'bad_group id, #discordances\n', bad_groups G, H = t.training_data[snp_index], result[t.sample_index] changed = np.where((H[:, 0] != MISSING) & (H[:, 1] != MISSING) & (G[:, 0] != MISSING) & (G[:, 1] != MISSING) & (H[:, 0] + H[:, 1] != G[:, 0] + G[:, 1]))[0] print 'Time %6.2f s' % (t_impute,) print 'Call rate allele %6.2f%% (%d/%d)' % \ ((100.0 * num_alleles_called) / result.size, num_alleles_called, result.size) print 'Call rate genotype %6.2f%% (%d/%d)' % \ ((100.0 * num_fully_called) / result.shape[0], num_fully_called, result.shape[0]) print 'Phased training %6.2f%% (%d/%d)' % ((100.*num_phased) / result_training.size, num_phased, result_training.size) if genotype is not None: c, con, dis = recode.concordance_recoded(r, rg) print 'Concordance %6.2f%% (%d/%d)' % (100.*c, con, con + dis) if changed.size: print 'Changed training %6.2f%% (%d/%d) %s' % ((100.*len(changed)) / len(t.sample_index), len(changed), len(t.sample_index), repr(t.sample_index[changed])[6:-1]) # Restrict imputed results to problem's sample index subset if sample_translation: t.imputed_data = imputed_data[:, ibd_sample_index, :] t.imputed_hap_type = imputed_hap_type[:, ibd_sample_index]
class _IbdQc(object): '''Calculates a QC measure for a single variant using IBD cliques.''' #--------------------------------------------- # Constants #--------------------------------------------- __EMPTY_ARRAY = np.array([]) #--------------------------------------------- # Constructors #--------------------------------------------- def __init__(self, h, hap_type, ibd, g, training_sample_index, chrom, snp_bp, debug=False, majority_threshold=0.66, debug_sample= -1, max_iter=1000): '''Initialize an imputer that changes the result h in-place using the IBD index ibd and training genotype data g at snp position snp_bp. majority vote = threshold for majority vote. When |# haps with majority allele| >= majority_threshold*(All haps) in a clique, the vote is accepted.''' # Input fields self.h, self.hap_type, self.g, self.ibd, self.training_sample_index, self.max_iter, self.debug, \ self.debug_sample, self.chrom = h, hap_type, g, ibd, training_sample_index, max_iter, debug, \ debug_sample, chrom # Maps sample ID to training set index self.training_index = dict(zip(self.training_sample_index, xrange(self.g.shape[0]))) self.ratio_threshold = majority_threshold / (1 - majority_threshold) # Find the appropriate IBD index SNP for the target base-pair position self.snp = ibd.nearest_left_snp(chrom, snp_bp) if self.debug: ibd.find(self.chrom, self.snp, self.training_sample_index[0], PATERNAL) print 'IBD index file %s/chr%d/region-%d.npz, nearest SNP %d' % (ibd._index_dir, ibd._chrom, ibd._start, self.snp) #--------------------------------------------- # Methods #--------------------------------------------- def impute(self, samples=None): '''Infer imputed genotypes at all samples of h from the samples of g.''' # Aliases r = self.ratio_threshold if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Initial g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'Initial h[%d] = %s' % (j, repr(self.h[j])) # Create a queue of haplotypes that can be used to impute others. Initially, it # is the training set homozygotes; every time we phase a training set het, its other # allele is appended to the queue. # # TODO: possibly replace by a priority queue where alleles are ordered by their clique sizes? # (we have extra confidence in those alleles; not sure it matters though) q = Queue() # Initial condition: phase all hom training samples hom = self.__phase_hom() for hap in itertools.product(hom, ALLELES): if self.debug: print 'Adding hom haplotype to queue', hap q.put(hap) num_hom_haps = q.qsize() if self.debug: print 'Items on queue : %d' % (q.qsize(),) print 'filled haps : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,) print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,) count = 0 while not q.empty(): # Find group = the IBD clique of an imputed haplotype (all haps that are IBD with it at self.snp) hap = q.get() count += 1 if count > self.max_iter: raise ValueError('Possible imputation bug - exceeded maximum number of iterations') if self.debug: print '*' * 55 print 'Iteration %d, imputing from hap %s' % (count, hap) print '*' * 55 if self.debug_sample >= 0: j = self.debug_sample print 'h[%d] = %s' % (j, repr(self.h[j])) if self.h[j, 0] == 1: pass group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]) if group.size: # if self.debug: print 'group', group s, a = group[:, SAMPLE], group[:, ALLELE] H = self.h[s, a] # print 'H', H # Find haplotypes that have been imputed as allele 1 and those imputed as allele 2 R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)] if self.debug: print 'IBD group %d (%d haps):\n%s' % (self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr(np.concatenate((group, H[np.newaxis].transpose()), axis=1))) print 'R1 = %s' % (repr(list(map(tuple, R1))),) print 'R2 = %s' % (repr(list(map(tuple, R2))),) # print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1))) # print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1))) # Majority vote: if there are enough haps with one allele, override the rest. # Otherwise, an unresolved conflict ==> zero everyone out. l1, l2 = len(R1), len(R2) consensus = 1 if l1 >= r * l2 else (2 if l2 >= r * l1 else MISSING) self.h[s, a] = consensus if consensus == 0: # If no consensus is reached, keep the already-imputed values in place, otherwise # we can run into an infinite loop by imputing and erasing h-entries. self.h[R1[:, 0], R1[:, 1]] = 1 self.h[R2[:, 0], R2[:, 1]] = 2 H = self.h[s] if self.debug: print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus) print 'Items on queue : %d' % (q.qsize(),) print 'filled haps : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,) print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,) # Phase training hets (this includes BOTH partially-called = potential hets an # fully-called hets) with one imputed allele i = np.array([self.training_index.has_key(x) for x in s]) si = s[i] G = self.g[map(self.training_index.get, si), :] unphased_hets = np.where(((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING) & (G[:, PATERNAL] != G[:, MATERNAL]))) if unphased_hets[0].size: if self.debug: if count >= num_hom_haps: print 'After hom, items on queue %d' % (q.qsize(),) pass print 'unphased_hets', unphased_hets print 'si', si print 'index i[unphased_hets]', np.where(i)[0][unphased_hets] print 'H_unphased', H[i][unphased_hets] print 'G_unphased', G[unphased_hets] H_unphased = H[i][unphased_hets] H_phased = H_unphased.copy() complete_haplotype_partial(H_phased, G[unphased_hets]) # if self.debug: # print 'Phasing hets' # print 'i', np.where(i) # print 'H_unphased', H_unphased # print 'G of unphased_hets', G[unphased_hets] newly_phased_alleles = np.where(H_phased != H_unphased)[1] self.h[si[unphased_hets]] = H_phased[:] # if self.debug: # print 'After phasing H_unphased', self.h[s[unphased_hets]] # print 'unphased_hets', s[unphased_hets] # print 'newly_phased_alleles', newly_phased_alleles # Append the new data we can now make use of to the queue if self.debug: print 'After phasing them' print 'H_phased ', H_phased for hap in zip(si[unphased_hets], newly_phased_alleles): if self.debug: print 'Adding phased het haplotypes to queue', hap q.put(hap) self.__override_training_imputed_by_genotypes() if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Final g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'h[%d] = %s' % (j, repr(self.h[j]))
def recoded_genotype(self): '''Return the genotype test set, recoded as a single number of allele pair.''' if self.__recode_single_genotype is None: self.__recode_single_genotype = recode.recode_single_genotype(self.test_called) return self.__recode_single_genotype
def test_orig(self): '''Return the original set of deleted test genotypes.''' i = self.test_index return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :])
def recoded_genotype(self): '''Return the genotype test set, recoded as a single number of allele pair.''' if self.__recode_single_genotype is None: self.__recode_single_genotype = recode.recode_single_genotype( self.test_called) return self.__recode_single_genotype
def ibs_diff(g, id1, id2): '''Return the IBS difference between two haplotypes (0 if IBS >= 1, 1 if IBS = 0).''' g1, g2 = recode.recode_single_genotype(g[:, id1, :]), recode.recode_single_genotype(g[:, id2, :]) return (recode.ibs_state(g1, g2) == 0).astype(np.byte)
def ibs_state(g, id1, id2): '''Return the IBS difference between two haplotypes (IBS=0,1 or 2).''' g1, g2 = recode.recode_single_genotype(g[:, id1, :]), recode.recode_single_genotype(g[:, id2, :]) return recode.ibs_state(g1, g2)
def ibs_state(g, id1, id2): '''Return the IBS difference between two haplotypes (IBS=0,1 or 2).''' g1, g2 = recode.recode_single_genotype( g[:, id1, :]), recode.recode_single_genotype(g[:, id2, :]) return recode.ibs_state(g1, g2)
def __remove_partial_calls(self): '''Set partially-imputed genotypes to missing. A prerequisite for loading into PLINK, for instance (but not PLINK-SEQ).''' self.h[recode.where_partial_called( recode.recode_single_genotype(self.h)), :] = MISSING
def impute(ibd, t, snp=None, majority_threshold=0.66, debug=1, samples=None, genotype=None, debug_sample=-1, ibd_sample_index=None, remove_partial_calls=False): '''Main call that separately imputes each SNP in a list of SNPs. ibd = IBD dictionary - a SegmentIndex instance t = ImputationSet instance. Contains training genotypes and imputed genotypes. majority_threshold = Threshold for resolving IBD conflicts via the majority vote samples = samples to impute (if None, all samples are imputed) genotype = true genotype at all samples (non-None for validation studies) ibd_sample_index = optional an array whose ith element is the pedigree sample index corresponding to the ith sample ID in the ID list genotyped_ids. genotyped_ids is a subset of the pedigree sample index set; for sample indices that don''t appear in genotyped_ids, the corresponding output array element is set to -1. ''' # If problem sample indices require translation, augment the t.imputed_data, t.imputed_hap_type arrays # to the #genotyped samples in the IBD segment index; then impute all of them; in the end, restrict the # result to the problem's sample index subset. imputed_data, imputed_hap_type = t.imputed_data, t.imputed_hap_type sample_translation = ibd_sample_index is not None if sample_translation: imputed_data = np.zeros((t.num_snps, ibd.num_samples, 2), dtype=np.byte) imputed_data[:, ibd_sample_index, :] = t.imputed_data imputed_hap_type = np.zeros((t.num_snps, ibd.num_samples), dtype=np.byte) imputed_hap_type[:, ibd_sample_index] = t.imputed_hap_type snp = snp if snp is not None else np.arange(len(t.snp)) if debug >= 1: print 'Imputing SNPs' print 'majority_threshold', majority_threshold print 'snp', snp for snp_index in snp: chrom = t.snp['chrom'][snp_index] snp_bp = t.snp['base_pair'][snp_index] if debug >= 1: print '====== SNP %4d (%-22s): chr%-2d:%-9d x=%.2f ======' % \ (snp_index, t.snp['name'][snp_index], chrom, snp_bp, t.snp['dist_cm'][snp_index]) t_start = time.time() _IbdIndexImputer( imputed_data[snp_index], imputed_hap_type[snp_index], ibd, t.training_data[snp_index], t.sample_index, chrom, snp_bp, debug=(debug >= 2), majority_threshold=majority_threshold, debug_sample=debug_sample, remove_partial_calls=remove_partial_calls).impute(samples) t_impute = time.time() - t_start if debug >= 1: result = imputed_data[snp_index] r = recode.recode_single_genotype(result) if sample_translation: r = r[ibd_sample_index] num_fully_called = len(np.where(r > 0)[0]) num_alleles_called = len(result.nonzero()[0]) result_training = result[t.sample_index] num_phased = len(result_training.nonzero()[0]) # Per Rebecca's request: print the list of samples IDs that were not fully called # snp_name = t.snp['name'][snp_index] # for sample in im.examples.sample_index_to_id()[np.where(r <= 0)[0]]: # print '%s,%s' % (snp_name, sample) if genotype is not None: rg = recode.recode_single_genotype(genotype[snp_index]) if debug >= 2: eq = ((r <= 0) | (rg <= 0) | (r == rg)).astype(int) np.set_printoptions(threshold=np.nan) group_index = ibd._group_index[ 0 if ibd. test_index else ibd.nearest_left_snp(chrom, snp_bp) - ibd._start] if sample_translation: group_index, result = group_index[ ibd_sample_index], result[ibd_sample_index] all_haps = np.concatenate( ( np.arange(t.num_samples)[np.newaxis].transpose(), genotype[snp_index], result.tolist(), group_index # ,eq[np.newaxis].transpose( ), axis=1) print 'All (true vs. imputed):' print all_haps print 'Discordant (true vs. imputed):' discordant = all_haps[np.where(eq == 0)[0]] print discordant bad_paternal = np.where( discordant[:, 1] != discordant[:, 3])[0] bad_maternal = np.where( discordant[:, 2] != discordant[:, 4])[0] print 'bad_paternal', bad_paternal print 'bad_maternal', bad_maternal # dis = compressed form of haplotype report obtained from the discordant array dis = np.zeros((discordant.shape[0], 3), dtype=np.uint) if bad_paternal.size: dis[bad_paternal, :] = discordant[ bad_paternal][:, np.array([0, 1, 5])] if bad_maternal.size: dis[bad_maternal, :] = discordant[ bad_maternal][:, np.array([0, 2, 6])] dis = np.concatenate( (dis, np.zeros((dis.shape[0], 1), dtype=np.uint)), axis=1) if bad_paternal.size: dis[bad_paternal, 3] = PATERNAL dis[bad_maternal, 3] = MATERNAL dis = dis[:, np.array([0, 3, 1, 2])] bad_groups = np.array(Counter(dis[:, 3]).items(), dtype=np.uint) if bad_groups.size: bad_groups = bad_groups[np.lexsort( (bad_groups[:, 0], -bad_groups[:, 1]))] print 'bad_group id, #discordances\n', bad_groups G, H = t.training_data[snp_index], result[t.sample_index] changed = np.where((H[:, 0] != MISSING) & (H[:, 1] != MISSING) & (G[:, 0] != MISSING) & (G[:, 1] != MISSING) & (H[:, 0] + H[:, 1] != G[:, 0] + G[:, 1]))[0] print 'Time %6.2f s' % (t_impute, ) print 'Call rate allele %6.2f%% (%d/%d)' % \ ((100.0 * num_alleles_called) / result.size, num_alleles_called, result.size) print 'Call rate genotype %6.2f%% (%d/%d)' % \ ((100.0 * num_fully_called) / result.shape[0], num_fully_called, result.shape[0]) print 'Phased training %6.2f%% (%d/%d)' % ( (100. * num_phased) / result_training.size, num_phased, result_training.size) if genotype is not None: c, con, dis = recode.concordance_recoded(r, rg) print 'Concordance %6.2f%% (%d/%d)' % (100. * c, con, con + dis) if changed.size: print 'Changed training %6.2f%% (%d/%d) %s' % ( (100. * len(changed)) / len(t.sample_index), len(changed), len(t.sample_index), repr(t.sample_index[changed])[6:-1]) # Restrict imputed results to problem's sample index subset if sample_translation: t.imputed_data = imputed_data[:, ibd_sample_index, :] t.imputed_hap_type = imputed_hap_type[:, ibd_sample_index]
def impute(self, samples=None): '''Infer imputed genotypes at all samples of h from the samples of g.''' # Aliases r = self.ratio_threshold if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Initial g[%d] = %s, h[%d] = %s' % ( self.training_index[j], repr( self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'Initial h[%d] = %s' % (j, repr(self.h[j])) # Create a queue of haplotypes that can be used to impute others. Initially, it # is the training set homozygotes; every time we phase a training set het, its other # allele is appended to the queue. q = Queue() # Initial condition: phase all hom training samples hom = self.__phase_hom() for hap in itertools.product(hom, ALLELES): if self.debug: print 'Adding hom haplotype to queue', hap q.put(hap) num_hom_haps = q.qsize() if self.debug: print 'Items on queue : %d' % (q.qsize(), ) print 'filled haps : %.2f%%' % ( (100. * len(self.h.nonzero()[0])) / self.h.size, ) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ( (100. * len(np.where(HH > 0)[0])) / HH.size, ) print 'phased training: %.2f%%' % ( (100. * len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size, ) count = 0 while not q.empty(): # Get the IBD group of an imputed haplotype (all haps that are IBD with it at self.snp) hap = q.get() count += 1 if count > self.max_iter: raise ValueError( 'Possible imputation bug - exceeded max number of iterations' ) if self.debug: print '*' * 55 print 'Iteration %d, imputing from hap %s' % (count, hap) print '*' * 55 if self.debug_sample >= 0: j = self.debug_sample print 'h[%d] = %s' % (j, repr(self.h[j])) if self.h[j, 0] == 1: pass group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]) if group.size: # if self.debug: print 'group', group s, a = group[:, SAMPLE], group[:, ALLELE] H = self.h[s, a] # print 'H', H # Find haplotypes that have been imputed with each of the alleles R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)] if self.debug: print 'IBD group %d (%d haps):\n%s' % ( self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr( np.concatenate( (group, H[np.newaxis].transpose()), axis=1))) print 'R1 = %s' % (repr(list(map(tuple, R1))), ) print 'R2 = %s' % (repr(list(map(tuple, R2))), ) # print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1))) # print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1))) # Majority vote: if there are enough haps with one allele, override the rest. # Otherwise, an unresolved conflict ==> zero everyone out. l1, l2 = len(R1), len(R2) consensus = 1 if l1 >= r * l2 else ( 2 if l2 >= r * l1 else MISSING) self.h[s, a] = consensus if consensus == 0: # If no consensus is reached, keep the already-imputed values in place, otherwise # we can run into an infinite loop by imputing and erasing h-entries. self.h[R1[:, 0], R1[:, 1]] = 1 self.h[R2[:, 0], R2[:, 1]] = 2 H = self.h[s] if self.debug: print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus) print 'Items on queue : %d' % (q.qsize(), ) print 'filled haps : %.2f%%' % ( (100. * len(self.h.nonzero()[0])) / self.h.size, ) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ( (100. * len(np.where(HH > 0)[0])) / HH.size, ) print 'phased training: %.2f%%' % ( (100. * len( self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size, ) # Phase training hets (this includes BOTH partially-called = potential hets an # fully-called hets) with one imputed allele i = np.array([self.training_index.has_key(x) for x in s]) si = s[i] G = self.g[map(self.training_index.get, si), :] unphased_hets = np.where( ((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING) & (G[:, PATERNAL] != G[:, MATERNAL]))) if unphased_hets[0].size: if self.debug: if count >= num_hom_haps: print 'After hom, items on queue %d' % ( q.qsize(), ) pass print 'unphased_hets', unphased_hets print 'si', si print 'index i[unphased_hets]', np.where( i)[0][unphased_hets] print 'H_unphased', H[i][unphased_hets] print 'G_unphased', G[unphased_hets] H_unphased = H[i][unphased_hets] H_phased = H_unphased.copy() complete_haplotype_partial(H_phased, G[unphased_hets]) # if self.debug: # print 'Phasing hets' # print 'i', np.where(i) # print 'H_unphased', H_unphased # print 'G of unphased_hets', G[unphased_hets] newly_phased_alleles = np.where(H_phased != H_unphased)[1] self.h[si[unphased_hets]] = H_phased[:] # if self.debug: # print 'After phasing H_unphased', self.h[s[unphased_hets]] # print 'unphased_hets', s[unphased_hets] # print 'newly_phased_alleles', newly_phased_alleles # Append the new data we can now make use of to the queue if self.debug: print 'After phasing them' print 'H_phased ', H_phased for hap in zip(si[unphased_hets], newly_phased_alleles): if self.debug: print 'Adding phased het haplotypes to queue', hap q.put(hap) self.__override_training_imputed_by_genotypes() if self.remove_partial_calls: self.__remove_partial_calls() if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Final g[%d] = %s, h[%d] = %s' % ( self.training_index[j], repr( self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'h[%d] = %s' % (j, repr(self.h[j]))
def ibs_diff(g, id1, id2): '''Return the IBS difference between two haplotypes (0 if IBS >= 1, 1 if IBS = 0).''' g1, g2 = recode.recode_single_genotype( g[:, id1, :]), recode.recode_single_genotype(g[:, id2, :]) return (recode.ibs_state(g1, g2) == 0).astype(np.byte)