def _test_complete_genotype_partial(self, h, h_expected): """Test completing a haplotype with one known entry vs. a partially-called genotype. Comprehensive checks for a single h and all possible genotypes g.""" g = np.array(list(it.product(xrange(3), xrange(3)))) h_temp = np.tile(h, (9, 1)).copy() gt.complete_haplotype_partial(h_temp, g) assert_equal(h_temp, h_expected, "Unexpected haplotype completion")
def _test_complete_genotype_partial(self, h, h_expected): '''Test completing a haplotype with one known entry vs. a partially-called genotype. Comprehensive checks for a single h and all possible genotypes g.''' g = np.array(list(it.product(xrange(3), xrange(3)))) h_temp = np.tile(h, (9, 1)).copy() gt.complete_haplotype_partial(h_temp, g) assert_equal(h_temp, h_expected, 'Unexpected haplotype completion')
def impute(self, samples=None): '''Infer imputed genotypes at all samples of h from the samples of g.''' # Aliases r = self.ratio_threshold if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Initial g[%d] = %s, h[%d] = %s' % ( self.training_index[j], repr( self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'Initial h[%d] = %s' % (j, repr(self.h[j])) # Create a queue of haplotypes that can be used to impute others. Initially, it # is the training set homozygotes; every time we phase a training set het, its other # allele is appended to the queue. q = Queue() # Initial condition: phase all hom training samples hom = self.__phase_hom() for hap in itertools.product(hom, ALLELES): if self.debug: print 'Adding hom haplotype to queue', hap q.put(hap) num_hom_haps = q.qsize() if self.debug: print 'Items on queue : %d' % (q.qsize(), ) print 'filled haps : %.2f%%' % ( (100. * len(self.h.nonzero()[0])) / self.h.size, ) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ( (100. * len(np.where(HH > 0)[0])) / HH.size, ) print 'phased training: %.2f%%' % ( (100. * len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size, ) count = 0 while not q.empty(): # Get the IBD group of an imputed haplotype (all haps that are IBD with it at self.snp) hap = q.get() count += 1 if count > self.max_iter: raise ValueError( 'Possible imputation bug - exceeded max number of iterations' ) if self.debug: print '*' * 55 print 'Iteration %d, imputing from hap %s' % (count, hap) print '*' * 55 if self.debug_sample >= 0: j = self.debug_sample print 'h[%d] = %s' % (j, repr(self.h[j])) if self.h[j, 0] == 1: pass group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]) if group.size: # if self.debug: print 'group', group s, a = group[:, SAMPLE], group[:, ALLELE] H = self.h[s, a] # print 'H', H # Find haplotypes that have been imputed with each of the alleles R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)] if self.debug: print 'IBD group %d (%d haps):\n%s' % ( self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr( np.concatenate( (group, H[np.newaxis].transpose()), axis=1))) print 'R1 = %s' % (repr(list(map(tuple, R1))), ) print 'R2 = %s' % (repr(list(map(tuple, R2))), ) # print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1))) # print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1))) # Majority vote: if there are enough haps with one allele, override the rest. # Otherwise, an unresolved conflict ==> zero everyone out. l1, l2 = len(R1), len(R2) consensus = 1 if l1 >= r * l2 else ( 2 if l2 >= r * l1 else MISSING) self.h[s, a] = consensus if consensus == 0: # If no consensus is reached, keep the already-imputed values in place, otherwise # we can run into an infinite loop by imputing and erasing h-entries. self.h[R1[:, 0], R1[:, 1]] = 1 self.h[R2[:, 0], R2[:, 1]] = 2 H = self.h[s] if self.debug: print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus) print 'Items on queue : %d' % (q.qsize(), ) print 'filled haps : %.2f%%' % ( (100. * len(self.h.nonzero()[0])) / self.h.size, ) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ( (100. * len(np.where(HH > 0)[0])) / HH.size, ) print 'phased training: %.2f%%' % ( (100. * len( self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size, ) # Phase training hets (this includes BOTH partially-called = potential hets an # fully-called hets) with one imputed allele i = np.array([self.training_index.has_key(x) for x in s]) si = s[i] G = self.g[map(self.training_index.get, si), :] unphased_hets = np.where( ((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING) & (G[:, PATERNAL] != G[:, MATERNAL]))) if unphased_hets[0].size: if self.debug: if count >= num_hom_haps: print 'After hom, items on queue %d' % ( q.qsize(), ) pass print 'unphased_hets', unphased_hets print 'si', si print 'index i[unphased_hets]', np.where( i)[0][unphased_hets] print 'H_unphased', H[i][unphased_hets] print 'G_unphased', G[unphased_hets] H_unphased = H[i][unphased_hets] H_phased = H_unphased.copy() complete_haplotype_partial(H_phased, G[unphased_hets]) # if self.debug: # print 'Phasing hets' # print 'i', np.where(i) # print 'H_unphased', H_unphased # print 'G of unphased_hets', G[unphased_hets] newly_phased_alleles = np.where(H_phased != H_unphased)[1] self.h[si[unphased_hets]] = H_phased[:] # if self.debug: # print 'After phasing H_unphased', self.h[s[unphased_hets]] # print 'unphased_hets', s[unphased_hets] # print 'newly_phased_alleles', newly_phased_alleles # Append the new data we can now make use of to the queue if self.debug: print 'After phasing them' print 'H_phased ', H_phased for hap in zip(si[unphased_hets], newly_phased_alleles): if self.debug: print 'Adding phased het haplotypes to queue', hap q.put(hap) self.__override_training_imputed_by_genotypes() if self.remove_partial_calls: self.__remove_partial_calls() if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Final g[%d] = %s, h[%d] = %s' % ( self.training_index[j], repr( self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'h[%d] = %s' % (j, repr(self.h[j]))
class _IbdQc(object): '''Calculates a QC measure for a single variant using IBD cliques.''' #--------------------------------------------- # Constants #--------------------------------------------- __EMPTY_ARRAY = np.array([]) #--------------------------------------------- # Constructors #--------------------------------------------- def __init__(self, h, hap_type, ibd, g, training_sample_index, chrom, snp_bp, debug=False, majority_threshold=0.66, debug_sample= -1, max_iter=1000): '''Initialize an imputer that changes the result h in-place using the IBD index ibd and training genotype data g at snp position snp_bp. majority vote = threshold for majority vote. When |# haps with majority allele| >= majority_threshold*(All haps) in a clique, the vote is accepted.''' # Input fields self.h, self.hap_type, self.g, self.ibd, self.training_sample_index, self.max_iter, self.debug, \ self.debug_sample, self.chrom = h, hap_type, g, ibd, training_sample_index, max_iter, debug, \ debug_sample, chrom # Maps sample ID to training set index self.training_index = dict(zip(self.training_sample_index, xrange(self.g.shape[0]))) self.ratio_threshold = majority_threshold / (1 - majority_threshold) # Find the appropriate IBD index SNP for the target base-pair position self.snp = ibd.nearest_left_snp(chrom, snp_bp) if self.debug: ibd.find(self.chrom, self.snp, self.training_sample_index[0], PATERNAL) print 'IBD index file %s/chr%d/region-%d.npz, nearest SNP %d' % (ibd._index_dir, ibd._chrom, ibd._start, self.snp) #--------------------------------------------- # Methods #--------------------------------------------- def impute(self, samples=None): '''Infer imputed genotypes at all samples of h from the samples of g.''' # Aliases r = self.ratio_threshold if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Initial g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'Initial h[%d] = %s' % (j, repr(self.h[j])) # Create a queue of haplotypes that can be used to impute others. Initially, it # is the training set homozygotes; every time we phase a training set het, its other # allele is appended to the queue. # # TODO: possibly replace by a priority queue where alleles are ordered by their clique sizes? # (we have extra confidence in those alleles; not sure it matters though) q = Queue() # Initial condition: phase all hom training samples hom = self.__phase_hom() for hap in itertools.product(hom, ALLELES): if self.debug: print 'Adding hom haplotype to queue', hap q.put(hap) num_hom_haps = q.qsize() if self.debug: print 'Items on queue : %d' % (q.qsize(),) print 'filled haps : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,) print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,) count = 0 while not q.empty(): # Find group = the IBD clique of an imputed haplotype (all haps that are IBD with it at self.snp) hap = q.get() count += 1 if count > self.max_iter: raise ValueError('Possible imputation bug - exceeded maximum number of iterations') if self.debug: print '*' * 55 print 'Iteration %d, imputing from hap %s' % (count, hap) print '*' * 55 if self.debug_sample >= 0: j = self.debug_sample print 'h[%d] = %s' % (j, repr(self.h[j])) if self.h[j, 0] == 1: pass group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]) if group.size: # if self.debug: print 'group', group s, a = group[:, SAMPLE], group[:, ALLELE] H = self.h[s, a] # print 'H', H # Find haplotypes that have been imputed as allele 1 and those imputed as allele 2 R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)] if self.debug: print 'IBD group %d (%d haps):\n%s' % (self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr(np.concatenate((group, H[np.newaxis].transpose()), axis=1))) print 'R1 = %s' % (repr(list(map(tuple, R1))),) print 'R2 = %s' % (repr(list(map(tuple, R2))),) # print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1))) # print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1))) # Majority vote: if there are enough haps with one allele, override the rest. # Otherwise, an unresolved conflict ==> zero everyone out. l1, l2 = len(R1), len(R2) consensus = 1 if l1 >= r * l2 else (2 if l2 >= r * l1 else MISSING) self.h[s, a] = consensus if consensus == 0: # If no consensus is reached, keep the already-imputed values in place, otherwise # we can run into an infinite loop by imputing and erasing h-entries. self.h[R1[:, 0], R1[:, 1]] = 1 self.h[R2[:, 0], R2[:, 1]] = 2 H = self.h[s] if self.debug: print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus) print 'Items on queue : %d' % (q.qsize(),) print 'filled haps : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,) print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,) # Phase training hets (this includes BOTH partially-called = potential hets an # fully-called hets) with one imputed allele i = np.array([self.training_index.has_key(x) for x in s]) si = s[i] G = self.g[map(self.training_index.get, si), :] unphased_hets = np.where(((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING) & (G[:, PATERNAL] != G[:, MATERNAL]))) if unphased_hets[0].size: if self.debug: if count >= num_hom_haps: print 'After hom, items on queue %d' % (q.qsize(),) pass print 'unphased_hets', unphased_hets print 'si', si print 'index i[unphased_hets]', np.where(i)[0][unphased_hets] print 'H_unphased', H[i][unphased_hets] print 'G_unphased', G[unphased_hets] H_unphased = H[i][unphased_hets] H_phased = H_unphased.copy() complete_haplotype_partial(H_phased, G[unphased_hets]) # if self.debug: # print 'Phasing hets' # print 'i', np.where(i) # print 'H_unphased', H_unphased # print 'G of unphased_hets', G[unphased_hets] newly_phased_alleles = np.where(H_phased != H_unphased)[1] self.h[si[unphased_hets]] = H_phased[:] # if self.debug: # print 'After phasing H_unphased', self.h[s[unphased_hets]] # print 'unphased_hets', s[unphased_hets] # print 'newly_phased_alleles', newly_phased_alleles # Append the new data we can now make use of to the queue if self.debug: print 'After phasing them' print 'H_phased ', H_phased for hap in zip(si[unphased_hets], newly_phased_alleles): if self.debug: print 'Adding phased het haplotypes to queue', hap q.put(hap) self.__override_training_imputed_by_genotypes() if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Final g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'h[%d] = %s' % (j, repr(self.h[j]))
def impute(self, samples=None): '''Infer imputed genotypes at all samples of h from the samples of g.''' # Aliases r = self.ratio_threshold if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Initial g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'Initial h[%d] = %s' % (j, repr(self.h[j])) # Create a queue of haplotypes that can be used to impute others. Initially, it # is the training set homozygotes; every time we phase a training set het, its other # allele is appended to the queue. q = Queue() # Initial condition: phase all hom training samples hom = self.__phase_hom() for hap in itertools.product(hom, ALLELES): if self.debug: print 'Adding hom haplotype to queue', hap q.put(hap) num_hom_haps = q.qsize() if self.debug: print 'Items on queue : %d' % (q.qsize(),) print 'filled haps : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,) print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,) count = 0 while not q.empty(): # Get the IBD group of an imputed haplotype (all haps that are IBD with it at self.snp) hap = q.get() count += 1 if count > self.max_iter: raise ValueError('Possible imputation bug - exceeded max number of iterations') if self.debug: print '*' * 55 print 'Iteration %d, imputing from hap %s' % (count, hap) print '*' * 55 if self.debug_sample >= 0: j = self.debug_sample print 'h[%d] = %s' % (j, repr(self.h[j])) if self.h[j, 0] == 1: pass group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]) if group.size: # if self.debug: print 'group', group s, a = group[:, SAMPLE], group[:, ALLELE] H = self.h[s, a] # print 'H', H # Find haplotypes that have been imputed with each of the alleles R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)] if self.debug: print 'IBD group %d (%d haps):\n%s' % (self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr(np.concatenate((group, H[np.newaxis].transpose()), axis=1))) print 'R1 = %s' % (repr(list(map(tuple, R1))),) print 'R2 = %s' % (repr(list(map(tuple, R2))),) # print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1))) # print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1))) # Majority vote: if there are enough haps with one allele, override the rest. # Otherwise, an unresolved conflict ==> zero everyone out. l1, l2 = len(R1), len(R2) consensus = 1 if l1 >= r * l2 else (2 if l2 >= r * l1 else MISSING) self.h[s, a] = consensus if consensus == 0: # If no consensus is reached, keep the already-imputed values in place, otherwise # we can run into an infinite loop by imputing and erasing h-entries. self.h[R1[:, 0], R1[:, 1]] = 1 self.h[R2[:, 0], R2[:, 1]] = 2 H = self.h[s] if self.debug: print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus) print 'Items on queue : %d' % (q.qsize(),) print 'filled haps : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,) HH = recode.recode_single_genotype(self.h) print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,) print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,) # Phase training hets (this includes BOTH partially-called = potential hets an # fully-called hets) with one imputed allele i = np.array([self.training_index.has_key(x) for x in s]) si = s[i] G = self.g[map(self.training_index.get, si), :] unphased_hets = np.where(((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING) & (G[:, PATERNAL] != G[:, MATERNAL]))) if unphased_hets[0].size: if self.debug: if count >= num_hom_haps: print 'After hom, items on queue %d' % (q.qsize(),) pass print 'unphased_hets', unphased_hets print 'si', si print 'index i[unphased_hets]', np.where(i)[0][unphased_hets] print 'H_unphased', H[i][unphased_hets] print 'G_unphased', G[unphased_hets] H_unphased = H[i][unphased_hets] H_phased = H_unphased.copy() complete_haplotype_partial(H_phased, G[unphased_hets]) # if self.debug: # print 'Phasing hets' # print 'i', np.where(i) # print 'H_unphased', H_unphased # print 'G of unphased_hets', G[unphased_hets] newly_phased_alleles = np.where(H_phased != H_unphased)[1] self.h[si[unphased_hets]] = H_phased[:] # if self.debug: # print 'After phasing H_unphased', self.h[s[unphased_hets]] # print 'unphased_hets', s[unphased_hets] # print 'newly_phased_alleles', newly_phased_alleles # Append the new data we can now make use of to the queue if self.debug: print 'After phasing them' print 'H_phased ', H_phased for hap in zip(si[unphased_hets], newly_phased_alleles): if self.debug: print 'Adding phased het haplotypes to queue', hap q.put(hap) self.__override_training_imputed_by_genotypes() if self.remove_partial_calls: self.__remove_partial_calls() if self.debug_sample >= 0: j = self.debug_sample if j in self.training_index: print 'Final g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j])) else: print 'h[%d] = %s' % (j, repr(self.h[j]))