def concatenate_segments(segments): '''Concatenate segments and errors from a generator expression.''' (all_segments, all_errors) = ([], gt.empty_errors_array()) for s in segments: all_segments += s all_errors = gt.concatenate_errors(all_errors, s.errors) return SegmentSet(all_segments, all_errors)
def phase_at_snp_using_training_set(snp_index, ibd, h, t, training_sample_set): '''Phase using IBD segments containing the training_sample_set.''' errors = empty_errors_array() bp = t.snp['base_pair'][snp_index] segments = im.imputation.ibd_lookup.segments_ibd_at_bp(ibd, (bp, bp), samples=training_sample_set) for segment in segments: s = np.array(list(segment.samples)) hap_ids = np.array([x[0] for x in segment.samples]) i = np.in1d(hap_ids, training_sample).nonzero()[0] (i1, i2) = s[i].transpose() alleles = h[snp_index, i1, i2] known = alleles.nonzero()[0] if known.size: # There exist phased training samples hap_ids = hap_ids[i[known]] alleles = alleles[known] if np.diff(alleles).nonzero()[0]: # Contradicting alleles, flag errors print 'Error: contradicting alleles at snp index', snp_index, 'hap_ids', hap_ids, 'alleles', alleles e = np.zeros((2, len(hap_ids)), dtype=np.uint) e[0, :] = snp_index e[1, :] = hap_ids errors = np.concatenate((errors, e), axis=1) elif alleles.size: # Copy known alleles to all IBD haplotypes' h-entries (i1, i2) = s.transpose() print 'phasing at i1', i1, 'i2', i2, 'allele value', alleles[0] h[snp_index, i1, i2] = alleles[0] return errors
def phase_at_snp_using_training_set(snp_index, ibd, h, t, training_sample_set): '''Phase using IBD segments containing the training_sample_set.''' errors = empty_errors_array() bp = t.snp['base_pair'][snp_index] segments = im.imputation.ibd_lookup.segments_ibd_at_bp( ibd, (bp, bp), samples=training_sample_set) for segment in segments: s = np.array(list(segment.samples)) hap_ids = np.array([x[0] for x in segment.samples]) i = np.in1d(hap_ids, training_sample).nonzero()[0] (i1, i2) = s[i].transpose() alleles = h[snp_index, i1, i2] known = alleles.nonzero()[0] if known.size: # There exist phased training samples hap_ids = hap_ids[i[known]] alleles = alleles[known] if np.diff(alleles).nonzero()[0]: # Contradicting alleles, flag errors print 'Error: contradicting alleles at snp index', snp_index, 'hap_ids', hap_ids, 'alleles', alleles e = np.zeros((2, len(hap_ids)), dtype=np.uint) e[0, :] = snp_index e[1, :] = hap_ids errors = np.concatenate((errors, e), axis=1) elif alleles.size: # Copy known alleles to all IBD haplotypes' h-entries (i1, i2) = s.transpose() print 'phasing at i1', i1, 'i2', i2, 'allele value', alleles[0] h[snp_index, i1, i2] = alleles[0] return errors
def concatenate_segments(segments): '''Concatenate segments and errors from a generator expression.''' (all_segments, all_errors) = ([], gt.empty_errors_array()) for s in segments: all_segments += s all_errors = gt.concatenate_errors(all_errors, s.errors) return SegmentSet(all_segments, all_errors)
def genotype_ibs_segments(genotype, id1, id2, snps, error_filter='median', error_filter_length=5, margin=0.0, min_ibs_len_snp=400, debug=False): '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2 in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array. See ibs_segments() for a description of optional parameters.''' num_snps = genotype.num_snps g = genotype.data g1 = recode.recode_single_genotype(g[snps, id1, :]) g2 = recode.recode_single_genotype(g[snps, id2, :]) d = (recode.ibs_state(g1, g2) == 0).astype(np.byte) # Consider informative or the specified SNPs only filtered_diff = filter_diff(d, error_filter, error_filter_length) error_snps = snps[np.nonzero(d - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: # Convert recombination locations to segments of no recombination; filter short segments bp = genotype.snp['base_pair'] #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) segments = [ Segment(((x[0], x[1])), [id1, id2], (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x), collapse_to_set=False) for x in segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) ] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [ s for s in (s.middle_part( genotype.nearest_snp, bp, margin, collapse_to_set=False) for s in segments) if s ] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def test_ibd_segments_ibdld(self): '''Calculate IBD segments in a nuclear family using IBDLD.''' segment_cache = im.ibdld.ibd_ld.IbdSegmentGlobalCacheIbdld(itu.FAMILY7 + '.ibd') segment_computer = im.ibdld.ibd_ld.IbdSegmentComputerIbdld(segment_cache, self.haplotype, chrom=22, sample_id=self.problem.pedigree.sample_id, samples=[2, 8], threshold=0.9, params=PhaseParam()) segment_set = segment.break_and_group_segments(segment_computer.segments) assert_segments_almost_equal(segment_set, [((38 , 2849), [], ((8, 1), (2, 1)))], full_data=False, decimal=3, err_msg='Wrong grouped IBDLD IBD segments') assert_equal(segment_set.errors, empty_errors_array(), 'IBDLD does not support errors but they are output?!')
def _find_errors(problem, snps, haps, consensus, common, hh, min_consensus_samples): e = _get_current_error_status(problem, haps, snps) # if debug and snp_test_index and start <= snp_test_index and snp_test_index <= stop: # print 'Haps at SNP %d before phasing:' % (snp_test_index,) # np.set_printoptions(threshold=np.nan) # ind = np.array(haps) # print h[max(start, snp_test_index - 5):min(stop, snp_test_index + 5), ind[:, 0], ind[:, 1]] if consensus == 'majority' and hh.shape[0] >= min_consensus_samples: # errors = np.where(np.tile(common, (num_haps,1)) != hh) # Convert to original coordinates: (SNP, sample) errors = _find_new_errors(common, hh, e) errors = (snps[errors[1]], haps[errors[0], 0]) if errors[0].size else errors else: errors = gt.empty_errors_array() return errors
def _find_errors(problem, snps, haps, consensus, common, hh, min_consensus_samples): e = _get_current_error_status(problem, haps, snps) # if debug and snp_test_index and start <= snp_test_index and snp_test_index <= stop: # print 'Haps at SNP %d before phasing:' % (snp_test_index,) # np.set_printoptions(threshold=np.nan) # ind = np.array(haps) # print h[max(start, snp_test_index - 5):min(stop, snp_test_index + 5), ind[:, 0], ind[:, 1]] if consensus == 'majority' and hh.shape[0] >= min_consensus_samples: # errors = np.where(np.tile(common, (num_haps,1)) != hh) # Convert to original coordinates: (SNP, sample) errors = _find_new_errors(common, hh, e) errors = (snps[errors[1]], haps[errors[0], 0]) if errors[0].size else errors else: errors = gt.empty_errors_array() return errors
def genotype_ibs_segments(genotype, id1, id2, snps, error_filter='median', error_filter_length=5, margin=0.0, min_ibs_len_snp=400, debug=False): '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2 in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array. See ibs_segments() for a description of optional parameters.''' num_snps = genotype.num_snps g = genotype.data g1 = recode.recode_single_genotype(g[snps, id1, :]) g2 = recode.recode_single_genotype(g[snps, id2, :]) d = (recode.ibs_state(g1, g2) == 0).astype(np.byte) # Consider informative or the specified SNPs only filtered_diff = filter_diff(d, error_filter, error_filter_length) error_snps = snps[np.nonzero(d - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: # Convert recombination locations to segments of no recombination; filter short segments bp = genotype.snp['base_pair'] #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) segments = [Segment(((x[0], x[1])), [id1, id2], (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x), collapse_to_set=False) for x in segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [s for s in (s.middle_part(genotype.nearest_snp, bp, margin, collapse_to_set=False) for s in segments) if s] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def ibs_segments(haplotype, id1, id2, hap1_type, hap2_type, snps=None, include_alt_phase=False, error_filter='median', error_filter_length=5, length_bound=None, min_segment_length=INDETERMINATE, margin=0.0, debug=False): '''Return 1) Identical-by-State (IBS) segments separated by recombination events between two sample haplotypes (id1, hap1_type) and (id2, hap2_type). The 2-D output array's ith row format is (segment_start, segment_stop), (id1, hap1), (id2, hap2), (segment_start_bp, segment_stop_bp, segment_length_in_bp, num_errors_in_segment) The SNP range is [segment_start, segment_stop) where start=inclusive and stop is exclusive. 2) List of het_snp indices at which there are likely genotype errors. Options: snps - list of SNPs to base the comparison on. For parent-child comparisons, these should be heterozygous SNPs in the parent's genotype, distinguishing its haplotypes and used to locate segments. For unphased-phased individuals, these should be the list of homozygous SNPs at the unphased individual (those that have data). If not specified, all SNPs are used. length_bound - minimum segment length bound type: None: no lower bound enforced 'base_pair': output segments of at least min_segment_length [base pair] 'snp': output segments of at least min_segment_length consecutive SNPs out of the snps list. This is useful only if snps includes all SNPs (or is None) *NOTE*: min_segment_length''s units are interpreted differently depending on length_bound. margin = fraction of segment to discard near the endpoints (margin/2 is removed from each side).''' if debug: print 'Computing IBD segments between haplotypes (%d,%d), (%d,%d); filter %s length %d' % \ (id1, hap1_type, id2, hap2_type, error_filter, error_filter_length) d = diff.all_diffs(haplotype.data, id1, id2, hap1_type=hap1_type, hap2_type=hap2_type)[0] # Segment length, as defined by the input parameters segment_length = lambda f: np.inf if not length_bound else ( f.length if length_bound == 'base_pair' else f.num_snps) # @UnusedVariable # Consider informative or the specified SNPs only snps = snps if snps is not None else haplotype.snp_range snps = np.intersect1d(snps, np.where(d != INDETERMINATE)[0]) d_snps = d[snps] filtered_diff = filter_diff(d_snps, error_filter, error_filter_length) error_snps = snps[np.nonzero(d_snps - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments bp = haplotype.snp['base_pair'] num_snps = haplotype.num_snps if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: deriv = ndimage.convolve(filtered_diff, [1, -1]) edge = np.where(deriv != 0)[0] initial_phase = hap1_type if filtered_diff[0] == 0 else 1 - hap1_type if debug: print 'initial_phase', initial_phase # , 'edge', edge # Convert recombination locations to segments of no recombination; filter short segments segments = [ f for f in ( Segment(((x[0], x[1])), set(((id1, x[2]), (id2, hap2_type))), ( bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x)) for x in segment.edges_to_segments( snps, edge, initial_phase, haplotype.num_snps, hap1_type)) if segment_length(f) >= min_segment_length ] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [ s for s in (s.middle_part(haplotype.nearest_snp, bp, margin) for s in segments) if s ] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
# Allocate imputed data structures imputed = im.factory.GenotypeFactory.new_instance('haplotype', np.zeros((t.num_snps, pedigree.num_genotyped, 2), dtype=np.byte), t.snp, t.sample_id) h = imputed.data # Phase all homozygous trainees; place in corresponding locations in h hom = np.where(im.gt.is_homozygous(tg)[:, :]) h[hom[0], training_sample[hom[1]], :] = tg[hom] # Phase training set. For each SNP: # - Find all segments intersecting the SNP # - Construct global IBD segments-sets. This is currently a wasteful implementation. TODO: replace ibd by # the global IBD dictionary to speed this part up # - Copy known alleles to # - If known alleles differ, mark errors (at all alleles for now; TODO: use instead majority vote to find a single error among multiple alleles) errors = empty_errors_array() for snp_index in xrange(t.num_snps): chrom = t.snp['chrom'][snp_index] print '====== SNP %d: chr%d:%d, %s ======' % (snp_index, chrom, t.snp['base_pair'][snp_index], t.snp['name'][snp_index]) ibd_chrom = ibd[chrom - 1] # Phase using homozygous training samples errors = merge_errors(errors, phase_at_snp_using_training_set(snp_index, ibd_chrom, h, t, training_sample_set)) # Phase using het training samples that now have one allele determined hh = h[snp_index, training_sample, :] het_to_phase = training_sample[np.where(((hh[:, 0] == MISSING) ^ (hh[:, 1] == MISSING)) & (tg[snp_index, :, 0] != MISSING) & (tg[snp_index, :, 1] != MISSING))[0]] errors = merge_errors(errors, phase_at_snp_using_training_set(snp_index, ibd_chrom, h, t, het_to_phase))
'haplotype', np.zeros((t.num_snps, pedigree.num_genotyped, 2), dtype=np.byte), t.snp, t.sample_id) h = imputed.data # Phase all homozygous trainees; place in corresponding locations in h hom = np.where(im.gt.is_homozygous(tg)[:, :]) h[hom[0], training_sample[hom[1]], :] = tg[hom] # Phase training set. For each SNP: # - Find all segments intersecting the SNP # - Construct global IBD segments-sets. This is currently a wasteful implementation. TODO: replace ibd by # the global IBD dictionary to speed this part up # - Copy known alleles to # - If known alleles differ, mark errors (at all alleles for now; TODO: use instead majority vote to find a single error among multiple alleles) errors = empty_errors_array() for snp_index in xrange(t.num_snps): chrom = t.snp['chrom'][snp_index] print '====== SNP %d: chr%d:%d, %s ======' % ( snp_index, chrom, t.snp['base_pair'][snp_index], t.snp['name'][snp_index]) ibd_chrom = ibd[chrom - 1] # Phase using homozygous training samples errors = merge_errors( errors, phase_at_snp_using_training_set(snp_index, ibd_chrom, h, t, training_sample_set)) # Phase using het training samples that now have one allele determined hh = h[snp_index, training_sample, :] het_to_phase = training_sample[ np.where(((hh[:, 0] == MISSING) ^ (hh[:, 1] == MISSING))
def ibs_segments(haplotype, id1, id2, hap1_type, hap2_type, snps=None, include_alt_phase=False, error_filter='median', error_filter_length=5, length_bound=None, min_segment_length=INDETERMINATE, margin=0.0, debug=False): '''Return 1) Identical-by-State (IBS) segments separated by recombination events between two sample haplotypes (id1, hap1_type) and (id2, hap2_type). The 2-D output array's ith row format is (segment_start, segment_stop), (id1, hap1), (id2, hap2), (segment_start_bp, segment_stop_bp, segment_length_in_bp, num_errors_in_segment) The SNP range is [segment_start, segment_stop) where start=inclusive and stop is exclusive. 2) List of het_snp indices at which there are likely genotype errors. Options: snps - list of SNPs to base the comparison on. For parent-child comparisons, these should be heterozygous SNPs in the parent's genotype, distinguishing its haplotypes and used to locate segments. For unphased-phased individuals, these should be the list of homozygous SNPs at the unphased individual (those that have data). If not specified, all SNPs are used. length_bound - minimum segment length bound type: None: no lower bound enforced 'base_pair': output segments of at least min_segment_length [base pair] 'snp': output segments of at least min_segment_length consecutive SNPs out of the snps list. This is useful only if snps includes all SNPs (or is None) *NOTE*: min_segment_length''s units are interpreted differently depending on length_bound. margin = fraction of segment to discard near the endpoints (margin/2 is removed from each side).''' if debug: print 'Computing IBD segments between haplotypes (%d,%d), (%d,%d); filter %s length %d' % \ (id1, hap1_type, id2, hap2_type, error_filter, error_filter_length) d = diff.all_diffs(haplotype.data, id1, id2, hap1_type=hap1_type, hap2_type=hap2_type)[0] # Segment length, as defined by the input parameters segment_length = lambda f: np.inf if not length_bound else (f.length if length_bound == 'base_pair' else f.num_snps) # @UnusedVariable # Consider informative or the specified SNPs only snps = snps if snps is not None else haplotype.snp_range snps = np.intersect1d(snps, np.where(d != INDETERMINATE)[0]) d_snps = d[snps] filtered_diff = filter_diff(d_snps, error_filter, error_filter_length) error_snps = snps[np.nonzero(d_snps - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments bp = haplotype.snp['base_pair'] num_snps = haplotype.num_snps if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: deriv = ndimage.convolve(filtered_diff, [1, -1]) edge = np.where(deriv != 0)[0] initial_phase = hap1_type if filtered_diff[0] == 0 else 1 - hap1_type if debug: print 'initial_phase', initial_phase # , 'edge', edge # Convert recombination locations to segments of no recombination; filter short segments segments = [f for f in (Segment(((x[0], x[1])), set(((id1, x[2]), (id2, hap2_type))), (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x)) for x in segment.edges_to_segments(snps, edge, initial_phase, haplotype.num_snps, hap1_type)) if segment_length(f) >= min_segment_length] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [s for s in (s.middle_part(haplotype.nearest_snp, bp, margin) for s in segments) if s] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def __init__(self, segments=None, errors=empty_errors_array()): '''Initialize a segment set.''' SegmentComposite.__init__(self, segments) self.errors = errors