def test_ibd_segments_sib_pair(self): '''Test calculating distant IBD segments between every hap pair within a pair of phased sibs. Compare with IBD segments based on nucelar family info.''' # Segments should contain segments obtained by the genotype-genotye-HMM to validate phasing: # [((1412, 3218), (32992389, 51156934, 18.165, 1), ((3, 0), (2, 0))), # ((241 , 451), (19643555, 23817486, 4.174, 1), ((2, 0), (3, 1))), # ((0 , 454), (16484792, 23834889, 7.350, 1), ((3, 1), (2, 1))), # ((2650, 3218), (45892433, 51156934, 5.265, 1), ((3, 1), (2, 1)))], expected_segments = [ ((1412, 3217), (32992389, 51156933, 18.165, 0), ((3, 0), (2, 0))), ((238, 453), (19581946, 23826675, 4.245, 0), ((2, 0), (3, 1))), ((0, 629), (16484792, 25608548, 9.124, 0), ((3, 1), (2, 1))), ((2661, 3217), (45972017, 51156933, 5.185, 0), ((3, 1), (2, 1))) ] # Serial test segment_set = im.ih.between_samples_segments( self.problem, [3], [2], im.PhaseParam(kinship_file=im.itu.KINSHIP_FILE, debug=False)) im.itu.assert_segments_almost_equal(segment_set, expected_segments, full_data=True, decimal=3, err_msg='Wrong IBD segments') # Parallel test segment_set = im.ih.between_samples_segments( self.problem, [3], [2], im.PhaseParam(kinship_file=im.itu.KINSHIP_FILE, debug=False), num_processes=3) im.itu.assert_segments_almost_equal(segment_set, expected_segments, full_data=True, decimal=3, err_msg='Wrong IBD segments')
def write_id_coefs_legacy(p, out_file, params=im.PhaseParam()): '''Given a legacy Problem object p, import ID coefficients from global ID coefficient file.''' i = p.pedigree.genotyped_sample_id() a = np.array([(x, y, m[0]) + tuple(m[1].tolist()) for (x, y, m) in ((x, y, params.id_coefs(x, y)) for x, y in itertools.product(i, i))]) np.savetxt(out_file, a, fmt='%d %d %e %e %e %e %e %e %e %e %e %e')
def problem_ibd_segments(problem, i, ai, j, bj, **kwargs): '''IBD segments using a phasing npz file- absolute path.''' segments = im.segment.SegmentSet() for a in (im.constants.ALLELES if ai is None else [ai]): for b in (im.constants.ALLELES if bj is None else [bj]): segments += im.ih.hap_segments_from_problem( problem, (i, a), (j, b), im.PhaseParam(**kwargs)) return segments
def debug_distant_phasing(sample): '''Debug distant-phasing of sample ''sample''.''' p = im.io.read_npz( '/home/oren/ober/out/kids/cytosnp/chr22/cytosnp.imputed.stage4.npz') phaser = im.phase_core.new_phaser_chain( [im.phase_distant.distant_phaser(single_sample=sample)]) h = p.haplotype print h.fill_fraction(sample=0) phaser.run(p, im.PhaseParam()) print h.fill_fraction(sample=0)
def plot_lambda_std(problem, id_coef=im.PhaseParam().id_coef_file): '''Plot lambda std dev vs. mean lambda in all children of all families in the problem object ''problem'.''' k = __idcoef_dao(id_coef) l = dict((x, k.id_coefs(x, x)[0]) for x in k.index) child_lam = [ map(l.get, problem.pedigree.sample_id[np.array(list(y.children))]) for y in problem.families() ] c = np.array([(np.mean(x), np.std(x)) for x in child_lam]) P.clf() P.hold(True) P.scatter(c[:, 0], c[:, 1]) P.xlabel('Sibs Mean $\lambda$') P.ylabel('Sibs Stddev $\lambda$')
def lambda_vs_f(id_coef=im.PhaseParam().id_coef_file): '''Return lambda as a discrete function of the inbreeding coefficient f for all Hutt samples. Outputs an array with f, lam columns.''' # Read lambda, calculate f from Deltas k = __idcoef_dao(id_coef) a = np.array([ np.concatenate(([y[0]], y[1])) for y in (k.id_coefs(x, x) for x in k.index) ]) inbreed = lambda d: 2 * (d[1] + 0.5 * (d[3] + d[5] + d[7]) + 0.25 * d[8]) - 1 b = np.array([(inbreed(x), x[0]) for x in a]) # Remove outliers with f~0 b = b[np.where(b[:, 0] >= 1e-5)[0], :] return b
def test_global_ibd_segments(self): '''Debug error that appeared after stage 2 - child had wrong hap color at SNP 2960 in the middle of a long IBD segment. This error doesn''t appear in this isolated test. It depends on the ordering of processing samples and segments.''' # Before stage 2: no IBD segments should exist problem = im.io.read_npz(im.itu.FAMILY4_STAGE1) im.itu.assert_problem_stats(problem, 38616, 35586, 22) ibd = problem.info.ibd assert_equal(ibd.length, 0, 'Before stage 2: no IBD segments should exist') assert_equal(problem.h[2955:2965, 1, 0], [2, 2, 2, 2, 2, 2, 2, 2, 2, 0], 'Wrong mother haplotype') assert_equal(problem.h[2955:2965, 4, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], 'Wrong child haplotype') im.phase_family.family_phaser().run(problem, im.PhaseParam()) # After: check IBD segment; (1, 0), (4, 1) should be IBD around SNP 2960 assert_segments_almost_equal( ibd, [((0, 176), (16484792, 18545634, 2.061, 0), ((2, 0), (0, 0))), ((187, 2687), (18895227, 46242359, 27.347, 1), ((0, 1), (2, 0))), ((2687, 3216), (46336181, 51103692, 4.768, 0), ((2, 0), (0, 0))), ((0, 3218), (16484792, 51156933, 34.672, 0), ((0, 1), (3, 0))), ((0, 2470), (16484792, 44456692, 27.972, 0), ((0, 0), (4, 0))), ((2473, 3216), (44515806, 51103692, 6.588, 1), ((0, 1), (4, 0))), ((0, 2985), (16484792, 48709188, 32.224, 0), ((0, 1), (5, 0))), ((2993, 3216), (48742097, 51103692, 2.362, 0), ((0, 0), (5, 0))), ((0, 3218), (16484792, 51156933, 34.672, 0), ((1, 1), (2, 1))), ((4, 36), (17087656, 17434521, 0.347, 0), ((3, 1), (1, 1))), ((42, 2035), (17587680, 37662436, 20.075, 1), ((1, 0), (3, 1))), ((2047, 3217), (37902926, 51140316, 13.237, 0), ((3, 1), (1, 1))), ((11, 804), (17285049, 27091750, 9.807, 0), ((4, 1), (1, 1))), ((823, 3217), (27200942, 51140316, 23.939, 0), ((1, 0), (4, 1))), ((11, 14), (17285049, 17307742, 0.023, 0), ((5, 1), (1, 0))), ((31, 3217), (17415572, 51140316, 33.725, 1), ((5, 1), (1, 1)))], decimal=3, err_msg='Wrong IBD segments') im.itu.assert_problem_stats(problem, 38616, 38576, 30) assert_equal(problem.h[2955:2965, 1, 0], [2, 2, 2, 2, 2, 2, 2, 2, 2, 1], 'Wrong mother haplotype') assert_equal(problem.h[2955:2965, 4, 1], [2, 2, 2, 2, 2, 2, 2, 2, 2, 1], 'Wrong child haplotype')
def test_entire_pipeline(self): '''Run the entire pipeline. Using a small # of surrogate parents, for speed.''' g = self.problem.genotype # Inject a mock DAO so that we don't need the real ID coef file, which is huge here self.problem.pedigree._idcoef_dao = mock_dao.IdCoefDao( self.problem.pedigree.num_genotyped) phaser = im.phase.build_phasing_pipeline( util.Struct(impute=im.phase.IMPUTE_OPTION.IMPUTE_AND_FILL, debug=False, print_times=False, stage=0)) im.phase.run_phasing_chain( phaser, self.problem, im.PhaseParam(distant_phasing_params=[(0.9, 2, 0.95)])) im.itu.assert_problem_stats(self.problem, 22640, 20225, 144) assert_equal(g.num_filled, 22640, 'Incorrect number of imputed genotypes') assert_equal( g.num_missing, 0, 'Incorrect number of missing genotypes; there should not be any after imputation' )
Test IBD clique kinship comparison for parent-of-origin determination. Created on July 2, 2013 @author: Oren Livne <*****@*****.**> ============================================================ ''' import impute as im, time, numpy as np, os ''' -------------------------------------------------- Main program -------------------------------------------------- ''' chrom = 22 # a = im.poo.Aligner(chrom) # print a.debug_sample_snp(1101, 1500) # print a.debug_sample(6) # a.debug_sample_snp(6, 2000) # a.debug_sample_snp(0, 500) # Time POO vs. # processes used for num_processes in [1]:#[2, 4]: start_time = time.time() m = im.poo.determine_poo(chrom, params=im.PhaseParam(debug=True, poo_snp_step_size=100, num_processes=num_processes)) print '#processes=%d, time %.2f' % (num_processes, time.time() - start_time) np.savetxt(os.environ['OBER'] + '/doc/poo/m-chr%d.txt' % (chrom,), m) im.poo.plot_flip_measure_vs_sample(chrom, m)
items = [int(x) for x in line] return tuple(items[0:2]), im.segment.DisjointSegmentSet( zip(items[2::2], items[3::2])) #################################################################################### if __name__ == '__main__': ''' -------------------------------------------------- Main program -------------------------------------------------- ''' options = parse_command_line_args() file_reader = lambda f: csv.reader( open(f, 'rb'), delimiter=' ', skipinitialspace=True) param = im.PhaseParam() try: reader1 = file_reader(options.file1) reader2 = file_reader(options.file2) for i, line in enumerate(reader1): key, A = parse_line(line) key2, B = parse_line(reader2.next()) if key != key2: raise ValueError('Sample pairs are not the same in both files: (%d,%d), (%d,%d) at line %d' % \ key + key2 + (i + 1,)) else: # Output statistics. Only log pairs for which samples were found len_A, len_B = A.length, B.length if len_A != 0 or len_B != 0: f = param.kinship(key[0], key[1]) sys.stdout.write('%d %d %f %d %d %d %d\n' %
Created on August 2, 2013 @author: Oren Livne <*****@*****.**> ============================================================ ''' import impute as im, os, numpy as np, matplotlib.pyplot as P, sys from impute.kids.hutt_kids import get_sib_ids, read_chip_problem, chip_data_set from impute.phasing.phase_trivial import trivial_phaser #################################################################################### if __name__ == '__main__': path = os.environ['OBER_OUT'] + '/kids' chrom = 22 debug = 1 num_processes = 1 # 4 params = im.PhaseParam() sibs = get_sib_ids(path) sib = 842 # Has high imputation call rate but high re-phasing error rate chip = 'cytosnp' p = im.io.read_npz('%s/%s/chr%d/%s.npz' % (path, chip, chrom, chip_data_set(chip))) # Run stage 1 so that we can step through the code and find why SNP 1700 # is phased to 2,1 instead of 1,2 phaser = trivial_phaser() problem = phaser.run( p, im.PhaseParam(selected_samples=np.array([842]), debug=True,
#!/usr/bin/env python ''' ============================================================ Plot the nbhrs1298 pedigree. Created on August 16, 2012 @author: Oren Livne <*****@*****.**> ============================================================ ''' import impute as im p = im.hutt('hutt.phased.npz') s = 1159 # Unphased sample index #phaser = im.phase_core.new_phaser_chain([im.phase_distant.distant_phaser(single_sample=s)]) phaser = im.phase_core.new_phaser_chain([im.phase_distant.distant_phaser(phased_fill=0.92, target_fill=0.95, max_path_length=7)]) h = p.haplotype print h.fill_fraction(sample=s) phaser.run(p, im.PhaseParam(debug=True)) print h.fill_fraction(sample=s) out = '/home/oren/ped-%d.png' % (s,) im.pt.draw_member_neighbor_genotyped_pedigree(p, 1159, 5, out, identifier='index')
dest='debug', default=0, help='Debug Level (0=quiet; 1=summary; 2=full debug)') parser.add_option( '-i', '--input', type='str', dest='input_file', default=None, help= 'Input file. If specified, reads from this file instead of from stdin') parser.add_option('-u', '--len-unit', type='str', dest='len_unit', default=im.PhaseParam().len_unit, help='segment length to look for [cm|mbp]') parser.add_option('-l', '--min-len', type='str', dest='min_len', default=im.PhaseParam().min_len, help='Minimum segment length to look for') options, args = parser.parse_args(sys.argv[1:]) if len(args) != 2: print usage sys.exit(1) phased_data_file, kinship_file = args try:
#!/usr/bin/env python ''' ============================================================ Test GERMLINE IBD on 507's ungenotyped family. Created on September 15, 2012 @author: Oren Livne <*****@*****.**> ============================================================ ''' import impute as im import numpy as np p = im.hutt('hutt.stage3.npz') q = im.hutt('hutt.stage3.npz') phaser = im.phase_distant.family_sib_comparison_phaser() i = 507 phaser.run(q, im.PhaseParam(single_member=i, debug=True)) print np.where(p.haplotype.data[:, i, :] != q.haplotype.data[:, i, :])
''' ============================================================ Generate the family 7 identity coefficient file. Created on January 21, 2013 @author: Oren Livne <*****@*****.**> ============================================================ ''' import impute as im, numpy as np, db_gene, itertools # Temporarily comment out the frames field in im.io functions for this cmomand to work p = im.io.read_npz(im.itu.FAMILY7 + '.npz') i = p.pedigree.genotyped_sample_id() k = db_gene.snp.file_dao.IdCoefDao(im.PhaseParam().id_coef_file) a = np.array([(x, y, m[0]) + tuple(m[1].tolist()) for (x, y, m) in ((x, y, k.id_coefs(x, y)) for x, y in itertools.product(i, i))]) np.savetxt(im.itu.FAMILY7 + '.id', a, fmt='%d %d %e %e %e %e %e %e %e %e %e %e')
type='int', dest='num_processes', default=1, help='Number of processes to spawn') parser.add_option('-s', '--snp-step-size', type='int', dest='poo_snp_step_size', default=1, help='POO SNP downsampling step size') options, args = parser.parse_args(argv[1:]) if len(args) != 2: print usage sys.exit(1) options.input = args[0] options.output = args[1] return args, options #--------------------------------------------- # Main Program #--------------------------------------------- if __name__ == '__main__': args, options = __parse_command_line_args(sys.argv) params = im.PhaseParam() params.update_from_struct(options) problem = im.io.read_npz(options.input) poo_alignment(problem, params) im.io.write_npz(problem, options.output)
Created on January 11, 2012 @author: Oren Livne <*****@*****.**> ============================================================ ''' import impute as im, itertools, matplotlib.pyplot as P, sys from impute.color.hap_color_grouping import plot_hap_coloring print sys.argv generate_plots = True if (len(sys.argv) < 2) else bool(int(sys.argv[1])) p = im.io.read_npz(im.itu.FAMILY_TOO_ZEROED_STAGE4) haps = list( itertools.product(im.gt.genotyped_members(p, p.first_family), xrange(2))) children = im.gt.genotyped_children(p, p.first_family) child_haps = list(itertools.product(children, xrange(2))) params = im.PhaseParam() #debug=True) #------------------------------------------------------- # Nuclear family phasing #------------------------------------------------------- ibd = p.info.ibd print ibd if generate_plots: # Including parents P.figure(1) plot_hap_coloring(ibd, haps, pair_gap=10, linewidth=6, title='Family IBD Segments: Nuclear Family Phasing') #P.savefig(os.environ['OBER'] + '/doc/ibd/hmm/family_ibd_nuclear.png')