def haplotype_hut(): '''Load partially-phased haplotypes for the hutterites sample problem. Cached since it's large.''' if not Templates.HAPLOTYPE_HUT: Templates.HAPLOTYPE_HUT = io_genotype.read('plink', 'haplotype', tped=GENOTYPE_SAMPLE + '.hap.tped', load_ids=False) return Templates.HAPLOTYPE_HUT
def test_nearest_snp(self): '''Test finding the nearest SNP of a given base pair location.''' g = io_genotype.read('plink', 'genotype', prefix=itu.GENOTYPE_SAMPLE) assert_equal(list(g.nearest_snp_multiple(g.base_pair)), g.snp_range, 'Wrong nearest SNP location') i = 5 assert_equal(g.nearest_snp(g.base_pair[i]+0.01), i, 'Wrong nearest SNP location') assert_equal(g.nearest_snp(g.base_pair[i]-0.01), i, 'Wrong nearest SNP location')
def problem_hut(): '''Load the hutterites data set. Cached since it's large.''' if not Templates.PROBLEM_HUT: pedigree = Templates.pedigree_hut() genotype = io_genotype.read('plink', 'genotype', prefix=GENOTYPE_SAMPLE, load_ids=False) Templates.PROBLEM_HUT = Problem(pedigree, genotype) return Templates.PROBLEM_HUT
def setUp(self): '''Load test data and expected results.''' unittest.TestCase.setUp(self) # The way to load a pedigree in conjunction with a genotype set is to recode # its sample IDs to consecutive for easier access by phasers. self.problem = io.read_plink(prefix=itu.GENOTYPE_DUO, haplotype=None, pedigree=itu.GENOTYPE_DUO + '.tfam') self.phaser = trivial_phaser() # Expected results self.solution = Problem(self.problem.pedigree, io_genotype.read('plink', 'genotype', prefix=itu.GENOTYPE_DUO_SOLUTION))
def test_nearest_snp(self): '''Test finding the nearest SNP of a given base pair location.''' g = io_genotype.read('plink', 'genotype', prefix=itu.GENOTYPE_SAMPLE) assert_equal(list(g.nearest_snp_multiple(g.base_pair)), g.snp_range, 'Wrong nearest SNP location') i = 5 assert_equal(g.nearest_snp(g.base_pair[i] + 0.01), i, 'Wrong nearest SNP location') assert_equal(g.nearest_snp(g.base_pair[i] - 0.01), i, 'Wrong nearest SNP location')
def setUp(self): '''Load test data and expected results.''' unittest.TestCase.setUp(self) # The way to load a pedigree in conjunction with a genotype set is to recode # its sample IDs to consecutive for easier access by phasers. self.problem = io.read_plink(prefix=itu.GENOTYPE_TRIO, haplotype=None, pedigree=itu.GENOTYPE_TRIO + '.tfam') self.phaser = trivial_phaser() # Expected results self.solution = Problem(self.problem.pedigree, io_genotype.read('plink', 'genotype', prefix=itu.GENOTYPE_TRIO_SOLUTION))
def read_plink(**kwargs): '''Load a problem from the following PLINK files: Default Override Option Data Format ====================================================================================== prefix.pdg.tfam pedigree Pedigree adjacency PLINK TFAM (genotyped+nongenotyped samples) prefix.tfam pedigree_genotyped Genotyped sample pedigree (sub-graph of the pedigree) PLINK TFAM corresponding to prefix.tped prefix.tped genotype Genotype data PLINK TPED prefix.hap.tped haplotype* Haplotype data PLINK TPED prefix.err error** Genotype errors flagged Integer array (snps x samples) prefix.info info Problem info pickle (binary) prefix.frm frames LD-independent SNP frames text file prefix.lam lam*** Haplotype est. recombination rate text file * - hap data not loaded if this option is None. ** - errors set to 0 if this file is not found or this option is set to None. *** - data not loaded if if this file is not found. ''' # Read input options verbose = kwargs.get('verbose', False) prefix = kwargs.get('prefix', '') overrideable_option = lambda name, default: kwargs.get(name, default if prefix else None) pedigree = overrideable_option('pedigree', prefix + '.pdg.tfam') pedigree_genotyped = overrideable_option('pedigree_genotyped', prefix + '.tfam') genotype = overrideable_option('genotype', prefix + '.tped') haplotype = overrideable_option('haplotype', prefix + '.hap.tped') error_file = overrideable_option('error', prefix + '.err') info = overrideable_option('info', prefix + '.info') if not np.all([[pedigree, pedigree_genotyped, genotype, error_file] is not None]): raise ValueError('Must specify a prefix or pedigree, pedigree_genotyped, genotype, error files') frames_file = overrideable_option('frames', prefix + '.frm') lam_file = overrideable_option('lam', prefix + '.lam') # Load data print_location = lambda x : x if x else '-' if verbose: print 'Reading pedigree from %s, %s ...' % (print_location(pedigree), print_location(pedigree_genotyped),) p = io_pedigree.read(pedigree, genotyped_id_file=pedigree_genotyped) if verbose: print 'Reading genotype data from %s ...' % (print_location(genotype),) g = io_genotype.read('plink', 'genotype', tped=genotype, load_ids=False) if verbose: print 'Reading haplotype data from %s ...' % (print_location(haplotype),) h = io_genotype.read('plink', 'haplotype', tped=haplotype, load_ids=False) if haplotype else None if verbose: print 'Reading error data from %s ...' % (print_location(error_file),) error = np.loadtxt(error_file) if error_file and os.path.isfile(error_file) else None if verbose: print 'Reading frame data from %s ...' % (print_location(frames_file),) frames = db_gene.snp.ld_graph.read_frames(frames_file) if frames_file else None lam = np.loadtxt(lam_file) if lam_file and os.path.isfile(lam_file) else None # info = ProblemInfo(p, g) if info is None else info problem = Problem(p, g, haplotype=h, error=error, frames=frames, lam=lam) if haplotype and info: if verbose: print 'Reading problem info from %s ...' % (info,) with open(info, 'rb') as fout: problem.info = pickle.load(fout) return problem
if options.recode: # First, compute allele frequencies with PLINK util.run_command('%s --nonfounders --freq --out %s' % (plink_cmd_base, out_base_name)) # Convert frequencies file that to a reference allele recoding # file (a file containing the list of SNPs and their minor allele letter) bu.frq_to_minor_file(out_base_name + '.frq', out_base_name + '.mnr') # Then convert binary PLINK to a recoded 12-recoded TPED, where 1=minor allele for each SNP out_recoded = out_base_name + '.recoded' util.run_command('%s --transpose --recode12 --reference-allele %s.mnr --out %s' % \ (plink_cmd_base, out_base_name, out_recoded)) # Reload the recoded problem for ext in ('nof', 'tped', 'tfam'): os.rename(out_recoded + '.' + ext, out_base_name + '.' + ext) genotype = io_genotype.read('plink', 'genotype', tped=out_base_name + '.tped', load_ids=False) else: genotype = problem.genotype # Write problem to file in our (npz) io.write_npz(problem, out_base_name + '.npz') # Write genotypes Gaixin formats; she uses those separate files io_genotype.write('gaixin', genotype, options.out_gxn + '.gxn', sample_id=problem.pedigree.sample_id_genotyped) # Convert plink tped to bed; delete the tped set util.run_command('%s --make-bed --out %s' % (plink_cmd_base, out_base_name)) for ext in ('nof', 'pdg.tfam', 'tped', 'tfam', 'info'): os.remove(out_base_name + '.' + ext) except: traceback.print_exc(file=sys.stdout)
def test_create_from_file_no_sample_ids(self): g = io_genotype.read('plink', 'genotype', tped=itu.GENOTYPE_SAMPLE + '.tped', load_ids=False) self.__assert_genotype_set_stats_correct(g)
def test_create_from_file(self): g = io_genotype.read('plink', 'genotype', prefix=itu.GENOTYPE_SAMPLE) self.__assert_genotype_set_stats_correct(g) self.__assert_snp_equals((22, 'rs1654', 0, 17596388), g.snp[7])
def test_create_from_file_no_sample_ids(self): g = io_genotype.read('plink', 'genotype', tped=itu.GENOTYPE_SAMPLE+'.tped', load_ids=False) self.__assert_genotype_set_stats_correct(g)
def read_plink(**kwargs): '''Load a problem from the following PLINK files: Default Override Option Data Format ====================================================================================== prefix.pdg.tfam pedigree Pedigree adjacency PLINK TFAM (genotyped+nongenotyped samples) prefix.tfam pedigree_genotyped Genotyped sample pedigree (sub-graph of the pedigree) PLINK TFAM corresponding to prefix.tped prefix.tped genotype Genotype data PLINK TPED prefix.hap.tped haplotype* Haplotype data PLINK TPED prefix.err error** Genotype errors flagged Integer array (snps x samples) prefix.info info Problem info pickle (binary) prefix.frm frames LD-independent SNP frames text file prefix.lam lam*** Haplotype est. recombination rate text file * - hap data not loaded if this option is None. ** - errors set to 0 if this file is not found or this option is set to None. *** - data not loaded if if this file is not found. ''' # Read input options verbose = kwargs.get('verbose', False) prefix = kwargs.get('prefix', '') overrideable_option = lambda name, default: kwargs.get( name, default if prefix else None) pedigree = overrideable_option('pedigree', prefix + '.pdg.tfam') pedigree_genotyped = overrideable_option('pedigree_genotyped', prefix + '.tfam') genotype = overrideable_option('genotype', prefix + '.tped') haplotype = overrideable_option('haplotype', prefix + '.hap.tped') error_file = overrideable_option('error', prefix + '.err') info = overrideable_option('info', prefix + '.info') if not np.all([[pedigree, pedigree_genotyped, genotype, error_file] is not None]): raise ValueError( 'Must specify a prefix or pedigree, pedigree_genotyped, genotype, error files' ) frames_file = overrideable_option('frames', prefix + '.frm') lam_file = overrideable_option('lam', prefix + '.lam') # Load data print_location = lambda x: x if x else '-' if verbose: print 'Reading pedigree from %s, %s ...' % ( print_location(pedigree), print_location(pedigree_genotyped), ) p = io_pedigree.read(pedigree, genotyped_id_file=pedigree_genotyped) if verbose: print 'Reading genotype data from %s ...' % ( print_location(genotype), ) g = io_genotype.read('plink', 'genotype', tped=genotype, load_ids=False) if verbose: print 'Reading haplotype data from %s ...' % ( print_location(haplotype), ) h = io_genotype.read('plink', 'haplotype', tped=haplotype, load_ids=False) if haplotype else None if verbose: print 'Reading error data from %s ...' % (print_location(error_file), ) error = np.loadtxt( error_file) if error_file and os.path.isfile(error_file) else None if verbose: print 'Reading frame data from %s ...' % ( print_location(frames_file), ) frames = db_gene.snp.ld_graph.read_frames( frames_file) if frames_file else None lam = np.loadtxt( lam_file) if lam_file and os.path.isfile(lam_file) else None # info = ProblemInfo(p, g) if info is None else info problem = Problem(p, g, haplotype=h, error=error, frames=frames, lam=lam) if haplotype and info: if verbose: print 'Reading problem info from %s ...' % (info, ) with open(info, 'rb') as fout: problem.info = pickle.load(fout) return problem