def test_some_std(self): k0 = self.snpdata.read_kernel(standardizer=Unit()).val from pysnptools.kernelreader import SnpKernel k1 = self.snpdata.read_kernel(standardizer=Unit()) np.testing.assert_array_almost_equal(k0, k1.val, decimal=10) from pysnptools.snpreader import SnpData snpdata2 = SnpData(iid=self.snpdata.iid,sid=self.snpdata.sid,pos=self.snpdata.pos,val=np.array(self.snpdata.val)) s = str(snpdata2) snpdata2.standardize() s = str(snpdata2) snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False) k2 = snpreader.read_kernel(standardizer=Unit(),block_size=500).val np.testing.assert_array_almost_equal(k0, k2, decimal=10) from pysnptools.standardizer.identity import Identity from pysnptools.standardizer.diag_K_to_N import DiagKtoN for dtype in [sp.float64,sp.float32]: for std in [Unit(),Beta(1,25),Identity(),DiagKtoN()]: s = str(std) np.random.seed(0) x = np.array(np.random.randint(3,size=[60,100]),dtype=dtype) x2 = x[:,::2] x2b = np.array(x2) #LATER what's this about? It doesn't do non-contiguous? #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous #a,b = std.standardize(x2b),std.standardize(x2) #np.testing.assert_array_almost_equal(a,b) logging.info("done")
def divideData(filename,direct,num=5,mph=3,delet=True): print "Estimating heritability using "+str(num)+" components" [yFil,sFil]=getData(filename,mph=mph); n=sFil.iid_count reOrd=perm(n); yFil=yFil[reOrd,:]; sFil=sFil[reOrd,:]; div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)]; varEsts=[]; for i in range(0,num): print "For component "+str(i); sFilTemp=sFil[div[i]:div[i+1],:]; yFilTemp=yFil[div[i]:div[i+1],:]; fileTemp=direct+"/tempFile_"+str(i); Bed.write(fileTemp,sFilTemp.read()); Pheno.write(fileTemp+".phen",yFilTemp.read()) varEsts.append(varRes(fileTemp,direct)); if delet: os.system("rm "+direct+"/tempFile_"+str(i)+"*"); return varEsts;
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100 ''' logging.info("TestSingleSnp test_match_cpp") snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"), count_A1=False) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"] sim_idx = snps.sid_to_index(sim_sid) test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"] test_idx = snps.sid_to_index(test_sid) for G0,G1 in [(snps[:,sim_idx],KernelIdentity(snps.iid)),(KernelIdentity(snps.iid),snps[:,sim_idx])]: frame_h2 = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,h2=.5,leave_out_one_chrom=False,count_A1=False) frame_log_delta = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,log_delta=0,leave_out_one_chrom=False,count_A1=False) for frame in [frame_h2, frame_log_delta]: referenceOutfile = TestFeatureSelection.reference_file("single_snp/topsnps.single.txt") reference = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file assert len(frame) == len(reference) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue reldiff = abs(row.Pvalue - pvalue)/row.Pvalue assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format(sid,None,row.Pvalue,pvalue,reldiff)
def setUpClass(self): currentFolder = os.path.dirname(os.path.realpath(__file__)) self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata" self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt" #self.cov_fn = currentFolder + "/examples/toydata.cov" # load data ################################################################### snp_reader = Bed(self.snp_fn) pheno = pstpheno.loadOnePhen(self.pheno_fn) #cov = pstpheno.loadPhen(self.cov_fn) # intersect sample ids snp_reader, pheno = pysnptools.util.intersect_apply([snp_reader, pheno]) self.G = snp_reader.read(order='C').val self.G = stdizer.Unit().standardize(self.G) self.G.flags.writeable = False self.y = pheno['vals'][:,0] self.y.flags.writeable = False # load pcs #self.G_cov = cov['vals'] self.G_cov = np.ones((len(self.y), 1)) self.G_cov.flags.writeable = False
def __init__(self,args): if args.window_type not in ['BP','SNP']: raise ValueError('Window type not supported') bed_1 = Bed(args.bfile) # af1 = self.get_allele_frequency(bed_1,args) # print(len(af1), "SNPs in file 1") snps_1 = (af1>args.maf)&(af1<1-args.maf) # print(np.sum(snps_1), "SNPs in file 1 after MAF filter") if (args.from_bp is not None) and (args.to_bp is not None): k = (bed_1.pos[:,2]>args.from_bp)&(bed_1.pos[:,2]<args.to_bp) snps_1 = snps_1&k snps_to_use = bed_1.sid[snps_1] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) print(len(snps_to_use),"SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # pos = bed_1.pos[bed_1_index] # bim_1=pd.read_table(bed_1.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) af = af1[bed_1_index] # if args.afile is not None: a1 = pd.read_table(args.afile,header=None,sep='\s*', names=['id1','id2','theta']) else: a1 = None self.af = af self.M = len(bed_1_index) # self.windows = self.get_windows(pos,args) # self.chr = pos[:,0] self.pos = pos[:,2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].loc[bed_1_index] self.A2 = bim_1['a2'].loc[bed_1_index] self.scores = self.compute(bed_1,bed_1_index,af,a1,args) #
def setUpClass(self): self.currentFolder = os.path.dirname(os.path.realpath(__file__)) #TODO: get data set with NANs! snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False) self.pheno_fn = self.currentFolder + "/examples/toydata.phe" self.snpdata = snpreader.read(order='F',force_python_only=True) self.snps = self.snpdata.val
def read_plink(self, fn_plink = None): """ plink reader """ PL = Bed(fn_plink) PLOB = PL.read() self.GT = PLOB.val self.POS = PLOB.pos[:,[0,1]] self.SID = PLOB.iid[:,1] self.isNormalised = False
def factory(snpreader, num_snps_in_memory, standardizer, blocksize): if isinstance(snpreader, str): snpreader = Bed(snpreader) if num_snps_in_memory >= snpreader.sid_count: in_memory = InMemory(snpreader.read(order='C').standardize(standardizer), standardizer, blocksize) in_memory._snpreader.val.flags.writeable = False in_memory._val = in_memory._snpreader.val return in_memory else: return FromDisk(snpreader, num_snps_in_memory, standardizer, blocksize, None)
def test_write_x_x_cpp(self): snpreader = Bed(self.currentFolder + "/examples/toydata") for order in ['C','F']: for dtype in [np.float32,np.float64]: snpdata = snpreader.read(order=order,dtype=dtype) snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) Bed.write(snpdata, output) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def process_data(input_path, output_path, name): snpreader = Bed(os.path.join(input_path, name)) data = snpreader.read() values = data.val preproc_vals = pysnp_genpreproc(values) assert(np.any(np.isnan(preproc_vals)) == False) saved = os.path.join(output_path, name + ".h5py") path, keys = h5_save(path=saved, data_obj={name:preproc_vals}, dt='f') return {'n_subjects':data.iid_count, 'subject_ids':data.iid, 'n_snps':data.sid_count, 'snp_ids':data.sid, 'data_preprocessed_location': {'path':path, 'key':keys}}
def test_write_x_x_cpp(self): snpreader = Bed(self.currentFolder + "/examples/toydata") for order in ['C','F']: for dtype in [np.float32,np.float64]: snpdata = snpreader.read(order=order,dtype=dtype) snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) Bed.write(output, snpdata) snpdata2 = Bed(output).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def test_subset_view(self): snpreader2 = Bed(self.currentFolder + "/examples/toydata",count_A1=False)[:,:] result = snpreader2.read(view_ok=True) self.assertFalse(snpreader2 is result) result2 = result[:,:].read() self.assertFalse(sp.may_share_memory(result2.val,result.val)) result3 = result[:,:].read(view_ok=True) self.assertTrue(sp.may_share_memory(result3.val,result.val)) result4 = result3.read() self.assertFalse(sp.may_share_memory(result4.val,result3.val)) result5 = result4.read(view_ok=True) self.assertTrue(sp.may_share_memory(result4.val,result5.val))
def test_npz(self): logging.info("in test_npz") snpreader = Bed(self.currentFolder + "/../examples/toydata",count_A1=False) kerneldata1 = snpreader.read_kernel(standardizer=stdizer.Unit()) s = str(kerneldata1) output = "tempdir/kernelreader/toydata.kernel.npz" create_directory_if_necessary(output) KernelNpz.write(output,kerneldata1) kernelreader2 = KernelNpz(output) kerneldata2 = kernelreader2.read() np.testing.assert_array_almost_equal(kerneldata1.val, kerneldata2.val, decimal=10) logging.info("done with test")
def main(args): print('reading seeed snps') seed_snps = pd.read_csv(args.seed_snps, header=None, names=['SNP'], index_col='SNP') seed_snps['ibs_length'] = 0 seed_snps['ibd'] = 0 print('reading typed snps') typed_snps = pd.read_csv(args.typed_snps, header=None, names=['SNP']) print('reading genotypes') data = Bed(args.bfile) X = data.read().val typed_snps_indices = np.sort(data.sid_to_index(typed_snps.SNP)) typed_snps_bp = data.col_property[typed_snps_indices,2] print(len(seed_snps), 'snps in list') print(data.iid_count, data.sid_count, 'are dimensions of X') def analyze_snp(i): # find first typed snp after query snp snp_bp = data.col_property[i,2] v = np.where(typed_snps_bp > snp_bp)[0] if len(v) > 0: typed_i = v[0] else: typed_i = len(typed_snps_indices)-1 n1, n2 = np.where(X[:,i] == 1)[0] if (X[n1,typed_snps_indices[typed_i]] - X[n2, typed_snps_indices[typed_i]])**2 == 4: return 0, 0 typed_il, typed_ir = fis.find_boundaries( X[n1,typed_snps_indices], X[n2,typed_snps_indices], typed_i) typed_ir -= 1 il = typed_snps_indices[typed_il] ir = typed_snps_indices[typed_ir] cM = data.col_property[ir, 1] - \ data.col_property[il, 1] ibd = (np.mean(X[n1,il:ir] == X[n2,il:ir]) > 0.99) return cM, int(ibd) for (i, snp) in iter.show_progress( it.izip(data.sid_to_index(seed_snps.index), seed_snps.index), total=len(seed_snps)): # total=10): seed_snps.ix[snp, ['ibs_length', 'ibd']] = analyze_snp(i) print(seed_snps.iloc[:100]) seed_snps.to_csv(args.outfile, sep='\t')
def test_subset(self): logging.info("in test_subset") snpreader = Bed(self.currentFolder + "/../examples/toydata",count_A1=False) snpkernel = SnpKernel(snpreader,stdizer.Unit()) krsub = snpkernel[::2,::2] kerneldata1 = krsub.read() expected = snpreader.read_kernel(stdizer.Unit())[::2].read() np.testing.assert_array_almost_equal(kerneldata1.val, expected.val, decimal=10) krsub2 = snpkernel[::2] kerneldata2 = krsub2.read() np.testing.assert_array_almost_equal(kerneldata2.val, expected.val, decimal=10) logging.info("done with test")
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader.snpdata import SnpData #!!! promote on level up innamespace iid = np.array([[str(i),str(i)] for i in xrange(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in xrange(sid_count)]) pos = np.array([[i,i,i] for i in xrange(sid_count)]) np.random.seed = 0 snpdata = SnpData(iid,sid,pos,np.zeros((iid_count,sid_count))) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count) create_directory_if_necessary(output) Bed.write(snpdata, output) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader import SnpData iid = np.array([[str(i),str(i)] for i in range(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in range(sid_count)]) pos = np.array([[i,i,i] for i in range(sid_count)]) np.random.seed(0) snpdata = SnpData(iid,sid,np.zeros((iid_count,sid_count)),pos=pos) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count) create_directory_if_necessary(output) Bed.write(output, snpdata, count_A1=False) snpdata2 = Bed(output,count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def main(): """ example that compares output to fastlmmc """ # set up data phen_fn = "../feature_selection/examples/toydata.phe" snp_fn = "../feature_selection/examples/toydata.5chrom" #chrom_count = 5 # load data ################################################################### snp_reader = Bed(snp_fn) pheno = pstpheno.loadOnePhen(phen_fn) cov = None #cov = pstpheno.loadPhen(self.cov_fn) snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) G = snp_reader.read(order='C').val G = stdizer.Unit().standardize(G) G.flags.writeable = False y = pheno['vals'][:,0] y.flags.writeable # load pcs #G_pc = cov['vals'] #G_pc.flags.writeable = False delta = 2.0 gwas = WindowingGwas(G, y, delta=delta) pv = gwas.run_gwas() from fastlmm.association.tests.test_gwas import GwasTest REML = False snp_pos_sim = snp_reader.sid snp_pos_test = snp_reader.sid os.environ["FastLmmUseAnyMklLib"] = "1" gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0) gwas_c.run_gwas() import pylab pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+") pylab.plot(np.arange(-18, 0), np.arange(-18,0), "-k") pylab.show() np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3) simple_manhattan_plot(pv)
def test_SNC(self): logging.info("TestSNC") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) snc = bed.read() snc.val[:,2] = [0] * snc.iid_count # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:,:10], pheno=pheno, G0=snc, mixing=0,leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"snc")
def test_write_bed_f64cpp_5_python(self): snpreader = Bed(self.currentFolder + "/examples/toydata") iid_index = 5 logging.info("iid={0}".format(iid_index)) #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # snpreader = snpreader[0:-1,:] #assert snpreader.iid_count % 4 != 0 snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64) if snpdata.iid_count > 0: snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.F64python.{0}".format(iid_index) create_directory_if_necessary(output) Bed.write(snpdata, output,force_python_only=True) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def test_write_bed_f64cpp_5_python(self): snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False) iid_index = 5 logging.info("iid={0}".format(iid_index)) #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # snpreader = snpreader[0:-1,:] #assert snpreader.iid_count % 4 != 0 snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64) if snpdata.iid_count > 0: snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.F64python.{0}".format(iid_index) create_directory_if_necessary(output) Bed.write(output,snpdata, force_python_only=True) snpdata2 = Bed(output,count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def setUpClass(self): from fastlmm.util.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False) self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
def genPheno(filename="../thinFam", per=.5, savename="fakePheno.txt", c=2.0, num=5): sFil = Bed(filename) D = sFil.read().val m = len(D[0]) n = len(D) print m print n I = [rand.randint(0, m - 1) for i in range(0, num)] SNP = [[D[j][i] for j in range(0, n)] for i in I] #p0=n*peir/sum([c**i*len([j for j in SNP if j==float(i)]) for i in range(0,3)]) print len(I) print len(SNP) print len(SNP[0]) print n print min([len(s) for s in SNP]) print SNP SNP = [[max(i, 0.0) for i in s] for s in SNP] for i in range(0, num): for j in range(0, n): if not SNP[i][j] in [1.0, 0.0, 2.0]: SNP[i][j] = 0.0 print[list(set(s)) for s in SNP] lst = [sum([SNP[j][i] for j in range(0, num)]) for i in range(0, n)] #print lst; print sum( [c**(sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)]) p0 = n * per / sum( [c**(sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n)]) print p0 y = [ float( rand.uniform(0, 1) < p0 * c**sum([SNP[j][i] for j in range(0, num)])) for i in range(0, n) ] if len(savename) == 0: return y fil = open(savename, "w") for i in y: fil.write(str(i) + "\n") fil.close()
def load_plink_bed_bim_fam_dataset(path_dataset, snp_ids=None, subject_ids=None, count_A1=True): """ Load a Plink bed/bim/fam dataset as a SnpData instance. Optionnally a specific list of snps or subjects can be extracted to avoid loading everything in memory. Parameters ---------- path_dataset: str Path to the Plink bed/bim/fam dataset, with or without .bed extension. snp_ids: list/set of str, default None Snps that should be extracted if available in the dataset. By default None, all snps are loaded. subject_ids: list of str, default None Subjects that should be extracted if available in the dataset. By default None, all subjects are loaded. count_A1: bool, default True Genotypes are provided as allele counts, A1 if True else A2. Return ------ snp_data: pysnptools object PLINK data loaded by the 'pysnptools' library. """ # Load the metadata, without loading the genotypes snp_data = Bed(path_dataset, count_A1=count_A1) # If requested, filter on snp ids if snp_ids is not None: snp_ids = set(snp_ids) snp_bool_indexes = [(s in snp_ids) for s in snp_data.sid] snp_data = snp_data[:, snp_bool_indexes] # If requested, filter on subject ids if subject_ids is not None: subject_ids = set(subject_ids) subject_bool_indexes = [(s in subject_ids) for s in snp_data.iid[:, 1]] snp_data = snp_data[subject_bool_indexes, :] # Load the genotypes from the Plink dataset snp_data = snp_data.read() return snp_data
def gen_and_compare(self, output_file, **kwargs): gen_snpdata = snp_gen(**kwargs) #pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True) #Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file) #comment out ref_snpdata = Bed(self.currentFolder + "/expected/" + output_file).read() assert TestSnpGen.is_same(gen_snpdata, ref_snpdata), "Failure on " + output_file return gen_snpdata
def test_load_and_standardize_hdf5(self): snpreader2 = SnpHdf5(self.currentFolder + "/examples/toydata.snpmajor.snp.hdf5") snpreader3 = SnpHdf5(self.currentFolder + "/examples/toydata.iidmajor.snp.hdf5") self.load_and_standardize(snpreader2, snpreader3) snpreaderref = Bed(self.currentFolder + "/examples/toydata", count_A1=False) self.load_and_standardize(snpreader2, snpreaderref)
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader import SnpData iid = np.array([[str(i), str(i)] for i in range(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in range(sid_count)]) pos = np.array([[i, i, i] for i in range(sid_count)]) np.random.seed(0) snpdata = SnpData( iid, sid, np.zeros((iid_count, sid_count)), pos=pos ) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count, sid_count) create_directory_if_necessary(output) Bed.write(output, snpdata, count_A1=False) snpdata2 = Bed(output, count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def test_subset(self): logging.info("in test_subset") snpreader = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed", count_A1=False) snpkernel = SnpKernel(snpreader, stdizer.Unit()) krsub = snpkernel[::2, ::2] kerneldata1 = krsub.read() expected = snpreader.read_kernel(stdizer.Unit())[::2].read() np.testing.assert_array_almost_equal(kerneldata1.val, expected.val, decimal=10) krsub2 = snpkernel[::2] kerneldata2 = krsub2.read() np.testing.assert_array_almost_equal(kerneldata2.val, expected.val, decimal=10) logging.info("done with test")
def _run_once(self): if self._ran_once: return self._ran_once = True self.row # get row info self.col # get col info _bed = SnpReader._name_of_other_file(self.path, remove_suffix="bed", add_suffix="bed") local_bed = self._storage.open_read(_bed) self.local = Bed(local_bed.__enter__(), count_A1=True, iid=self.row, sid=self.col, pos=self.col_property, skip_format_check=True) self._file_dict["bed"] = local_bed
def test_write_bed_f64cpp_5_python(self): snpreader = Bed(self.currentFolder + "/examples/toydata", count_A1=False) iid_index = 5 logging.info("iid={0}".format(iid_index)) #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # snpreader = snpreader[0:-1,:] #assert snpreader.iid_count % 4 != 0 snpdata = snpreader[0:iid_index, :].read(order='F', dtype=np.float64) if snpdata.iid_count > 0: snpdata.val[-1, 0] = float("NAN") output = "tempdir/toydata.F64python.{0}".format(iid_index) create_directory_if_necessary(output) Bed.write(output, snpdata, force_python_only=True) snpdata2 = Bed(output, count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def __init__(self, path, shape, dtype=np.int8, count_A1=True): # n variants (sid = SNP id), n samples (iid = Individual id) n_sid, n_iid = shape # Initialize Bed with empty arrays for axis data, otherwise it will # load the bim/map/fam files entirely into memory (it does not do out-of-core for those) self.bed = Bed( str(path), count_A1=count_A1, # Array (n_sample, 2) w/ FID and IID iid=np.empty((n_iid, 2), dtype="str"), # SNP id array (n_variants) sid=np.empty((n_sid, ), dtype="str"), # Contig and positions array (n_variants, 3) pos=np.empty((n_sid, 3), dtype="int"), ) self.shape = (n_sid, n_iid, 2) self.dtype = dtype self.ndim = 3
def factory(snpreader, num_snps_in_memory, standardizer, blocksize, count_A1=None): if isinstance(snpreader, str): snpreader = Bed(snpreader, count_A1=count_A1) if num_snps_in_memory >= snpreader.sid_count: in_memory = InMemory( snpreader.read(order='C').standardize(standardizer), standardizer, blocksize) in_memory._snpreader.val.flags.writeable = False in_memory._val = in_memory._snpreader.val return in_memory else: return FromDisk(snpreader, num_snps_in_memory, standardizer, blocksize, None)
def getData(filename): mph = 3 sFil = Bed(filename, count_A1=False) # Bed object yFil = Pheno(filename + ".fam") y = yFil.read().val[:, mph] y = [i - 1 for i in y ] # the last column of .fam file is the disease states of data owners return [y, sFil]
def setUpClass(self): from fastlmm.util.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
def _run_once(self): if self._ran_once: return self._ran_once = None if isinstance(self.test_snps, str): self.test_snps = Bed(self.test_snps) if isinstance(self.G0, str): self.G0 = Bed(self.G0) if isinstance(self.pheno, str): self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True) #!! what about missing=-9? if self.covar is not None and isinstance(self.covar, str): self.covar = pstpheno.loadPhen(self.covar)#!! what about missing=-9? if self.G1_or_none is not None and isinstance(self.G1_or_none, str): self.G1_or_none = Bed(self.G1_or_none) if self.sid_list_0 is None: self.sid_list_0 = self.test_snps.sid if self.sid_list_1 is None: self.sid_list_1 = self.test_snps.sid self.set_sid_sets() #!!Should fix up to add only of no constant columns - will need to add a test case for this if self.covar is None: self.covar = np.ones((self.test_snps.iid_count, 1)) else: self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1)))) self.n_cov = self.covar.shape[1] if self.output_file_or_none is None: self.__tempdirectory = ".working" else: self.__tempdirectory = self.output_file_or_none + ".working" self._ran_once = True
def gen_and_compare(self, output_file, **kwargs): from pysnptools.snpreader import Bed gen_snpdata = snp_gen(**kwargs) #pstutil.create_directory_if_necessary(self.currentFolder + "/tempdir/" + output_file,isfile=True) #Bed.write(gen_snpdata, self.currentFolder + "/tempdir/" + output_file) #comment out ref_snpdata = Bed(self.currentFolder + "/../../tests/datasets/generate/" + output_file, count_A1=False).read() assert gen_snpdata == ref_snpdata, "Failure on " + output_file return gen_snpdata
def shuffle_bed(bed_file): """ shuffle the genotypes of individuals snp-by-snp :param bed_file: the prefix for plink binary file :return: the shuffled plink binary file """ try: from pysnptools.snpreader import Bed except Exception as e: print(e) return 0 logging.INFO('Read the plink file') data = Bed(bed_file, count_A1=False).read() num_snp = data.val.shape[1] logging.INFO("Start shuffle the genotypes snp-by-snp") for i in tqdm(range(num_snp)): np.random.shuffle(data.val[:, i]) logging.INFO('Write the shuffled plink file') Bed.write(bed_file + '_shuffle', data, count_A1=False) return 1
def test_SNC(self): logging.info("TestSNC") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) snc = bed.read() snc.val[:, 2] = 0 # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:, :10], pheno=pheno, G0=snc, mixing=0, leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name, count_A1=False) self.compare_files(frame, "snc")
def test_underscore_read2(self): logging.info("in test_underscore_read2") snpreader = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed", count_A1=False) assert snpreader.iid is snpreader.row kid = Identity(snpreader.row) sub3 = kid[::2, ::2] expected = np.identity(kid.iid_count)[::2, :][:, ::2] np.testing.assert_array_almost_equal(sub3.read().val, expected) logging.info("done with test")
def gen_reference(self, load_path): """Get the pysnptools reference via the load type""" if self.gen_type == ".bed": return Bed(load_path, count_A1=True) elif self.gen_type == ".bgen": if self._snp_tools: return Bgen(load_path) else: return BgenObject(load_path) else: raise Exception("Unknown load type set")
def sim_zsc(bfile, nsample, start_chrom, end_chrom, pheno, legend, standardize, freq, nblock=40): zsc_maf_thres = 0.01 nindv = nsample nsnp_all = legend.shape[0] zsc = np.zeros(nsnp_all, dtype=np.float32) for i in xrange(start_chrom, end_chrom + 1): snpdata = Bed('{}{}.bed'.format(bfile, i), count_A1=False) nsnp = snpdata.sid_count blocks = create_block(0, nsnp - 1, nblock) snp_idx = np.where(legend['CHR'] == i)[0] zsc_chrom = np.zeros(snp_idx.shape[0]) freq_chrom = freq[snp_idx] mask_chrom = np.zeros(nsnp, dtype=bool) mask_chrom[freq_chrom > zsc_maf_thres] = True for blk in blocks: mask_chrom_blk = mask_chrom[blk] use_idx = blk[mask_chrom_blk == True] snpdata_blk = snpdata[0:nindv, use_idx] if standardize == False: snpdata_blk = snpdata_blk.read(dtype=np.float32).val else: snpdata_blk = snpdata_blk.read(dtype=np.float32)\ .standardize(Unit()).val if standardize == False: snpdata_blk -= snpdata_blk.mean(axis=0) if standardize == True: zsc_chrom[use_idx] = np.dot(snpdata_blk.T, pheno) / np.sqrt(nindv) else: sigmasq = snpdata_blk.var(axis=0) zsc_chrom[use_idx] = np.dot(snpdata_blk.T, pheno) zsc_chrom[use_idx] /= np.sqrt(nindv * sigmasq) zsc[snp_idx] = zsc_chrom return zsc[freq > zsc_maf_thres]
def __init__(self, gene, iso, sim): seed = 124 np.random.seed(seed) self.gene = gene self.num_iso = iso self.num_sim = sim bfile = gene + "_AFR.clean" geno = Bed(bfile, count_A1=False).read().val self.n_ind, self.n_SNP = geno.shape print(geno.shape) f = np.sum(geno, axis=0) / (2 * self.n_ind) self.geno = (geno - 2 * f) / np.sqrt(2 * f * (1 - f))
def test_intersection_Snp2Dist(self): from pysnptools.distreader._snp2dist import _Snp2Dist from pysnptools.snpreader import Pheno, Bed from pysnptools.distreader._subset import _DistSubset from pysnptools.snpreader._subset import _SnpSubset from pysnptools.util import intersect_apply snp_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True) k = snp_all.as_dist(max_weight=2) pheno = Pheno(self.currentFolder + "/../examples/toydata.phe") pheno = pheno[1:,:] # To test intersection we remove a iid from pheno k1,pheno = intersect_apply([k,pheno]) assert isinstance(k1.snpreader,_SnpSubset) and not isinstance(k1,_DistSubset) #What happens with fancy selection? k2 = k[::2,:] assert isinstance(k2,_Snp2Dist) logging.info("Done with test_intersection")
def getData(filename="", mph=3, UseCov=False): sFil = Bed(filename) yFil = Pheno(filename + ".fam") Q = [] if isfile(filename + ".cov") and UseCov: QFil = Pheno(filename + ".cov") [sFil, yFil, QFil] = intersect_apply([sFil, yFil, QFil]) if isfile(filename + ".phen"): yFil = Pheno(filename + ".phen") [sFil, yFil] = intersect_apply([sFil, yFil]) return [yFil, sFil]
def test_SNC(self): logging.info("TestSNC") from pysnptools.snpreader import Bed test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) snc = bed.read() snc.val[:, 2] = [ 0 ] * snc.iid_count # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:, :10], pheno=pheno, G0=snc, mixing=0, covar=covar, output_file_name=output_file_name) self.compare_files(frame, "snc")
def __init__(self, args): if args.window_type not in ['BP', 'SNP']: raise ValueError('Window type not supported') bed_1 = Bed(args.bfile) # af1 = self.get_allele_frequency(bed_1, args) # print(len(af1), "SNPs in file 1") snps_1 = (af1 > args.maf) & (af1 < 1 - args.maf) # print(np.sum(snps_1), "SNPs in file 1 after MAF filter") if (args.from_bp is not None) and (args.to_bp is not None): k = (bed_1.pos[:, 2] > args.from_bp) & (bed_1.pos[:, 2] < args.to_bp) snps_1 = snps_1 & k snps_to_use = bed_1.sid[snps_1] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract, 'r')]) snps_to_use = np.intersect1d(snps_to_use, keep) print(len(snps_to_use), "SNPs remaining after extraction") bed_1_index = np.sort(bed_1.sid_to_index(snps_to_use)) # pos = bed_1.pos[bed_1_index] # bim_1 = pd.read_table( bed_1.filename + '.bim', header=None, names=['chm', 'id', 'pos_mb', 'pos_bp', 'a1', 'a2']) af = af1[bed_1_index] # if args.afile is not None: a1 = pd.read_table(args.afile, header=None, sep='\s*', names=['id1', 'id2', 'theta']) else: a1 = None self.af = af self.M = len(bed_1_index) # self.windows = self.get_windows(pos, args) # self.chr = pos[:, 0] self.pos = pos[:, 2] self.id = bed_1.sid[bed_1_index] self.A1 = bim_1['a1'].loc[bed_1_index] self.A2 = bim_1['a2'].loc[bed_1_index] self.scores = self.compute(bed_1, bed_1_index, af, a1, args) #
def __init__(self,args): self.bed = Bed(args.bfile) # self.N = self.bed.iid_count if args.covfile is not None: cov = pd.read_table(args.covfile,header=None) self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid)) self.ncov = self.cov.shape[1] # + constant else: self.cov = np.ones((self.N,1)) self.ncov = 1 # Constant if args.phenofile is not None: Y = pd.read_table(args.phenofile,header=None,na_values='-9') else: try: Y = pd.read_table(args.bfile+'.pheno',header=None,na_values='-9') except IOError: print("Phenotype file not found.") exit(1) self.Y = ju._reorder(Y,self.bed.iid) af = ju.get_allele_frequency(self.bed,args) # snps = (af>args.maf)&(af<1-args.maf) # if (args.from_bp is not None) and (args.to_bp is not None): k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp) snp1 = snps&k snps_to_use = self.bed.sid[snps] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) # pos = self.bed.pos[self.bed_index] # bim=pd.read_table(self.bed.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) self.af = af[self.bed_index] # self.M = len(self.bed_index) # self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type) self.pos = pos[:,2] self.chr = pos[:,0] self.id = self.bed.sid[self.bed_index] self.A1 = bim['a1'].loc[self.bed_index] self.A2 = bim['a2'].loc[self.bed_index] self.logistic = False self.chimin = stats.chi2.ppf(1-args.minp,2) # Fit null if (not args.linear) and (self.Y.min() >= 0 and self.Y.max() <= 1): self.null = sm.Logit(self.Y, self.cov, missing='drop').fit(disp=0) self.logistic = True else: self.null = sm.OLS(self.Y, self.cov, missing='drop').fit(disp=0) if self.ncov > 1: self.cov = sm.add_constant(self.null.fittedvalues) self.marg_res, self.joint_res = self.compute(args)
def compute(pgs, bedfile=None, bgenfile=None, par_gts_f=None, ped=None, sib=False, compute_controls=False, verbose=True): """Compute a polygenic score (PGS) for the individuals with observed genotypes and observed/imputed parental genotypes. Args: par_gts_f : :class:`str` path to HDF5 file with imputed parental genotypes gts_f : :class:`str` path to bed file with observed genotypes pgs : :class:`snipar.pgs` the PGS, defined by the weights for a set of SNPs and the alleles of those SNPs sib : :class:`bool` Compute the PGS for genotyped individuals with at least one genotyped sibling and observed/imputed parental genotypes. Default False. compute_controls : :class:`bool` Compute polygenic scores for control families (families with observed parental genotypes set to missing). Default False. Returns: pg : :class:`snipar.gtarray` Return the polygenic score as a genotype array with columns: individual's PGS, mean of their siblings' PGS, observed/imputed paternal PGS, observed/imputed maternal PGS """ # Check for SNP overlap if bedfile is not None: bed = Bed(bedfile, count_A1=True) snp_ids = bed.sid if bgenfile is not None: bgen = open_bgen(bgenfile) snp_ids = bgen.ids if np.unique(snp_ids).shape[0] == 1: snp_ids = bgen.rsids snp_set = set(snp_ids) in_snp_set = np.array([x in snp_set for x in pgs.snp_ids]) if np.sum(in_snp_set)==0: print('No overlap between variants in weights file and observed genotypes') return None else: # Get genotype matrix G = get_gts_matrix(bedfile=bedfile, bgenfile=bgenfile, par_gts_f=par_gts_f, ped=ped, snp_ids=pgs.snp_ids, sib=sib, compute_controls=compute_controls, verbose=verbose) if sib: cols = np.array(['proband', 'sibling', 'paternal', 'maternal']) else: cols = np.array(['proband', 'paternal', 'maternal']) if compute_controls: pgs_out = [pgs.compute(x,cols) for x in G[0:3]] if sib: o_cols = np.array(['proband', 'sibling', 'parental']) else: o_cols = np.array(['proband','parental']) pgs_out.append(pgs.compute(G[3], o_cols)) return pgs_out else: return pgs.compute(G,cols)
def get_beta_tildes(bed_file, mean_std_file, betas1, betas2, h1, h2, chunk_size_snp): # reading bed file G = Bed(bed_file, count_A1=False) # reading file with means, standard deviation for each SNP mean_std = pd.read_csv(mean_std_file, delimiter='\t') # dimensions of genotype matrix N = G.row_count # number of individuals M = G.col_count # number of SNPs # dot products of standardized matrix and betas GB1 = np.zeros(N) GB2 = np.zeros(N) # standardizing genotype matrix and taking dot product with betas (chunk_size_snp at a time) for i in range(0, M, chunk_size_snp): # standardizing G_sub = G[:, i:(i + chunk_size_snp)].read().val # current chunk mean_sub = mean_std['mean'][ i:i + chunk_size_snp].values # means of SNPs corresponding to current chunk std_sub = mean_std['std'][i:( i + chunk_size_snp )].values # standard deviations of SNPs corresponding to current chunk nanidx = np.where(np.isnan(G_sub)) # finding NaNs in genotype matrix G_sub[nanidx] = mean_sub[ nanidx[1]] # setting NaNs to mean of corresponding SNP G_sub_std = np.nan_to_num( (G_sub - mean_sub) / std_sub) # standardizing chunk # dot product betas1_sub = betas1[i:( i + chunk_size_snp )] # trait 1 effect sizes of SNPs corresponding to current chunk betas2_sub = betas2[i:( i + chunk_size_snp )] # trait 2 effect sizes of SNPs corresponding to current chunk GB1 += np.dot(G_sub_std, betas1_sub) # dot product for trait 1 GB2 += np.dot(G_sub_std, betas2_sub) # dot product for trait 2 # re-scaling to have variance of dot product equal to heritability var_GB1 = np.var(GB1) var_GB2 = np.var(GB2) k1 = h1 / var_GB1 k2 = h2 / var_GB2 beta_tildes1 = math.sqrt(k1) * betas1 # re-scaled effect sizes for trait 1 beta_tildes2 = math.sqrt(k2) * betas2 # re-scaled effect sizes for trait 2 return beta_tildes1, beta_tildes2
def _snps_fixup(snp_input, iid_if_none=None): if isinstance(snp_input, str): return Bed(snp_input) if snp_input is None: assert iid_if_none is not None, "snp_input cannot be None here" return SnpData( iid_if_none, sid=np.empty((0), dtype='str'), val=np.empty((len(iid_if_none), 0)), pos=np.empty((0, 3)), parent_string="") #todo: make a static factory method on SnpData return snp_input
def test_gen5(self): gen_snpdata = self.gen_and_compare("gen5", fst=.1, dfr=.5, iid_count=200, sid_count=20, maf_low=.05, maf_high=.4, seed=5) ref_snpdata = Bed(self.currentFolder + "/expected/gen2").read() assert not TestSnpGen.is_same( gen_snpdata, ref_snpdata), "Expect different seeds to produce different results"
def gen_Test_Bed(filename, n0, n1, m): n = n0 + n1 iid = [["fam_" + str(i), "iid_" + str(i)] for i in range(0, n)] sid = ["snp_" + str(i) for i in range(0, m)] X = [[2.0 for i in range(0, m)] for i in range(0, n1)] X.extend([[0.0 for i in range(0, m)] for i in range(0, n0)]) dat = SnpData(iid=iid, sid=sid, val=X) Bed.write(filename, dat) fil = open(filename + ".fam") lines = fil.readlines() fil.close() fil = open(filename + ".fam", "w") for i in range(0, len(lines)): l = lines[i] s = l.strip().split() if i < n1: s[5] = "2" else: s[5] = "1" l = " ".join(s) + "\n" fil.write(l) fil.close()
def cluster_data(snpreader): """ compute hierarchical clustering of snp data set in bed_fn """ if isinstance(snpreader,str): snpreader = Bed(snpreader) G = snpreader.read().standardize().val # Generate distance matrix from sklearn.metrics.pairwise import euclidean_distances D = euclidean_distances(G, G) # Compute and plot first dendrogram. fig = pylab.figure(figsize=(8,8)) ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) Y = fc.linkage(D, method='average') #method="centroid" is cubic! Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3,0.71,0.6,0.2]) #Y = sch.linkage(D, method='single') Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] #dx2 = Z2['leaves'] D = D[idx1,:] D = D[:,idx1] axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu) axmatrix.set_xticks([]) axmatrix.set_yticks([]) pylab.show()
def test1(self): from pysnptools.snpreader import Bed, SnpMemMap from pysnptools.util import example_file # Download and return local file name old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) filename2 = "tempdir/tiny.snp.memmap" pstutil.create_directory_if_necessary(filename2) snpreader2 = SnpMemMap.empty(iid=[['fam0', 'iid0'], ['fam0', 'iid1']], sid=['snp334', 'snp349', 'snp921'], filename=filename2, order="F", dtype=np.float64) assert isinstance(snpreader2.val, np.memmap) snpreader2.val[:, :] = [[0., 2., 0.], [0., 1., 2.]] assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val, np.array([[1.]])) snpreader2.flush() assert isinstance(snpreader2.val, np.memmap) assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val, np.array([[1.]])) snpreader2.flush() snpreader3 = SnpMemMap(filename2) assert np.array_equal(snpreader3[[1], [1]].read(view_ok=True).val, np.array([[1.]])) assert isinstance(snpreader3.val, np.memmap) logging.info("in TestSnpMemMap test1") snpreader = SnpMemMap('tempdir/tiny.snp.memmap') assert snpreader.iid_count == 2 assert snpreader.sid_count == 3 assert isinstance(snpreader.val, np.memmap) snpdata = snpreader.read(view_ok=True) assert isinstance(snpdata.val, np.memmap) bed_file = example_file("pysnptools/examples/toydata.5chrom.*", "*.bed") bed = Bed(bed_file) pstutil.create_directory_if_necessary( "tempdir/toydata.5chrom.snp.memmap" ) #LATER should we just promise to create directories? SnpMemMap.write("tempdir/toydata.5chrom.snp.memmap", bed) # Write bed in SnpMemMap format SnpMemMap.write( "tempdir/toydata.5chromsnpdata.snp.memmap", bed[:, ::2].read()) # Write snpdata in SnpMemMap format os.chdir(old_dir)
def factory_iterator(): snp_reader_factory_bed = lambda: Bed("examples/toydata", count_A1=False) snp_reader_factory_snpmajor_hdf5 = lambda: SnpHdf5( "examples/toydata.snpmajor.snp.hdf5") snp_reader_factory_iidmajor_hdf5 = lambda: SnpHdf5( "examples/toydata.iidmajor.snp.hdf5") snp_reader_factory_dat = lambda: Dat("examples/toydata.dat") previous_wd = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) snpreader0 = snp_reader_factory_bed() S_original = snpreader0.sid_count N_original = snpreader0.iid_count snps_to_read_count = min(S_original, 100) for iid_index_list in [ list(range(N_original)), list(range(N_original / 2)), list(range(N_original - 1, 0, -2)) ]: for snp_index_list in [ list(range(snps_to_read_count)), list(range(snps_to_read_count / 2)), list(range(snps_to_read_count - 1, 0, -2)) ]: for standardizer in [Unit(), Beta(1, 25)]: reference_snps, reference_dtype = NaNCNCTestCases( iid_index_list, snp_index_list, standardizer, snp_reader_factory_bed(), sp.float64, "C", "False", None, None).read_and_standardize() for snpreader_factory in [ snp_reader_factory_bed, snp_reader_factory_snpmajor_hdf5, snp_reader_factory_iidmajor_hdf5, snp_reader_factory_dat ]: for dtype in [sp.float64, sp.float32]: for order in ["C", "F"]: for force_python_only in [False, True]: snpreader = snpreader_factory() test_case = NaNCNCTestCases( iid_index_list, snp_index_list, standardizer, snpreader, dtype, order, force_python_only, reference_snps, reference_dtype) yield test_case os.chdir(previous_wd)
def __init__(self, snp_fn, out_prefix): self.force_recompute = False #self.base_path = base_path self.snp_fn = snp_fn from pysnptools.snpreader import Bed self.snp_reader = Bed(snp_fn) self.eigen_fn = self.snp_fn + "_pcs.pickle" self.out_prefix = out_prefix
def genPheno(filename="../thinFam",per=.5,savename="fakePheno.txt",c=2.0,num=5): sFil=Bed(filename); D=sFil.read().val; m=len(D[0]); n=len(D); print m; print n; I=[rand.randint(0,m-1) for i in range(0,num)]; SNP=[[D[j][i] for j in range(0,n)] for i in I] #p0=n*peir/sum([c**i*len([j for j in SNP if j==float(i)]) for i in range(0,3)]) print len(I); print len(SNP); print len(SNP[0]); print n; print min([len(s) for s in SNP]) print SNP; SNP=[[max(i,0.0) for i in s] for s in SNP] for i in range(0,num): for j in range(0,n): if not SNP[i][j] in [1.0,0.0,2.0]: SNP[i][j]=0.0; print [list(set(s)) for s in SNP]
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>fastlmmc -snpPairs -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.pairs.txt -logDelta 0 -verbose 100 ''' logging.info("TestEpistasis test_match_cpp") from pysnptools.snpreader import Bed snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps")) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"] sim_idx = snps.sid_to_index(sim_sid) test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"] test_idx = snps.sid_to_index(test_sid) frame = epistasis(snps[:,test_idx], pheno,covar=covar, G0 = snps[:,sim_idx],log_delta=0) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) referenceOutfile = TestFeatureSelection.reference_file("epistasis/topsnps.pairs.txt") import pandas as pd table = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file assert len(pvalue_list) == len(table) for row in table.iterrows(): snp0cpp,snp1cpp,pvaluecpp,i1,i2 = row[1] for i in xrange(len(pvalue_list)): found = False pvaluepy = pvalue_list[i] snp0py = sid0[i] snp1py = sid1[i] if (snp0py == snp0cpp and snp1py == snp1cpp) or (snp0py == snp1cpp and snp1py == snp0cpp): found = True diff = abs(pvaluecpp - pvaluepy)/pvaluecpp assert diff < .035, "'{0}' '{1}' pvalue_list differ too much {4} -- {2} vs {3}".format(snp0cpp,snp1cpp,pvaluecpp,pvaluepy,diff) break assert found
def __init__(self,args): self.bed = Bed(args.bfile) # self.N = self.bed.iid_count if args.covfile is not None: cov = pd.read_table(args.covfile,header=None) self.cov = sm.add_constant(ju._reorder(cov,self.bed.iid)) self.ncov = self.cov.shape[1] # + constant else: self.cov = np.ones((self.N,1)) self.ncov = 1 # Constant af = ju.get_allele_frequency(self.bed,args) # snps = (af>args.maf)&(af<1-args.maf) # if (args.from_bp is not None) and (args.to_bp is not None): k = (bed.pos[:,2]>args.from_bp)&(bed.pos[:,2]<args.to_bp) snp1 = snps&k snps_to_use = self.bed.sid[snps] if args.extract is not None: keep = np.array([l.strip() for l in open(args.extract,'r')]) snps_to_use = np.intersect1d(snps_to_use,keep) self.bed_index = np.sort(self.bed.sid_to_index(snps_to_use)) # pos = self.bed.pos[self.bed_index] # bim=pd.read_table(self.bed.filename+'.bim',header=None, names=['chm','id','pos_mb','pos_bp','a1','a2']) self.af = af[self.bed_index] # self.M = len(self.bed_index) # self.windows = ju.get_windows(pos,self.M,args.window_size,args.window_type) self.sample_windows = ju.get_windows(pos,self.M,args.sample_window_size, args.sample_window_type) self.pos = pos[:,2] self.chr = pos[:,0] self.id = self.bed.sid[self.bed_index] self.A1 = bim['a1'].loc[self.bed_index] self.A2 = bim['a2'].loc[self.bed_index] self.numSamples = args.numSamples self.JMaxStats, self.ZMaxStats = self.sample(args) self.JMinP = stats.chi2.sf(self.JMaxStats,2) self.ZMinP = stats.chi2.sf(self.ZMaxStats**2,1) self.minP = np.minimum(self.JMinP,self.ZMinP)