def test_two(self): ''' Lock in results on arbitrary data -- because meaningful runs take too long to run. ''' fn = "two.txt" logging.info(fn) tmpOutfile = self.file_name(fn) snpreader = self.snpreader_whole[:10, :] spatial_coor = [[i, -i] for i in xrange(snpreader.iid_count)] alpha_list = alpha_list_big = [ int(v) for v in np.logspace(2, np.log10(4000), 2) ] dataframe = heritability_spatial_correction(snpreader, spatial_coor, snpreader.iid, alpha_list, 2, self.pheno_whole, jackknife_count=2, permute_plus_count=1, permute_times_count=1, just_testing=False) dataframe.to_csv(tmpOutfile, sep="\t", index=False) referenceOutfile = TestFeatureSelection.reference_file( "heritability_spatial_correction/" + fn) out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance) self.assertTrue( out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile))
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100 ''' logging.info("TestSingleSnp test_match_cpp") snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"), count_A1=False) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"] sim_idx = snps.sid_to_index(sim_sid) test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"] test_idx = snps.sid_to_index(test_sid) for G0,G1 in [(snps[:,sim_idx],KernelIdentity(snps.iid)),(KernelIdentity(snps.iid),snps[:,sim_idx])]: frame_h2 = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,h2=.5,leave_out_one_chrom=False,count_A1=False) frame_log_delta = single_snp(test_snps=snps[:,test_idx], pheno=pheno, G0=G0,G1=G1, covar=covar,log_delta=0,leave_out_one_chrom=False,count_A1=False) for frame in [frame_h2, frame_log_delta]: referenceOutfile = TestFeatureSelection.reference_file("single_snp/topsnps.single.txt") reference = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file assert len(frame) == len(reference) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue reldiff = abs(row.Pvalue - pvalue)/row.Pvalue assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format(sid,None,row.Pvalue,pvalue,reldiff)
def compare_files(self, sid0_list, sid1_list, pvalue_list, ref_base): reffile = TestFeatureSelection.reference_file("epistasis/" + ref_base + ".txt") pair_to_pvalue = {} for index, sid0 in enumerate(sid0_list): sid1 = sid1_list[index] if sid0 < sid1: key = (sid0, sid1) else: key = (sid1, sid0) pair_to_pvalue[key] = pvalue_list[index] reference = sp.loadtxt(reffile, dtype='str', comments=None, skiprows=1) assert len(pvalue_list) == len( reference), "# of pairs differs from file '{0}'".format(reffile) for row in reference: sid0 = row[0] sid1 = row[4] if sid0 < sid1: key = (sid0, sid1) else: key = (sid1, sid0) assert abs( float(row[8]) - pair_to_pvalue[key] ) < 1e-5, "pair {0} differs too much from file '{1}'".format( key, reffile)
def compare_files(self, frame, ref_base): reffile = TestFeatureSelection.reference_file("single_snp/" + ref_base + ".txt") #sid_list,pvalue_list = frame['SNP'].values,frame['Pvalue'].values #sid_to_pvalue = {} #for index, sid in enumerate(sid_list): # sid_to_pvalue[sid] = pvalue_list[index] reference = pd.read_csv(reffile, delimiter='\s', comment=None, engine='python') assert len(frame) == len( reference), "# of pairs differs from file '{0}'".format(reffile) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue diff = abs(row.PValue - pvalue) if diff > 1e-5 or np.isnan(diff): raise Exception( "pair {0} differs too much from file '{1}'".format( sid, reffile)) assert abs(row.PValue - pvalue) < 1e-5, "wrong"
def compare_files(self, frame, ref_base): reffile = TestFeatureSelection.reference_file("single_snp_select/" + ref_base + ".txt") #sid_list,pvalue_list = frame['SNP'].values,frame['Pvalue'].values #sid_to_pvalue = {} #for index, sid in enumerate(sid_list): # sid_to_pvalue[sid] = pvalue_list[index] reference = pd.read_csv(reffile, delimiter='\s', comment=None, engine='python') if 'Pvalue' in reference.columns: reference[ 'PValue'] = reference.Pvalue #add a new column with different capitalization if it is there assert len(frame) == len( reference), "# of pairs differs from file '{0}'".format(reffile) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue assert abs( row.PValue - pvalue ) < 1e-5, "pair {0} differs too much from file '{1}'".format( sid, reffile)
def compare_files(self,answer,ref_base): reffile = TestFeatureSelection.reference_file("fastlmm/"+ref_base+".dat") #Uses same results folder as lmm_train reference=Dat(reffile).read() assert np.array_equal(answer.col,reference.col), "sid differs. File '{0}'".format(reffile) assert np.array_equal(answer.row,reference.row), "iid differs. File '{0}'".format(reffile) for iid_index in range(reference.row_count): for sid_index in range(reference.col_count): a_v = answer.val[iid_index,sid_index] r_v = reference.val[iid_index,sid_index] assert abs(a_v - r_v) < 1e-4, "Value at {0},{1} differs too much from file '{2}'".format(iid_index,sid_index,reffile)
def compare_files(self,answer,ref_base): reffile = TestFeatureSelection.reference_file("fastlmm/"+ref_base+".dat") #Uses same results folder as lmm_train reference=Dat(reffile).read() assert np.array_equal(answer.col,reference.col), "sid differs. File '{0}'".format(reffile) assert np.array_equal(answer.row,reference.row), "iid differs. File '{0}'".format(reffile) for iid_index in xrange(reference.row_count): for sid_index in xrange(reference.col_count): a_v = answer.val[iid_index,sid_index] r_v = reference.val[iid_index,sid_index] assert abs(a_v - r_v) < 1e-4, "Value at {0},{1} differs too much from file '{2}'".format(iid_index,sid_index,reffile)
def compare_files(self,frame,ref_base): reffile = TestFeatureSelection.reference_file("single_snp/"+ref_base+".txt") #sid_list,pvalue_list = frame['SNP'].values,frame['Pvalue'].values #sid_to_pvalue = {} #for index, sid in enumerate(sid_list): # sid_to_pvalue[sid] = pvalue_list[index] reference=pd.read_csv(reffile,delimiter='\s',comment=None,engine='python') assert len(frame) == len(reference), "# of pairs differs from file '{0}'".format(reffile) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue assert abs(row.PValue - pvalue) < 1e-5, "pair {0} differs too much from file '{1}'".format(sid,reffile)
def compare_files(self,frame,ref_base): reffile = TestFeatureSelection.reference_file("single_snp/"+ref_base+".txt") #Results are in single_snp, not single_snp_lin_reg #sid_list,pvalue_list = frame['SNP'].as_matrix(),frame['Pvalue'].as_matrix() #sid_to_pvalue = {} #for index, sid in enumerate(sid_list): # sid_to_pvalue[sid] = pvalue_list[index] reference=pd.read_csv(reffile,delimiter='\s',comment=None,engine='python') assert len(frame) == len(reference), "# of pairs differs from file '{0}'".format(reffile) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue assert abs(row.PValue - pvalue) < 1e-5, "pair {0} differs too much from file '{1}'".format(sid,reffile)
def compare_files(self,frame,ref_base): reffile = TestFeatureSelection.reference_file("single_snp/"+ref_base+".txt") #sid_list,pvalue_list = frame['SNP'].values,frame['Pvalue'].values #sid_to_pvalue = {} #for index, sid in enumerate(sid_list): # sid_to_pvalue[sid] = pvalue_list[index] reference=pd.read_csv(reffile,delimiter='\s',comment=None,engine='python') assert len(frame) == len(reference), "# of pairs differs from file '{0}'".format(reffile) frame.set_index('SNP',inplace=True) reference.set_index('SNP',inplace=True) diff = (frame.PValue-reference.PValue) bad = diff[np.abs(diff)>1e-5] if len(bad) > 0: raise Exception("snps differ too much from file '{0}' at these snps {1}".format(reffile,bad))
def compare_files(self,frame,ref_base): reffile = TestFeatureSelection.reference_file("single_snp_all_plus_select/"+ref_base+".txt") #sid_list,pvalue_list = frame['SNP'].as_matrix(),frame['Pvalue'].as_matrix() #sid_to_pvalue = {} #for index, sid in enumerate(sid_list): # sid_to_pvalue[sid] = pvalue_list[index] reference=pd.read_csv(reffile,delimiter='\s',comment=None,engine='python') if 'Pvalue' in reference.columns: reference['PValue']=reference.Pvalue #add a new column with different capitalization if it is there assert len(frame) == len(reference), "# of pairs differs from file '{0}'".format(reffile) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue assert abs(row.PValue - pvalue) < 1e-5, "pair {0} differs too much from file '{1}'".format(sid,reffile)
def test_two(self): ''' Lock in results on arbitrary data -- because meaningful runs take too long to run. ''' fn = "two.txt" logging.info(fn) tmpOutfile = self.file_name(fn) snpreader = self.snpreader_whole[:10,:] spatial_coor = [[i,-i] for i in xrange(snpreader.iid_count)] alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)] dataframe = heritability_spatial_correction(snpreader,spatial_coor,snpreader.iid,alpha_list,self.pheno_whole,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=False) dataframe.to_csv(tmpOutfile,sep="\t",index=False) referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn) out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance) self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile))
def compare_files(self,sid0_list,sid1_list,pvalue_list,ref_base): reffile = TestFeatureSelection.reference_file("epistasis/"+ref_base+".txt") pair_to_pvalue = {} for index, sid0 in enumerate(sid0_list): sid1 = sid1_list[index] if sid0 < sid1: key = (sid0, sid1) else: key = (sid1, sid0) pair_to_pvalue[key] = pvalue_list[index] reference=sp.loadtxt(reffile,dtype='str',comments=None,skiprows=1) assert len(pvalue_list) == len(reference), "# of pairs differs from file '{0}'".format(reffile) for row in reference: sid0 = row[0] sid1 = row[4] if sid0 < sid1: key = (sid0, sid1) else: key = (sid1, sid0) assert abs(float(row[8])-pair_to_pvalue[key]) < 1e-5, "pair {0} differs too much from file '{1}'".format(key,reffile)
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>fastlmmc -snpPairs -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.pairs.txt -logDelta 0 -verbose 100 ''' logging.info("TestEpistasis test_match_cpp") from pysnptools.snpreader import Bed snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps")) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"] sim_idx = snps.sid_to_index(sim_sid) test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"] test_idx = snps.sid_to_index(test_sid) frame = epistasis(snps[:,test_idx], pheno,covar=covar, G0 = snps[:,sim_idx],log_delta=0) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) referenceOutfile = TestFeatureSelection.reference_file("epistasis/topsnps.pairs.txt") import pandas as pd table = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file assert len(pvalue_list) == len(table) for row in table.iterrows(): snp0cpp,snp1cpp,pvaluecpp,i1,i2 = row[1] for i in xrange(len(pvalue_list)): found = False pvaluepy = pvalue_list[i] snp0py = sid0[i] snp1py = sid1[i] if (snp0py == snp0cpp and snp1py == snp1cpp) or (snp0py == snp1cpp and snp1py == snp0cpp): found = True diff = abs(pvaluecpp - pvaluepy)/pvaluecpp assert diff < .035, "'{0}' '{1}' pvalue_list differ too much {4} -- {2} vs {3}".format(snp0cpp,snp1cpp,pvaluecpp,pvaluepy,diff) break assert found
def compare_files(self, frame, ref_base): reffile = TestFeatureSelection.reference_file("single_snp/" + ref_base + ".txt") #sid_list,pvalue_list = frame['SNP'].as_matrix(),frame['Pvalue'].as_matrix() #sid_to_pvalue = {} #for index, sid in enumerate(sid_list): # sid_to_pvalue[sid] = pvalue_list[index] reference = pd.read_csv(reffile, delimiter='\s', comment=None, engine='python') assert len(frame) == len( reference), "# of pairs differs from file '{0}'".format(reffile) frame.set_index('SNP', inplace=True) reference.set_index('SNP', inplace=True) diff = (frame.PValue - reference.PValue) bad = diff[np.abs(diff) > 1e-5] if len(bad) > 0: raise Exception( "snps differ too much from file '{0}' at these snps {1}". format(reffile, bad))
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>..\.cd.\bin\windows\cpp_mkl\fastlmmc -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.singlesnp.txt -logDelta 0 -verbose 100 ''' logging.info("TestSingleSnp test_match_cpp") snps = Bed( os.path.join(self.pythonpath, "tests/datasets/selecttest/snps")) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = [ "snp26250_m0_.19m1_.19", "snp82500_m0_.28m1_.28", "snp63751_m0_.23m1_.23", "snp48753_m0_.4m1_.4", "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05", "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07", "snp11253_m0_.2m1_.2", "snp86250_m0_.33m1_.33", "snp3753_m0_.23m1_.23", "snp75003_m0_.32m1_.32", "snp30002_m0_.25m1_.25", "snp26252_m0_.19m1_.19", "snp67501_m0_.15m1_.15", "snp63750_m0_.28m1_.28", "snp30001_m0_.28m1_.28", "snp52502_m0_.35m1_.35", "snp33752_m0_.31m1_.31", "snp37503_m0_.37m1_.37", "snp15002_m0_.11m1_.11", "snp3751_m0_.34m1_.34", "snp7502_m0_.18m1_.18", "snp52503_m0_.3m1_.3", "snp30000_m0_.39m1_.39", "isnp4457_m0_.11m1_.11", "isnp23145_m0_.2m1_.2", "snp60001_m0_.39m1_.39", "snp33753_m0_.16m1_.16", "isnp60813_m0_.2m1_.2", "snp82502_m0_.34m1_.34", "snp11252_m0_.13m1_.13" ] sim_idx = snps.sid_to_index(sim_sid) test_sid = [ "snp26250_m0_.19m1_.19", "snp63751_m0_.23m1_.23", "snp82500_m0_.28m1_.28", "snp48753_m0_.4m1_.4", "snp45001_m0_.26m1_.26", "snp52500_m0_.05m1_.05", "snp75002_m0_.39m1_.39", "snp41253_m0_.07m1_.07", "snp86250_m0_.33m1_.33", "snp15002_m0_.11m1_.11", "snp33752_m0_.31m1_.31", "snp26252_m0_.19m1_.19", "snp30001_m0_.28m1_.28", "snp11253_m0_.2m1_.2", "snp67501_m0_.15m1_.15", "snp3753_m0_.23m1_.23", "snp52502_m0_.35m1_.35", "snp30000_m0_.39m1_.39", "snp30002_m0_.25m1_.25" ] test_idx = snps.sid_to_index(test_sid) for G0, G1 in [(snps[:, sim_idx], KernelIdentity(snps.iid)), (KernelIdentity(snps.iid), snps[:, sim_idx])]: frame_h2 = single_snp(test_snps=snps[:, test_idx], pheno=pheno, G0=G0, G1=G1, covar=covar, h2=.5, leave_out_one_chrom=False) frame_log_delta = single_snp(test_snps=snps[:, test_idx], pheno=pheno, G0=G0, G1=G1, covar=covar, log_delta=0, leave_out_one_chrom=False) for frame in [frame_h2, frame_log_delta]: referenceOutfile = TestFeatureSelection.reference_file( "single_snp/topsnps.single.txt") reference = pd.read_table( referenceOutfile, sep="\t" ) # We've manually remove all comments and blank lines from this file assert len(frame) == len(reference) for _, row in reference.iterrows(): sid = row.SNP pvalue = frame[frame['SNP'] == sid].iloc[0].PValue reldiff = abs(row.Pvalue - pvalue) / row.Pvalue assert reldiff < .035, "'{0}' pvalue_list differ too much {4} -- {2} vs {3}".format( sid, None, row.Pvalue, pvalue, reldiff)