def test_one(self): logging.info("TestEpistasis test_one") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase, count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("one") frame = epistasis( test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=test_snps.sid[:10], #first 10 snps sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10 output_file_name=output_file, count_A1=False) sid0, sid1, pvalue_list = np.array(frame['SNP0']), np.array( frame['SNP1']), np.array(frame['PValue']) #Check the output file self.compare_files(sid0, sid1, pvalue_list, "one") #Check the values returned output_file2 = self.file_name("one_again") write(sid0, sid1, pvalue_list, output_file2) self.compare_files(sid0, sid1, pvalue_list, "one")
def test_unknown_sid(self): logging.info("TestEpistasis test_unknown_sid") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn try: frame = epistasis(test_snps, pheno,covar=covar,sid_list_0=['1_4','bogus sid','1_9'],sid_list_1=test_snps.sid[5:15]) #Skip 5 snps, use next 10 failed = False except: failed = True assert(failed)
def test_no_cov(self): logging.info("TestEpistasis test_no_cov") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn output_file = self.file_name("no_cov") frame = epistasis(test_snps, pheno, G0=test_snps, sid_list_0=test_snps.sid[:10], #first 10 snps sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10 output_file_name=output_file ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) self.compare_files(sid0,sid1,pvalue_list,"no_cov")
def test_no_sid_list_0(self): logging.info("TestEpistasis test_no_sid_list_0") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("no_sid_list_0") frame = epistasis(test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=['1_4'], output_file_name=output_file ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) self.compare_files(sid0,sid1,pvalue_list,"no_sid_list_0")
def test_preload_files(self): logging.info("TestEpistasis test_preload_files") from pysnptools.snpreader import Bed test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) output_file = self.file_name("preload_files") frame = epistasis(test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=bed.sid[:10], #first 10 snps sid_list_1=bed.sid[5:15], #Skip 5 snps, use next 10 output_file_name=output_file ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) self.compare_files(sid0,sid1,pvalue_list,"one")
def test_no_cov_b(self): logging.info("TestEpistasis test_no_cov_b") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn output_file = self.file_name("no_cov_b") covar = pstpheno.loadPhen(self.cov_fn) covar['vals'] = np.delete(covar['vals'], np.s_[:],1) #Remove all the columns frame = epistasis(test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=test_snps.sid[:10], #first 10 snps sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10 output_file_name=output_file ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) self.compare_files(sid0,sid1,pvalue_list,"no_cov")
def test_G1_mixing(self): logging.info("TestEpistasis test_G1_mixing") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase,count_A1=False) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("G1_mixing") frame = epistasis(test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=test_snps.sid[:10], #first 10 snps sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10 G1=test_snps, mixing=0, output_file_name=output_file,count_A1=False ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) self.compare_files(sid0,sid1,pvalue_list,"one")
def test_cid_intersect(self): logging.info("TestEpistasis test_cid_intersect") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) pheno['iid'] = np.vstack([pheno['iid'][::-1],[['Bogus','Bogus']]]) pheno['vals'] = np.hstack([pheno['vals'][::-1],[-34343]]) covar = self.cov_fn output_file = self.file_name("cid_intersect") frame = epistasis(test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=test_snps.sid[:10], #first 10 snps sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10 output_file_name=output_file ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) self.compare_files(sid0,sid1,pvalue_list,"one")
def test_match_cpp(self): ''' match FaSTLMM.207\Data\DemoData>fastlmmc -snpPairs -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.pairs.txt -logDelta 0 -verbose 100 ''' logging.info("TestEpistasis test_match_cpp") from pysnptools.snpreader import Bed snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps")) pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt") covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt") sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"] sim_idx = snps.sid_to_index(sim_sid) test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"] test_idx = snps.sid_to_index(test_sid) frame = epistasis(snps[:,test_idx], pheno,covar=covar, G0 = snps[:,sim_idx],log_delta=0) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) referenceOutfile = TestFeatureSelection.reference_file("epistasis/topsnps.pairs.txt") import pandas as pd table = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file assert len(pvalue_list) == len(table) for row in table.iterrows(): snp0cpp,snp1cpp,pvaluecpp,i1,i2 = row[1] for i in xrange(len(pvalue_list)): found = False pvaluepy = pvalue_list[i] snp0py = sid0[i] snp1py = sid1[i] if (snp0py == snp0cpp and snp1py == snp1cpp) or (snp0py == snp1cpp and snp1py == snp0cpp): found = True diff = abs(pvaluecpp - pvaluepy)/pvaluecpp assert diff < .035, "'{0}' '{1}' pvalue_list differ too much {4} -- {2} vs {3}".format(snp0cpp,snp1cpp,pvaluecpp,pvaluepy,diff) break assert found
def test_one(self): logging.info("TestEpistasis test_one") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn output_file = self.file_name("one") frame = epistasis(test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=test_snps.sid[:10], #first 10 snps sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10 output_file_name=output_file ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) #Check the output file self.compare_files(sid0,sid1,pvalue_list,"one") #Check the values returned output_file2 = self.file_name("one_again") write(sid0,sid1,pvalue_list,output_file2) self.compare_files(sid0,sid1,pvalue_list,"one")
def run_fastlmmc(dataset, output_dir, process_id, group_size, covFile=None, species='mouse', maxthreads=1, featsel=False, exclude=False, condition=None): # commands from fastlmmc: # maxthreads # condition # exclude by position # if condition: # condition = '-SnpId1 %s' % condition[0] # else: # condition = ' bfile = dataset filtered_snp_reader = Bed('%s.FILTERED' % bfile) full_snp_reader = Bed('%s.FULL' % bfile) pheno = '%s.pheno.txt' % dataset v = globals() chroms = map(str, range(1, species_chroms[species] + 1)) v.update(locals()) n = len(filtered_snp_reader.sid) # case checking if(group_size > n): print("trying to group more than the number of existing snps:\nprogram ended!") exit(1) if(group_size == 0): print("grouping size is 0:\nprogram ended!") exit(2) groupNum = (n//group_size) if (n % group_size !=0): groupNum += 1 #print("group_num: " + str(groupNum)) if(groupNum < 2): print("group number should be at least two, please decrease the size of snps in each group") exit(3) th = groupNum - 1 rest = process_id + 1 base = 0 hetero_num = groupNum*(groupNum - 1)//2 maxjobnum = hetero_num homo_num = (groupNum //2) if (groupNum % 2 == 0) else (groupNum//2 + 1) maxjobnum += homo_num if(process_id >= maxjobnum): print("job number exceeds the total number of jobs that epstasis could do") exit(1) list_1_idx_start = 0 list_1_idx_end = 0 list_2_idx_start = 0 list_2_idx_end = 0 single_homo = False; #print("hetero_num: %s, homo_num: %s, maxjobnum: %s" %(hetero_num, homo_num,maxjobnum)) if(process_id < hetero_num): while(rest > th): rest -= th th -= 1 base += 1 list_1_idx_start = group_size*base list_1_idx_end = list_1_idx_start + group_size list_2_idx_start = group_size*(base + rest) if((base + rest) == (groupNum -1)): list_2_idx_end = n - 1; else: list_2_idx_end = list_2_idx_start + group_size #print('(' + str(base) + ',' + str(base + rest) + ')') # homogenous computing: same group else: offset = process_id - hetero_num offset *= 2 list_1_idx_start = group_size * offset if(offset == groupNum - 1): # last h**o with only one group #list_1_idx_start = list_1_idx_end = n; single_homo = True else: list_1_idx_end = list_1_idx_start + group_size list_2_idx_start = list_1_idx_end list_2_idx_end = min(list_2_idx_start + group_size, n) #print('(' + str(offset) + ',' + str(offset) + ')' + '(' + str(offset + 1) + ',' +\ #str(offset + 1) + ')') # epistasis on all snps df = None df2 = None if covFile: if (process_id < hetero_num): df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, covar=covFile, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end]) else: if(single_homo): df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, covar=covFile, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end]) else: df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, covar=covFile, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end]) df2 = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, covar=covFile, sid_list_0=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end], sid_list_1=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end]) else: if (process_id < hetero_num): df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end]) else: if(single_homo): df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end]) else: df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end]) df2 = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, sid_list_0=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end], sid_list_1=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end]) def format_results(df, final_columns, threshold): final = df.loc[:, final_columns] final = final[final['PValue'] <= threshold] return final v.update(locals()) # output to csv final = format_results(df, final_columns, p_value_threshold) final.to_csv('%(output_dir)s/%(dataset)s_%(process_id)s.gwas' % v, sep='\t', index=False) if(df2 is not None): final = format_results(df2, final_columns, p_value_threshold) final.to_csv('%(output_dir)s/%(dataset)s_%(process_id)s.gwas' % v, mode = 'a', sep='\t', index=False)