Пример #1
0
    def test_one(self):
        logging.info("TestEpistasis test_one")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("one")
        frame = epistasis(
            test_snps,
            pheno,
            G0=test_snps,
            covar=covar,
            sid_list_0=test_snps.sid[:10],  #first 10 snps
            sid_list_1=test_snps.sid[5:15],  #Skip 5 snps, use next 10
            output_file_name=output_file,
            count_A1=False)
        sid0, sid1, pvalue_list = np.array(frame['SNP0']), np.array(
            frame['SNP1']), np.array(frame['PValue'])

        #Check the output file
        self.compare_files(sid0, sid1, pvalue_list, "one")

        #Check the values returned
        output_file2 = self.file_name("one_again")
        write(sid0, sid1, pvalue_list, output_file2)
        self.compare_files(sid0, sid1, pvalue_list, "one")
Пример #2
0
    def test_unknown_sid(self):
        logging.info("TestEpistasis test_unknown_sid")

        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        try:
            frame = epistasis(test_snps, pheno,covar=covar,sid_list_0=['1_4','bogus sid','1_9'],sid_list_1=test_snps.sid[5:15]) #Skip 5 snps, use next 10
            failed = False
        except:
            failed = True

        assert(failed)
Пример #3
0
    def test_no_cov(self):
        logging.info("TestEpistasis test_no_cov")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn

        output_file = self.file_name("no_cov")
        frame = epistasis(test_snps, pheno, G0=test_snps, 
                                          sid_list_0=test_snps.sid[:10], #first 10 snps
                                          sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10
                                          output_file_name=output_file
                                          )

        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"no_cov")
Пример #4
0
    def test_no_sid_list_0(self):
        logging.info("TestEpistasis test_no_sid_list_0")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("no_sid_list_0")
        frame = epistasis(test_snps, pheno, G0=test_snps, 
                                  covar=covar, 
                                  sid_list_0=['1_4'],
                                  output_file_name=output_file
                                  )
        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"no_sid_list_0")
Пример #5
0
    def test_preload_files(self):
        logging.info("TestEpistasis test_preload_files")
        from pysnptools.snpreader import Bed
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps)

        output_file = self.file_name("preload_files")

        frame = epistasis(test_snps, pheno, G0=test_snps, 
                                  covar=covar, 
                                  sid_list_0=bed.sid[:10], #first 10 snps
                                  sid_list_1=bed.sid[5:15], #Skip 5 snps, use next 10
                                  output_file_name=output_file
                                  )
        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"one")
Пример #6
0
    def test_no_cov_b(self):
        logging.info("TestEpistasis test_no_cov_b")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn

        output_file = self.file_name("no_cov_b")
        covar = pstpheno.loadPhen(self.cov_fn)
        covar['vals'] = np.delete(covar['vals'], np.s_[:],1) #Remove all the columns

        frame = epistasis(test_snps, pheno, G0=test_snps, 
                                  covar=covar,
                                  sid_list_0=test_snps.sid[:10], #first 10 snps
                                  sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10
                                  output_file_name=output_file
                                  )

        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"no_cov")
Пример #7
0
    def test_G1_mixing(self):
        logging.info("TestEpistasis test_G1_mixing")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase,count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("G1_mixing")
        frame = epistasis(test_snps, pheno, G0=test_snps,
                                  covar=covar, 
                                  sid_list_0=test_snps.sid[:10], #first 10 snps
                                  sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10
                                  G1=test_snps,
                                  mixing=0,
                                  output_file_name=output_file,count_A1=False
                                  )

        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"one")
Пример #8
0
    def test_G1_mixing(self):
        logging.info("TestEpistasis test_G1_mixing")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase,count_A1=False)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("G1_mixing")
        frame = epistasis(test_snps, pheno, G0=test_snps,
                                  covar=covar, 
                                  sid_list_0=test_snps.sid[:10], #first 10 snps
                                  sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10
                                  G1=test_snps,
                                  mixing=0,
                                  output_file_name=output_file,count_A1=False
                                  )

        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"one")
Пример #9
0
    def test_cid_intersect(self):
        logging.info("TestEpistasis test_cid_intersect")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase)
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        pheno['iid'] = np.vstack([pheno['iid'][::-1],[['Bogus','Bogus']]])
        pheno['vals'] = np.hstack([pheno['vals'][::-1],[-34343]])

        
        covar = self.cov_fn
        output_file = self.file_name("cid_intersect")
        frame = epistasis(test_snps, pheno, G0=test_snps,
                                  covar=covar, 
                                  sid_list_0=test_snps.sid[:10], #first 10 snps
                                  sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10
                                  output_file_name=output_file
                                  )

        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"one")
Пример #10
0
    def test_match_cpp(self):
        '''
        match
            FaSTLMM.207\Data\DemoData>fastlmmc -snpPairs -bfile snps -extract topsnps.txt -bfileSim snps -extractSim ASout.snps.txt -pheno pheno.txt -covar covariate.txt -out topsnps.pairs.txt -logDelta 0 -verbose 100

        '''
        logging.info("TestEpistasis test_match_cpp")
        from pysnptools.snpreader import Bed
        snps = Bed(os.path.join(self.pythonpath, "tests/datasets/selecttest/snps"))
        pheno = os.path.join(self.pythonpath, "tests/datasets/selecttest/pheno.txt")
        covar = os.path.join(self.pythonpath, "tests/datasets/selecttest/covariate.txt")
        sim_sid = ["snp26250_m0_.19m1_.19","snp82500_m0_.28m1_.28","snp63751_m0_.23m1_.23","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp11253_m0_.2m1_.2","snp86250_m0_.33m1_.33","snp3753_m0_.23m1_.23","snp75003_m0_.32m1_.32","snp30002_m0_.25m1_.25","snp26252_m0_.19m1_.19","snp67501_m0_.15m1_.15","snp63750_m0_.28m1_.28","snp30001_m0_.28m1_.28","snp52502_m0_.35m1_.35","snp33752_m0_.31m1_.31","snp37503_m0_.37m1_.37","snp15002_m0_.11m1_.11","snp3751_m0_.34m1_.34","snp7502_m0_.18m1_.18","snp52503_m0_.3m1_.3","snp30000_m0_.39m1_.39","isnp4457_m0_.11m1_.11","isnp23145_m0_.2m1_.2","snp60001_m0_.39m1_.39","snp33753_m0_.16m1_.16","isnp60813_m0_.2m1_.2","snp82502_m0_.34m1_.34","snp11252_m0_.13m1_.13"]
        sim_idx = snps.sid_to_index(sim_sid)
        test_sid = ["snp26250_m0_.19m1_.19","snp63751_m0_.23m1_.23","snp82500_m0_.28m1_.28","snp48753_m0_.4m1_.4","snp45001_m0_.26m1_.26","snp52500_m0_.05m1_.05","snp75002_m0_.39m1_.39","snp41253_m0_.07m1_.07","snp86250_m0_.33m1_.33","snp15002_m0_.11m1_.11","snp33752_m0_.31m1_.31","snp26252_m0_.19m1_.19","snp30001_m0_.28m1_.28","snp11253_m0_.2m1_.2","snp67501_m0_.15m1_.15","snp3753_m0_.23m1_.23","snp52502_m0_.35m1_.35","snp30000_m0_.39m1_.39","snp30002_m0_.25m1_.25"]
        test_idx = snps.sid_to_index(test_sid)

        frame = epistasis(snps[:,test_idx], pheno,covar=covar, G0 = snps[:,sim_idx],log_delta=0)
        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])

        referenceOutfile = TestFeatureSelection.reference_file("epistasis/topsnps.pairs.txt")

        import pandas as pd
        table = pd.read_table(referenceOutfile,sep="\t") # We've manually remove all comments and blank lines from this file
        assert len(pvalue_list) == len(table)
        for row in table.iterrows():
            snp0cpp,snp1cpp,pvaluecpp,i1,i2 = row[1]
            for i in xrange(len(pvalue_list)):
                found = False
                pvaluepy = pvalue_list[i]
                snp0py = sid0[i]
                snp1py = sid1[i]
                if (snp0py == snp0cpp and snp1py == snp1cpp) or (snp0py == snp1cpp and snp1py == snp0cpp):
                    found = True
                    diff = abs(pvaluecpp - pvaluepy)/pvaluecpp
                    assert diff < .035, "'{0}' '{1}' pvalue_list differ too much {4} -- {2} vs {3}".format(snp0cpp,snp1cpp,pvaluecpp,pvaluepy,diff)
                    break
            assert found
Пример #11
0
    def test_one(self):
        logging.info("TestEpistasis test_one")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase)
        pheno = self.phen_fn
        covar = self.cov_fn

        output_file = self.file_name("one")
        frame = epistasis(test_snps, pheno, G0=test_snps, 
                                  covar=covar, 
                                  sid_list_0=test_snps.sid[:10], #first 10 snps
                                  sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10
                                  output_file_name=output_file
                                  )
        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])


        #Check the output file
        self.compare_files(sid0,sid1,pvalue_list,"one")

        #Check the values returned
        output_file2 = self.file_name("one_again")
        write(sid0,sid1,pvalue_list,output_file2)
        self.compare_files(sid0,sid1,pvalue_list,"one")
def run_fastlmmc(dataset, output_dir, process_id, group_size, covFile=None, species='mouse', maxthreads=1, featsel=False, exclude=False, condition=None):

	# commands from fastlmmc:
	# maxthreads
	# condition
	# exclude by position

	# if condition:
	#	 condition = '-SnpId1 %s' % condition[0]
	# else:
	#	 condition = '

	bfile = dataset
	filtered_snp_reader = Bed('%s.FILTERED' % bfile)
	full_snp_reader = Bed('%s.FULL' % bfile)
	pheno = '%s.pheno.txt' % dataset

	v = globals()
	chroms = map(str, range(1, species_chroms[species] + 1))
	v.update(locals())

	n = len(filtered_snp_reader.sid)

	# case checking
	if(group_size > n):
		print("trying to group more than the number of existing snps:\nprogram ended!")
		exit(1)
	if(group_size == 0):
		print("grouping size is 0:\nprogram ended!")
		exit(2)
	groupNum = (n//group_size)
	if (n % group_size !=0):
		groupNum += 1
	#print("group_num: " + str(groupNum))
	if(groupNum < 2):
		print("group number should be at least two, please decrease the size of snps in each group")
		exit(3)
	th = groupNum - 1
	rest = process_id + 1
	base = 0
	hetero_num = groupNum*(groupNum - 1)//2
	maxjobnum = hetero_num
	homo_num = (groupNum //2) if (groupNum % 2 == 0) else (groupNum//2 + 1)
	maxjobnum += homo_num
	if(process_id >= maxjobnum):
		print("job number exceeds the total number of jobs that epstasis could do")
		exit(1)

	list_1_idx_start = 0
	list_1_idx_end = 0
	list_2_idx_start = 0
	list_2_idx_end = 0
	single_homo = False;
	#print("hetero_num: %s, homo_num: %s, maxjobnum: %s" %(hetero_num, homo_num,maxjobnum))
	if(process_id < hetero_num):
		while(rest > th):
			rest -= th
			th -= 1
			base += 1

		list_1_idx_start = group_size*base
		list_1_idx_end = list_1_idx_start + group_size

		list_2_idx_start = group_size*(base + rest)
		if((base + rest) == (groupNum -1)):
			 list_2_idx_end = n - 1;
		else:
			list_2_idx_end = list_2_idx_start + group_size
		#print('(' + str(base) + ',' + str(base + rest) + ')')

	# homogenous computing: same group
	else:
		offset = process_id - hetero_num

		offset *= 2
		list_1_idx_start = group_size * offset

		if(offset == groupNum - 1):
			# last h**o with only one group
			#list_1_idx_start =
			list_1_idx_end = n;
			single_homo = True

		else:
			list_1_idx_end = list_1_idx_start + group_size
			list_2_idx_start = list_1_idx_end
			list_2_idx_end = min(list_2_idx_start + group_size, n)

		#print('(' + str(offset) + ',' + str(offset) + ')' + '(' + str(offset + 1) + ',' +\
		#str(offset + 1) + ')')


	# epistasis on all snps
	df = None
	df2 = None
	if covFile:
		if (process_id < hetero_num):
			df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, covar=covFile, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end])
		else:
			if(single_homo):
				df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, covar=covFile, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end])
			else:
				df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, covar=covFile, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end])
				df2 = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, covar=covFile, sid_list_0=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end], sid_list_1=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end])
	else:
		if (process_id < hetero_num):
			df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end])
		else:
			if(single_homo):
				df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end])
			else:
				df = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, sid_list_0=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end], sid_list_1=filtered_snp_reader.sid[list_1_idx_start:list_1_idx_end])
				df2 = epistasis(filtered_snp_reader, pheno, G0=full_snp_reader, sid_list_0=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end], sid_list_1=filtered_snp_reader.sid[list_2_idx_start:list_2_idx_end])

	def format_results(df, final_columns, threshold):
		final = df.loc[:, final_columns]
		final = final[final['PValue'] <= threshold]
		return final

	v.update(locals())
	# output to csv
	final = format_results(df, final_columns, p_value_threshold)
	final.to_csv('%(output_dir)s/%(dataset)s_%(process_id)s.gwas' % v, sep='\t', index=False)
	if(df2 is not None):
		final = format_results(df2, final_columns, p_value_threshold)
		final.to_csv('%(output_dir)s/%(dataset)s_%(process_id)s.gwas' % v, mode = 'a', sep='\t', index=False)