def test_c_reader_pheno(self): snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() self.assertEqual(np.float64, snpdata1.val.dtype) snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.phe" create_directory_if_necessary(output) Pheno.write(output, snpdata1) snpreader = Pheno(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() import pysnptools.util.pheno as pstpheno dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="") snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True) assert len(dict['vals'].shape)==1, "test 1-d array of values" snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) snpdata4 = Pheno(None,iid_if_none=snpdata1.iid) assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0 snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10) snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
def test_c_reader_pheno(self): snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() self.assertEqual(np.float64, snpdata1.val.dtype) snpdata1.val[ 1, 0] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.phe" create_directory_if_necessary(output) Pheno.write(output, snpdata1) snpreader = Pheno(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() import pysnptools.util.pheno as pstpheno dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe", missing="") snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe", missing="", vectorize=True) assert len(dict['vals'].shape) == 1, "test 1-d array of values" snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) snpdata4 = Pheno(None, iid_if_none=snpdata1.iid) assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0 snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10) snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
def load_data(self): """load data """ tt0 = time.time() logging.info("loading data...") if self.num_snps_in_memory <= self.snpreader.iid_count : raise Exception("Expect self.num_snps_in_memory, {0} > self.snpreader.iid_count, {1}".format(self.num_snps_in_memory, self.total_num_ind)) self.sid = pd.Series(self.snpreader.sid) # load phenotype pheno = pstpheno.loadOnePhen(self.pheno_fn,self.mpheno, vectorize=True) self.ind_iid = pheno['iid'] #!!LATER: bug? It looks like we record the pre-intersect iids only to write out the pcs later? Why? # load covariates self.X, cov_iid = self.load_covariates(pheno) # Set up the snps # G is the standardized snps. The GClass.factory will either load them into memory or will note their file and read them as needed. self.G = GClass.factory(self.snpreader, self.num_snps_in_memory, self.standardizer, self.blocksize) #!!LATER Should we give preference to self.G since reordering it is the most expensive? (self.y, yiid), (self.X, xiid), self.G = pstutil.intersect_apply([(pheno['vals'], pheno['iid']), (self.X, cov_iid), self.G], sort_by_dataset=False) # make sure input data isn't modified self.X.flags.writeable = False self.y.flags.writeable = False logging.info("...done. Loading time %.2f s" % (float(time.time() - tt0)))
def setUpClass(self): currentFolder = os.path.dirname(os.path.realpath(__file__)) self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata" self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt" #self.cov_fn = currentFolder + "/examples/toydata.cov" # load data ################################################################### snp_reader = Bed(self.snp_fn) pheno = pstpheno.loadOnePhen(self.pheno_fn) #cov = pstpheno.loadPhen(self.cov_fn) # intersect sample ids snp_reader, pheno = pysnptools.util.intersect_apply([snp_reader, pheno]) self.G = snp_reader.read(order='C').val self.G = stdizer.Unit().standardize(self.G) self.G.flags.writeable = False self.y = pheno['vals'][:,0] self.y.flags.writeable = False # load pcs #self.G_cov = cov['vals'] self.G_cov = np.ones((len(self.y), 1)) self.G_cov.flags.writeable = False
def load_data(self): """load data """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: tt0 = time.time() logging.info("loading data...") if self.num_snps_in_memory <= self.snpreader.iid_count : raise Exception("Expect self.num_snps_in_memory, {0} > self.snpreader.iid_count, {1}".format(self.num_snps_in_memory, self.total_num_ind)) self.sid = pd.Series(self.snpreader.sid) # load phenotype pheno = pstpheno.loadOnePhen(self.pheno_fn,self.mpheno, vectorize=True) self.ind_iid = pheno['iid'] #!!LATER: bug? It looks like we record the pre-intersect iids only to write out the pcs later? Why? # load covariates self.X, cov_iid = self.load_covariates(pheno) # Set up the snps # G is the standardized snps. The GClass.factory will either load them into memory or will note their file and read them as needed. self.G = GClass.factory(self.snpreader, self.num_snps_in_memory, self.standardizer, self.blocksize,count_A1=self.count_A1) #!!LATER Should we give preference to self.G since reordering it is the most expensive? (self.y, yiid), (self.X, xiid), self.G = pstutil.intersect_apply([(pheno['vals'], pheno['iid']), (self.X, cov_iid), self.G], sort_by_dataset=False) # make sure input data isn't modified self.X.flags.writeable = False self.y.flags.writeable = False logging.info("...done. Loading time %.2f s" % (float(time.time() - tt0)))
def loadPheno(bed, phenoFile, missingPhenotype="-9", keepDict=False): pheno = phenoUtils.loadOnePhen(phenoFile, missing=missingPhenotype, vectorize=True) checkIntersection(bed, pheno, "phenotypes") bed, pheno = pstutil.intersect_apply([bed, pheno]) if not keepDict: pheno = pheno["vals"] return bed, pheno
def setUpClass(self): currentFolder = os.path.dirname(os.path.realpath(__file__)) self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata" self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt" #self.cov_fn = currentFolder + "/examples/toydata.cov" # load data ################################################################### snp_reader = Bed(self.snp_fn) pheno = pstpheno.loadOnePhen(self.pheno_fn) #cov = pstpheno.loadPhen(self.cov_fn) # intersect sample ids snp_reader, pheno = pysnptools.util.intersect_apply( [snp_reader, pheno]) self.G = snp_reader.read(order='C').val self.G = stdizer.Unit().standardize(self.G) self.G.flags.writeable = False self.y = pheno['vals'][:, 0] self.y.flags.writeable = False # load pcs #self.G_cov = cov['vals'] self.G_cov = np.ones((len(self.y), 1)) self.G_cov.flags.writeable = False
def load_snp_data(snpreader, pheno_fn, cov_fn=None, offset=True, mpheno=0, standardizer=Unit()): """Load plink files ---------- snpreader : snpreader object object to read in binary SNP file pheno_fn : str File name of phenotype file cov_fn : str File name of covariates file offset : bool, default=True Adds offset to the covariates specified in cov_fn, if neccesssary Returns ------- G : array, shape = [n_samples, n_features] SNP matrix X : array, shape = [n_samples, n_covariates] Matrix of covariates (e.g. age, gender) y : array, shape = [n_samples] Phenotype (target) vector """ #TODO: completely remove this pheno = pstpheno.loadOnePhen(pheno_fn, mpheno, vectorize=True) geno = snpreader.read(order='C').standardize(standardizer) # sanity check #assert np.testing.assert_array_equal(ind_iid, pheno['iid'][indarr[:,0]]) # load covariates or generate vector of ones (for bias) if cov_fn == None: cov = {'vals': np.ones((len(pheno['iid']), 1)), 'iid': pheno['iid']} else: cov = pstpheno.loadPhen(cov_fn) (y, yiid), G, (X, xiid) = pstutil.intersect_apply( [(pheno['vals'], pheno['iid']), geno, (cov['vals'], cov['iid'])], sort_by_dataset=False) G = G.read(order='C', view_ok=True) # add bias column if not present if offset and sp.all(X.std(0) != 0): offset = sp.ones((len(indarr), 1)) X = sp.hstack((X, offset)) return G, X, y
def loadPheno(bed, phenoFile, missingPhenotype='-9', keepDict=False): pheno = phenoUtils.loadOnePhen(phenoFile, missing=missingPhenotype, vectorize=True) checkIntersection(bed, pheno, 'phenotypes') bed, pheno = pstutil.intersect_apply([bed, pheno]) if (not keepDict): pheno = pheno['vals'] return bed, pheno
def loadCovars(bed, covarFile): covarsDict = phenoUtils.loadOnePhen(covarFile, vectorize=False) checkIntersection(bed, covarsDict, "covariates", checkSuperSet=True) _, covarsDict = pstutil.intersect_apply([bed, covarsDict]) covar = covarsDict["vals"] covar -= np.mean(covar, axis=0) covar /= np.std(covar, axis=0) return covar
def loadCovars(bed, covarFile): covarsDict = phenoUtils.loadOnePhen(covarFile, vectorize=False) checkIntersection(bed, covarsDict, 'covariates', checkSuperSet=True) _, covarsDict = pstutil.intersect_apply([bed, covarsDict]) covar = covarsDict['vals'] covar -= np.mean(covar, axis=0) covar /= np.std(covar, axis=0) return covar
def loadRelatedFile(bed, relFile): relatedDict = phenoUtils.loadOnePhen(relFile, vectorize=True) checkIntersection(bed, relatedDict, "relatedness", checkSuperSet=True) _, relatedDict = pstutil.intersect_apply([bed, relatedDict]) related = relatedDict["vals"] keepArr = related < 0.5 print np.sum(~keepArr), "individuals will be removed due to high relatedness" return keepArr
def loadRelatedFile(bed, relFile): relatedDict = phenoUtils.loadOnePhen(relFile, vectorize=True) checkIntersection(bed, relatedDict, 'relatedness', checkSuperSet=True) _, relatedDict = pstutil.intersect_apply([bed, relatedDict]) related = relatedDict['vals'] keepArr = (related < 0.5) print np.sum( ~keepArr), 'individuals will be removed due to high relatedness' return keepArr
def main(): """ example that compares output to fastlmmc """ # set up data phen_fn = "../feature_selection/examples/toydata.phe" snp_fn = "../feature_selection/examples/toydata.5chrom.bed" #chrom_count = 5 # load data ################################################################### snp_reader = Bed(snp_fn) pheno = pstpheno.loadOnePhen(phen_fn) cov = None #cov = pstpheno.loadPhen(self.cov_fn) snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) G = snp_reader.read(order='C').val G = stdizer.Unit().standardize(G) G.flags.writeable = False y = pheno['vals'][:, 0] y.flags.writeable # load pcs #G_pc = cov['vals'] #G_pc.flags.writeable = False delta = 2.0 gwas = WindowingGwas(G, y, delta=delta) pv = gwas.run_gwas() from fastlmm.association.tests.test_gwas import GwasTest REML = False snp_pos_sim = snp_reader.sid snp_pos_test = snp_reader.sid os.environ["FastLmmUseAnyMklLib"] = "1" gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0) gwas_c.run_gwas() import pylab pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+") pylab.plot(np.arange(-18, 0), np.arange(-18, 0), "-k") pylab.show() np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3) simple_manhattan_plot(pv)
def _fixup_pheno(pheno, bed=None, missingPhenotype='-9'): if (isinstance(pheno, str)): if (bed is not None): bed, pheno = loadPheno(bed, pheno, missingPhenotype, keepDict=True) return bed, pheno else: phenoDict = phenoUtils.loadOnePhen(pheno, missing=missingPhenotype, vectorize=True) return phenoDict else: if (bed is not None): return bed, pheno else: return pheno
def load_snp_data(snpreader, pheno_fn, cov_fn=None, offset=True, mpheno=0, standardizer=Unit()): """Load plink files ---------- snpreader : snpreader object object to read in binary SNP file pheno_fn : str File name of phenotype file cov_fn : str File name of covariates file offset : bool, default=True Adds offset to the covariates specified in cov_fn, if neccesssary Returns ------- G : array, shape = [n_samples, n_features] SNP matrix X : array, shape = [n_samples, n_covariates] Matrix of covariates (e.g. age, gender) y : array, shape = [n_samples] Phenotype (target) vector """ #TODO: completely remove this pheno = pstpheno.loadOnePhen(pheno_fn,mpheno, vectorize=True) geno = snpreader.read(order='C').standardize(standardizer) # sanity check #assert np.testing.assert_array_equal(ind_iid, pheno['iid'][indarr[:,0]]) # load covariates or generate vector of ones (for bias) if cov_fn == None: cov = {'vals': np.ones((len(pheno['iid']), 1)), 'iid':pheno['iid']} else: cov = pstpheno.loadPhen(cov_fn) (y, yiid), G, (X, xiid) = pstutil.intersect_apply([(pheno['vals'],pheno['iid']), geno, (cov['vals'],cov['iid'])], sort_by_dataset=False) G = G.read(order='C', view_ok=True) # add bias column if not present if offset and sp.all(X.std(0)!=0): offset = sp.ones((len(indarr),1)) X = sp.hstack((X,offset)) return G, X, y
def test_preload_files(self): logging.info("TestSingleSnp test_preload_files") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) output_file_name = self.file_name("preload_files") frame = single_snp(test_snps=bed[:,:10], pheno=pheno, G0=test_snps, mixing=0,leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"one")
def main(): """ example that compares output to fastlmmc """ # set up data phen_fn = "../feature_selection/examples/toydata.phe" snp_fn = "../feature_selection/examples/toydata.5chrom" #chrom_count = 5 # load data ################################################################### snp_reader = Bed(snp_fn) pheno = pstpheno.loadOnePhen(phen_fn) cov = None #cov = pstpheno.loadPhen(self.cov_fn) snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) G = snp_reader.read(order='C').val G = stdizer.Unit().standardize(G) G.flags.writeable = False y = pheno['vals'][:,0] y.flags.writeable # load pcs #G_pc = cov['vals'] #G_pc.flags.writeable = False delta = 2.0 gwas = WindowingGwas(G, y, delta=delta) pv = gwas.run_gwas() from fastlmm.association.tests.test_gwas import GwasTest REML = False snp_pos_sim = snp_reader.sid snp_pos_test = snp_reader.sid os.environ["FastLmmUseAnyMklLib"] = "1" gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0) gwas_c.run_gwas() import pylab pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+") pylab.plot(np.arange(-18, 0), np.arange(-18,0), "-k") pylab.show() np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3) simple_manhattan_plot(pv)
def test_preload_files(self): logging.info("TestSingleSnp test_preload_files") from pysnptools.snpreader import Bed test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) output_file_name = self.file_name("preload_files") frame = single_snp(test_snps=bed[:,:10], pheno=pheno, G0=test_snps, mixing=0, covar=covar, output_file_name=output_file_name ) self.compare_files(frame,"one")
def load_intersect(snp_reader, pheno_fn_or_none, snp_set=AllSnps()): """ load SNPs and phenotype, intersect ids ---------------------------------------------------------------------- Input: bed_reader : SnpReader object (e.g. BedReader) pheno_fn : str, file name of phenotype file, defa ---------------------------------------------------------------------- Output: G : numpy array containing SNP data y : numpy (1d) containing phenotype ---------------------------------------------------------------------- """ standardizer = stdizer.Unit() geno = snp_reader.read(order='C', snp_set=snp_set) G = geno['snps'] G = standardizer.standardize(G) snp_names = geno['rs'] chr_ids = geno['pos'][:, 0] if not pheno_fn_or_none is None: # load phenotype pheno = pstpheno.loadOnePhen(pheno_fn_or_none, 0) y = pheno['vals'][:, 0] # load covariates and intersect ids import warnings warnings.warn( "This intersect_ids is deprecated. Pysnptools includes newer versions of intersect_ids", DeprecationWarning) indarr = util.intersect_ids([pheno['iid'], snp_reader.original_iids]) #print "warning: random phen" #y = np.random.random_sample(len(y)) if not (indarr[:, 0] == indarr[:, 1]).all(): assert False, "ERROR: this code assumes the same order for snp and phen file" print "reindexing" y = y[indarr[:, 0]] G = G[indarr[:, 1]] else: y = None return G, y, snp_names, chr_ids
def load_intersect(snp_reader, pheno_fn_or_none,snp_set=AllSnps()): """ load SNPs and phenotype, intersect ids ---------------------------------------------------------------------- Input: bed_reader : SnpReader object (e.g. BedReader) pheno_fn : str, file name of phenotype file, defa ---------------------------------------------------------------------- Output: G : numpy array containing SNP data y : numpy (1d) containing phenotype ---------------------------------------------------------------------- """ standardizer = stdizer.Unit() geno = snp_reader.read(order='C',snp_set=snp_set) G = geno['snps'] G = standardizer.standardize(G) snp_names = geno['rs'] chr_ids = geno['pos'][:,0] if not pheno_fn_or_none is None: # load phenotype pheno = pstpheno.loadOnePhen(pheno_fn_or_none, 0) y = pheno['vals'][:,0] # load covariates and intersect ids import warnings warnings.warn("This intersect_ids is deprecated. Pysnptools includes newer versions of intersect_ids", DeprecationWarning) indarr = util.intersect_ids([pheno['iid'], snp_reader.original_iids]) #print "warning: random phen" #y = np.random.random_sample(len(y)) if not (indarr[:,0] == indarr[:,1]).all(): assert False, "ERROR: this code assumes the same order for snp and phen file" print "reindexing" y = y[indarr[:,0]] G = G[indarr[:,1]] else: y = None return G, y, snp_names, chr_ids
def test_SNC(self): logging.info("TestSNC") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) snc = bed.read() snc.val[:,2] = [0] * snc.iid_count # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:,:10], pheno=pheno, G0=snc, mixing=0,leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"snc")
def test_cid_intersect(self): logging.info("TestSingleSnp test_cid_intersect") test_snps = Bed(self.bedbase, count_A1=False) pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) pheno['iid'] = np.vstack([pheno['iid'][::-1],[['Bogus','Bogus']]]) pheno['vals'] = np.hstack([pheno['vals'][::-1],[-34343]]) covar = self.cov_fn output_file_name = self.file_name("cid_intersect") frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, G0=test_snps, leave_out_one_chrom=False, covar=covar, mixing=0, output_file_name=output_file_name,count_A1=False ) self.compare_files(frame,"one")
def test_preload_files(self): logging.info("TestSingleSnp test_preload_files") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) output_file_name = self.file_name("preload_files") frame = single_snp(test_snps=bed[:, :10], pheno=pheno, G0=test_snps, mixing=0, leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name) self.compare_files(frame, "one")
def test_preload_files(self): logging.info("TestEpistasis test_preload_files") from pysnptools.snpreader import Bed test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) output_file = self.file_name("preload_files") frame = epistasis(test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=bed.sid[:10], #first 10 snps sid_list_1=bed.sid[5:15], #Skip 5 snps, use next 10 output_file_name=output_file ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) self.compare_files(sid0,sid1,pvalue_list,"one")
def test_cid_intersect(self): logging.info("TestSingleSnp test_cid_intersect") test_snps = Bed(self.bedbase) pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) pheno['iid'] = np.vstack([pheno['iid'][::-1], [['Bogus', 'Bogus']]]) pheno['vals'] = np.hstack([pheno['vals'][::-1], [-34343]]) covar = self.cov_fn output_file_name = self.file_name("cid_intersect") frame = single_snp(test_snps=test_snps[:, :10], pheno=pheno, G0=test_snps, leave_out_one_chrom=False, covar=covar, mixing=0, output_file_name=output_file_name) self.compare_files(frame, "one")
def _run_once(self): if self._ran_once: return self._ran_once = None if isinstance(self.test_snps, str): self.test_snps = Bed(self.test_snps) if isinstance(self.G0, str): self.G0 = Bed(self.G0) if isinstance(self.pheno, str): self.pheno = pstpheno.loadOnePhen(self.pheno, vectorize=True, missing='NaN') if self.covar is not None and isinstance(self.covar, str): self.covar = pstpheno.loadPhen(self.covar, missing='NaN') if self.G1_or_none is not None and isinstance(self.G1_or_none, str): self.G1_or_none = Bed(self.G1_or_none) if self.sid_list_0 is None: self.sid_list_0 = self.test_snps.sid if self.sid_list_1 is None: self.sid_list_1 = self.test_snps.sid self.set_sid_sets() #!!Should fix up to add only of no constant columns - will need to add a test case for this if self.covar is None: self.covar = np.ones((self.test_snps.iid_count, 1)) else: self.covar = np.hstack( (self.covar['vals'], np.ones((self.test_snps.iid_count, 1)))) self.n_cov = self.covar.shape[1] if self.output_file_or_none is None: self.__tempdirectory = ".working" else: self.__tempdirectory = self.output_file_or_none + ".working" self._ran_once = True
def _run_once(self): if self._ran_once: return self._ran_once = None if isinstance(self.test_snps, str): self.test_snps = Bed(self.test_snps) if isinstance(self.G0, str): self.G0 = Bed(self.G0) if isinstance(self.pheno, str): self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True) #!! what about missing=-9? if self.covar is not None and isinstance(self.covar, str): self.covar = pstpheno.loadPhen(self.covar)#!! what about missing=-9? if self.G1_or_none is not None and isinstance(self.G1_or_none, str): self.G1_or_none = Bed(self.G1_or_none) if self.sid_list_0 is None: self.sid_list_0 = self.test_snps.sid if self.sid_list_1 is None: self.sid_list_1 = self.test_snps.sid self.set_sid_sets() #!!Should fix up to add only of no constant columns - will need to add a test case for this if self.covar is None: self.covar = np.ones((self.test_snps.iid_count, 1)) else: self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1)))) self.n_cov = self.covar.shape[1] if self.output_file_or_none is None: self.__tempdirectory = ".working" else: self.__tempdirectory = self.output_file_or_none + ".working" self._ran_once = True
def test_SNC(self): logging.info("TestSNC") test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps, count_A1=False) snc = bed.read() snc.val[:, 2] = 0 # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:, :10], pheno=pheno, G0=snc, mixing=0, leave_out_one_chrom=False, covar=covar, output_file_name=output_file_name, count_A1=False) self.compare_files(frame, "snc")
def test_cid_intersect(self): logging.info("TestEpistasis test_cid_intersect") from pysnptools.snpreader import Bed test_snps = Bed(self.bedbase) pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True) pheno['iid'] = np.vstack([pheno['iid'][::-1],[['Bogus','Bogus']]]) pheno['vals'] = np.hstack([pheno['vals'][::-1],[-34343]]) covar = self.cov_fn output_file = self.file_name("cid_intersect") frame = epistasis(test_snps, pheno, G0=test_snps, covar=covar, sid_list_0=test_snps.sid[:10], #first 10 snps sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10 output_file_name=output_file ) sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue']) self.compare_files(sid0,sid1,pvalue_list,"one")
def test_SNC(self): logging.info("TestSNC") from pysnptools.snpreader import Bed test_snps = self.bedbase pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True) covar = pstpheno.loadPhen(self.cov_fn) bed = Bed(test_snps) snc = bed.read() snc.val[:, 2] = [ 0 ] * snc.iid_count # make SNP #2 have constant values (aka a SNC) output_file_name = self.file_name("snc") frame = single_snp(test_snps=snc[:, :10], pheno=pheno, G0=snc, mixing=0, covar=covar, output_file_name=output_file_name) self.compare_files(frame, "snc")
def run_fastlmmc(dataset, output_dir, pheno_index, covFile=None, species='mouse', maxthreads=1, featsel=False, exclude=False, condition=None): # commands from fastlmmc: # maxthreads # condition # exclude by position # if condition: # condition = '-SnpId1 %s' % condition[0] # else: # condition = '' # temporary kludge because -excludeByPosition option is slow (at least for v2.05 and v2.06) bfile = dataset filtered_snp_reader = Bed('%s.FILTERED' % bfile) full_snp_reader = Bed('%s.FULL' % bfile) pheno_file = loadOnePhen('%s.pheno.txt' % dataset, i_pheno=pheno_index) phenotype_name = pheno_file['header'][0] v = globals() chroms = map(str, range(1, species_chroms[species] + 1)) v.update(locals()) # loop through chromosomes and run for i, chrom in enumerate(chroms): # separate by chromosome for LOOCV test_snps = filtered_snp_reader[:, filtered_snp_reader.pos[:, 0] == int(chrom)] matrix_snps = full_snp_reader[:, full_snp_reader.pos[:, 0] != int(chrom)] # run snp with covar if covFile: df = single_snp(test_snps=test_snps, pheno=pheno_file, K0=matrix_snps, covar=covFile) else: df = single_snp(test_snps=test_snps, pheno=pheno_file, K0=matrix_snps) # format outputs out_df = df.loc[:, ['SNP', 'Chr', 'ChrPos', 'PValue', 'SnpWeight']] out_df.columns = ['SNP', 'CHR', 'BP', 'P', 'Beta'] # save results into data frame if i == 0: final = out_df else: final = final.append(out_df) # output to csv v.update(locals()) final.to_csv('%(output_dir)s/%(phenotype_name)s.gwas' % v, sep='\t', index=False)