def test_c_reader_npz(self): distreader = DistNpz(self.currentFolder + "/../examples/toydata10.dist.npz") distdata = distreader.read(order='F',force_python_only=False) snp_c = distdata.val self.assertEqual(np.float64, snp_c.dtype) self.assertTrue(np.allclose(self.dist_values[:,:10], snp_c, rtol=1e-05, atol=1e-05)) distreader1 = DistNpz(self.currentFolder + "/../examples/toydata10.dist.npz") distreader2 = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz")[:,:10] self.assertTrue(np.allclose(distreader1.read().val, distreader2.read().val, rtol=1e-05, atol=1e-05)) distdata.val[1,2] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/distreader/toydata10.dist.npz" create_directory_if_necessary(output) DistNpz.write(output,distdata) snpdata2 = DistNpz(output).read() np.testing.assert_array_almost_equal(distdata.val, snpdata2.val, decimal=10) snpdata3 = distdata[:,0:0].read() #create distdata with no sids output = "tempdir/distreader/toydata0.dist.npz" DistNpz.write(output,snpdata3) snpdata4 = DistNpz(output).read() assert snpdata3 == snpdata4
def test_str(self): logging.info("TestLmmTrain test_str") G0_train = self.pythonpath + "/tests/datasets/synth/all" covariate_train = None pheno_train = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train, y=pheno_train,count_A1=False) filename = self.tempout_dir + "/model_str.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on same G0_test = G0_train covariate_test = covariate_train predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False) output_file = self.file_name("str") Dat.write(output_file,predicted_pheno) #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"str")
def test_twoK(self): logging.info("TestLmmTrain test_twoK") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids G0_train = self.snpreader_whole[train_idx,:] covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, K1_train=G0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_one.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G0_test, X=covariate_test,count_A1=False) output_file = self.file_name("one") Dat.write(output_file,predicted_pheno) pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"one")
def test_one(self): logging.info("test_one") import filecmp azure_path = "/chrom1/azurecopypy/test/one/little.txt" local_path0 = self._temp_dir() + "/little0.txt" pstutil.create_directory_if_necessary(local_path0) shutil.copy(os.path.realpath(__file__), local_path0) local_path = self._temp_dir() + "/little.txt" if self.container.file_exists(azure_path): self.container.remove(azure_path) assert not self.container.file_exists(azure_path) self.container.upload(local_path0, azure_path) assert self.container.file_exists(azure_path) self.container.download(azure_path, local_path, as_needed=True) self.container.download( azure_path, local_path, as_needed=True ) #Manually testing: see that it doesn't download again assert self.container.getmdate( azure_path) == datetime.datetime.utcfromtimestamp( os.path.getmtime(local_path)).replace(tzinfo=pytz.utc) assert filecmp.cmp(local_path0, local_path) self.container.remove(azure_path) self.container.rmtree('/'.join(azure_path.split('/')[:3])) os.remove(local_path)
def _simple_open_write(self, simple_file_name, size=0, updater=None): import pysnptools.util as pstutil logging.info("open_write('{0}',size={1})".format( simple_file_name, size)) #Register the file name in the directory file_name = self.directory + "/" + simple_file_name if os.path.exists( file_name ): #This is only OK, if it is a directory containing no files (and thus doesn't exist) assert not os.path.isfile( file_name), "Can't open a file for write if it already exists." assert not self._at_least_one( self.walk(simple_file_name) ), "Can't open a file for write if a directory with files already has the same name ({0},{1})".format( self, simple_file_name) shutil.rmtree(file_name) else: pstutil.create_directory_if_necessary(file_name, isfile=True) yield file_name logging.info("close('{0}')".format(simple_file_name)) assert os.path.exists( file_name ), "File doesn't exist in LocalCache. File is '{0}'".format(file_name)
def _run_one_task(original_distributable, taskindex, taskcount, workdirectory): ''' Does a fraction of the work (e.g. 1 of every 1000 work items) and then saves the results to single file. if taskindex == taskcount, does the reduce step ''' if not 0 < taskcount: raise Exception("Expect taskcount to be positive") if not (0 <= taskindex and taskindex < taskcount + 1): raise Exception( "Expect taskindex to be between 0 (inclusive) and taskcount (exclusive)" ) shaped_distributable = _shape_to_desired_workcount(original_distributable, taskcount) if shaped_distributable.work_count != taskcount: raise Exception("Assert: expect workcount == taskcount") pstutil.create_directory_if_necessary(workdirectory, isfile=False, robust=True) if (taskindex < taskcount): doMainWorkForOneIndex(shaped_distributable, taskcount, taskindex, workdirectory) return None else: result_sequence = work_sequence_from_disk(workdirectory, taskcount) return shaped_distributable.reduce(result_sequence)
def _fasttwoK(self,force_low_rank,GB_goal): train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids G0_train = self.snpreader_whole[train_idx,:] G1_train = SnpData(iid=G0_train.iid,sid=[item+"_1" for item in G0_train.sid],val=G0_train.read().val,pos=G0_train.pos,name="Different SNP names for {0}".format(G0_train)) covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] logging.info("force_low_rank = {0}".format(force_low_rank)) fastlmm1 = FastLMM(force_low_rank=force_low_rank,GB_goal=GB_goal).fit(K0_train=G0_train, K1_train=G1_train, X=covariate_train, y=pheno_train, mixing=.1) filename = self.tempout_dir + "/model_fasttwoK.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] G1_test = SnpData(iid=G0_test.iid,sid=[item+"_1" for item in G0_test.sid],val=G0_test.read().val,pos=G0_test.pos,name="Different SNP names for {0}".format(G0_test)) covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G1_test, X=covariate_test,count_A1=False) output_file = self.file_name("fasttwoK"+("_force_low" if force_low_rank else "")+("GB{0}".format(GB_goal) if GB_goal is not None else "")) Dat.write(output_file,predicted_pheno) pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"one")
def test_c_reader_dat(self): snpreader = Dat(self.currentFolder + "/examples/toydata.dat")[:, ::100] _fortesting_JustCheckExists().input(snpreader) snpdata1 = snpreader.read() self.assertEqual(np.float64, snpdata1.val.dtype) self.assertTrue( np.allclose(self.snps[:, ::100], snpdata1.val, rtol=1e-05, atol=1e-05)) snpdata1.val[ 1, 2] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.dat" create_directory_if_necessary(output) Dat.write(output, snpdata1) snpdata2 = Dat(output).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata3 = snpdata1[:, 0:0].read() #create snpdata with no sids output = "tempdir/snpreader/toydata3.dat" Dat.write(output, snpdata3) snpdata4 = Dat(output).read() assert snpdata3 == snpdata4
def run(self, distributable): if self.temp_dir is not None: tempdir = self.temp_dir else: tempdir = os.path.join(self.run_dir, distributable.tempdirectory) tempdir = os.path.realpath(tempdir) with patch.dict('os.environ', {'MKL_NUM_THREADS': str(self.mkl_num_threads)} if self.mkl_num_threads is not None else {}) as _: if self.taskindex != self.taskcount: _JustCheckExists().input(distributable) return _run_one_task(distributable, self.taskindex, self.taskcount, tempdir, weights=self.weights, environ=self.environ) else: result = _run_one_task(distributable, self.taskindex, self.taskcount, tempdir, weights=self.weights, environ=self.environ) if self.result_file is not None: create_directory_if_necessary(self.result_file) with open(self.result_file, mode='wb') as f: pickle.dump(result, f, pickle.HIGHEST_PROTOCOL) return result
def save_hashdown(self, filename): """ Save a Hashdown object to a json file. :param filename: name of file to save to. :type path: string >>> from pysnptools.util.filecache import Hashdown >>> file_to_hash= {'pysnptools/examples/toydata.5chrom.bed': '766f55aa716bc7bc97cad4de41a50ec3', ... 'pysnptools/examples/toydata.5chrom.bim': '6a07f96e521f9a86df7bfd7814eebcd6', ... 'pysnptools/examples/toydata.5chrom.fam': 'f4eb01f67e0738d4865fad2014af8537'} >>> hashdown1 = Hashdown('https://github.com/fastlmm/PySnpTools/raw/cf248cbf762516540470d693532590a77c76fba2', ... file_to_hash=file_to_hash) >>> hashdown1.save_hashdown('tempdir/demo.hashdown.json') >>> hashdown2 = Hashdown.load_hashdown('tempdir/demo.hashdown.json') >>> hashdown2.file_exists('pysnptools/examples/toydata.5chrom.bed') True >>> hashdown2.load('pysnptools/examples/toydata.5chrom.fam').split('\\n')[0] 'per0 per0 0 0 2 0.408848' """ pstutil.create_directory_if_necessary(filename) dict0 = dict(self.__dict__) del dict0["directory"] del dict0["_relative_directory"] del dict0["allow_unknown_files"] del dict0["trust_local_files"] with open(filename, "w") as json_file: json.dump(dict0, json_file)
def test1(self): logging.info("in TestPstMemMap test1") old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) filename2 = "tempdir/tiny.pst.memmap" pstutil.create_directory_if_necessary(filename2) pstreader2 = PstMemMap.empty(row=['a','b','c'],col=['y','z'],filename=filename2,row_property=['A','B','C'],order="F",dtype=np.float64) assert isinstance(pstreader2.val,np.memmap) pstreader2.val[:,:] = [[1,2],[3,4],[np.nan,6]] assert np.array_equal(pstreader2[[0],[0]].read(view_ok=True).val,np.array([[1.]])) pstreader2.flush() assert isinstance(pstreader2.val,np.memmap) assert np.array_equal(pstreader2[[0],[0]].read(view_ok=True).val,np.array([[1.]])) pstreader2.flush() pstreader3 = PstMemMap(filename2) assert np.array_equal(pstreader3[[0],[0]].read(view_ok=True).val,np.array([[1.]])) assert isinstance(pstreader3.val,np.memmap) pstreader = PstMemMap('../examples/tiny.pst.memmap') assert pstreader.row_count == 3 assert pstreader.col_count == 2 assert isinstance(pstreader.val,np.memmap) pstdata = pstreader.read(view_ok=True) assert isinstance(pstdata.val,np.memmap) os.chdir(old_dir)
def test1(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) filename2 = "tempdir/tiny.snp.memmap" pstutil.create_directory_if_necessary(filename2) snpreader2 = SnpMemMap.empty(iid=[['fam0', 'iid0'], ['fam0', 'iid1']], sid=['snp334', 'snp349', 'snp921'], filename=filename2, order="F", dtype=np.float64) assert isinstance(snpreader2.val, np.memmap) snpreader2.val[:, :] = [[0., 2., 0.], [0., 1., 2.]] assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val, np.array([[1.]])) snpreader2.flush() assert isinstance(snpreader2.val, np.memmap) assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val, np.array([[1.]])) snpreader2.flush() snpreader3 = SnpMemMap(filename2) assert np.array_equal(snpreader3[[1], [1]].read(view_ok=True).val, np.array([[1.]])) assert isinstance(snpreader3.val, np.memmap) logging.info("in TestSnpMemMap test1") snpreader = SnpMemMap('../examples/tiny.snp.memmap') assert snpreader.iid_count == 2 assert snpreader.sid_count == 3 assert isinstance(snpreader.val, np.memmap) snpdata = snpreader.read(view_ok=True) assert isinstance(snpdata.val, np.memmap) os.chdir(old_dir)
def test_kernel_one(self): logging.info("TestLmmTrain test_kernel_one") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids K0_train = SnpKernel(self.snpreader_whole[train_idx,:],standardizer=Unit()) covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] assert np.array_equal(K0_train.iid,covariate_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)" assert np.array_equal(K0_train.iid,pheno_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)" fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_kernel_one.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False) output_file = self.file_name("kernel_one") Dat.write(output_file,predicted_pheno) pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"one") #Expect same results as SNPs "one"
def setUpClass(self): from pysnptools.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","..")) self.bedbase = os.path.join(self.pythonpath, 'tests/datasets/all_chr.maf0.001.N300') self.phen_fn = os.path.join(self.pythonpath, 'tests/datasets/phenSynthFrom22.23.N300.randcidorder.txt') self.cov_fn = os.path.join(self.pythonpath, 'tests/datasets/all_chr.maf0.001.covariates.N300.txt')
def test_c_reader_pheno(self): snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() self.assertEqual(np.float64, snpdata1.val.dtype) snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.phe" create_directory_if_necessary(output) Pheno.write(output, snpdata1) snpreader = Pheno(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() import pysnptools.util.pheno as pstpheno dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="") snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True) assert len(dict['vals'].shape)==1, "test 1-d array of values" snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) snpdata4 = Pheno(None,iid_if_none=snpdata1.iid) assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0 snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10) snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
def run(self, distributable): # Check that the local machine has python path set localpythonpath = os.environ.get("PYTHONPATH")#!!should it be able to work without pythonpath being set (e.g. if there was just one file)? Also, is None really the return or is it an exception. if localpythonpath is None: raise Exception("Expect local machine to have 'pythonpath' set") remotepythoninstall = self.check_remote_pythoninstall() remotewd, run_dir_abs, run_dir_rel, nodelocalwd = self.create_run_dir() pstutil.create_directory_if_necessary(os.path.join(remotewd, distributable.tempdirectory), isfile=False) #create temp directory now so that cluster tasks won't try to create it many times at once result_remote = os.path.join(run_dir_abs,"result.p") self.copy_python_settings(run_dir_abs) inputOutputCopier = HPCCopier(remotewd,skipinput=self.skipinputcopy) #Create the object that copies input and output files to where they are needed inputOutputCopier.input(distributable) # copy of the input files to where they are needed (i.e. the cluster) remotepythonpath = self.FindOrCreateRemotePythonPath(localpythonpath, run_dir_abs) batfilename_rel = self.create_bat_file(distributable, remotepythoninstall, remotepythonpath, remotewd, run_dir_abs, run_dir_rel, result_remote, nodelocalwd, distributable) self.submit_to_cluster(batfilename_rel, distributable, remotewd, run_dir_abs, run_dir_rel, nodelocalwd) inputOutputCopier.output(distributable) # copy the output file from where they were created (i.e. the cluster) to the local computer assert os.path.exists(result_remote), "The HPC job produced no result (and, thus, likely failed)" with open(result_remote, mode='rb') as f: result = pickle.load(f) #logging.info('Done: HPC runner is running a distributable. Returns {0}'.format(result)) return result
def cmktest_writes(self): #=================================== # Defining sub functions #=================================== def _oned_int(c): return range(c) def _oned_str(c): return [str(i).encode('ascii') for i in range(c)] def _twooned_int(c): return [[i] for i in range(c)] def _twooned_str(c): return [[str(i).encode('ascii')] for i in range(c)] def _twotwod_int(c): return [[i,i] for i in range(c)] def _twotwod_str(c): return [[str(i).encode('ascii'),b"hello"] for i in range(c)] def _none(c): return None def _zero(c): return np.empty([c,0]) #=================================== # Staring main function #=================================== logging.info("starting 'test_writes'") np.random.seed(0) output_template = "tempdir/pstreader/writes.{0}.{1}" create_directory_if_necessary(output_template.format(0,"npz")) i = 0 for row_count in [5,2,1,0]: for col_count in [4,2,1,0]: val = np.random.normal(.5,2,size=(row_count,col_count)) for row_or_col_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str]: row = row_or_col_gen(row_count) col = row_or_col_gen(col_count) for prop_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str,_none,_zero]: row_prop = prop_gen(row_count) col_prop = prop_gen(col_count) pstdata = PstData(row,col,val,row_prop,col_prop,str(i)) for the_class,suffix in [(PstNpz,"npz"),(PstHdf5,"hdf5")]: filename = output_template.format(i,suffix) logging.info(filename) i += 1 the_class.write(filename,pstdata) for subsetter in [None, sp.s_[::2,::3]]: reader = the_class(filename) _fortesting_JustCheckExists().input(reader) subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]] readdata = subreader.read(order='C') expected = pstdata if subsetter is None else pstdata[subsetter[0],subsetter[1]].read() assert np.array_equal(readdata.val,expected.val) assert np.array_equal(readdata.row,expected.row) assert np.array_equal(readdata.col,expected.col) assert np.array_equal(readdata.row_property,expected.row_property) assert np.array_equal(readdata.col_property,expected.col_property) try: os.remove(filename) except: pass logging.info("done with 'test_writes'")
def test_writes(self): #=================================== # Defining sub functions #=================================== def _oned_int(c): return list(range(c)) def _oned_str(c): return [str(i) for i in range(c)] def _twooned_int(c): return [[i] for i in range(c)] def _twooned_str(c): return [[str(i)] for i in range(c)] def _twotwod_int(c): return [[i,i] for i in range(c)] def _twotwod_str(c): return [[str(i),"hello"] for i in range(c)] def _none(c): return None def _zero(c): return np.empty([c,0]) #=================================== # Staring main function #=================================== logging.info("starting 'test_writes'") np.random.seed(0) output_template = "tempdir/pstreader/writes.{0}.{1}" create_directory_if_necessary(output_template.format(0,"npz")) i = 0 for row_count in [5,2,1,0]: for col_count in [4,2,1,0]: val = np.random.normal(.5,2,size=(row_count,col_count)) for row_or_col_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str]: row = row_or_col_gen(row_count) col = row_or_col_gen(col_count) for prop_gen in [_oned_int,_oned_str,_twooned_int,_twooned_str,_twotwod_int,_twotwod_str,_none,_zero]: row_prop = prop_gen(row_count) col_prop = prop_gen(col_count) pstdata = PstData(row,col,val,row_prop,col_prop,str(i)) for the_class,suffix in [(PstNpz,"npz"),(PstHdf5,"hdf5")]: filename = output_template.format(i,suffix) logging.info(filename) i += 1 the_class.write(filename,pstdata) for subsetter in [None, sp.s_[::2,::3]]: reader = the_class(filename) _fortesting_JustCheckExists().input(reader) subreader = reader if subsetter is None else reader[subsetter[0],subsetter[1]] readdata = subreader.read(order='C') expected = pstdata if subsetter is None else pstdata[subsetter[0],subsetter[1]].read() assert np.array_equal(readdata.val,expected.val) assert np.array_equal(readdata.row,expected.row) assert np.array_equal(readdata.col,expected.col) assert np.array_equal(readdata.row_property,expected.row_property) assert np.array_equal(readdata.col_property,expected.col_property) try: os.remove(filename) except: pass logging.info("done with 'test_writes'")
def reduce(self, result_sequence): ''' ''' for i, pcs in result_sequence: out_fn = self.create_out_fn(self.cache_prefix, i) pstutil.create_directory_if_necessary(out_fn) save(out_fn, pcs) return None
def setUpClass(self): from pysnptools.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False) self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
def test3(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) filename = "tempdir/x.pst.memmap" pstutil.create_directory_if_necessary(filename) a = PstMemMap.empty(row=['a','b','c'],col=['y','z'],filename=filename,row_property=['A','B','C'],order="F",dtype=np.float64) pstdata = a.read(order='C',view_ok=True) os.chdir(old_dir)
def test_read1(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) file_from = "../examples/example.bgen" file_to = "temp/example.bgen" pstutil.create_directory_if_necessary(file_to) if os.path.exists(file_to + ".metadata"): os.remove(file_to + ".metadata") meta = open_bgen._metadata_path_from_filename(file_to, samples_filepath=None) if os.path.exists(meta): os.remove(meta) shutil.copy(file_from, file_to) for loop_index in range(2): bgen = Bgen(file_to) assert np.array_equal(bgen.iid[0], ["0", "sample_001"]) assert bgen.sid[0] == "SNPID_2,RSID_2" # Use the bgen_sample_id for both parts of iid def iid_dup(bgen_sample_id): return (bgen_sample_id, bgen_sample_id) iid_function = iid_dup bgen = Bgen(file_to, iid_function=iid_function, sid_function="id") assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"]) assert bgen.sid[0] == "SNPID_2" bgen = Bgen(file_to, iid_function=iid_function, sid_function="rsid") assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"]) assert bgen.sid[0] == "RSID_2" sid_function = lambda id, rsid: "{0},{1}".format(id, rsid) bgen = Bgen(file_to, iid_function, sid_function=sid_function) assert bgen.sid[0] == "SNPID_2,RSID_2" metafile = bgen._open_bgen._metadata_path_from_filename( file_to, samples_filepath=None) del bgen os.remove(metafile) sid_function = lambda id, rsid: "{0},{1}".format(id, rsid) bgen = Bgen(file_to, iid_function, sid_function=sid_function) assert bgen.sid[0] == "SNPID_2,RSID_2" metafile = bgen._open_bgen._metadata_path_from_filename( file_to, samples_filepath=None) del bgen os.remove(metafile) bgen = Bgen(file_to, iid_function, sid_function="rsid") assert np.array_equal(bgen.iid[0], ["sample_001", "sample_001"]) assert bgen.sid[0] == "RSID_2" os.chdir(old_dir)
def test_c_reader_dense(self): snpdata1 = self.snpdata[:,::100].read() snpdata1.val[1,2] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.dense.txt" create_directory_if_necessary(output) Dense.write(output, snpdata1) snpreader = Dense(output) _fortesting_JustCheckExists().input(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)
def create_distributablep(self, distributable, run_dir_abs, run_dir_rel): logging.info('Hadoop runner is pickling distributable') distributablep_filename_rel = os.path.join(run_dir_rel, "distributable.p") #distributablep_filename_abs = os.path.join(run_dir_abs, "distributable.p") pstutil.create_directory_if_necessary(distributablep_filename_rel) with open(distributablep_filename_rel, mode='wb') as f: pickle.dump(distributable, f, pickle.HIGHEST_PROTOCOL) logging.info('Done: Hadoop runner is pickling distributable') return distributablep_filename_rel
def fill_in_cache_file(self): self._run_once() logging.info("filling in the cache_file and log_delta, as needed") if self.G1_or_none is None: self.G1val_or_none = None else: self.G1val_or_none = self.G1_or_none.read().standardize().val # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs if self.cache_file is None: self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz") if os.path.exists( self.cache_file ): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date os.remove(self.cache_file) lmm = None if not os.path.exists(self.cache_file): logging.info("Precomputing eigen") lmm = LMM() G0_standardized = self.G0.read().standardize() lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing) logging.info("Saving precomputation to {0}".format( self.cache_file)) pstutil.create_directory_if_necessary(self.cache_file) np.savez( self.cache_file, lmm.U, lmm.S ) #using np.savez instead of pickle because it seems to be faster to read and write if self.external_log_delta is None: if lmm is None: lmm = self.lmm_from_cache_file() logging.info("searching for internal delta") lmm.setX(self.covar) lmm.sety(self.pheno['vals']) #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count # As per the paper, we optimized delta with REML=True, but # we will later optimize beta and find log likelihood with ML (REML=False) result = lmm.find_log_delta( REML=True, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta ) #!!what about findA2H2? minH2=0.00001 self.external_log_delta = result['log_delta'] self.internal_delta = np.exp( self.external_log_delta) * self.G0.sid_count logging.info("internal_delta={0}".format(self.internal_delta)) logging.info("external_log_delta={0}".format(self.external_log_delta))
def test_write_x_x_cpp(self): snpreader = Bed(self.currentFolder + "/examples/toydata") for order in ['C','F']: for dtype in [np.float32,np.float64]: snpdata = snpreader.read(order=order,dtype=dtype) snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) Bed.write(snpdata, output) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def test_write_x_x_cpp(self): distreader = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz") for order in ['C','F']: for dtype in [np.float32,np.float64]: distdata = distreader.read(order=order,dtype=dtype) distdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp.dist.npz".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) DistNpz.write(output, distdata) snpdata2 = DistNpz(output).read() np.testing.assert_array_almost_equal(distdata.val, snpdata2.val, decimal=10)
def test_write_x_x_cpp(self): snpreader = Bed(self.currentFolder + "/examples/toydata") for order in ['C','F']: for dtype in [np.float32,np.float64]: snpdata = snpreader.read(order=order,dtype=dtype) snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.{0}{1}.cpp".format(order,"32" if dtype==np.float32 else "64") create_directory_if_necessary(output) Bed.write(output, snpdata) snpdata2 = Bed(output).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def _create_directory(local): import pysnptools.util as pstutil #put here to avoid recursive nesting if os.path.exists(local): if os.path.isfile(local): os.remove(local) else: shutil.rmtree(local) directory_name = os.path.dirname(local) if os.path.exists(directory_name) and os.path.isfile(directory_name): os.remove(directory_name) pstutil.create_directory_if_necessary(local, isfile=True)
def test_npz(self): logging.info("in test_npz") snpreader = Bed(self.currentFolder + "/../examples/toydata",count_A1=False) kerneldata1 = snpreader.read_kernel(standardizer=stdizer.Unit()) s = str(kerneldata1) output = "tempdir/kernelreader/toydata.kernel.npz" create_directory_if_necessary(output) KernelNpz.write(output,kerneldata1) kernelreader2 = KernelNpz(output) kerneldata2 = kernelreader2.read() np.testing.assert_array_almost_equal(kerneldata1.val, kerneldata2.val, decimal=10) logging.info("done with test")
def reducer_closure(result_sequence): if output_file_name is not None: create_directory_if_necessary(output_file_name) frame = pd.concat(result_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) return frame
def test_npz(self): logging.info("in test_npz") distreader = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz") snpdata1 = distreader.as_snp(max_weight=1.0).read() s = str(snpdata1) output = "tempdir/distreader/toydata.snp.npz" create_directory_if_necessary(output) SnpNpz.write(output,snpdata1) snpreader2 = SnpNpz(output) snpdata2 = snpreader2.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) logging.info("done with test")
def test_c_reader_pheno(self): snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() self.assertEqual(np.float64, snpdata1.val.dtype) snpdata1.val[ 1, 0] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.phe" create_directory_if_necessary(output) Pheno.write(output, snpdata1) snpreader = Pheno(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() import pysnptools.util.pheno as pstpheno dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe", missing="") snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe", missing="", vectorize=True) assert len(dict['vals'].shape) == 1, "test 1-d array of values" snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) snpdata4 = Pheno(None, iid_if_none=snpdata1.iid) assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0 snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10) snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
def FindOrCreateRemotePythonPath(self, localpythonpath, run_dir_abs): if self.remote_python_parent is None: remotepythonpath = self.CopySource(localpythonpath, run_dir_abs) else: pstutil.create_directory_if_necessary(self.remote_python_parent,isfile=False) list = [] for rel in os.listdir(self.remote_python_parent): list.append(os.path.join(self.remote_python_parent,rel)) remotepythonpath = ";".join(list) if self.update_remote_python_parent: remotepythonpath = self.CopySource(localpythonpath, run_dir_abs) return remotepythonpath
def setUpClass(self): from pysnptools.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..")) self.bedbase = os.path.join( self.pythonpath, 'fastlmm/feature_selection/examples/toydata.5chrom') self.phen_fn = os.path.join( self.pythonpath, 'fastlmm/feature_selection/examples/toydata.phe') self.cov_fn = os.path.join( self.pythonpath, 'fastlmm/feature_selection/examples/toydata.cov')
def test1(self): from pysnptools.snpreader import Bed, SnpMemMap from pysnptools.util import example_file # Download and return local file name old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))) filename2 = "tempdir/tiny.snp.memmap" pstutil.create_directory_if_necessary(filename2) snpreader2 = SnpMemMap.empty(iid=[['fam0', 'iid0'], ['fam0', 'iid1']], sid=['snp334', 'snp349', 'snp921'], filename=filename2, order="F", dtype=np.float64) assert isinstance(snpreader2.val, np.memmap) snpreader2.val[:, :] = [[0., 2., 0.], [0., 1., 2.]] assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val, np.array([[1.]])) snpreader2.flush() assert isinstance(snpreader2.val, np.memmap) assert np.array_equal(snpreader2[[1], [1]].read(view_ok=True).val, np.array([[1.]])) snpreader2.flush() snpreader3 = SnpMemMap(filename2) assert np.array_equal(snpreader3[[1], [1]].read(view_ok=True).val, np.array([[1.]])) assert isinstance(snpreader3.val, np.memmap) logging.info("in TestSnpMemMap test1") snpreader = SnpMemMap('tempdir/tiny.snp.memmap') assert snpreader.iid_count == 2 assert snpreader.sid_count == 3 assert isinstance(snpreader.val, np.memmap) snpdata = snpreader.read(view_ok=True) assert isinstance(snpdata.val, np.memmap) bed_file = example_file("pysnptools/examples/toydata.5chrom.*", "*.bed") bed = Bed(bed_file) pstutil.create_directory_if_necessary( "tempdir/toydata.5chrom.snp.memmap" ) #LATER should we just promise to create directories? SnpMemMap.write("tempdir/toydata.5chrom.snp.memmap", bed) # Write bed in SnpMemMap format SnpMemMap.write( "tempdir/toydata.5chromsnpdata.snp.memmap", bed[:, ::2].read()) # Write snpdata in SnpMemMap format os.chdir(old_dir)
def setUpClass(self): from pysnptools.util import create_directory_if_necessary import fastlmm as fastlmm create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(fastlmm.__file__)), "..")) self.bed = Bed(os.path.join(self.pythonpath, 'tests/datasets/synth/all.bed'), count_A1=True)[:, ::10] self.phen_fn = os.path.join( self.pythonpath, 'tests/datasets/synth/pheno_10_causals.txt') self.cov_fn = os.path.join(self.pythonpath, 'tests/datasets/synth/cov.txt')
def output(self,item): if isinstance(item, str): itemnorm = os.path.normpath(item) pstutil.create_directory_if_necessary(itemnorm) remote_file_name = os.path.join(self.remotewd,itemnorm) local_dir_name,ignore = os.path.split(itemnorm) assert os.path.exists(remote_file_name), "Don't see expected file '{0}'. Did the HPC job fail?".format(remote_file_name) #xcopycommand = "xcopy /d /e /s /c /h /y {0} {1}".format(remote_file_name, local_dir_name) # we copy to the local dir instead of the local file so that xcopy won't ask 'file or dir?' xcopycommand = "xcopy /d /c /y {0} {1}".format(remote_file_name, local_dir_name) # we copy to the local logging.info(xcopycommand) rc = os.system(xcopycommand) if rc!=0: logging.info("xcopy cmd failed with return value={0}, from cmd {1}".format(rc,xcopycommand)) elif hasattr(item,"copyoutputs"): item.copyoutputs(self)
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader.snpdata import SnpData #!!! promote on level up innamespace iid = np.array([[str(i),str(i)] for i in xrange(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in xrange(sid_count)]) pos = np.array([[i,i,i] for i in xrange(sid_count)]) np.random.seed = 0 snpdata = SnpData(iid,sid,pos,np.zeros((iid_count,sid_count))) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count) create_directory_if_necessary(output) Bed.write(snpdata, output) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def too_slow_test_write_bedbig(self): iid_count = 100000 sid_count = 50000 from pysnptools.snpreader import SnpData iid = np.array([[str(i),str(i)] for i in range(iid_count)]) sid = np.array(["sid_{0}".format(i) for i in range(sid_count)]) pos = np.array([[i,i,i] for i in range(sid_count)]) np.random.seed(0) snpdata = SnpData(iid,sid,np.zeros((iid_count,sid_count)),pos=pos) #random.choice((0.0,1.0,2.0,float("nan")),size=(iid_count,sid_count))) output = "tempdir/bedbig.{0}.{1}".format(iid_count,sid_count) create_directory_if_necessary(output) Bed.write(output, snpdata, count_A1=False) snpdata2 = Bed(output,count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def test_write_bed_f64cpp_5_python(self): snpreader = Bed(self.currentFolder + "/examples/toydata",count_A1=False) iid_index = 5 logging.info("iid={0}".format(iid_index)) #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # snpreader = snpreader[0:-1,:] #assert snpreader.iid_count % 4 != 0 snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64) if snpdata.iid_count > 0: snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.F64python.{0}".format(iid_index) create_directory_if_necessary(output) Bed.write(output,snpdata, force_python_only=True) snpdata2 = Bed(output,count_A1=False).read() np.testing.assert_array_almost_equal(snpdata.val, snpdata2.val, decimal=10)
def test_write_bed_f64cpp_5_python(self): snpreader = Bed(self.currentFolder + "/examples/toydata") iid_index = 5 logging.info("iid={0}".format(iid_index)) #if snpreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # snpreader = snpreader[0:-1,:] #assert snpreader.iid_count % 4 != 0 snpdata = snpreader[0:iid_index,:].read(order='F',dtype=np.float64) if snpdata.iid_count > 0: snpdata.val[-1,0] = float("NAN") output = "tempdir/toydata.F64python.{0}".format(iid_index) create_directory_if_necessary(output) Bed.write(snpdata, output,force_python_only=True) snpdata2 = Bed(output).read() assert TestLoader.is_same(snpdata, snpdata2) #!!!define an equality method on snpdata?
def test_npz(self): logging.info("in test_npz") snpreader = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed", count_A1=False) kerneldata1 = snpreader.read_kernel(standardizer=stdizer.Unit()) s = str(kerneldata1) output = "tempdir/kernelreader/toydata.kernel.npz" create_directory_if_necessary(output) KernelNpz.write(output, kerneldata1) kernelreader2 = KernelNpz(output) kerneldata2 = kernelreader2.read() np.testing.assert_array_almost_equal(kerneldata1.val, kerneldata2.val, decimal=10) logging.info("done with test")
def test_write_distnpz_f64cpp_0(self): distreader = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz") iid_index = 0 logging.info("iid={0}".format(iid_index)) #if distreader.iid_count % 4 == 0: # divisible by 4 isn't a good test # distreader = distreader[0:-1,:] #assert distreader.iid_count % 4 != 0 distdata = distreader[0:iid_index,:].read(order='F',dtype=np.float64) if distdata.iid_count > 0: distdata.val[-1,0] = float("NAN") output = "tempdir/toydata.F64cpp.{0}.dist.npz".format(iid_index) create_directory_if_necessary(output) DistNpz.write(output, distdata ) snpdata2 = DistNpz(output).read() np.testing.assert_array_almost_equal(distdata.val, snpdata2.val, decimal=10)
def test_lr(self): import matplotlib.pyplot as plt import pylab logging.info("TestLmmTrain test_lr") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids G0_train = self.snpreader_whole[train_idx,:] covariate_train3 = self.covariate_whole[train_idx,:].read() covariate_train3.val = np.array([[float(num)] for num in xrange(covariate_train3.iid_count)]) pheno_train3 = self.pheno_whole[train_idx,:].read() np.random.seed(0) pheno_train3.val = covariate_train3.val * 2.0 + 100 + np.random.normal(size=covariate_train3.val.shape) # y = 2*x+100+normal(0,1) ##Plot training x and y #pylab.plot(covariate_train3.val, pheno_train3.val,".") #pylab.show() for force_full_rank,force_low_rank in [(True,False),(False,True)]: #Learn model, save, load fastlmm3x = FastLMM(force_full_rank=force_full_rank,force_low_rank=force_low_rank,GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3) filename = self.tempout_dir + "/model_lr.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm3x, filename) fastlmm3 = joblib.load(filename) #Predict with model (test on train) predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train output_file = self.file_name("lr") Dat.write(output_file,predicted_pheno) ## Plot training x and y, and training x with predicted y #do_plot = True #if do_plot: # pylab.plot(covariate_train3.val, pheno_train3.val,covariate_train3.val,predicted_pheno.val,".") # pylab.show() # # Plot y and predicted y (test on train) # pheno_actual = pheno_train3.val[:,0] # pylab.plot(pheno_actual,predicted_pheno.val,".") # pylab.show() self.compare_files(predicted_pheno,"lr")
def test_c_reader_ped(self): if False: #Too slow for routine testing snpdata1 = Ped(self.currentFolder + "/examples/toydata.ped")[::25,::1000].read() self.assertEqual(np.float64, snpdata1.val.dtype) TestPySnpTools.assert_match_012_210(self.snpdata[::25,::1000].read(),snpdata1) else: snpdata1 = self.snpdata[::25,::1000].read() output = "tempdir/snpreader/toydata.ped" create_directory_if_necessary(output) snpdata1.val[1,2] = np.NaN # Inject a missing value to test writing and reading missing values Ped.write(output, snpdata1) snpreader = Ped(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() TestPySnpTools.assert_match_012_210(snpdata1,snpdata2)
def cmktest_big_npz(self): logging.info("in test_big_npz") n = 1000 pstdata = PstData(row=range(n-1),col=range(n+1),val=np.zeros([n-1,n+1])) output = "tempdir/pstreader/big.npz" create_directory_if_necessary(output) PstNpz.write(output,pstdata) pstnpz = PstNpz(output) pstdata1 = pstnpz[::2,::4].read() pstdata2 = pstnpz.read(order='A') assert pstdata2.val.flags['C_CONTIGUOUS'] pstdata = PstData(row=range(n-1),col=range(n+1),val=np.zeros([n-1,n+1],order='F')) PstNpz.write(output,pstdata) pstnpz = PstNpz(output) pstdata2 = pstnpz.read(order='A') pstdata2.val.flags['F_CONTIGUOUS'] print("done")
def test_c_reader_dat(self): snpreader = Dat(self.currentFolder + "/examples/toydata.dat")[:,::100] _fortesting_JustCheckExists().input(snpreader) snpdata1 = snpreader.read() self.assertEqual(np.float64, snpdata1.val.dtype) self.assertTrue(np.allclose(self.snps[:,::100], snpdata1.val, rtol=1e-05, atol=1e-05)) snpdata1.val[1,2] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.dat" create_directory_if_necessary(output) Dat.write(output,snpdata1) snpdata2 = Dat(output).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata3 = snpdata1[:,0:0].read() #create snpdata with no sids output = "tempdir/snpreader/toydata3.dat" Dat.write(output,snpdata3) snpdata4 = Dat(output).read() assert snpdata3 == snpdata4
def test_kernel(self): logging.info("TestLmmTrain test_kernel") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids # Show it using the snps K0_train = self.snpreader_whole[train_idx,:].read_kernel(Unit()) covariate_train3 = self.covariate_whole[train_idx,:].read() pheno_train3 = self.pheno_whole[train_idx,:].read() pheno_train3.val = self.snpreader_whole[train_idx,0:1].read().val*2 assert np.array_equal(K0_train.iid,covariate_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)" assert np.array_equal(K0_train.iid,pheno_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)" #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".") #pylab.show() #Learn model, save, load fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train3, y=pheno_train3) filename = self.tempout_dir + "/model_snps.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm3x, filename) fastlmm3 = joblib.load(filename) #Predict with model (test on train) predicted_pheno, covar = fastlmm3.predict(K0_whole_test=K0_train, X=covariate_train3,count_A1=False) #test on train output_file = self.file_name("kernel") Dat.write(output_file,predicted_pheno) #### Plot training x and y, and training x with predicted y #pylab.plot(self.snpreader_whole[train_idx,0:1].read().val[:,0], pheno_train3.val,".",self.snpreader_whole[train_idx,0:1].read().val[:,0],predicted_pheno.val,".") #pylab.show() #### Plot y and predicted y (test on train) #pheno_actual = pheno_train3.val[:,0] #pylab.plot(pheno_actual,predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"snps") #"kernel" and "snps" test cases should give the same results
def test_snps(self): logging.info("TestLmmTrain test_snps") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids # Show it using the snps G0_train = self.snpreader_whole[train_idx,:] covariate_train3 = self.covariate_whole[train_idx,:].read() pheno_train3 = self.pheno_whole[train_idx,:].read() pheno_train3.val = G0_train[:,0:1].read().val*2 #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".") #pylab.show() #Learn model, save, load fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3) filename = self.tempout_dir + "/model_snps.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm3x, filename) fastlmm3 = joblib.load(filename) #Predict with model (test on train) predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train output_file = self.file_name("snps") Dat.write(output_file,predicted_pheno) ### Plot training x and y, and training x with predicted y #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val,".",G0_train[:,0:1].read().val[:,0],predicted_pheno.val,".") #pylab.show() ### Plot y and predicted y (test on train) #pheno_actual = pheno_train3.val[:,0] #pylab.plot(pheno_actual,predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"snps")
def test_str2(self): logging.info("TestLmmTrain test_str2") #Standardize train and test together whole_kernel = self.snpreader_whole.read_kernel(Unit()) train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] K0_train_filename = self.tempout_dir + "/model_str2.kernel.npz" pstutil.create_directory_if_necessary(K0_train_filename) from pysnptools.kernelreader import KernelNpz KernelNpz.write(K0_train_filename,whole_kernel[train_idx].read(order='A',view_ok=True)) fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train_filename, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_str2.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=whole_kernel[:,test_idx].read(order='A',view_ok=True), X=covariate_test,count_A1=False) output_file = self.file_name("str2") Dat.write(output_file,predicted_pheno) #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"str2")
def test_lmm(self): do_plot = False iid_count = 500 seed = 0 import pylab logging.info("TestLmmTrain test_lmm") iid = [["cid{0}P{1}".format(iid_index,iid_index//250)]*2 for iid_index in xrange(iid_count)] train_idx = np.r_[10:iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #Every person is 100% related to everyone in one of 5 families K0a = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance") for iid_index0 in xrange(iid_count): for iid_index1 in xrange(iid_count): K0a.val[iid_index0,iid_index1] = 1 if iid_index0 % 5 == iid_index1 % 5 else 0 if iid_index1 < iid_index0: assert K0a.val[iid_index0,iid_index1] == K0a.val[iid_index1,iid_index0] #every person lives on a line from 0 to 1 # They are related to every other person as a function of distance on the line np.random.seed(seed) home = np.random.random([iid_count]) K0b = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance") for iid_index in xrange(iid_count): K0b.val[iid_index,:] = 1 - np.abs(home-home[iid_index])**.1 #make covar just numbers 0,1,... covar = SnpData(iid=iid,sid=["x"],val=np.array([[float(num)] for num in xrange(iid_count)])) covariate_train = covar[train_idx,:].read() covariate_test = covar[test_idx,:].read() for name, h2, K0 in [("clones", 1, K0a),("line_world",.75,K0b)]: sigma2x = 100 varg = sigma2x * h2 vare = sigma2x * (1-h2) ####################################################################### #make pheno # pheno = 2*covar+100+normal(0,1)*2.5+normal(0,K)*7.5 ####################################################################### #random.multivariate_normal is sensitive to mkl_num_thread, so we control it. if 'MKL_NUM_THREADS' in os.environ: mkl_num_thread = os.environ['MKL_NUM_THREADS'] else: mkl_num_thread = None os.environ['MKL_NUM_THREADS'] = '1' np.random.seed(seed) p1 = covar.val * 2.0 + 100 p2 = np.random.normal(size=covar.val.shape)*np.sqrt(vare) p3 = (np.random.multivariate_normal(np.zeros(iid_count),K0.val)*np.sqrt(varg)).reshape(-1,1) if mkl_num_thread is not None: os.environ['MKL_NUM_THREADS'] = mkl_num_thread else: del os.environ['MKL_NUM_THREADS'] pheno = SnpData(iid=iid,sid=["pheno0"],val= p1 + p2 + p3) pheno_train = pheno[train_idx,:].read() pheno_test = pheno[test_idx,:].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".") pylab.suptitle(name + ": Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))] Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=pheno_train.iid_count REML = False if not REML: sigma2 = float(r2/N) nLL = N*0.5*np.log(2*np.pi*sigma2) + N*0.5 else: sigma2 = float(r2 / (N-D)) nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2; nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name + ": real linear regression: actual to prediction") pylab.show() for factor in [1,100,.02]: K0 = K0.read() K0.val *= factor K0_train = K0[train_idx] K0_whole_test = K0[:,test_idx] #Learn model, save, load fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train) v2 = np.var(p2) v3 = np.var(p3) logging.debug("Original h2 of {0}. Generated h2 of {1}. Learned h2 of {2}".format(h2, v3/(v2+v3), fastlmmx.h2raw)) filename = self.tempout_dir + "/model_lmm.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmmx, filename) fastlmm = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar_pheno = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train output_file = self.file_name("lmma_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar_pheno.row,sid=covar_pheno.col[:,1],val=covar_pheno.val) #kludge to write kernel to text format output_file = self.file_name("lmma.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar_pheno.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_pheno,"lmma_"+name) self.compare_files(covar2,"lmma.cov_"+name) predicted_pheno0, covar_pheno0 = fastlmm.predict(K0_whole_test=K0_train[:,0], X=covariate_train[0,:],count_A1=False) #test on train #0 assert np.abs(predicted_pheno0.val[0,0] - predicted_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" assert np.abs(covar_pheno0.val[0,0] - covar_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" #Predict with model (test on test) predicted_phenoB, covar_phenoB = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on test output_file = self.file_name("lmmb_"+name) Dat.write(output_file,predicted_phenoB) covar2 = SnpData(iid=covar_phenoB.row,sid=covar_phenoB.col[:,1],val=covar_phenoB.val) #kludge to write kernel to text format output_file = self.file_name("lmmb.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar_phenoB.val)) predicted = predicted_phenoB.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_phenoB,"lmmb_"+name) self.compare_files(covar2,"lmmb.cov_"+name) predicted_phenoB0, covar_phenoB0 = fastlmm.predict(K0_whole_test=K0_whole_test[:,0], X=covariate_test[0,:],count_A1=False) #test on a single test case assert np.abs(predicted_phenoB0.val[0,0] - predicted_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" assert np.abs(covar_phenoB0.val[0,0] - covar_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" #Predict with model test on some train and some test some_idx = range(covar.iid_count) some_idx.remove(train_idx[0]) some_idx.remove(test_idx[0]) covariate_some = covar[some_idx,:] K0_whole_some = K0[:,some_idx] predicted_phenoC, covar_phenoC = fastlmm.predict(K0_whole_test=K0_whole_some, X=covariate_some,count_A1=False) for idxC, iidC in enumerate(predicted_phenoC.iid): meanC = predicted_phenoC.val[idxC] varC = covar_phenoC.val[idxC,idxC] if iidC in predicted_pheno.iid: predicted_pheno_ref = predicted_pheno covar_pheno_ref = covar_pheno else: assert iidC in predicted_phenoB.iid predicted_pheno_ref = predicted_phenoB covar_pheno_ref = covar_phenoB idx_ref = predicted_pheno_ref.iid_to_index([iidC])[0] mean_ref = predicted_pheno_ref.val[idx_ref] var_ref = covar_pheno_ref.val[idx_ref,idx_ref] assert np.abs(meanC - mean_ref) < 1e-6 assert np.abs(varC - var_ref) < 1e-6
def _internal_single(G0_standardized, test_snps, pheno,covar, G1_standardized, mixing, #!!test mixing and G1 h2, log_delta, cache_file): assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0/(np.exp(log_delta)+1) covar = np.hstack((covar['vals'],np.ones((test_snps.iid_count, 1)))) #We always add 1's to the end. y = pheno['vals'] from pysnptools.standardizer import DiagKtoN assert mixing is None or 0.0 <= mixing <= 1.0 if cache_file is not None and os.path.exists(cache_file): lmm = fastLMM(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] else: # combine two kernels (normalize kernels to diag(K)=N G0_standardized_val = DiagKtoN(G0_standardized.val.shape[0]).standardize(G0_standardized.val) G1_standardized_val = DiagKtoN(G1_standardized.val.shape[0]).standardize(G1_standardized.val) if mixing == 0.0 or G1_standardized.sid_count == 0: G = G0_standardized.val elif mixing == 1.0 or G0_standardized.sid_count == 0: G = G1_standardized.val else: G = np.empty((G0_standardized.iid_count,G0_standardized.sid_count+G1_standardized.sid_count)) if mixing is None: mixing, h2 = _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y) _mix(G, G0_standardized_val,G1_standardized_val,mixing) #TODO: make sure low-rank case is handled correctly lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) snps_read = test_snps.read().standardize() res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=snps_read.val) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) np.savez(cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write beta = res['beta'] chi2stats = beta*beta/res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] if G0_standardized is not None: assert G.shape[0] == lmm.U.shape[0] p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP) items = [ ('SNP', snps_read.sid), ('Chr', snps_read.pos[:,0]), ('GenDist', snps_read.pos[:,1]), ('ChrPos', snps_read.pos[:,2]), ('PValue', p_values), ('SnpWeight', beta[:,0]), ('SnpWeightSE', np.sqrt(res['variance_beta'][:,0])), ('SnpFractVarExpl', np.sqrt(res['fraction_variance_explained_beta'][:,0])), ('Nullh2', np.zeros((snps_read.sid_count)) + h2) ] frame = pd.DataFrame.from_items(items) return frame
def _internal_single(K0, test_snps, pheno, covar, K1, mixing, h2, log_delta, cache_file, force_full_rank, force_low_rank, output_file_name, block_size, interact_with_snp, runner): assert K0 is not None, "real assert" assert K1 is not None, "real assert" assert block_size is not None, "real assert" assert mixing is None or 0.0 <= mixing <= 1.0 if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0/(np.exp(log_delta)+1) covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))] #view_ok because np.c_ will allocation new memory y = pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values if cache_file is not None and os.path.exists(cache_file): lmm = lmm_cov(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] h2 = data['arr_2'][0] mixing = data['arr_2'][1] else: K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN()) mixing = mixer.mixing if mixer.do_g: lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True) else: lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) lmm.getSU() np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write if interact_with_snp is not None: logging.info("interaction with %i" % interact_with_snp) assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range" interact = covar[:,interact_with_snp].copy() interact -=interact.mean() interact /= interact.std() else: interact = None work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up) # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function. def debatch_closure(work_index): return test_snps.sid_count * work_index // work_count def mapper_closure(work_index): if work_count > 1: logging.info("single_snp: Working on part {0} of {1}".format(work_index,work_count)) do_work_time = time.time() start = debatch_closure(work_index) end = debatch_closure(work_index+1) snps_read = test_snps[:,start:end].read().standardize() if interact_with_snp is not None: variables_to_test = snps_read.val * interact[:,np.newaxis] else: variables_to_test = snps_read.val res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test) beta = res['beta'] chi2stats = beta*beta/res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] assert test_snps.iid_count == lmm.U.shape[0] p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP) dataframe = _create_dataframe(snps_read.sid_count) dataframe['sid_index'] = np.arange(start,end) dataframe['SNP'] = snps_read.sid dataframe['Chr'] = snps_read.pos[:,0] dataframe['GenDist'] = snps_read.pos[:,1] dataframe['ChrPos'] = snps_read.pos[:,2] dataframe['PValue'] = p_values dataframe['SnpWeight'] = beta[:,0] dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0]) dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0]) dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2 logging.info("time={0}".format(time.time()-do_work_time)) #logging.info(dataframe) return dataframe def reducer_closure(result_sequence): if output_file_name is not None: create_directory_if_necessary(output_file_name) frame = pd.concat(result_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) return frame frame = map_reduce(xrange(work_count), mapper=mapper_closure,reducer=reducer_closure, input_files=[test_snps],output_files=[output_file_name], name="single_snp(output_file={0})".format(output_file_name), runner=runner) return frame
def heritability_spatial_correction(G_kernel, spatial_coor, spatial_iid, alpha_list, alpha_power, pheno, map_function = map, cache_folder=None, jackknife_count=500, permute_plus_count=10000, permute_times_count=10000, seed=0, just_testing=False, always_remote=False, allow_gxe2 = True ): """ Function measuring heritability with correction for spatial location. :param G_kernel: A kernel that tells the genetic similarity between all pairs of individuals. The kernel can be given explicitly, for example with a :class:`.KernelData`. The kernel can also be given implicitly by providing a set of SNPs or the name of a BED file. :type G_kernel: a :class:`.KernelReader`, :class:`.SnpReader` or a string :param spatial_coor: The position of each individual given by two coordinates. Any units are allowed, but the two values must be compatible so that distance can be determined via Pythagoras' theorem. (So, longitude and latitude should not be used unless the locations are near the Equator.) :type spatial_coor: a iid_count x 2 array :param spatial_iid: A ndarray of the iids. Each iid is a ndarray of two strings (a family ID and a case ID) that identifies an individual. :type spatial_iid: array of strings with shape [iid_count,2] :param alpha_list: a list of numbers to search to find the best alpha, which is the similarity scale. The similarity of two individuals is here defined as exp(-(distance_between/alpha)**alpha_power). If the closest individuals are 100 units apart and the farthest individuals are 4e6 units apart, a reasonable alpha_list might be: [int(v) for v in np.logspace(np.log10(100),np.log10(1e10), 100)] The function's reports on the alphas chosen. If an extreme alpha is picked, change alpha_list to cover more range. :type alpha_list: list of numbers :param alpha_power: 2 (a good choice) means that similarity goes with area. 1 means with distance. :type alpha_list: number :param pheno: The target values(s) to predict. It can be a file name readable via :class:`SnpReader.Pheno` or any :class:`.SnpReader`. :type pheno: a :class:`.SnpReader` or string :param cache_folder: (default 'None') The name of a directory in which to save intermediate results. If 'None', then no intermediate results are saved. :type cache_folder: a string :param map_function: (default 'map') A function with the same inputs and functionality as Python's 'map' function. Can be used to run 'heritability_spatial_correction' on a cluster. :type map_function: a function :param jackknife_count: (default 500) The number of jackknife groups to use when calculating standard errors (SE). Changing to a small number, 2, speeds up calculation at the cost of unusable SEs. :type jackknife_count: number :param permute_plus_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, speeds up calculation at the cost of unusable P values. :type permute_plus_count: number :param permute_times_count: (default 10000) The number of permutations used when calculating P values. Changing to a small number, 1, speeds up calculation at the cost of unusable P values. :type permute_times_count: number :param seed: (default 0) The random seed used by jackknifing and permutation. :type seed: number :param just_testing: (default False) If true, skips actual LMM-related search and calculation. :type just_testing: bool :rtype: Pandas dataframe with one row per phenotyper. Columns include "h2uncorr", "h2corr", etc. """ ###################### # Prepare the inputs ###################### from fastlmm.inference.fastlmm_predictor import _kernel_fixup, _pheno_fixup G_kernel = _kernel_fixup(G_kernel, iid_if_none=None, standardizer=Unit()) # Create a kernel from an in-memory kernel, some snps, or a text file. pheno = _pheno_fixup(pheno,iid_if_none=G_kernel.iid, missing='NA') # Create phenotype data from in-memory data or a text file. if cache_folder is not None: pstutil.create_directory_if_necessary(cache_folder,isfile=False) jackknife_seed = seed or 1954692566L permute_plus_seed = seed or 2372373100L permute_times_seed = seed or 2574440128L ###################### # Find 'alpha', the scale for distance ###################### # create the alpha table (unless it is already there) alpha_table_fn = "{0}/alpha_table.{1}.txt".format(cache_folder,pheno.sid_count) # create a name for the alpha_table cache file if cache_folder is not None and os.path.exists(alpha_table_fn): alpha_table = pd.read_csv(alpha_table_fn, delimiter = '\t',index_col=False, comment=None) else: # create the list of arguments to run arg_list = [] for phen_target in pheno.sid: pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target for alpha in alpha_list: #pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (-1, 0, None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1, 0, None), (-1, 0, None), just_testing, False, True and allow_gxe2, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list) return_list = [line for line in return_list if line is not None] #Remove 'None' results alpha_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(alpha_table,False,alpha_table_fn) # read the alpha table and find the best values grouped = alpha_table.groupby("phen") alpha_dict = {} for phen, phen_table in grouped: best_index_corr = phen_table['nLLcorr'].idxmin() # with Pandas, this returns the index in the parent table, not the group table best_index_gxe2 = phen_table['nLL_gxe2'].idxmin() if allow_gxe2 else 0 alpha_corr = alpha_table.iloc[best_index_corr]['alpha'] alpha_gxe2 = alpha_table.iloc[best_index_gxe2]['alpha'] alpha_dict[phen] = alpha_corr, alpha_gxe2 logging.info(alpha_dict) ###################### # Use jackknifing to compute h2uncorr, SE, h2corr, SE, e2, SE, gxe2, SE ###################### jackknife_count_actual = min(jackknife_count,G_kernel.iid_count) # Set up the run and do it (unless it has already been run) jackknife_table_fn = "{0}/jackknife.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual) if cache_folder is not None and os.path.exists(jackknife_table_fn): jackknife_table = pd.read_csv(jackknife_table_fn, delimiter = '\t',index_col=False, comment=None) else: arg_list = [] for phen_target in pheno.sid: pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] alpha_set = set([alpha_corr, alpha_gxe2]) #If these are the same, then only need to do half the work for alpha in alpha_set: logging.debug(alpha) do_uncorr = (alpha == alpha_corr) do_gxe2 = (alpha == alpha_gxe2) and allow_gxe2 for jackknife in range(-1, jackknife_count_actual): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife, jackknife_count_actual, jackknife_seed), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1,0,None), (-1,0,None), just_testing, do_uncorr, do_gxe2, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list) return_list = [line for line in return_list if line is not None] #Remove 'None' results jackknife_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(jackknife_table, False, jackknife_table_fn) # get the real (that is, unjackknifed) values jackknife_table["diff"] = jackknife_table.h2uncorr-jackknife_table.h2corr # Compute the diff = h2uncorr-h2corr column results_both = jackknife_table[jackknife_table.jackknife_index==-1] # Create a table of the real (non-jackknifed) results for both alphas (which may be the same) del results_both["jackknife_index"] results_corr = results_both[results_both.alpha == [alpha_dict[phen][0] for phen in results_both.phen]] #Create version for g+e's alpha results_gxe2 = results_both[results_both.alpha == [alpha_dict[phen][1] for phen in results_both.phen]] #Create version for gxe's alpha #remove unwanted columns for delcol in ["a2_gxe2","gxe2","nLL_gxe2","permute_plus_count","permute_plus_index","permute_plus_seed","permute_times_count","permute_times_index","permute_times_seed","jackknife_count","jackknife_seed"]: del results_corr[delcol] for delcol in ["a2","e2","h2corr","h2uncorr","nLLcorr","nLLuncorr","diff","permute_plus_count","permute_plus_index","permute_plus_seed","permute_times_count","permute_times_index","permute_times_seed","jackknife_count","jackknife_seed"]: del results_gxe2[delcol] #Use a pivottable to compute the jackknifed SE's corr_rows = np.logical_and(jackknife_table.jackknife_index!=-1,jackknife_table.alpha==[alpha_dict[phen][0] for phen in jackknife_table.phen]) jk_table_corr = pd.pivot_table(jackknife_table[corr_rows], values=['h2uncorr','h2corr','diff','e2'], index=['phen'], columns=[], aggfunc=np.std) jk_table_corr["h2uncorr SE"] = jk_table_corr["h2uncorr"] * np.sqrt(jackknife_count_actual-1) jk_table_corr["h2corr SE"] = jk_table_corr["h2corr"] * np.sqrt(jackknife_count_actual-1) jk_table_corr["diff SE"] = jk_table_corr["diff"] * np.sqrt(jackknife_count_actual-1) jk_table_corr["e2 SE"] = jk_table_corr["e2"] * np.sqrt(jackknife_count_actual-1) del jk_table_corr["h2uncorr"] del jk_table_corr["h2corr"] del jk_table_corr["diff"] del jk_table_corr["e2"] gxe2_rows = np.logical_and(jackknife_table.jackknife_index!=-1,jackknife_table.alpha==[alpha_dict[phen][1] for phen in jackknife_table.phen]) jk_table_gxe2 = pd.pivot_table(jackknife_table[gxe2_rows], values=['gxe2'], index=['phen'], columns=[], aggfunc=np.std) jk_table_gxe2["gxe2 SE"] = jk_table_gxe2["gxe2"] * np.sqrt(jackknife_count_actual-1) del jk_table_gxe2["gxe2"] #Join the SE's to the main results table results_corr = results_corr.join(jk_table_corr, on='phen') results_gxe2 = results_gxe2.join(jk_table_gxe2, on='phen') #compute pValue columns results_corr["P (diff=0)"] = stats.t.sf(results_corr["diff"]/results_corr["diff SE"],df=jackknife_count_actual-1)*2 #two sided results_corr["from SE, one-sided, P (e2=0)"] = stats.t.sf(results_corr["e2"]/results_corr["e2 SE"],df=jackknife_count_actual-1) results_gxe2["from SE, one-sided, P (gxe2=0)"] = stats.t.sf(results_gxe2["gxe2"]/results_gxe2["gxe2 SE"],df=jackknife_count_actual-1) #one sided if cache_folder is not None: _write_csv(results_corr, False, "{0}/jackknife_corr_summary.{1}.jackknife{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual)) _write_csv(results_gxe2, False, "{0}/jackknife_gxe2_summary.{1}.jackknife{2}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual)) ###################### # compute p(e2=0) via permutation ###################### permplus_table_fn = "{0}/permutation.GPlusE.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, permute_plus_count) if cache_folder is not None and os.path.exists(permplus_table_fn): permplus_table = pd.read_csv(permplus_table_fn, delimiter = '\t',index_col=False, comment=None) else: arg_list = [] for phen_target in pheno.sid: pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] for jackknife_index in range(-1,permute_plus_count): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_power, (jackknife_index, jackknife_count, jackknife_seed), arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha_corr, alpha_power, (-1,0,None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (jackknife_index, permute_plus_count,permute_plus_seed), (-1,0,None), just_testing, False, False, None) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list) return_list = [line for line in return_list if line is not None] #Remove 'None' results permplus_table = pd.DataFrame(return_list) if cache_folder is not None: _write_csv(permplus_table, False, permplus_table_fn) #Create a table of the real nLL for each pheno real_result_permplus = permplus_table[permplus_table.permute_plus_index==-1][['phen','nLLcorr']] real_result_permplus.rename(columns={'nLLcorr':'nLLcorr_real'},inplace=True) real_result_permplus.set_index(['phen'],inplace=True) # Create a table of the permutation runs and add the real nLL to each row perm_table = permplus_table[permplus_table.permute_plus_index!=-1] result = perm_table.join(real_result_permplus, on='phen') result['P(e2)'] = [1.0 if b else 0.0 for b in result.nLLcorr <= result.nLLcorr_real] # create a column showing where the perm is better (or as good) as the real # Use pivottable to find the fraction of of times when permutation is better pivot_table_plus = pd.pivot_table(result, values=['P(e2)'], index=['phen'], columns=[], aggfunc=np.mean) if cache_folder is not None: summary_permplus_table_fn = "{0}/summary.permutation.GPlusE.{1}.count{2}.txt".format(cache_folder, pheno.sid_count, permute_plus_count) _write_csv(pivot_table_plus, True, summary_permplus_table_fn) ################################################ # compute p(gxe2=0) via permutation ################################################ #Only process phenos for which gxe2 is not 0 nonzero = set(results_gxe2[results_gxe2.gxe2 !=0].phen) permtimes_phenotypes = set(pheno.sid) & nonzero #intersection permtimes_table_list = [] for phen_target in permtimes_phenotypes: permtimes_table_fn = "{0}/permutation.GxE/{1}.count{2}.txt".format(cache_folder, phen_target, permute_times_count) if cache_folder is not None and os.path.exists(permtimes_table_fn): permtime_results = pd.read_csv(permtimes_table_fn, delimiter = '\t',index_col=False, comment=None) else: arg_list = [] pheno_one = pheno[:,pheno.col_to_index([phen_target])] # Look at only this pheno_target alpha_corr, alpha_gxe2 = alpha_dict[phen_target] a2 = float(permplus_table[permplus_table.phen==phen_target][permplus_table.permute_plus_index == -1]['a2']) for permute_index in range(-1,permute_times_count): # pheno, G_kernel, spatial_coor, spatial_iid, alpha, alpha_powerm (permute_index, permute_count, permute_seed), arg_tuple = (pheno_one, G_kernel, spatial_coor, spatial_iid, alpha_gxe2, alpha_power, (-1,0,None), # (permute_plus_index, permute_plus_count, permute_plus_seed), (permute_times_index, permute_times_count, permute_times_seed) ,just_testing, do_uncorr, do_gxe2, a2 (-1,0,None), (permute_index, permute_times_count,permute_times_seed), just_testing, False, allow_gxe2, a2) arg_list.append(arg_tuple) # Run "run_line" on each set of arguments and save to file return_list = map_function(work_item, arg_list) if len(arg_list)>1 or always_remote else map(work_item, arg_list) return_list = [line for line in return_list if line is not None] #Remove 'None' results permtime_results = pd.DataFrame(return_list) if cache_folder is not None: pstutil.create_directory_if_necessary(permtimes_table_fn) _write_csv(permtime_results,False,permtimes_table_fn) permtimes_table_list.append(permtime_results) if permtimes_table_list: #not empty permtimes_table = pd.concat(permtimes_table_list) logging.info(permtimes_table.head()) #Create a table of the real nLL for each pheno real_result_permtimes = permtimes_table[permtimes_table.permute_times_index==-1][['phen','nLL_gxe2']] real_result_permtimes.rename(columns={'nLL_gxe2':'nLL_gxe2_real'},inplace=True) real_result_permtimes.set_index(['phen'],inplace=True) # Create a table of the permutation runs and add the real nLL to reach row summary_permtimes_table_fn = "{0}/summary.permutation.GxE.{1}.count{2}.txt".format(cache_folder,len(permtimes_phenotypes), permute_times_count) perm_table = permtimes_table[permtimes_table.permute_times_index!=-1] resultx = perm_table.join(real_result_permtimes, on='phen') resultx['P(gxe2)'] = [1.0 if b else 0.0 for b in resultx.nLL_gxe2 <= resultx.nLL_gxe2_real] # create a column showing where the perm is better (or as good) as the real # Use pivottable to find the fraction of of times when permutation is better pivot_table_times = pd.pivot_table(resultx, values=['P(gxe2)'], index=['phen'], columns=[], aggfunc=np.mean) if cache_folder is not None: _write_csv(pivot_table_times,True,summary_permtimes_table_fn) ####################### # Create final table of results by combining the summary tables ####################### #Rename some columns results_corr.rename(columns={"h2uncorr SE":"SE (h2uncorr)","h2corr SE":"SE (h2corr)","e2 SE":"SE (e2)"}, inplace=True) #Rename some columns and join results results_gxe2.rename(columns={"alpha":"alpha_gxe2","gxe2 SE":"SE (gxe2)"}, inplace=True) del results_gxe2['alpha_power'] results_gxe2.set_index(["phen"],inplace=True) final0 = results_corr.join(results_gxe2, on='phen') #Rename some columns and join results pivot_table_plus.rename(columns={"P(e2)":"P(e2=0)"}, inplace=True) final1 = final0.join(pivot_table_plus, on='phen') #Rename some columns and join results if permtimes_table_list: #not empty pivot_table_times.rename(columns={"P(gxe2)":"P(gxe2=0)"}, inplace=True) final2 = final1.join(pivot_table_times, on='phen') else: final2 = final1.copy() final2["P(gxe2=0)"] = np.nan #Rename 'phen' and select final columns final2.rename(columns={"phen":"phenotype"}, inplace=True) final3 = final2[["phenotype","h2uncorr","SE (h2uncorr)","h2corr","SE (h2corr)","P (diff=0)","e2","SE (e2)","P(e2=0)","alpha","alpha_gxe2","gxe2","SE (gxe2)","P(gxe2=0)"]].copy() #Rename sort the phenotypes final3['lower'] = [pheno_one.lower() for pheno_one in final3.phenotype] final3.sort(['lower'],inplace=True) del final3['lower'] if cache_folder is not None: summary_final_table_fn = "{0}/summary.final.{1}.{2}.{3}.{4}.txt".format(cache_folder, pheno.sid_count, jackknife_count_actual,permute_plus_count,permute_times_count) _write_csv(final3,False,summary_final_table_fn) return final3
val[1::4,SNPsIndex:SNPsIndex+1][bytes>=4]=np.nan val[1::4,SNPsIndex:SNPsIndex+1][bytes>=8]=1 val[1::4,SNPsIndex:SNPsIndex+1][bytes>=12]=byteThree bytes=np.mod(bytes,4) val[0::4,SNPsIndex:SNPsIndex+1]=byteZero val[0::4,SNPsIndex:SNPsIndex+1][bytes>=1]=np.nan val[0::4,SNPsIndex:SNPsIndex+1][bytes>=2]=1 val[0::4,SNPsIndex:SNPsIndex+1][bytes>=3]=byteThree val = val[iid_index_out,:] #reorder or trim any extra allocation #!!LATER this can fail because the trim statement above messes up the order #assert(SnpReader._array_properties_are_ok(val, order, dtype)) #!! self._close_bed() return val if __name__ == "__main__": logging.basicConfig(level=logging.INFO) from pysnptools.snpreader import Pheno, Bed import pysnptools.util as pstutil snpdata = Pheno('../examples/toydata.phe').read() # Read data from Pheno format pstutil.create_directory_if_necessary("tempdir/toydata.bed") Bed.write("tempdir/toydata.bed",snpdata,count_A1=False) # Write data in Bed format import doctest doctest.testmod() # There is also a unit test case in 'pysnptools\test.py' that calls this doc test
def test_lr_real(self): do_plot = False import pylab logging.info("TestLinRegTrain test_lr_real") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #make covar just numbers 0,1,... covar = self.covariate_whole.read() covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)]) covariate_train = covar[train_idx,:].read() covariate_test = covar[test_idx,:].read() K0_test_test = KernelIdentity(covariate_test.iid) #make pheno # pheno = 2*covar+100+normal(0,1)*10 pheno = self.pheno_whole.read() np.random.seed(0) pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10 pheno_train = pheno[train_idx,:].read() pheno_test = pheno[test_idx,:].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".") pylab.suptitle("Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))] Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0]) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=pheno_train.iid_count REML = False if not REML: sigma2 = float(r2/N) nLL = N*0.5*np.log(2*np.pi*sigma2) + N*0.5 else: sigma2 = float(r2 / (N-D)) nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2; nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle("real linear regression: actual to prediction") pylab.show() #These should all give the same result first_name = None for name,K0_train,K0_whole_test in [("Identity Kernel",None,None)]: first_name = first_name or name #Learn model, save, load modelx = LinearRegression().fit(K0_train=K0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_lr_real.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(modelx, filename) model = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar = model.predict(K0_whole_test=K0_train, X=covariate_train) #test on train output_file = self.file_name("lr_reala_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_reala.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_pheno,"lr2a_"+first_name) self.compare_files(covar2,"lr2a.cov_"+first_name) #Predict with model (test on test) predicted_pheno, covar = model.predict(K0_whole_test=K0_whole_test, X=covariate_test) #test on train output_file = self.file_name("lr_realb_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_realb.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)") pylab.show() ## Plot y and predicted y (test on train) #pylab.plot(pheno_test.val,predicted_pheno.val,".") #pylab.suptitle(name+": test on test: true target to prediction") #pylab.show() self.compare_files(predicted_pheno,"lr2b_"+first_name) self.compare_files(covar2,"lr2b.cov_"+first_name)
def test_lr_as_lmm(self): do_plot = False #later why does this test case generate two intersect info messages instead of just one? import pylab logging.info("TestLmmTrain test_lr_as_lmm") ############################################################### # Create a linear data set with just a little noise ############################################################### train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #make covar just numbers 0,1,... covar = self.covariate_whole.read() covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)]) covar._name = 'np.array([[float(num)] for num in xrange(covar.iid_count)])' covariate_train = covar[train_idx,:].read() covariate_test = covar[test_idx,:].read() #make pheno # pheno = 2*covar+100+normal(0,1)*10 pheno = self.pheno_whole.read() np.random.seed(0) pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10 pheno_train = pheno[train_idx,:].read() pheno_test = pheno[test_idx,:].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".") pylab.suptitle("Plot training x and y, testing x and y") pylab.show() ############################################################### # Show that linear regression does a good job predicting ############################################################### Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))] Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=pheno_train.iid_count REML = False if not REML: sigma2 = float(r2/N) nLL = N*0.5*np.log(2*np.pi*sigma2) + N*0.5 else: sigma2 = float(r2 / (N-D)) nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2; nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle("real linear regression: actual to prediction") pylab.show() ############################################################### # Use LMM as LR and apply test on train ############################################################### for force_full_rank in [True, False]: #Learn model, save, load fastlmmx = FastLMM(GB_goal=2,force_full_rank=force_full_rank).fit(K0_train=covariate_train, X=None, y=pheno_train) filename = self.tempout_dir + "/model_lr_as_lmm.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmmx, filename) fastlmm = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar = fastlmm.predict(K0_whole_test=covariate_train, X=None,count_A1=False) #test on train output_file = self.file_name("lr_as_lmma_") Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_as_lmma.cov_") Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None') pylab.suptitle("test on train: train X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_pheno,"lr_as_lmma_") self.compare_files(covar2,"lr_as_lmma.cov_") ############################################################### # Use LMM as LR and apply test on test ############################################################### #Predict with model (test on test) predicted_pheno, covar = fastlmm.predict(K0_whole_test=covariate_test, X=None,count_A1=False) #test on train output_file = self.file_name("lr_as_lmmb_") Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_as_lmmb.cov_") Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle("test on test: test X to true target (green) and prediction (red)") pylab.show() ## Plot y and predicted y (test on train) #pylab.plot(pheno_test.val,predicted_pheno.val,".") #pylab.suptitle(name+": test on test: true target to prediction") #pylab.show() self.compare_files(predicted_pheno,"lr_as_lmmb_") self.compare_files(covar2,"lr_as_lmmb.cov_")