def mapper_gather_lots(i_fold_and_pair): i_fold, (train_idx, test_idx) = i_fold_and_pair logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold)) G_train = G_for_chrom[train_idx,:] #Precompute whole x whole standardized on train from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank) block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count) K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read() assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert" K_train = K_whole_unittrain[train_idx] single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2) is_all = (i_fold == n_folds) if n_folds > 1 else True k_list_in = [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)] if is_all: top_snps = list(single_snp_result.SNP[:max_k]) else: top_snps = None if i_fold == n_folds: k_index_to_nLL = None else: k_index_to_nLL = [] for k in k_list_in: top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])] logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k)) top_k_train = top_k[train_idx,:] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal) fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2raw=h2) #iid intersection means when can give the whole covariate and pheno top_k_test = top_k[test_idx,:] if k > 0 else None K0_whole_test = K_whole_unittrain[:,test_idx] nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno k_index_to_nLL.append(nLL) if i_fold > 0: k_list_in = None return k_list_in, top_snps, k_index_to_nLL
def mapper_gather_lots(i_fold_and_pair): i_fold, (train_idx, test_idx) = i_fold_and_pair logging.info("Working on GWAS_1K and k search, chrom={0}, i_fold={1}".format(test_chr, i_fold)) G_train = G_for_chrom[train_idx,:] #Precompute whole x whole standardized on train from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal min_count = _internal_determine_block_size(G_for_chrom, None, None, force_full_rank, force_low_rank) block_size = _block_size_from_GB_goal(GB_goal, G_for_chrom.iid_count, min_count) K_whole_unittrain = _SnpWholeWithTrain(whole=G_for_chrom,train_idx=train_idx, standardizer=Unit(), block_size=block_size).read() assert np.array_equal(K_whole_unittrain.iid,G_for_chrom.iid),"real assert" K_train = K_whole_unittrain[train_idx] single_snp_result = single_snp(test_snps=G_train, K0=K_train, pheno=pheno, #iid intersection means when can give the whole covariate and pheno covar=covar, leave_out_one_chrom=False, GB_goal=GB_goal, force_full_rank=force_full_rank, force_low_rank=force_low_rank, mixing=mixing, h2=h2) is_all = (i_fold == n_folds) if n_folds > 1 else True k_list_in = [0] + [int(k) for k in k_list if 0 < k and k < len(single_snp_result)] if is_all: top_snps = list(single_snp_result.SNP[:max_k]) else: top_snps = None if i_fold == n_folds: k_index_to_nLL = None else: k_index_to_nLL = [] for k in k_list_in: top_k = G_for_chrom[:,G_for_chrom.sid_to_index(single_snp_result.SNP[:k])] logging.info("Working on chr={0}, i_fold={1}, and K_{2}".format(test_chr,i_fold,k)) top_k_train = top_k[train_idx,:] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal) fastlmm.fit(K0_train=K_train, K1_train=top_k_train, X=covar, y=pheno,mixing=mixing,h2=h2) #iid intersection means when can give the whole covariate and pheno top_k_test = top_k[test_idx,:] if k > 0 else None K0_whole_test = K_whole_unittrain[:,test_idx] nLL = fastlmm.score(K0_whole_test=K0_whole_test,K1_whole_test=top_k_test,X=covar,y=pheno) #iid intersection means when can give the whole covariate and pheno k_index_to_nLL.append(nLL) if i_fold > 0: k_list_in = None return k_list_in, top_snps, k_index_to_nLL
def k_index_to_nLL_mapper(k): _, G_in, pheno_in, covar_in = _fixup(test_snps, G, pheno, covar, count_A1=count_A1) nll_sum = 0 mse_sum = 0 n_folds_in = 0 for fold_index, (train_idx, test_idx) in _kfold(G.iid_count, n_folds, seed, end_with_all=False, iid_to_index=G.iid_to_index): assert set(train_idx).isdisjoint(set(test_idx)), "real assert" top_snps_in_fold = fold_index_to_top_snps[fold_index][:k] sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold) G_train = G_in[train_idx, sid_idx_in_fold] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank, GB_goal=GB_goal) fastlmm.fit( K0_train=G_train, X=covar_in[train_idx, :], y=pheno_in[train_idx, :], h2raw=h2 ) #iid intersection means when can give the whole covariate and pheno G_test = G_in[ test_idx, sid_idx_in_fold] if k > 0 else KernelIdentity( G_in.iid, G_in.iid[test_idx] ) #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0 nll, mse = fastlmm.score( K0_whole_test=G_test, X=covar_in[test_idx, :], y=pheno_in[test_idx, :], return_mse_too=True ) #iid intersection means when can give the whole covariate and pheno nll_sum += nll mse_sum += mse n_folds_in += 1 logging.info("k={0},nLL={1},average mse={2}".format( k, nll_sum, mse_sum / n_folds_in)) return nll_sum
def test_api(self): train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids ##################################################### # Train and standardize cov and then apply to test ##################################################### cov_train, unit_trained = self.covariate_whole[train_idx,:].read().standardize(Unit(),return_trained=True) cov_test = self.covariate_whole[test_idx,:].read().standardize(unit_trained) ##################################################### # standardize whole kernel from snps (both ways) and then pull out the 3 parts ##################################################### whole_kernel = SnpKernel(self.covariate_whole,Unit()).read().standardize(DiagKtoN()) train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True) test_kernel = whole_kernel[train_idx,test_idx].read(order='A',view_ok=True) test_test_kernel = whole_kernel[test_idx,test_idx].read(order='A',view_ok=True) ##################################################### # create train_train, train_test, and test_test based on just the training snps (both standardizations) ##################################################### K_train = SnpKernel(self.snpreader_whole[train_idx,:],Unit(),block_size=100) train_train_kernel, snp_trained, kernel_trained = K_train._read_with_standardizing(to_kerneldata=True, kernel_standardizer=DiagKtoN(), return_trained=True) K_whole_test = _SnpWholeTest(train=self.snpreader_whole[train_idx,:],test=self.snpreader_whole[test_idx,:],standardizer=snp_trained,block_size=100) train_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[train_idx]) #The new reader may have the iids in a different order than the original reader train_test_kernel = K_whole_test[train_idx2,:].read().standardize(kernel_trained) test_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[test_idx]) test_test_kernel = K_whole_test[test_idx2,:].read().standardize(kernel_trained) ##################################################### # How does predict look with whole_test as input? ##################################################### # a. - standardize whole up front whole_kernel = SnpKernel(self.snpreader_whole,Unit(),block_size=100).read().standardize() train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True) whole_test_kernel = whole_kernel[:,test_idx].read(order='A',view_ok=True) fastlmm1 = FastLMM(snp_standardizer=SS_Identity(), kernel_standardizer=KS_Identity()) fastlmm1.fit(K0_train=train_kernel, X=self.covariate_whole, y=self.pheno_whole) #iid intersection means we won't really be using whole covar or pheno predicted_pheno, covar = fastlmm1.predict(K0_whole_test=whole_test_kernel, X=self.covariate_whole,count_A1=False) output_file = self.file_name("whole") Dat.write(output_file,predicted_pheno) self.compare_files(predicted_pheno,"whole") # b -- just files fastlmm2 = FastLMM() fastlmm2.fit(K0_train=self.snpreader_whole[train_idx,:], X=self.covariate_whole, y=self.pheno_whole[train_idx,:]) #iid intersection means we won't really be using whole covar predicted_pheno, covar = fastlmm2.predict(K0_whole_test=self.snpreader_whole[test_idx,:], X=self.covariate_whole,count_A1=False) self.compare_files(predicted_pheno,"one")
def predict(self, X=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance """ assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance( K0_whole_test, KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance( K1_whole_test, KernelIdentity) # could also accept no snps X = _pheno_fixup(X, iid_if_none=iid_if_none) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val, np.ones((X.iid_count, 1))]) assert np.array_equal( X.sid, self.covar_sid ), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1, 1) ret0 = SnpData(iid=X.iid, sid=self.pheno_sid, val=pheno_predicted, pos=np.array([[np.nan, np.nan, np.nan]]), name="linear regression Prediction" ) #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid, val=np.eye(X.iid_count) * self.ssres / self.iid_count) return ret0, ret1
def k_index_to_nLL_mapper(k): _, G_in, pheno_in, covar_in = _fixup(test_snps, G, pheno, covar,count_A1=count_A1) nll_sum=0 mse_sum = 0 n_folds_in = 0 for fold_index, (train_idx, test_idx) in _kfold(G.iid_count, n_folds, seed, end_with_all=False,iid_to_index=G.iid_to_index): assert set(train_idx).isdisjoint(set(test_idx)), "real assert" top_snps_in_fold = fold_index_to_top_snps[fold_index][:k] sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold) G_train = G_in[train_idx,sid_idx_in_fold] if k > 0 else None fastlmm = FastLMM(force_full_rank=force_full_rank, force_low_rank=force_low_rank,GB_goal=GB_goal) fastlmm.fit(K0_train=G_train, X=covar_in[train_idx,:], y=pheno_in[train_idx,:], h2raw=h2) #iid intersection means when can give the whole covariate and pheno G_test = G_in[test_idx,sid_idx_in_fold] if k > 0 else KernelIdentity(G_in.iid,G_in.iid[test_idx]) #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0 nll,mse = fastlmm.score(K0_whole_test=G_test,X=covar_in[test_idx,:],y=pheno_in[test_idx,:],return_mse_too=True) #iid intersection means when can give the whole covariate and pheno nll_sum += nll mse_sum += mse n_folds_in += 1 logging.info("k={0},nLL={1},average mse={2}".format(k,nll_sum,mse_sum / n_folds_in)) return nll_sum
def test_notebook1(self): do_plot=False import matplotlib.pyplot as plt from pysnptools.snpreader import Pheno,Bed bed = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False) cov = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") pheno = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt").read() # Now we learn from the first 400 students. training = bed[:400,:] #!!!later: the learning code doesn't like it if there are two instances of bed[:400] that are not "is -equal" fastlmm2 = FastLMM(GB_goal=2).fit(K0_train=training, X=cov[:400,:], y=pheno[:400,:]) # Predict on training data: predicted_score,covariance = fastlmm2.predict(K0_whole_test=training, X=cov[:400,:],count_A1=False) assert np.array_equal(pheno.iid[:400],predicted_score.iid), "for plots to make sense, the iids must be in the order" if do_plot: plt.plot(pheno.val[:400,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r") plt.errorbar(pheno.val[:400,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.') plt.xlabel('score (actual train)') plt.ylabel('predicted (test on train with stdev)') plt.show() # How well does this model predict the (unseen) TEST data? predicted_score,covariance = fastlmm2.predict(K0_whole_test=bed[400:500,:], X=cov[400:500,:],count_A1=False) assert np.array_equal(pheno.iid[400:500],predicted_score.iid), "for plots to make sense, the iids must be in the order" if do_plot: plt.plot(pheno.val[400:500,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r") plt.errorbar(pheno.val[400:500,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.') plt.xlabel('score (actual test)') plt.ylabel('predicted') plt.show()
def predict(self,X=None,K0_whole_test=None,K1_whole_test=None,iid_if_none=None,count_A1=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: :class:`.SnpReader` or a string or :class:`.KernelReader` :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: A :class:`SnpData` of the means and a :class:`KernelData` of the covariance """ assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance(K0_whole_test,KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance(K1_whole_test,KernelIdentity) # could also accept no snps X = _pheno_fixup(X,iid_if_none=iid_if_none,count_A1=count_A1) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val,np.ones((X.iid_count,1))]) assert np.array_equal(X.sid,self.covar_sid), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1,1) ret0 = SnpData(iid = X.iid, sid=self.pheno_sid,val=pheno_predicted,pos=np.array([[np.nan,np.nan,np.nan]]),name="linear regression Prediction") #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid,val=np.eye(X.iid_count)* self.ssres / self.iid_count) return ret0, ret1
def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None, count_A1=None): """ Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected. :param X: training covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param y: training phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param K0_train: Must be None. Represents the identity similarity matrix. :type K0_train: None :param K1_train: Must be None. Represents the identity similarity matrix. :type K1_train: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__ :param h2: Ignored. Optional. :type h2: number :param mixing: Ignored. Optional. :type mixing: number :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: self, the fitted Linear Regression predictor """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: self.is_fitted = True assert K0_train is None # could also accept that ID or no snps assert K1_train is None # could also accept that ID or no snps assert y is not None, "y must be given" y = _pheno_fixup(y, count_A1=count_A1) assert y.sid_count == 1, "Expect y to be just one variable" X = _pheno_fixup(X, iid_if_none=y.iid, count_A1=count_A1) X, y = intersect_apply([X, y]) y = y.read() X, covar_unit_trained = X.read().standardize( self.covariate_standardizer, return_trained=True) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.val, np.ones((X.iid_count, 1))]) lsqSol = np.linalg.lstsq(X.val, y.val[:, 0], rcond=-1) bs = lsqSol[0] #weights r2 = lsqSol[1] #squared residuals D = lsqSol[2] #rank of design matrix N = y.iid_count self.beta = bs self.ssres = float(r2) self.sstot = ((y.val - y.val.mean())**2).sum() self.covar_unit_trained = covar_unit_trained self.iid_count = X.iid_count self.covar_sid = X.sid self.pheno_sid = y.sid return self
def predict(self, X=None, K0_whole_test=None, K1_whole_test=None, iid_if_none=None, count_A1=None): """ Method for predicting from a fitted :class:`FastLMM` predictor. If the examples in X, K0_whole_test, K1_whole_test are not the same, they will be reordered and intersected. :param X: testing covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ (such as `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__) or string. :param K0_whole_test: Must be None. Represents the identity similarity matrix. :type K0_whole_test: None :param K1_whole_test: Must be None. Represents the identity similarity matrix. :type K1_whole_test: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string or `KernelReader <http://fastlmm.github.io/PySnpTools/#kernelreader-kernelreader>`__ :param iid_if_none: Examples to predict for if no X, K0_whole_test, K1_whole_test is provided. :type iid_if_none: an ndarray of two strings :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: A `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__ of the means and a :class:`KernelData` of the covariance """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: assert self.is_fitted, "Can only predict after predictor has been fitted" assert K0_whole_test is None or isinstance( K0_whole_test, KernelIdentity) # could also accept no snps assert K1_whole_test is None or isinstance( K1_whole_test, KernelIdentity) # could also accept no snps X = _pheno_fixup(X, iid_if_none=iid_if_none, count_A1=count_A1) X = X.read().standardize(self.covar_unit_trained) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.read().val, np.ones((X.iid_count, 1))]) assert np.array_equal( X.sid, self.covar_sid ), "Expect covar sids to be the same in train and test." pheno_predicted = X.val.dot(self.beta).reshape(-1, 1) ret0 = SnpData(iid=X.iid, sid=self.pheno_sid, val=pheno_predicted, pos=np.array([[np.nan, np.nan, np.nan]]), name="linear regression Prediction" ) #!!!replace 'parent_string' with 'name' from pysnptools.kernelreader import KernelData ret1 = KernelData(iid=X.iid, val=np.eye(X.iid_count) * self.ssres / self.iid_count) return ret0, ret1
def run_fastlmm(args): from pysnptools.snpreader import SnpData, Pheno, SnpReader from utils import prepare_output_file, read_cvindex from fastlmm.inference import FastLMM import dill as pickle logger.info('read phenotypes from file: ' + args.phenotype_file) phenotypes = pd.read_table(args.phenotype_file) iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis], 2, axis=1) if args.cvindex_file is not None: logger.info('read indices from file: ' + args.cvindex_file) train_index, test_index = read_cvindex(args.cvindex_file) else: train_index = np.nonzero((phenotypes['type'] == 'training').values)[0] test_index = np.nonzero((phenotypes['type'] == 'test').values)[0] n_snps_total = get_num_snps(args.snp_file) n_snps_sel = min(n_snps_total, args.n_snps) logger.info('number of sampled SNPs: %d' % n_snps_sel) sel_snps = np.random.choice(n_snps_total, size=n_snps_sel) logger.info('read SNP file (for test): ' + args.snp_file) test_snps = get_snpdata(iid, args.snp_file, transpose=args.transpose_x, snp_indices=sel_snps, std_filter_indices=train_index) logger.info('number of sampled SNPs after filtering by std: %d' % test_snps.shape[1]) logger.info('read SNP file (for K0): ' + args.k0_file) K0 = get_snpdata(iid, args.k0_file, transpose=args.transpose_k0) if args.seed: logger.info('set random seed for numpy: %d' % args.seed) np.seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) df_pheno = phenotypes.copy() df_pheno['fid'] = df_pheno['id'] df_pheno['iid'] = df_pheno['id'] traits = ('trait1', 'trait2', 'trait3') for trait in traits: pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait) logger.info('create Pheno file: ' + pheno_file) df_pheno.loc[train_index, ['fid', 'iid', trait]].to_csv(pheno_file, index=False, sep='\t', header=False) pheno = Pheno(pheno_file) logger.info('train FastLMM model for %s' % trait) model = FastLMM(GB_goal=args.GB_goal, force_low_rank=True) model.fit(X=test_snps[train_index, :], y=pheno, K0_train=K0, penalty=args.penalty, Smin=1.0) logger.info('fitted h2: %f' % model.h2raw) logger.info('predict using the FastLMM model for %s' % trait) y_mean, y_var = model.predict(X=test_snps[test_index, :], K0_whole_test=K0[test_index, :]) y_true = phenotypes[trait][test_index].values result_file = os.path.join(args.output_dir, 'predictions.%s' % trait) logger.info('save predictions to file: ' + result_file) prepare_output_file(result_file) with h5py.File(result_file, 'w') as f: f.create_dataset('y_mean', data=y_mean.val) f.create_dataset('y_var', data=y_var.val) f.create_dataset('y_true', data=y_true) f.create_dataset('h2raw', data=model.h2raw) f.create_dataset('sel_snps', data=sel_snps) model_file = os.path.join(args.output_dir, 'model.fastlmm.%s' % trait) logger.info('save model to file: ' + model_file) with open(model_file, 'wb') as f: pickle.dump(model, f)
def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None): """ Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected. :param X: training covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param y: training phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_train: Must be None. Represents the identity similarity matrix. :type K0_train: None :param K1_train: Must be None. Represents the identity similarity matrix. :type K1_train: :class:`.SnpReader` or a string or :class:`.KernelReader` :param h2: Ignored. Optional. :type h2: number :param mixing: Ignored. Optional. :type mixing: number :rtype: self, the fitted Linear Regression predictor """ self.is_fitted = True assert K0_train is None # could also accept that ID or no snps assert K1_train is None # could also accept that ID or no snps assert y is not None, "y must be given" y = _pheno_fixup(y) assert y.sid_count == 1, "Expect y to be just one variable" X = _pheno_fixup(X, iid_if_none=y.iid) X, y = intersect_apply([X, y]) y = y.read() X, covar_unit_trained = X.read().standardize( self.covariate_standardizer, return_trained=True) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.val, np.ones((X.iid_count, 1))]) lsqSol = np.linalg.lstsq(X.val, y.val[:, 0]) bs = lsqSol[0] #weights r2 = lsqSol[1] #squared residuals D = lsqSol[2] #rank of design matrix N = y.iid_count self.beta = bs self.ssres = float(r2) self.sstot = ((y.val - y.val.mean())**2).sum() self.covar_unit_trained = covar_unit_trained self.iid_count = X.iid_count self.covar_sid = X.sid self.pheno_sid = y.sid return self
def fit(self, X=None, y=None, K0_train=None, K1_train=None, h2=None, mixing=None,count_A1=None): """ Method for training a :class:`FastLMM` predictor. If the examples in X, y, K0_train, K1_train are not the same, they will be reordered and intersected. :param X: training covariate information, optional: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type X: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param y: training phenotype: If you give a string, it should be the file name of a PLINK phenotype-formatted file. :type y: a PySnpTools :class:`SnpReader` (such as :class:`Pheno` or :class:`SnpData`) or string. :param K0_train: Must be None. Represents the identity similarity matrix. :type K0_train: None :param K1_train: Must be None. Represents the identity similarity matrix. :type K1_train: :class:`.SnpReader` or a string or :class:`.KernelReader` :param h2: Ignored. Optional. :type h2: number :param mixing: Ignored. Optional. :type mixing: number :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1 alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True. :type count_A1: bool :rtype: self, the fitted Linear Regression predictor """ self.is_fitted = True assert K0_train is None # could also accept that ID or no snps assert K1_train is None # could also accept that ID or no snps assert y is not None, "y must be given" y = _pheno_fixup(y,count_A1=count_A1) assert y.sid_count == 1, "Expect y to be just one variable" X = _pheno_fixup(X, iid_if_none=y.iid,count_A1=count_A1) X, y = intersect_apply([X, y]) y = y.read() X, covar_unit_trained = X.read().standardize(self.covariate_standardizer,return_trained=True) # add a column of 1's to cov to increase DOF of model (and accuracy) by allowing a constant offset X = SnpData(iid=X.iid, sid=FastLMM._new_snp_name(X), val=np.c_[X.val,np.ones((X.iid_count,1))]) lsqSol = np.linalg.lstsq(X.val, y.val[:,0],rcond=-1) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=y.iid_count self.beta = bs self.ssres = float(r2) self.sstot = ((y.val-y.val.mean())**2).sum() self.covar_unit_trained = covar_unit_trained self.iid_count = X.iid_count self.covar_sid = X.sid self.pheno_sid = y.sid return self