def _find_mixing_from_Ks(K, covar, K0_val, K1_val, h2, y): logging.info("starting _find_mixing_from_Ks") import fastlmm.util.mingrid as mingrid assert h2 is None, "if mixing is None, expect h2 to also be None" resmin=[None] def f(mixing,K0_val=K0_val,K1_val=K1_val,covar=covar,y=y,**kwargs): if not isinstance(mixing, (int, long, float, complex)): assert mixing.ndim == 1 and mixing.shape[0] == 1 mixing = mixing[0] _mix_from_Ks(K, K0_val,K1_val,mixing) lmm = lmm_cov(X=covar, Y=y, G=None, K=K, inplace=True) result = lmm.findH2() if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']): resmin[0]=result logging.debug("mixing_from_Ks\t{0}\th2\t{1}\tnLL\t{2}".format(mixing,result['h2'],result['nLL'])) #logging.info("reporter:counter:single_snp,find_mixing_from_Ks_count,1") assert not np.isnan(result['nLL']), "nLL should be a number (not a NaN)" return result['nLL'] mixing,nLL = mingrid.minimize1D(f=f, nGrid=10, minval=0.0, maxval=1.0,verbose=False) if not isinstance(mixing, (int, long, float, complex)): assert mixing.ndim == 1 and mixing.shape[0] == 1 mixing = mixing[0] h2 = resmin[0]['h2'] return mixing, h2
def estimate_tau(self, beta, ste): meta = MetaAnalysis(beta=beta, ste=ste, tau=0) def f(x): return -meta.log_likelihood(tau=x, mean_beta=None, reml=self.reml) tau = mingrid.minimize1D(f, evalgrid=None, nGrid=10, minval=0.0, maxval=(beta*beta).mean(), verbose=False, brent=True,check_boundaries=True, resultgrid=None, return_grid=False) return tau[0]
def fit_scale_logP(self, dof=None): ''' Extracts the top qmax lrt values to do the fit. ''' if dof is None: dof = self.dof resmin = [None] def f(x): scale = x err, imax = self.scale_dof_obj(scale, dof) if (resmin[0] is None) or (resmin[0]['mse'] > err): resmin[0] = { # bookeeping for CL's mingrid.minimize1D 'mse': err, 'dof': dof, 'scale': scale, 'imax': imax, } return err min = mingrid.minimize1D(f=f, nGrid=10, minval=self.scalemin, maxval=self.scalemax) return resmin[0]
def _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y): import fastlmm.util.mingrid as mingrid assert h2 is None, "if mixing is None, expect h2 to also be None" resmin = [None] def f(mixing, G0_standardized_val=G0_standardized_val, G1_standardized_val=G1_standardized_val, covar=covar, y=y, **kwargs): _mix(G, G0_standardized_val, G1_standardized_val, mixing) lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True) result = lmm.findH2() if (resmin[0] is None) or (result['nLL'] < resmin[0]['nLL']): resmin[0] = result return result['nLL'] mixing, nLL = mingrid.minimize1D(f=f, nGrid=10, minval=0.0, maxval=1.0, verbose=False) h2 = resmin[0]['h2'] return mixing, h2
def _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y): import fastlmm.util.mingrid as mingrid assert h2 is None, "if mixing is None, expect h2 to also be None" resmin=[None] def f(mixing,G0_standardized_val=G0_standardized_val,G1_standardized_val=G1_standardized_val,covar=covar,y=y,**kwargs): _mix(G, G0_standardized_val,G1_standardized_val,mixing) lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True) result = lmm.findH2() if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']): resmin[0]=result return result['nLL'] mixing,nLL = mingrid.minimize1D(f=f, nGrid=10, minval=0.0, maxval=1.0,verbose=False) h2 = resmin[0]['h2'] return mixing, h2
def fit_params_Qreg(self): ''' Fit the scale and dof parameters of the model by minimizing the squared error between the model log quantiles and the log P-values obtained on the lrt values. Only the top qmax quantile is being used for the fit (self.qmax is used in fit_scale_logP). ''' #imin= sp.argsort(self.lrt[~self.i0]) #ntests = self.lrt.shape[0] if self.isortlrt is None: self.isortlrt = self.lrt.argsort()[::-1] self.qnulllrtsort = ( 0.5 + sp.arange(self.mixture * self.isortlrt.shape[0])) / ( self.mixture * self.isortlrt.shape[0]) self.lrtsort = self.lrt[self.isortlrt] resmin = [ None ] #CL says it had to be a list or wouldn't work, even though doesn't make sense if self.fitdof: #fit both scale and dof def f(x): res = self.fit_scale_logP(dof=x) if (resmin[0] is None) or (res['mse'] < resmin[0]['mse']): resmin[0] = res return res['mse'] else: def f(x): #fit only scale scale = x mse, imax = self.scale_dof_obj(scale, self.dof) if (resmin[0] is None) or (resmin[0]['mse'] > mse): resmin[0] = { #bookeeping for CL's mingrid.minimize1D 'mse':mse, 'dof':self.dof, 'scale':scale, 'imax':imax, } return mse min = mingrid.minimize1D(f=f, nGrid=10, minval=self.dofmin, maxval=self.dofmax) self.dof = resmin[0]['dof'] self.scale = resmin[0]['scale'] self.imax = resmin[0]['imax'] return resmin[0]
def fit_scale_logP(self, dof = None): ''' Extracts the top qmax lrt values to do the fit. ''' if dof is None: dof = self.dof resmin = [None] def f(x): scale = x err,imax=self.scale_dof_obj(scale,dof) if (resmin[0] is None) or (resmin[0]['mse']>err): resmin[0] = { #bookeeping for CL's mingrid.minimize1D 'mse':err, 'dof':dof, 'scale':scale, 'imax':imax, } return err min = mingrid.minimize1D(f=f, nGrid=10, minval=self.scalemin, maxval=self.scalemax ) return resmin[0]
def fit_params_Qreg(self): ''' Fit the scale and dof parameters of the model by minimizing the squared error between the model log quantiles and the log P-values obtained on the lrt values. Only the top qmax quantile is being used for the fit (self.qmax is used in fit_scale_logP). ''' #imin= sp.argsort(self.lrt[~self.i0]) #ntests = self.lrt.shape[0] if self.isortlrt is None: self.isortlrt = self.lrt.argsort()[::-1] self.qnulllrtsort = (0.5+sp.arange(self.mixture*self.isortlrt.shape[0]))/(self.mixture*self.isortlrt.shape[0]) self.lrtsort = self.lrt[self.isortlrt] resmin=[None] #CL says it had to be a list or wouldn't work, even though doesn't make sense if self.fitdof: #fit both scale and dof def f(x): res = self.fit_scale_logP(dof=x) if (resmin[0] is None) or (res['mse']<resmin[0]['mse']): resmin[0]=res return res['mse'] else: def f(x): #fit only scale scale = x mse,imax=self.scale_dof_obj(scale,self.dof) if (resmin[0] is None) or (resmin[0]['mse']>mse): resmin[0] = { #bookeeping for CL's mingrid.minimize1D 'mse':mse, 'dof':self.dof, 'scale':scale, 'imax':imax, } return mse min = mingrid.minimize1D(f=f, nGrid=10, minval=self.dofmin, maxval=self.dofmax ) self.dof = resmin[0]['dof'] self.scale = resmin[0]['scale'] self.imax=resmin[0]['imax'] return resmin[0]
def dowork(self, fold_idx): self.feature_selection_strategy.run_once() for i_k,k in enumerate(self.k_values): self.k_values[i_k]=min(self.k_values[i_k],self.feature_selection_strategy.snpreader.sid_count) max_k = max([1]+[k for k in self.k_values if k != self.feature_selection_strategy.snpreader.sid_count]) split_iterator = self.feature_selection_strategy.setup_linear_regression(max_k, start=fold_idx, stop=None) fold_data = next(split_iterator) tt0 = time.time() if self.strategy == "lmm_full_cv": mse_cv1 = np.zeros((len(self.k_values), len(self.delta_values))) ll_cv1 = np.zeros((len(self.k_values), len(self.delta_values))) best_delta_for_k_1 = None elif self.strategy=="insample_cv": mse_cv1 = np.zeros((len(self.k_values))) ll_cv1 = np.zeros((len(self.k_values))) best_delta_for_k_1 = np.zeros((len(self.k_values))) else: raise NotImplementedError("not implemented") logging.info("reporter:counter:PerformSelectionDistributable,foldcount,1") for k_idx, k in enumerate(self.k_values): logging.info("processing fold={0}, k={1}".format(fold_idx,k)) logging.info("reporter:status:processing fold={0}, k={1}".format(fold_idx,k)) logging.info("reporter:counter:PerformSelectionDistributable,k,1") model = fastlmm.getLMM() # compute kernel externally if k == self.feature_selection_strategy.snpreader.sid_count or k >= self.feature_selection_strategy.num_snps_in_memory: if k == self.feature_selection_strategy.snpreader.sid_count: # use precomputed kernel logging.info("using precomputed kernel on all snps") K = self.feature_selection_strategy.K else: # build kernel in blocks from snpreader (from file) logging.info("building kernel in blocks") top_k_feat_idx = fold_data["feat_idx"][0:int(k)] subset = self.feature_selection_strategy.snpreader[:,top_k_feat_idx] K = subset.kernel(self.feature_selection_strategy.standardizer,blocksize=self.feature_selection_strategy.blocksize) train_idx = fold_data["train_idx"] test_idx = fold_data["test_idx"] K_train_lhs = K[train_idx] K_train = K_train_lhs[:,train_idx] K_train_test = K_train_lhs[:,test_idx].T K_test_test = K[test_idx][:,test_idx] model.setK(K_train) model.setTestData(Xstar=fold_data["X_test"], K0star=K_train_test) #np.testing.assert_array_almost_equal(model.K, K_train, decimal=4) #np.testing.assert_array_almost_equal(model.Kstar, K_train_test, decimal=4) # use precomputed features as before else: logging.info("using cached data to build kernel") outer_G_train = fold_data["G_train"][:,0:k] outer_G_test = fold_data["G_test"][:,0:k] model.setG(outer_G_train.val) model.setTestData(Xstar=fold_data["X_test"], G0star=outer_G_test.val) K_test_test = None model.sety(fold_data["y_train"]) model.setX(fold_data["X_train"]) if self.strategy == "lmm_full_cv": for delta_idx, delta_act in enumerate(self.delta_values): if k: delta = delta_act * k else: delta = delta_act REML = True#TODO: Why is REML False? # predict on test set res = model.nLLeval(delta=delta, REML=REML,penalty=self.penalty) out = model.predictMean(beta=res["beta"], delta=delta) mse_cv1[k_idx, delta_idx] = mean_squared_error(fold_data["y_test"], out) ll_cv1[k_idx, delta_idx] = model.nLLeval_test(fold_data["y_test"], res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test) elif self.strategy == "insample_cv": best_res = None best_delta = None best_nLL = float("inf") REML = True #Note that for brent = True there will always be many unique delta values, as these deiate from the grid. #brent = False brent = True # evaluate negative log-likelihood for different values of alpha import fastlmm.util.mingrid as mingrid resmin = [None] def f(x): if k: delta_corr = x * k else: delta_corr = x myres = model.nLLeval(delta = delta_corr, REML = REML,penalty=self.penalty) if (resmin[0] is None) or (myres['nLL']<resmin[0]['nLL']): resmin[0]=myres resmin[0]["delta_corr"] = delta_corr resmin[0]["delta"] = x return myres["nLL"] res = mingrid.minimize1D(f,evalgrid = self.delta_values,brent = brent) if 0:#old code without brent search for delta_idx, delta_act in enumerate(self.delta_values): delta = delta_act * k #rescale delta for X val. res = model.nLLeval(delta=delta,REML=REML,penalty=self.penalty) #TODO: check if we need scale if res["nLL"] < best_nLL: best_res = res best_delta_act = delta_act best_delta = delta best_nLL = res["nLL"] out = model.predictMean(beta=resmin[0]["beta"], delta=resmin[0]["delta_corr"]) mse_cv1[k_idx] = mean_squared_error(fold_data["y_test"], out) ll_cv1[k_idx] = model.nLLeval_test(fold_data["y_test"], resmin[0]["beta"], sigma2=resmin[0]["sigma2"], delta=resmin[0]["delta_corr"], Kstar_star=K_test_test) best_delta_for_k_1[k_idx] = resmin[0]["delta"] logging.info("crossval time %.2f s" % (float(time.time() - tt0))) return fold_idx, mse_cv1, ll_cv1, best_delta_for_k_1