예제 #1
0
def _find_mixing_from_Ks(K, covar, K0_val, K1_val, h2, y):
    logging.info("starting _find_mixing_from_Ks")
    import fastlmm.util.mingrid as mingrid
    assert h2 is None, "if mixing is None, expect h2 to also be None"
    resmin=[None]
    def f(mixing,K0_val=K0_val,K1_val=K1_val,covar=covar,y=y,**kwargs):

        if not isinstance(mixing, (int, long, float, complex)):
            assert mixing.ndim == 1 and mixing.shape[0] == 1
            mixing = mixing[0]

        _mix_from_Ks(K, K0_val,K1_val,mixing)
        lmm = lmm_cov(X=covar, Y=y, G=None, K=K, inplace=True)
        result = lmm.findH2()
        if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']):
            resmin[0]=result
        logging.debug("mixing_from_Ks\t{0}\th2\t{1}\tnLL\t{2}".format(mixing,result['h2'],result['nLL']))
        #logging.info("reporter:counter:single_snp,find_mixing_from_Ks_count,1")
        assert not np.isnan(result['nLL']), "nLL should be a number (not a NaN)"
        return result['nLL']
    mixing,nLL = mingrid.minimize1D(f=f, nGrid=10, minval=0.0, maxval=1.0,verbose=False)

    if not isinstance(mixing, (int, long, float, complex)):
        assert mixing.ndim == 1 and mixing.shape[0] == 1
        mixing = mixing[0]
    h2 = resmin[0]['h2']
    return mixing, h2
예제 #2
0
def _find_mixing_from_Ks(K, covar, K0_val, K1_val, h2, y):
    logging.info("starting _find_mixing_from_Ks")
    import fastlmm.util.mingrid as mingrid
    assert h2 is None, "if mixing is None, expect h2 to also be None"
    resmin=[None]
    def f(mixing,K0_val=K0_val,K1_val=K1_val,covar=covar,y=y,**kwargs):

        if not isinstance(mixing, (int, long, float, complex)):
            assert mixing.ndim == 1 and mixing.shape[0] == 1
            mixing = mixing[0]

        _mix_from_Ks(K, K0_val,K1_val,mixing)
        lmm = lmm_cov(X=covar, Y=y, G=None, K=K, inplace=True)
        result = lmm.findH2()
        if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']):
            resmin[0]=result
        logging.debug("mixing_from_Ks\t{0}\th2\t{1}\tnLL\t{2}".format(mixing,result['h2'],result['nLL']))
        #logging.info("reporter:counter:single_snp,find_mixing_from_Ks_count,1")
        assert not np.isnan(result['nLL']), "nLL should be a number (not a NaN)"
        return result['nLL']
    mixing,nLL = mingrid.minimize1D(f=f, nGrid=10, minval=0.0, maxval=1.0,verbose=False)

    if not isinstance(mixing, (int, long, float, complex)):
        assert mixing.ndim == 1 and mixing.shape[0] == 1
        mixing = mixing[0]

    h2 = resmin[0]['h2']
    return mixing, h2
예제 #3
0
    def estimate_tau(self, beta, ste):
        meta = MetaAnalysis(beta=beta, ste=ste, tau=0)
        def f(x):
            return -meta.log_likelihood(tau=x, mean_beta=None, reml=self.reml)

        tau = mingrid.minimize1D(f, evalgrid=None, nGrid=10, minval=0.0, maxval=(beta*beta).mean(), verbose=False, brent=True,check_boundaries=True, resultgrid=None, return_grid=False)
        return tau[0]
예제 #4
0
    def fit_scale_logP(self, dof=None):
        '''
        Extracts the top qmax lrt values to do the fit.
        '''

        if dof is None:
            dof = self.dof
        resmin = [None]

        def f(x):
            scale = x
            err, imax = self.scale_dof_obj(scale, dof)
            if (resmin[0] is None) or (resmin[0]['mse'] > err):
                resmin[0] = {  # bookeeping for CL's mingrid.minimize1D
                    'mse': err,
                    'dof': dof,
                    'scale': scale,
                    'imax': imax,
                }
            return err

        min = mingrid.minimize1D(f=f,
                                 nGrid=10,
                                 minval=self.scalemin,
                                 maxval=self.scalemax)
        return resmin[0]
예제 #5
0
def _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y):
    import fastlmm.util.mingrid as mingrid
    assert h2 is None, "if mixing is None, expect h2 to also be None"
    resmin = [None]

    def f(mixing,
          G0_standardized_val=G0_standardized_val,
          G1_standardized_val=G1_standardized_val,
          covar=covar,
          y=y,
          **kwargs):
        _mix(G, G0_standardized_val, G1_standardized_val, mixing)
        lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True)
        result = lmm.findH2()
        if (resmin[0] is None) or (result['nLL'] < resmin[0]['nLL']):
            resmin[0] = result
        return result['nLL']

    mixing, nLL = mingrid.minimize1D(f=f,
                                     nGrid=10,
                                     minval=0.0,
                                     maxval=1.0,
                                     verbose=False)
    h2 = resmin[0]['h2']
    return mixing, h2
예제 #6
0
def _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y):
    import fastlmm.util.mingrid as mingrid
    assert h2 is None, "if mixing is None, expect h2 to also be None"
    resmin=[None]
    def f(mixing,G0_standardized_val=G0_standardized_val,G1_standardized_val=G1_standardized_val,covar=covar,y=y,**kwargs):
        _mix(G, G0_standardized_val,G1_standardized_val,mixing)
        lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True)
        result = lmm.findH2()
        if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']):
            resmin[0]=result
        return result['nLL']
    mixing,nLL = mingrid.minimize1D(f=f, nGrid=10, minval=0.0, maxval=1.0,verbose=False)
    h2 = resmin[0]['h2']
    return mixing, h2
예제 #7
0
    def fit_params_Qreg(self):
        '''
        Fit the scale and dof parameters of the model by minimizing the squared error between
        the model log quantiles and the log P-values obtained on the lrt values.

        Only the top qmax quantile is being used for the fit (self.qmax is used in fit_scale_logP).
        '''
        #imin= sp.argsort(self.lrt[~self.i0])
        #ntests = self.lrt.shape[0]
        if self.isortlrt is None:
            self.isortlrt = self.lrt.argsort()[::-1]
            self.qnulllrtsort = (
                0.5 + sp.arange(self.mixture * self.isortlrt.shape[0])) / (
                    self.mixture * self.isortlrt.shape[0])
            self.lrtsort = self.lrt[self.isortlrt]
        resmin = [
            None
        ]  #CL says it had to be a list or wouldn't work, even though doesn't make sense
        if self.fitdof:  #fit both scale and dof

            def f(x):
                res = self.fit_scale_logP(dof=x)
                if (resmin[0] is None) or (res['mse'] < resmin[0]['mse']):
                    resmin[0] = res
                return res['mse']
        else:

            def f(x):  #fit only scale
                scale = x
                mse, imax = self.scale_dof_obj(scale, self.dof)
                if (resmin[0] is None) or (resmin[0]['mse'] > mse):
                    resmin[0] = { #bookeeping for CL's mingrid.minimize1D
                        'mse':mse,
                        'dof':self.dof,
                        'scale':scale,
                        'imax':imax,
                    }
                return mse

        min = mingrid.minimize1D(f=f,
                                 nGrid=10,
                                 minval=self.dofmin,
                                 maxval=self.dofmax)
        self.dof = resmin[0]['dof']
        self.scale = resmin[0]['scale']
        self.imax = resmin[0]['imax']
        return resmin[0]
예제 #8
0
 def fit_scale_logP(self, dof = None):        
     '''
     Extracts the top qmax lrt values to do the fit.        
     '''              
   
     if dof is None:
         dof =  self.dof
     resmin = [None]                       
    
     def f(x):            
         scale = x 
         err,imax=self.scale_dof_obj(scale,dof)
         if (resmin[0] is None) or (resmin[0]['mse']>err):
             resmin[0] = { #bookeeping for CL's mingrid.minimize1D
                 'mse':err,
                 'dof':dof,
                 'scale':scale,
                 'imax':imax,
             }
         return err
     min = mingrid.minimize1D(f=f, nGrid=10, minval=self.scalemin, maxval=self.scalemax )        
     return resmin[0]
예제 #9
0
    def fit_params_Qreg(self):
        '''
        Fit the scale and dof parameters of the model by minimizing the squared error between
        the model log quantiles and the log P-values obtained on the lrt values.

        Only the top qmax quantile is being used for the fit (self.qmax is used in fit_scale_logP).
        '''
        #imin= sp.argsort(self.lrt[~self.i0])
        #ntests = self.lrt.shape[0]  
        if self.isortlrt is None:
            self.isortlrt = self.lrt.argsort()[::-1]            
            self.qnulllrtsort = (0.5+sp.arange(self.mixture*self.isortlrt.shape[0]))/(self.mixture*self.isortlrt.shape[0])   
            self.lrtsort = self.lrt[self.isortlrt]      
        resmin=[None] #CL says it had to be a list or wouldn't work, even though doesn't make sense
        if self.fitdof: #fit both scale and dof
            def f(x):
                res = self.fit_scale_logP(dof=x)
                if (resmin[0] is None) or (res['mse']<resmin[0]['mse']):
                    resmin[0]=res
                return res['mse']                   
        else:
            def f(x): #fit only scale                
                scale = x                        
                mse,imax=self.scale_dof_obj(scale,self.dof)
                if (resmin[0] is None) or (resmin[0]['mse']>mse):
                    resmin[0] = { #bookeeping for CL's mingrid.minimize1D
                        'mse':mse,
                        'dof':self.dof,
                        'scale':scale,
                        'imax':imax,
                    }                
                return mse 
        min = mingrid.minimize1D(f=f, nGrid=10, minval=self.dofmin, maxval=self.dofmax )
        self.dof = resmin[0]['dof']
        self.scale = resmin[0]['scale']
        self.imax=resmin[0]['imax']
        return resmin[0]       
    def dowork(self, fold_idx):
        self.feature_selection_strategy.run_once()
        for i_k,k in enumerate(self.k_values):
            self.k_values[i_k]=min(self.k_values[i_k],self.feature_selection_strategy.snpreader.sid_count)
        max_k = max([1]+[k for k in self.k_values if k != self.feature_selection_strategy.snpreader.sid_count])

        split_iterator = self.feature_selection_strategy.setup_linear_regression(max_k, start=fold_idx, stop=None)
        fold_data = next(split_iterator)

        tt0 = time.time()

        if self.strategy == "lmm_full_cv":
            mse_cv1 = np.zeros((len(self.k_values), len(self.delta_values)))
            ll_cv1 = np.zeros((len(self.k_values), len(self.delta_values)))
            best_delta_for_k_1 = None
        elif self.strategy=="insample_cv":
            mse_cv1 = np.zeros((len(self.k_values)))
            ll_cv1 = np.zeros((len(self.k_values)))
            best_delta_for_k_1 = np.zeros((len(self.k_values)))
        else:
            raise NotImplementedError("not implemented")
        
        logging.info("reporter:counter:PerformSelectionDistributable,foldcount,1")
        for k_idx, k in enumerate(self.k_values):
            logging.info("processing fold={0}, k={1}".format(fold_idx,k))
            logging.info("reporter:status:processing fold={0}, k={1}".format(fold_idx,k))
            logging.info("reporter:counter:PerformSelectionDistributable,k,1")

            model = fastlmm.getLMM()

            # compute kernel externally
            if k == self.feature_selection_strategy.snpreader.sid_count or k >= self.feature_selection_strategy.num_snps_in_memory:
                if k == self.feature_selection_strategy.snpreader.sid_count:
                    # use precomputed kernel
                    logging.info("using precomputed kernel on all snps")
                    K = self.feature_selection_strategy.K
                else:
                    # build kernel in blocks from snpreader (from file)
                    logging.info("building kernel in blocks")
                    top_k_feat_idx = fold_data["feat_idx"][0:int(k)]
                    subset = self.feature_selection_strategy.snpreader[:,top_k_feat_idx]
                    K = subset.kernel(self.feature_selection_strategy.standardizer,blocksize=self.feature_selection_strategy.blocksize)

                train_idx = fold_data["train_idx"]
                test_idx = fold_data["test_idx"]
 
                K_train_lhs = K[train_idx]
                K_train = K_train_lhs[:,train_idx]
                K_train_test = K_train_lhs[:,test_idx].T
                K_test_test = K[test_idx][:,test_idx]

                model.setK(K_train)
                model.setTestData(Xstar=fold_data["X_test"], K0star=K_train_test)

                #np.testing.assert_array_almost_equal(model.K, K_train, decimal=4)
                #np.testing.assert_array_almost_equal(model.Kstar, K_train_test, decimal=4)

            # use precomputed features as before
            else:
                logging.info("using cached data to build kernel")
                outer_G_train = fold_data["G_train"][:,0:k]
                outer_G_test = fold_data["G_test"][:,0:k]
                model.setG(outer_G_train.val)
                model.setTestData(Xstar=fold_data["X_test"], G0star=outer_G_test.val)
                K_test_test = None


            model.sety(fold_data["y_train"])
            model.setX(fold_data["X_train"])

            if self.strategy == "lmm_full_cv":

                for delta_idx, delta_act in enumerate(self.delta_values):
                    if k:
                        delta = delta_act * k
                    else:
                        delta = delta_act
                    REML = True#TODO: Why is REML False?
                    
                    # predict on test set
                    res = model.nLLeval(delta=delta, REML=REML,penalty=self.penalty)
                    out = model.predictMean(beta=res["beta"], delta=delta)
                    mse_cv1[k_idx, delta_idx] = mean_squared_error(fold_data["y_test"], out)
                    ll_cv1[k_idx, delta_idx] = model.nLLeval_test(fold_data["y_test"], res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test)

            elif self.strategy == "insample_cv":

                best_res = None
                best_delta = None
                best_nLL = float("inf")
                REML = True

                #Note that for brent = True there will always be many unique delta values, as these deiate from the grid.
                #brent = False
                brent = True

                # evaluate negative log-likelihood for different values of alpha
                import fastlmm.util.mingrid as mingrid
                resmin = [None]
                def f(x):
                    if k:
                        delta_corr = x * k
                    else:
                        delta_corr = x
                    myres = model.nLLeval(delta = delta_corr, REML = REML,penalty=self.penalty)
                    if (resmin[0] is None) or (myres['nLL']<resmin[0]['nLL']):
                        resmin[0]=myres
                        resmin[0]["delta_corr"] = delta_corr
                        resmin[0]["delta"] = x
                    return myres["nLL"]
                res = mingrid.minimize1D(f,evalgrid = self.delta_values,brent = brent)

                if 0:#old code without brent search
                    for delta_idx, delta_act in enumerate(self.delta_values):
                        delta = delta_act * k #rescale delta for X val.
                        res = model.nLLeval(delta=delta,REML=REML,penalty=self.penalty)
                        #TODO: check if we need scale
                    
                        if res["nLL"] < best_nLL:
                            best_res = res
                            best_delta_act = delta_act
                            best_delta = delta
                            best_nLL = res["nLL"]
                out = model.predictMean(beta=resmin[0]["beta"], delta=resmin[0]["delta_corr"])
                mse_cv1[k_idx] = mean_squared_error(fold_data["y_test"], out)
                ll_cv1[k_idx] = model.nLLeval_test(fold_data["y_test"], resmin[0]["beta"], sigma2=resmin[0]["sigma2"], delta=resmin[0]["delta_corr"], Kstar_star=K_test_test)
                best_delta_for_k_1[k_idx] = resmin[0]["delta"]

        logging.info("crossval time %.2f s" % (float(time.time() - tt0)))

        return fold_idx, mse_cv1, ll_cv1, best_delta_for_k_1
예제 #11
0
    def dowork(self, fold_idx):
        self.feature_selection_strategy.run_once()
        for i_k,k in enumerate(self.k_values):
            self.k_values[i_k]=min(self.k_values[i_k],self.feature_selection_strategy.snpreader.sid_count)
        max_k = max([1]+[k for k in self.k_values if k != self.feature_selection_strategy.snpreader.sid_count])

        split_iterator = self.feature_selection_strategy.setup_linear_regression(max_k, start=fold_idx, stop=None)
        fold_data = next(split_iterator)

        tt0 = time.time()

        if self.strategy == "lmm_full_cv":
            mse_cv1 = np.zeros((len(self.k_values), len(self.delta_values)))
            ll_cv1 = np.zeros((len(self.k_values), len(self.delta_values)))
            best_delta_for_k_1 = None
        elif self.strategy=="insample_cv":
            mse_cv1 = np.zeros((len(self.k_values)))
            ll_cv1 = np.zeros((len(self.k_values)))
            best_delta_for_k_1 = np.zeros((len(self.k_values)))
        else:
            raise NotImplementedError("not implemented")
        
        logging.info("reporter:counter:PerformSelectionDistributable,foldcount,1")
        for k_idx, k in enumerate(self.k_values):
            logging.info("processing fold={0}, k={1}".format(fold_idx,k))
            logging.info("reporter:status:processing fold={0}, k={1}".format(fold_idx,k))
            logging.info("reporter:counter:PerformSelectionDistributable,k,1")

            model = fastlmm.getLMM()

            # compute kernel externally
            if k == self.feature_selection_strategy.snpreader.sid_count or k >= self.feature_selection_strategy.num_snps_in_memory:
                if k == self.feature_selection_strategy.snpreader.sid_count:
                    # use precomputed kernel
                    logging.info("using precomputed kernel on all snps")
                    K = self.feature_selection_strategy.K
                else:
                    # build kernel in blocks from snpreader (from file)
                    logging.info("building kernel in blocks")
                    top_k_feat_idx = fold_data["feat_idx"][0:int(k)]
                    subset = self.feature_selection_strategy.snpreader[:,top_k_feat_idx]
                    K = subset.kernel(self.feature_selection_strategy.standardizer,blocksize=self.feature_selection_strategy.blocksize)

                train_idx = fold_data["train_idx"]
                test_idx = fold_data["test_idx"]
 
                K_train_lhs = K[train_idx]
                K_train = K_train_lhs[:,train_idx]
                K_train_test = K_train_lhs[:,test_idx].T
                K_test_test = K[test_idx][:,test_idx]

                model.setK(K_train)
                model.setTestData(Xstar=fold_data["X_test"], K0star=K_train_test)

                #np.testing.assert_array_almost_equal(model.K, K_train, decimal=4)
                #np.testing.assert_array_almost_equal(model.Kstar, K_train_test, decimal=4)

            # use precomputed features as before
            else:
                logging.info("using cached data to build kernel")
                outer_G_train = fold_data["G_train"][:,0:k]
                outer_G_test = fold_data["G_test"][:,0:k]
                model.setG(outer_G_train.val)
                model.setTestData(Xstar=fold_data["X_test"], G0star=outer_G_test.val)
                K_test_test = None


            model.sety(fold_data["y_train"])
            model.setX(fold_data["X_train"])

            if self.strategy == "lmm_full_cv":

                for delta_idx, delta_act in enumerate(self.delta_values):
                    if k:
                        delta = delta_act * k
                    else:
                        delta = delta_act
                    REML = True#TODO: Why is REML False?
                    
                    # predict on test set
                    res = model.nLLeval(delta=delta, REML=REML,penalty=self.penalty)
                    out = model.predictMean(beta=res["beta"], delta=delta)
                    mse_cv1[k_idx, delta_idx] = mean_squared_error(fold_data["y_test"], out)
                    ll_cv1[k_idx, delta_idx] = model.nLLeval_test(fold_data["y_test"], res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test)

            elif self.strategy == "insample_cv":

                best_res = None
                best_delta = None
                best_nLL = float("inf")
                REML = True

                #Note that for brent = True there will always be many unique delta values, as these deiate from the grid.
                #brent = False
                brent = True

                # evaluate negative log-likelihood for different values of alpha
                import fastlmm.util.mingrid as mingrid
                resmin = [None]
                def f(x):
                    if k:
                        delta_corr = x * k
                    else:
                        delta_corr = x
                    myres = model.nLLeval(delta = delta_corr, REML = REML,penalty=self.penalty)
                    if (resmin[0] is None) or (myres['nLL']<resmin[0]['nLL']):
                        resmin[0]=myres
                        resmin[0]["delta_corr"] = delta_corr
                        resmin[0]["delta"] = x
                    return myres["nLL"]
                res = mingrid.minimize1D(f,evalgrid = self.delta_values,brent = brent)

                if 0:#old code without brent search
                    for delta_idx, delta_act in enumerate(self.delta_values):
                        delta = delta_act * k #rescale delta for X val.
                        res = model.nLLeval(delta=delta,REML=REML,penalty=self.penalty)
                        #TODO: check if we need scale
                    
                        if res["nLL"] < best_nLL:
                            best_res = res
                            best_delta_act = delta_act
                            best_delta = delta
                            best_nLL = res["nLL"]
                out = model.predictMean(beta=resmin[0]["beta"], delta=resmin[0]["delta_corr"])
                mse_cv1[k_idx] = mean_squared_error(fold_data["y_test"], out)
                ll_cv1[k_idx] = model.nLLeval_test(fold_data["y_test"], resmin[0]["beta"], sigma2=resmin[0]["sigma2"], delta=resmin[0]["delta_corr"], Kstar_star=K_test_test)
                best_delta_for_k_1[k_idx] = resmin[0]["delta"]

        logging.info("crossval time %.2f s" % (float(time.time() - tt0)))

        return fold_idx, mse_cv1, ll_cv1, best_delta_for_k_1