Python FastGwas примеры использования

Язык программирования: Python

Пространство имен/Пакет: fastlmm.association.LocoGwas

Класс/Тип: FastGwas

Примеров на hotexamples.com: 4

Python FastGwas - 4 примера найдено. Это лучшие примеры Python кода для fastlmm.association.LocoGwas.FastGwas, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

FastGwas(2)

run_gwas(2)

Пример #1

Показать файл

    def test_results_identical_with_fastlmmc(self):
        """
        make sure gwas yields same results as fastlmmC
        """

        currentFolder = os.path.dirname(os.path.realpath(__file__))

        #prefix = r"C:\Users\chwidmer\Documents\Projects\sandbox\data\test"
        #bed_fn = prefix + "/jax_gt.up.filt.M"
        #dat_fn = prefix + "/jax_M_expression.1-18.dat"
        #pheno_fn = prefix + "/jax_M_expression.19.phe.txt"
        
        bed_fn = os.path.join(currentFolder, "../../feature_selection/examples/toydata")
        pheno_fn = os.path.join(currentFolder, "../../feature_selection/examples/toydata.phe")

        #prefix = "../../../tests\datasets\mouse"
        #bed_fn = os.path.join(prefix, "alldata")
        #pheno_fn = os.path.join(prefix, "pheno.txt")

        snp_reader = Bed(bed_fn)
        G, y, _, _ = load_intersect(snp_reader, pheno_fn)

        snp_pos = snp_reader.rs

        
        idx_sim = range(0, 5000)
        idx_test = range(5000, 10000)

        snp_pos_sim = snp_pos[idx_sim]
        snp_pos_test = snp_pos[idx_test]

        G_chr1, G_chr2 = G[:,idx_sim], G[:,idx_test]
        delta = 1.0



        ###################################
        # REML IN lmm.py is BROKEN!!

        # we compare REML=False in lmm.py to fastlmmc
        REML = False
        gwas_c_reml = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta, REML=REML)
        gwas_c_reml.run_gwas()

        gwas = GwasPrototype(G_chr1, G_chr2, y, delta, REML=False)
        gwas.run_gwas()

        # check p-values in log-space!
        np.testing.assert_array_almost_equal(np.log(gwas.p_values), np.log(gwas_c_reml.p_values), decimal=3)
        if False:
            import pylab
            pylab.plot(np.log(gwas_c_reml.p_values), np.log(gwas_f.p_values_F), "x")
            pylab.plot(range(-66,0,1), range(-66,0,1))
            pylab.show()

        # we compare lmm_cov.py to fastlmmc with REML=False
        gwas_c = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta, REML=True)
        gwas_c.run_gwas()
        gwas_f = FastGwas(G_chr1, G_chr2, y, delta, findh2=False)
        gwas_f.run_gwas()
        np.testing.assert_array_almost_equal(np.log(gwas_c.p_values), np.log(gwas_f.p_values_F), decimal=2)

        # additional testing code for the new wrapper functions

        # Fix delta
        from pysnptools.snpreader import Bed as BedSnpReader
        from fastlmm.association.single_snp import single_snp
        snpreader = BedSnpReader(bed_fn,count_A1=False)
        frame = single_snp(test_snps=snpreader[:,idx_test], pheno=pheno_fn, G0=snpreader[:,idx_sim],h2=1.0/(delta+1.0),leave_out_one_chrom=False,count_A1=False)
        sid_list,pvalue_list = frame['SNP'].values,frame['PValue'].values
        np.testing.assert_allclose(gwas_f.sorted_p_values_F, pvalue_list, rtol=1e-10)

        p_vals_by_genomic_pos = frame.sort_values(["Chr", "ChrPos"])["PValue"].tolist()
        np.testing.assert_allclose(gwas_c_reml.p_values, p_vals_by_genomic_pos, rtol=.1)
        np.testing.assert_allclose(gwas_c_reml.p_values, gwas_f.p_values_F, rtol=.1)
        np.testing.assert_allclose(gwas_f.sorted_p_values_F, gwas_c_reml.sorted_p_values, rtol=.1)


        # Search over delta
        gwas_c_reml_search = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta=None, REML=True)
        gwas_c_reml_search.run_gwas()

        frame_search = single_snp(test_snps=snpreader[:,idx_test], pheno=pheno_fn, G0=snpreader[:,idx_sim],h2=None,leave_out_one_chrom=False,count_A1=False)
        _,pvalue_list_search = frame_search['SNP'].values,frame_search['PValue'].values

        p_vals_by_genomic_pos = frame_search.sort_values(["Chr", "ChrPos"])["PValue"].tolist()
        np.testing.assert_allclose(gwas_c_reml_search.p_values, p_vals_by_genomic_pos, rtol=.001)
        np.testing.assert_allclose(gwas_c_reml_search.sorted_p_values, pvalue_list_search, rtol=.001)

Пример #2

Показать файл

Файл: feature_selection_two_kernel.py Проект: limix/FaST-LMM

    def run_select(self, G0, G_bg, y, cov=None):
        """set up two kernel feature selection
    
        Parameters
        ----------
        G0 : numpy array of shape (num_ind, num_snps)
            Data matrix from which foreground snps will be selected

        G0_bg : numpy array of shape (num_ind, num_snps)
            Data matrix containing background snps on which will be conditioned

        y : numpy vector of shape (num_ind, )
            Vector of phenotypes

        cov : numpy array of shape (num_ind, num_covariates) or None
            Covariates to be used as fixed effects

        Returns
        -------
        best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float)
            best_k is the best number of SNPs selected,
            feat_idx is a np.array of integers denoting the indices of these snps,
            best_mix is the best mixing coefficient between foreground and background kernel,
            best_delta is the best regularization coefficient
        """

        num_ind = len(y)

        if cov is None:
            cov = np.ones((num_ind, 1))
        else:
            logging.info("normalizing covariates")
            cov = cov.copy()
            cov = 1. / np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov
        cov.flags.writeable = False

        # normalize to diag(K) = N
        norm_factor = 1. / np.sqrt((G_bg**2).sum() / float(G_bg.shape[0]))

        # we copy in case G and G_bg are pointing to the same object
        G_bg = norm_factor * G_bg

        K_bg_full = G_bg.dot(G_bg.T)
        K_bg_full.flags.writeable = False

        # some asserts
        np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0])
        if self.debug:
            norm_factor_check = 1. / np.sqrt(G_bg.shape[1])
            np.testing.assert_array_almost_equal(norm_factor,
                                                 norm_factor_check,
                                                 decimal=1)

        for kfold_idx, (train_idx, test_idx) in enumerate(
                KFold(num_ind,
                      n_folds=self.n_folds,
                      random_state=self.random_state,
                      shuffle=True)):

            t0 = time.time()
            logging.info("running fold: %i" % kfold_idx)

            y_train = y.take(train_idx, axis=0)
            y_test = y.take(test_idx, axis=0)
            G0_train = G0.take(train_idx, axis=0)
            G0_test = G0.take(test_idx, axis=0)

            G_bg_train = G_bg.take(train_idx, axis=0)
            G_bg_test = G_bg.take(test_idx, axis=0)

            cov_train = cov.take(train_idx, axis=0)
            cov_test = cov.take(test_idx, axis=0)

            # write protect data
            y_train.flags.writeable = False
            y_test.flags.writeable = False
            G0_train.flags.writeable = False
            G0_test.flags.writeable = False
            G_bg_train.flags.writeable = False
            G_bg_test.flags.writeable = False
            cov_train.flags.writeable = False
            cov_test.flags.writeable = False

            # precompute background kernel
            K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx,
                                                                axis=1)
            K_bg_train.flags.writeable = False

            if self.measure != "mse":
                K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx,
                                                                  axis=1)
                K_bg_test.flags.writeable = False

            # rank features
            if self.order_by_lmm:
                logging.info("using linear mixed model to rank features")
                t0 = time.time()
                gwas = FastGwas(G_bg_train,
                                G0_train,
                                y_train,
                                delta=None,
                                train_pcs=None,
                                mixing=0.0,
                                cov=cov_train)
                gwas.run_gwas()
                _pval = gwas.p_values
                logging.info("time taken: %s" % (str(time.time() - t0)))
            else:
                logging.info("using linear regression to rank features")
                _F, _pval = lin_reg.f_regression_block(
                    lin_reg.f_regression_cov_alt,
                    G0_train,
                    y_train,
                    blocksize=10000,
                    C=cov_train)

            feat_idx = np.argsort(_pval)

            for k_idx, max_k in enumerate(self.grid_k):

                feat_idx_subset = feat_idx[0:max_k]
                G_fs_train = G0_train.take(feat_idx_subset, axis=1)
                G_fs_test = G0_test.take(feat_idx_subset, axis=1)

                # normalize to sum(diag)=N
                norm_factor = 1. / np.sqrt(
                    (G_fs_train**2).sum() / float(G_fs_train.shape[0]))

                G_fs_train *= norm_factor
                G_fs_test *= norm_factor

                G_fs_train.flags.writeable = False
                G_fs_test.flags.writeable = False

                # asserts
                if self.debug:
                    norm_factor_check = 1.0 / np.sqrt(max_k)
                    np.testing.assert_array_almost_equal(norm_factor,
                                                         norm_factor_check,
                                                         decimal=1)
                    np.testing.assert_almost_equal(
                        sum(np.diag(G_fs_train.dot(G_fs_train.T))),
                        G_fs_train.shape[0])

                logging.info("k: %i" % (max_k))

                # use LMM
                from fastlmm.inference.lmm_cov import LMM as fastLMM

                if G_bg_train.shape[1] <= G_bg_train.shape[0]:
                    lmm = fastLMM(X=cov_train,
                                  Y=y_train[:, np.newaxis],
                                  G=G_bg_train)
                else:
                    lmm = fastLMM(X=cov_train,
                                  Y=y_train[:, np.newaxis],
                                  K=K_bg_train)

                W = G_fs_train.copy()
                UGup, UUGup = lmm.rotate(W)

                i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool)
                i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool)
                t0 = time.time()
                res = lmm.findH2_2K(nGridH2=10,
                                    minH2=0.0,
                                    maxH2=0.99999,
                                    i_up=i_up,
                                    i_G1=i_G1,
                                    UW=UGup,
                                    UUW=UUGup)
                logging.info("time taken for k=%i: %s" %
                             (max_k, str(time.time() - t0)))

                # recover a2 from alternate parameterization
                a2 = res["h2_1"] / float(res["h2"] + res["h2_1"])
                h2 = res["h2"] + res["h2_1"]
                delta = (1 - h2) / h2
                #res_cov = res

                # do final prediction using lmm.py
                from fastlmm.inference import LMM
                lmm = LMM(forcefullrank=False)
                lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2)
                lmm.setX(cov_train)
                lmm.sety(y_train)

                # we take an additional step to estimate betas on covariates (not given from new model)
                res = lmm.nLLeval(delta=delta, REML=True)

                # predict on test set
                lmm.setTestData(Xstar=cov_test,
                                G0star=G_bg_test,
                                G1star=G_fs_test)
                out = lmm.predictMean(beta=res["beta"], delta=delta)

                mse = mean_squared_error(y_test, out)
                logging.info("mse: %f" % (mse))

                self.mse[kfold_idx, k_idx] = mse

                self.mixes[kfold_idx, k_idx] = a2
                self.deltas[kfold_idx, k_idx] = delta

                if self.measure != "mse":
                    K_test_test = a2 * G_fs_test.dot(
                        G_fs_test.T) + (1.0 - a2) * K_bg_test
                    ll = lmm.nLLeval_test(y_test,
                                          res["beta"],
                                          sigma2=res["sigma2"],
                                          delta=delta,
                                          Kstar_star=K_test_test,
                                          robust=True)

                    if self.debug:
                        ll2 = lmm.nLLeval_test(y_test,
                                               res["beta"],
                                               sigma2=res["sigma2"],
                                               delta=delta,
                                               Kstar_star=None,
                                               robust=True)
                        np.testing.assert_almost_equal(ll, ll2, decimal=4)

                    logging.info("ll: %f" % (ll))
                    self.ll[kfold_idx, k_idx] = ll

            logging.info("time taken for fold: %s" % str(time.time() - t0))

        best_k, best_mix, best_delta = self.select_best_k()

        logging.info("best_k: %i, best_mix: %f, best_delta: %f" %
                     (best_k, best_mix, best_delta))

        # final scan
        if self.order_by_lmm:
            logging.info("final scan using LMM")
            gwas = FastGwas(G_bg,
                            G0,
                            y,
                            delta=None,
                            train_pcs=None,
                            mixing=0.0,
                            cov=cov)
            gwas.run_gwas()
            _pval = gwas.p_values
            feat_idx = np.argsort(_pval)[0:best_k]
        else:
            logging.info("final scan using LR")
            _F, _pval = lin_reg.f_regression_block(
                lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000)

        logging.info("number of snps selected: %i" % (best_k))

        return best_k, feat_idx, best_mix, best_delta

Пример #3

Показать файл

Файл: test_gwas.py Проект: MicrosoftGenomics/FaST-LMM

    def test_results_identical_with_fastlmmc(self):
        """
        make sure gwas yields same results as fastlmmC
        """

        currentFolder = os.path.dirname(os.path.realpath(__file__))

        #prefix = r"C:\Users\chwidmer\Documents\Projects\sandbox\data\test"
        #bed_fn = prefix + "/jax_gt.up.filt.M"
        #dat_fn = prefix + "/jax_M_expression.1-18.dat"
        #pheno_fn = prefix + "/jax_M_expression.19.phe.txt"
        
        bed_fn = os.path.join(currentFolder, "../../feature_selection/examples/toydata")
        pheno_fn = os.path.join(currentFolder, "../../feature_selection/examples/toydata.phe")

        #prefix = "../../../tests\datasets\mouse"
        #bed_fn = os.path.join(prefix, "alldata")
        #pheno_fn = os.path.join(prefix, "pheno.txt")

        snp_reader = Bed(bed_fn)
        G, y, _, _ = load_intersect(snp_reader, pheno_fn)

        snp_pos = snp_reader.rs

        
        idx_sim = range(0, 5000)
        idx_test = range(5000, 10000)

        snp_pos_sim = snp_pos[idx_sim]
        snp_pos_test = snp_pos[idx_test]

        G_chr1, G_chr2 = G[:,idx_sim], G[:,idx_test]
        delta = 1.0



        ###################################
        # REML IN lmm.py is BROKEN!!

        # we compare REML=False in lmm.py to fastlmmc
        REML = False
        gwas_c_reml = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta, REML=REML)
        gwas_c_reml.run_gwas()

        gwas = GwasPrototype(G_chr1, G_chr2, y, delta, REML=False)
        gwas.run_gwas()

        # check p-values in log-space!
        np.testing.assert_array_almost_equal(np.log(gwas.p_values), np.log(gwas_c_reml.p_values), decimal=3)
        if False:
            import pylab
            pylab.plot(np.log(gwas_c_reml.p_values), np.log(gwas_f.p_values_F), "x")
            pylab.plot(range(-66,0,1), range(-66,0,1))
            pylab.show()

        # we compare lmm_cov.py to fastlmmc with REML=False
        gwas_c = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta, REML=True)
        gwas_c.run_gwas()
        gwas_f = FastGwas(G_chr1, G_chr2, y, delta, findh2=False)
        gwas_f.run_gwas()
        np.testing.assert_array_almost_equal(np.log(gwas_c.p_values), np.log(gwas_f.p_values_F), decimal=2)

        # additional testing code for the new wrapper functions

        # Fix delta
        from pysnptools.snpreader import Bed as BedSnpReader
        from fastlmm.association.single_snp import single_snp
        snpreader = BedSnpReader(bed_fn,count_A1=False)
        frame = single_snp(test_snps=snpreader[:,idx_test], pheno=pheno_fn, G0=snpreader[:,idx_sim],h2=1.0/(delta+1.0),leave_out_one_chrom=False,count_A1=False)
        sid_list,pvalue_list = frame['SNP'].values,frame['PValue'].values
        np.testing.assert_allclose(gwas_f.sorted_p_values_F, pvalue_list, rtol=1e-10)

        p_vals_by_genomic_pos = frame.sort_values(["Chr", "ChrPos"])["PValue"].tolist()
        np.testing.assert_allclose(gwas_c_reml.p_values, p_vals_by_genomic_pos, rtol=.1)
        np.testing.assert_allclose(gwas_c_reml.p_values, gwas_f.p_values_F, rtol=.1)
        np.testing.assert_allclose(gwas_f.sorted_p_values_F, gwas_c_reml.sorted_p_values, rtol=.1)


        # Search over delta
        gwas_c_reml_search = GwasTest(bed_fn, pheno_fn, snp_pos_sim, snp_pos_test, delta=None, REML=True)
        gwas_c_reml_search.run_gwas()

        frame_search = single_snp(test_snps=snpreader[:,idx_test], pheno=pheno_fn, G0=snpreader[:,idx_sim],h2=None,leave_out_one_chrom=False,count_A1=False)
        _,pvalue_list_search = frame_search['SNP'].values,frame_search['PValue'].values

        p_vals_by_genomic_pos = frame_search.sort_values(["Chr", "ChrPos"])["PValue"].tolist()
        np.testing.assert_allclose(gwas_c_reml_search.p_values, p_vals_by_genomic_pos, rtol=.001)
        np.testing.assert_allclose(gwas_c_reml_search.sorted_p_values, pvalue_list_search, rtol=.001)

Пример #4

Показать файл

Файл: feature_selection_two_kernel.py Проект: 42binwang/FaST-LMM

    def run_select(self, G0, G_bg, y, cov=None):
        """set up two kernel feature selection
    
        Parameters
        ----------
        G0 : numpy array of shape (num_ind, num_snps)
            Data matrix from which foreground snps will be selected

        G0_bg : numpy array of shape (num_ind, num_snps)
            Data matrix containing background snps on which will be conditioned

        y : numpy vector of shape (num_ind, )
            Vector of phenotypes

        cov : numpy array of shape (num_ind, num_covariates) or None
            Covariates to be used as fixed effects

        Returns
        -------
        best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float)
            best_k is the best number of SNPs selected,
            feat_idx is a np.array of integers denoting the indices of these snps,
            best_mix is the best mixing coefficient between foreground and background kernel,
            best_delta is the best regularization coefficient
        """

        num_ind = len(y)

        if cov is None:
            cov = np.ones((num_ind,1))
        else:
            logging.info("normalizing covariates")
            cov = cov.copy()
            cov = 1./np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov
        cov.flags.writeable = False
        
        # normalize to diag(K) = N
        norm_factor = 1./np.sqrt((G_bg**2).sum() / float(G_bg.shape[0]))

        # we copy in case G and G_bg are pointing to the same object
        G_bg = norm_factor * G_bg
       
        K_bg_full = G_bg.dot(G_bg.T)
        K_bg_full.flags.writeable = False
        
        # some asserts
        np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0])
        if self.debug:
            norm_factor_check = 1./np.sqrt(G_bg.shape[1])
            np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1)
            

        for kfold_idx, (train_idx, test_idx) in enumerate(KFold(num_ind, n_folds=self.n_folds, random_state=self.random_state, shuffle=True)):

            t0 = time.time()
            logging.info("running fold: %i" % kfold_idx)

            y_train = y.take(train_idx, axis=0)
            y_test = y.take(test_idx, axis=0)
            G0_train = G0.take(train_idx, axis=0)
            G0_test = G0.take(test_idx, axis=0)

            G_bg_train = G_bg.take(train_idx, axis=0)
            G_bg_test = G_bg.take(test_idx, axis=0)

            cov_train = cov.take(train_idx, axis=0)
            cov_test = cov.take(test_idx, axis=0)

            # write protect data
            y_train.flags.writeable = False
            y_test.flags.writeable = False
            G0_train.flags.writeable = False
            G0_test.flags.writeable = False
            G_bg_train.flags.writeable = False
            G_bg_test.flags.writeable = False
            cov_train.flags.writeable = False
            cov_test.flags.writeable = False

            # precompute background kernel
            K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx, axis=1) 
            K_bg_train.flags.writeable = False

            if self.measure != "mse":
                K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx, axis=1)
                K_bg_test.flags.writeable = False

            # rank features
            if self.order_by_lmm:
                logging.info("using linear mixed model to rank features")
                t0 = time.time()
                gwas = FastGwas(G_bg_train, G0_train, y_train, delta=None, train_pcs=None, mixing=0.0, cov=cov_train)
                gwas.run_gwas()
                _pval = gwas.p_values
                logging.info("time taken: %s" % (str(time.time()-t0)))
            else:
                logging.info("using linear regression to rank features")
                _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0_train, y_train, blocksize=10000, C=cov_train)

            feat_idx = np.argsort(_pval)
            
            for k_idx, max_k in enumerate(self.grid_k):

                feat_idx_subset = feat_idx[0:max_k]
                G_fs_train = G0_train.take(feat_idx_subset, axis=1)
                G_fs_test = G0_test.take(feat_idx_subset, axis=1)

                # normalize to sum(diag)=N
                norm_factor = 1./np.sqrt((G_fs_train**2).sum() / float(G_fs_train.shape[0]))

                G_fs_train *= norm_factor
                G_fs_test *= norm_factor
                                
                G_fs_train.flags.writeable = False
                G_fs_test.flags.writeable = False

                # asserts
                if self.debug:
                    norm_factor_check = 1.0 / np.sqrt(max_k)
                    np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1)
                    np.testing.assert_almost_equal(sum(np.diag(G_fs_train.dot(G_fs_train.T))), G_fs_train.shape[0])

                logging.info("k: %i" % (max_k))

                # use LMM
                from fastlmm.inference.lmm_cov import LMM as fastLMM

                if G_bg_train.shape[1] <= G_bg_train.shape[0]:
                    lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], G=G_bg_train)
                else:
                    lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], K=K_bg_train)

                W = G_fs_train.copy()
                UGup,UUGup = lmm.rotate(W)
                
                i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool)
                i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool)
                t0 = time.time()
                res = lmm.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup)
                logging.info("time taken for k=%i: %s" % (max_k, str(time.time()-t0)))
                
                # recover a2 from alternate parameterization
                a2 = res["h2_1"] / float(res["h2"] + res["h2_1"])
                h2 = res["h2"] + res["h2_1"]
                delta = (1-h2) / h2
                #res_cov = res


                # do final prediction using lmm.py
                from fastlmm.inference import LMM
                lmm = LMM(forcefullrank=False)
                lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2)
                lmm.setX(cov_train)
                lmm.sety(y_train)

                # we take an additional step to estimate betas on covariates (not given from new model)
                res = lmm.nLLeval(delta=delta, REML=True)
                
                # predict on test set
                lmm.setTestData(Xstar=cov_test, G0star=G_bg_test, G1star=G_fs_test)
                out = lmm.predictMean(beta=res["beta"], delta=delta)

                mse = mean_squared_error(y_test, out)
                logging.info("mse: %f" % (mse))

                self.mse[kfold_idx, k_idx] = mse

                self.mixes[kfold_idx, k_idx] = a2
                self.deltas[kfold_idx, k_idx] = delta

                if self.measure != "mse":
                    K_test_test = a2 * G_fs_test.dot(G_fs_test.T) + (1.0-a2) * K_bg_test 
                    ll = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test, robust=True)

                    if self.debug:
                        ll2 = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=None, robust=True)
                        np.testing.assert_almost_equal(ll, ll2, decimal=4)

                    logging.info("ll: %f" % (ll))
                    self.ll[kfold_idx, k_idx]  = ll
                    

            logging.info("time taken for fold: %s" % str(time.time()-t0))
        

        best_k, best_mix, best_delta = self.select_best_k()

        logging.info("best_k: %i, best_mix: %f, best_delta: %f" % (best_k, best_mix, best_delta))

        # final scan 
        if self.order_by_lmm:
            logging.info("final scan using LMM")
            gwas = FastGwas(G_bg, G0, y, delta=None, train_pcs=None, mixing=0.0, cov=cov)
            gwas.run_gwas()
            _pval = gwas.p_values
            feat_idx = np.argsort(_pval)[0:best_k]
        else:
            logging.info("final scan using LR")
            _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000)
        
        logging.info("number of snps selected: %i" % (best_k))

        return best_k, feat_idx, best_mix, best_delta