Exemplo n.º 1
0
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn, count_A1=False)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read(
            order='C').standardize()
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read(
            order='C').standardize()

        y = pheno.read().val[:, 0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info(
            "running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7,
                                          n_folds=7,
                                          order_by_lmm=True,
                                          measure="ll",
                                          random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val,
                                                                   G0.val,
                                                                   y,
                                                                   cov=X_cov)

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))

        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:, feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps,
                                pheno,
                                G0=G0,
                                G1=G1,
                                mixing=best_mix,
                                h2=None,
                                leave_out_one_chrom=False,
                                output_file_name=output_file_name,
                                count_A1=False)

        logging.info("results:")
        logging.info("#" * 40)
        logging.info(results_df.head())
        self.compare_files(results_df, "old")
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:,snp_reader.pos[:,0] != test_chr].read(order='C').standardize()
        test_snps = snp_reader[:,snp_reader.pos[:,0] == test_chr].read(order='C').standardize()


        y = pheno.read().val[:,0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info("running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov)    

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))


        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:,feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None,leave_out_one_chrom=False,output_file_name=output_file_name)

        logging.info("results:")
        logging.info("#"*40)
        logging.info(results_df.head())
        self.compare_files(results_df,"old")