Exemplo n.º 1
0
    def dowork(self, i, train_snp_idx, test_snp_idx, result, G, y):
        logging.info("{0}, {1}".format(len(train_snp_idx), len(test_snp_idx)))

        # intersect selected SNPs with train snps
        if not self.selected_snps is None:
            # intersect snp names
            logging.info("intersecting train snps with selected snps for LOCO")
            int_snp_idx = argintersect_left(self.snp_reader.rs[train_snp_idx],
                                            self.selected_snps)
            sim_keeper_idx = np.array(train_snp_idx)[int_snp_idx]

        else:
            sim_keeper_idx = train_snp_idx

        # subset data

        # fast indexing (needs to be C-order)
        assert np.isfortran(G) == False
        #G_train = G.take(train_snp_idx, axis=1)
        G_sim = G.take(sim_keeper_idx, axis=1)
        G_test = G.take(test_snp_idx, axis=1)

        t0 = time.time()

        if self.num_pcs == 0:
            pcs = None
        else:
            if not self.pc_prefix is None:
                out_fn = PrecomputeLocoPcs.create_out_fn(self.pc_prefix, i)
                logging.info("loading pc from file: %s" % out_fn)
                pcs = load(out_fn)[:, 0:self.num_pcs]
                logging.info("..done")

            else:
                assert False, "please precompute PCs"

                logging.info("done after %.4f seconds" % (time.time() - t0))

        # only use PCs
        if self.pcs_only:
            G_sim = None
            logging.info("Using PCs only in LocoGWAS")
        gwas = FastGwas(G_sim,
                        G_test,
                        y,
                        self.delta,
                        train_pcs=pcs,
                        mixing=self.mixing)
        gwas.run_gwas()

        assert len(gwas.p_values) == len(test_snp_idx)

        # wrap up results
        return test_snp_idx, gwas.p_values, result
Exemplo n.º 2
0
    def dowork(self, i, train_snp_idx, test_snp_idx, result, G, y):
        logging.info("{0}, {1}".format(len(train_snp_idx), len(test_snp_idx)))
        
        # intersect selected SNPs with train snps
        if not self.selected_snps is None:
            # intersect snp names
            logging.info("intersecting train snps with selected snps for LOCO")
            int_snp_idx = argintersect_left(self.snp_reader.rs[train_snp_idx], self.selected_snps)
            sim_keeper_idx = np.array(train_snp_idx)[int_snp_idx]

        else:
            sim_keeper_idx = train_snp_idx

        # subset data
            
        # fast indexing (needs to be C-order)
        assert np.isfortran(G) == False
        #G_train = G.take(train_snp_idx, axis=1)
        G_sim = G.take(sim_keeper_idx, axis=1)
        G_test = G.take(test_snp_idx, axis=1)

        t0 = time.time()

        if self.num_pcs == 0:
            pcs = None
        else:
            if not self.pc_prefix is None:
                out_fn = PrecomputeLocoPcs.create_out_fn(self.pc_prefix, i)
                logging.info("loading pc from file: %s" % out_fn)
                pcs = load(out_fn)[:,0:self.num_pcs]
                logging.info("..done")

            else:
                assert False, "please precompute PCs"

                logging.info("done after %.4f seconds" % (time.time() - t0))

        # only use PCs
        if self.pcs_only:
            G_sim = None
            logging.info("Using PCs only in LocoGWAS")
        gwas = FastGwas(G_sim, G_test, y, self.delta, train_pcs=pcs, mixing=self.mixing)
        gwas.run_gwas()

        assert len(gwas.p_values) == len(test_snp_idx)

        # wrap up results
        return test_snp_idx, gwas.p_values, result
Exemplo n.º 3
0
    def run_sim_and_compare(self, name, method):
        logging.info('in test_all')
        import fastlmm.util.runner as runner

        currentFolder = os.path.dirname(os.path.realpath(__file__))
        snp_fn = os.path.realpath(currentFolder + "/../../data/mouse/alldata")
        out_prefix = currentFolder + "/tempdir/mouse_"

    
        description = "test_run_{0}".format(name)
        runner = Local()
    
        num_causals = 500
        num_repeats = 1
        num_pcs = 5
        
        expected_prefix = currentFolder + "/expected/"
        methods = [method]
        combine_output = run_simulation(snp_fn, out_prefix, methods, num_causals, num_repeats, num_pcs, description, runner, plot_fn="out.png", seed=42)
        from fastlmm.util.pickle_io import load
        filename = "%s%s.bzip" % (expected_prefix, name)
        co = load(filename)
        compare_nested(combine_output, co)
def compute_core(input_tuple):
    """
    Leave-two-chromosome-out evaluation scheme:
    Chr1: no causals, used for T1-error evaluation
    Chr2: has causals, not conditioned on, used for power evaluation
    Rest: has causals, conditioned on
    
      T1   Pow  [     cond     ] 
    ===== ===== ===== .... =====
            x x   x x      xx
    
    """
    
    
    
    methods, snp_fn, eigen_fn, num_causal, num_pcs, seed, sim_id = input_tuple
    
    # partially load bed file
    from pysnptools.snpreader import Bed
    snp_reader = Bed(snp_fn)

    # determine indices for generation and evaluation
    ##################################################################
    chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(snp_reader.pos)
    
    causal_candidates_idx = np.concatenate((chr2_idx, rest_idx))
    # only compute t1-error (condition on all chr with causals on them)
    #causal_candidates_idx = rest_idx
    test_idx = np.concatenate((chr1_idx, chr2_idx))
    
    if seed is not None:
        np.random.seed(int(seed % sys.maxint))
    
    causal_idx = np.random.permutation(causal_candidates_idx)[0:num_causal]
    
    # generate phenotype
    ###################################################################
    genetic_var = 0.5
    noise_var = 0.5

    y = generate_phenotype(Bed(snp_fn).read(order='C').standardize(), causal_idx, genetic_var, noise_var)
    y.flags.writeable = False


    ############### only alter part until here --> modularize this


    # load pcs
    ###################################################################
    logging.info("loading eigendecomp from file %s" % eigen_fn)
    eig_dec = load(eigen_fn)
    G_pc = eig_dec["pcs"]
    G_pc.flags.writeable = False

    G_pc_ = G_pc[:,0:num_pcs]
    G_pc_norm = DiagKtoN(G_pc_.shape[0]).standardize(G_pc_.copy())
    G_pc_norm.flags.writeable = False
    

    # run feature selection
    #########################################################

    # generate pheno data structure
    pheno = {"iid": snp_reader.iid, "vals": y, "header": []}
    covar = {"iid": snp_reader.iid, "vals": G_pc_norm, "header": []}
    
    # subset readers
    G0 = snp_reader[:,rest_idx]
    test_snps = snp_reader[:,test_idx]
    
    result = {}
    fs_result = {}

    # additional methods can be defined and included in the benchmark
    for method_function in methods:
        result_, fs_result_ = method_function(test_snps, pheno, G0, covar)
        result.update(result_)
        fs_result.update(fs_result_)
    
    # save indices
    indices = {"causal_idx": causal_idx, "chr1_idx": chr1_idx, "chr2_idx": chr2_idx, "input_tuple": input_tuple, "fs_result": fs_result}
    #test_idx
    
    return result, indices
def combine_results(input_tuple):
    """
    compute performance statistics from p-values of method
    """
    
    method, results_fn = input_tuple

    logging.info("reading file: %s" % results_fn)
    output_list = load(results_fn)

    p_values_all = []
    mask_all = []    

    p_values_all = []
    p_values_chr1 = []
    p_values_chr2 = []
    mask_all = []

    t0 = time.time()
    logging.info("concatenating p-values")
    for result, idx in output_list:
        causals_chr2_idx = np.intersect1d(idx["chr2_idx"], idx["causal_idx"])

        assert len(result[method]) == len(idx["chr1_idx"]) + len(idx["chr2_idx"])

        p_vals_t1_err = result[method][idx["chr1_idx"]]
        p_vals_power = result[method][causals_chr2_idx]

        p_values_chr1.extend(p_vals_t1_err)
        p_values_chr2.extend(p_vals_power)

        p_values_all.extend(p_vals_t1_err)
        p_values_all.extend(p_vals_power)
                
        mask_t1_err = np.zeros(len(idx["chr1_idx"]), dtype=np.bool)
        mask_power = np.ones(len(causals_chr2_idx), dtype=np.bool)

        mask_all.extend(mask_t1_err)
        mask_all.extend(mask_power)
    
    logging.info("done concatenating p-values (%s)" % (str(time.time()-t0)))
    result = {}

    t0 = time.time()
    result["roc"] = gw.compute_roc_data(np.array(mask_all, dtype=np.bool), -np.array(p_values_all))
    logging.info("computed roc in (%s)" % (str(time.time()-t0)))

    t0 = time.time()
    result["prc"] = gw.compute_prc_data(np.array(mask_all, dtype=np.bool), -np.array(p_values_all))
    logging.info("computed prc in (%s)" % (str(time.time()-t0)))

    t0 = time.time()
    result["t1err"] = gw.compute_t1err_data(np.array(p_values_chr1), np.zeros(len(p_values_chr1), dtype=np.bool))
    logging.info("computed t1err in (%s)" % (str(time.time()-t0)))

    t0 = time.time()
    result["power"] = gw.compute_power_data(np.array(p_values_chr2), np.ones(len(p_values_chr2), dtype=np.bool))
    logging.info("computed power in (%s)" % (str(time.time()-t0)))

    result["method"] = method
    result["num_trials"] = len(p_values_chr1)

    return result
Exemplo n.º 6
0
def merge_results(results_dir, fn_filter_list, mindist):
    """
    visualize gwas results based on results file names
    """

    files = [fn for fn in os.listdir(results_dir) if fn.endswith("pickle")]

    import pylab
    pylab.figure()

    for fn_idx, fn_filter in enumerate(fn_filter_list):

        method_files = [fn for fn in files if fn.find(fn_filter) != -1]

        p_values = []
        p_values_lin = []
        i_causal = []

        for method_fn in method_files:
            tmp_fn = results_dir + "/" + method_fn
            print tmp_fn
            dat = load(tmp_fn)

            pv_m, i_causal_m = cut_snps_close_to_causals(dat["p_values_uncut"],
                                                         dat["pos"],
                                                         dat["causal_idx"],
                                                         mindist=mindist)
            pv_lin_m, i_causal_m2 = cut_snps_close_to_causals(
                dat["p_values_lin_uncut"],
                dat["pos"],
                dat["causal_idx"],
                mindist=mindist)

            np.testing.assert_array_equal(i_causal_m, i_causal_m2)

            p_values.extend(pv_m)
            p_values_lin.extend(pv_lin_m)
            i_causal.extend(i_causal_m)

        p_values = np.array(p_values)
        p_values_lin = np.array(p_values_lin)
        i_causal = np.array(i_causal)

        method_label = fn_filter.replace("_",
                                         "")  # underscore prefix hides label
        pylab.subplot(221)
        plot_prc_noshow(i_causal, -p_values, label=method_label)
        if fn_idx == 0:
            plot_prc_noshow(i_causal, -p_values_lin, label="lin")

        pylab.subplot(222)
        plot_roc_noshow(i_causal, -p_values, label=method_label)
        if fn_idx == 0:
            plot_roc_noshow(i_causal, -p_values_lin, label="lin")

        pylab.subplot(223)
        plot_t1err_noshow(p_values, i_causal, label=method_label)
        if fn_idx == 0:
            plot_t1err_noshow(p_values_lin, i_causal, label="lin")

        pylab.subplot(224)
        plot_power_noshow(p_values, i_causal, label=method_label)
        if fn_idx == 0:
            plot_power_noshow(p_values_lin, i_causal, label="lin")

        print(p_values)
        print(i_causal)

    pylab.show()
Exemplo n.º 7
0
def merge_results(results_dir, fn_filter_list, mindist):
    """
    visualize gwas results based on results file names
    """

    files = [fn for fn in os.listdir(results_dir) if fn.endswith("pickle")]

    import pylab
    pylab.figure()

    for fn_idx, fn_filter in enumerate(fn_filter_list):

        method_files = [fn for fn in files if fn.find(fn_filter) != -1]

        p_values = []
        p_values_lin = []
        i_causal = []

        for method_fn in method_files:
            tmp_fn = results_dir + "/" + method_fn
            print tmp_fn
            dat = load(tmp_fn)

            pv_m, i_causal_m = cut_snps_close_to_causals(dat["p_values_uncut"], dat["pos"], dat["causal_idx"], mindist=mindist)
            pv_lin_m, i_causal_m2 = cut_snps_close_to_causals(dat["p_values_lin_uncut"], dat["pos"], dat["causal_idx"], mindist=mindist)

            np.testing.assert_array_equal(i_causal_m, i_causal_m2)

            p_values.extend(pv_m)
            p_values_lin.extend(pv_lin_m)
            i_causal.extend(i_causal_m)

        p_values = np.array(p_values)
        p_values_lin = np.array(p_values_lin)
        i_causal = np.array(i_causal)


        method_label = fn_filter.replace("_", "")# underscore prefix hides label
        pylab.subplot(221)
        plot_prc_noshow(i_causal, -p_values, label=method_label)
        if fn_idx == 0:
            plot_prc_noshow(i_causal, -p_values_lin, label="lin")

        pylab.subplot(222)
        plot_roc_noshow(i_causal, -p_values, label=method_label)
        if fn_idx == 0:
            plot_roc_noshow(i_causal, -p_values_lin, label="lin")
                
        pylab.subplot(223)
        plot_t1err_noshow(p_values, i_causal, label=method_label)
        if fn_idx == 0:
            plot_t1err_noshow(p_values_lin, i_causal, label="lin")

        pylab.subplot(224)
        plot_power_noshow(p_values, i_causal, label=method_label)
        if fn_idx == 0:
            plot_power_noshow(p_values_lin, i_causal, label="lin")

        print p_values
        print i_causal


    pylab.show()
def compute_core(input_tuple):
    """
    Leave-two-chromosome-out evaluation scheme:
    Chr1: no causals, used for T1-error evaluation
    Chr2: has causals, not conditioned on, used for power evaluation
    Rest: has causals, conditioned on
    
      T1   Pow  [     cond     ] 
    ===== ===== ===== .... =====
            x x   x x      xx
    
    """

    methods, snp_fn, eigen_fn, num_causal, num_pcs, seed, sim_id = input_tuple

    # partially load bed file
    from pysnptools.snpreader import Bed
    snp_reader = Bed(snp_fn)

    # determine indices for generation and evaluation
    ##################################################################
    chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(
        snp_reader.pos)

    causal_candidates_idx = np.concatenate((chr2_idx, rest_idx))
    # only compute t1-error (condition on all chr with causals on them)
    #causal_candidates_idx = rest_idx
    test_idx = np.concatenate((chr1_idx, chr2_idx))

    if seed is not None:
        np.random.seed(int(seed % sys.maxint))

    causal_idx = np.random.permutation(causal_candidates_idx)[0:num_causal]

    # generate phenotype
    ###################################################################
    genetic_var = 0.5
    noise_var = 0.5

    y = generate_phenotype(
        Bed(snp_fn).read(order='C').standardize(), causal_idx, genetic_var,
        noise_var)
    y.flags.writeable = False

    ############### only alter part until here --> modularize this

    # load pcs
    ###################################################################
    logging.info("loading eigendecomp from file %s" % eigen_fn)
    eig_dec = load(eigen_fn)
    G_pc = eig_dec["pcs"]
    G_pc.flags.writeable = False

    G_pc_ = G_pc[:, 0:num_pcs]
    G_pc_norm = DiagKtoN(G_pc_.shape[0]).standardize(G_pc_.copy())
    G_pc_norm.flags.writeable = False

    # run feature selection
    #########################################################

    # generate pheno data structure
    pheno = {"iid": snp_reader.iid, "vals": y, "header": []}
    covar = {"iid": snp_reader.iid, "vals": G_pc_norm, "header": []}

    # subset readers
    G0 = snp_reader[:, rest_idx]
    test_snps = snp_reader[:, test_idx]

    result = {}
    fs_result = {}

    # additional methods can be defined and included in the benchmark
    for method_function in methods:
        result_, fs_result_ = method_function(test_snps, pheno, G0, covar)
        result.update(result_)
        fs_result.update(fs_result_)

    # save indices
    indices = {
        "causal_idx": causal_idx,
        "chr1_idx": chr1_idx,
        "chr2_idx": chr2_idx,
        "input_tuple": input_tuple,
        "fs_result": fs_result
    }
    #test_idx

    return result, indices
def combine_results(input_tuple):
    """
    compute performance statistics from p-values of method
    """

    method, results_fn = input_tuple

    logging.info("reading file: %s" % results_fn)
    output_list = load(results_fn)

    p_values_all = []
    mask_all = []

    p_values_all = []
    p_values_chr1 = []
    p_values_chr2 = []
    mask_all = []

    t0 = time.time()
    logging.info("concatenating p-values")
    for result, idx in output_list:
        causals_chr2_idx = np.intersect1d(idx["chr2_idx"], idx["causal_idx"])

        assert len(
            result[method]) == len(idx["chr1_idx"]) + len(idx["chr2_idx"])

        p_vals_t1_err = result[method][idx["chr1_idx"]]
        p_vals_power = result[method][causals_chr2_idx]

        p_values_chr1.extend(p_vals_t1_err)
        p_values_chr2.extend(p_vals_power)

        p_values_all.extend(p_vals_t1_err)
        p_values_all.extend(p_vals_power)

        mask_t1_err = np.zeros(len(idx["chr1_idx"]), dtype=np.bool)
        mask_power = np.ones(len(causals_chr2_idx), dtype=np.bool)

        mask_all.extend(mask_t1_err)
        mask_all.extend(mask_power)

    logging.info("done concatenating p-values (%s)" % (str(time.time() - t0)))
    result = {}

    t0 = time.time()
    result["roc"] = gw.compute_roc_data(np.array(mask_all, dtype=np.bool),
                                        -np.array(p_values_all))
    logging.info("computed roc in (%s)" % (str(time.time() - t0)))

    t0 = time.time()
    result["prc"] = gw.compute_prc_data(np.array(mask_all, dtype=np.bool),
                                        -np.array(p_values_all))
    logging.info("computed prc in (%s)" % (str(time.time() - t0)))

    t0 = time.time()
    result["t1err"] = gw.compute_t1err_data(
        np.array(p_values_chr1), np.zeros(len(p_values_chr1), dtype=np.bool))
    logging.info("computed t1err in (%s)" % (str(time.time() - t0)))

    t0 = time.time()
    result["power"] = gw.compute_power_data(
        np.array(p_values_chr2), np.ones(len(p_values_chr2), dtype=np.bool))
    logging.info("computed power in (%s)" % (str(time.time() - t0)))

    result["method"] = method
    result["num_trials"] = len(p_values_chr1)

    return result