def dowork(self, i, train_snp_idx, test_snp_idx, result, G, y): logging.info("{0}, {1}".format(len(train_snp_idx), len(test_snp_idx))) # intersect selected SNPs with train snps if not self.selected_snps is None: # intersect snp names logging.info("intersecting train snps with selected snps for LOCO") int_snp_idx = argintersect_left(self.snp_reader.rs[train_snp_idx], self.selected_snps) sim_keeper_idx = np.array(train_snp_idx)[int_snp_idx] else: sim_keeper_idx = train_snp_idx # subset data # fast indexing (needs to be C-order) assert np.isfortran(G) == False #G_train = G.take(train_snp_idx, axis=1) G_sim = G.take(sim_keeper_idx, axis=1) G_test = G.take(test_snp_idx, axis=1) t0 = time.time() if self.num_pcs == 0: pcs = None else: if not self.pc_prefix is None: out_fn = PrecomputeLocoPcs.create_out_fn(self.pc_prefix, i) logging.info("loading pc from file: %s" % out_fn) pcs = load(out_fn)[:, 0:self.num_pcs] logging.info("..done") else: assert False, "please precompute PCs" logging.info("done after %.4f seconds" % (time.time() - t0)) # only use PCs if self.pcs_only: G_sim = None logging.info("Using PCs only in LocoGWAS") gwas = FastGwas(G_sim, G_test, y, self.delta, train_pcs=pcs, mixing=self.mixing) gwas.run_gwas() assert len(gwas.p_values) == len(test_snp_idx) # wrap up results return test_snp_idx, gwas.p_values, result
def execute_fs(test_snps, pheno, G0, covar): """ run feature selection """ result = {} fs_result = {} # fs unconditioned ######################## tmp_uuid = str(uuid.uuid4())[0:13] out_fn = "tmp_pheno_%s.txt" % (tmp_uuid) out_data = pd.DataFrame({ "id1": G0.iid[:, 0], "id2": G0.iid[:, 1], "y": pheno["vals"] }) out_data.to_csv(out_fn, sep=" ", header=False, index=False) # write out covariates items = [ ('id1', G0.iid[:, 0]), ('id2', G0.iid[:, 1]), ] items += [("pc_%i" % i, covar["vals"][:, i]) for i in xrange(covar["vals"].shape[1])] cov_df = pd.DataFrame.from_items(items) cov_fn = "tmp_cov_%s.txt" % (tmp_uuid) cov_df.to_csv(cov_fn, sep=" ", header=False, index=False) #TODO: fix include_all!! fsd = create_feature_selection_distributable(G0, out_fn, None, 0, "fs_out", include_all=False, cov_fn=cov_fn) fs_result["result_uncond_all"] = Local().run(fsd) best_k, best_delta, best_obj, best_snps = fs_result["result_uncond_all"] fs_idx = argintersect_left(G0.sid, best_snps) G_fs = G0[:, fs_idx] result["fs_all"] = single_snp(test_snps, pheno, G0=G_fs).sort(["Chr", "ChrPos" ])["PValue"].as_matrix() result["fs_all_cov"] = single_snp( test_snps, pheno, G0=G_fs, covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def dowork(self, i, train_snp_idx, test_snp_idx, result, G, y): logging.info("{0}, {1}".format(len(train_snp_idx), len(test_snp_idx))) # intersect selected SNPs with train snps if not self.selected_snps is None: # intersect snp names logging.info("intersecting train snps with selected snps for LOCO") int_snp_idx = argintersect_left(self.snp_reader.rs[train_snp_idx], self.selected_snps) sim_keeper_idx = np.array(train_snp_idx)[int_snp_idx] else: sim_keeper_idx = train_snp_idx # subset data # fast indexing (needs to be C-order) assert np.isfortran(G) == False #G_train = G.take(train_snp_idx, axis=1) G_sim = G.take(sim_keeper_idx, axis=1) G_test = G.take(test_snp_idx, axis=1) t0 = time.time() if self.num_pcs == 0: pcs = None else: if not self.pc_prefix is None: out_fn = PrecomputeLocoPcs.create_out_fn(self.pc_prefix, i) logging.info("loading pc from file: %s" % out_fn) pcs = load(out_fn)[:,0:self.num_pcs] logging.info("..done") else: assert False, "please precompute PCs" logging.info("done after %.4f seconds" % (time.time() - t0)) # only use PCs if self.pcs_only: G_sim = None logging.info("Using PCs only in LocoGWAS") gwas = FastGwas(G_sim, G_test, y, self.delta, train_pcs=pcs, mixing=self.mixing) gwas.run_gwas() assert len(gwas.p_values) == len(test_snp_idx) # wrap up results return test_snp_idx, gwas.p_values, result
def execute_fs(test_snps, pheno, G0, covar): """ run feature selection """ result = {} fs_result = {} # fs unconditioned ######################## tmp_uuid = str(uuid.uuid4())[0:13] out_fn = "tmp_pheno_%s.txt" % (tmp_uuid) out_data = pd.DataFrame({"id1": G0.iid[:,0], "id2": G0.iid[:,1], "y": pheno["vals"]}) out_data.to_csv(out_fn, sep=" ", header=False, index=False) # write out covariates items = [ ('id1', G0.iid[:,0]), ('id2', G0.iid[:,1]), ] items += [("pc_%i" % i, covar["vals"][:,i]) for i in xrange(covar["vals"].shape[1])] cov_df = pd.DataFrame.from_items(items) cov_fn = "tmp_cov_%s.txt" % (tmp_uuid) cov_df.to_csv(cov_fn, sep=" ", header=False, index=False) #TODO: fix include_all!! fsd = create_feature_selection_distributable(G0, out_fn, None, 0, "fs_out", include_all=False, cov_fn=cov_fn) fs_result["result_uncond_all"] = Local().run(fsd) best_k, best_delta, best_obj, best_snps = fs_result["result_uncond_all"] fs_idx = argintersect_left(G0.sid, best_snps) G_fs = G0[:,fs_idx] result["fs_all"] = single_snp(test_snps, pheno, G0=G_fs).sort(["Chr", "ChrPos"])["PValue"].as_matrix() result["fs_all_cov"] = single_snp(test_snps, pheno, G0=G_fs, covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result