def precompute_pca(self): """ compute pcs """ logging.info("computing PCA on train set") t0 = time.time() if not os.path.isfile(self.eigen_fn) or self.force_recompute: G = self.snp_reader.read(order='C').standardize().val G.flags.writeable = False chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(self.snp_reader.pos) G_train = G.take(rest_idx, axis=1) from sklearn.decomposition import PCA pca = PCA() pcs = pca.fit_transform(G_train) logging.info("saving eigendecomp to file %s" % self.eigen_fn) eig_dec = {"pcs": pcs} save(self.eigen_fn, eig_dec) logging.info("time taken for pc computation: " + str(time.time()-t0)) else: logging.info("pc file already exists: %s" % (self.eigen_fn))
def run(self, methods, num_causal, num_repeats, num_pcs, description, runner, seed=None, plot_fn=None): self.precompute_pca() input_files = [self.snp_fn + ext for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn] input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal, num_pcs, seed, sim_id) for sim_id in range(num_repeats)] output_list = distributed_map.d_map(semisynth_simulations.compute_core, input_args, runner, input_files=input_files) ############################################ results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % (description, num_repeats, num_causal) reduced_results_fn = results_fn.replace("runs", "reduced.runs") save(results_fn, output_list) methods = output_list[0][0].keys() arg_list = [(method, results_fn) for method in methods] #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared") reduce_runner = Local() combine_output = distributed_map.d_map(semisynth_simulations.combine_results, arg_list, reduce_runner, input_files=[results_fn]) save(reduced_results_fn, combine_output) title = "%i causal, %i repeats" % (num_causal, num_repeats) visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn) return combine_output
def precompute_pca(self): """ compute pcs """ logging.info("computing PCA on train set") t0 = time.time() if not os.path.isfile(self.eigen_fn) or self.force_recompute: G = self.snp_reader.read(order='C').standardize().val G.flags.writeable = False chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest( self.snp_reader.pos) G_train = G.take(rest_idx, axis=1) from sklearn.decomposition import PCA pca = PCA() pcs = pca.fit_transform(G_train) logging.info("saving eigendecomp to file %s" % self.eigen_fn) eig_dec = {"pcs": pcs} save(self.eigen_fn, eig_dec) logging.info("time taken for pc computation: " + str(time.time() - t0)) else: logging.info("pc file already exists: %s" % (self.eigen_fn))
def reduce(self, result_sequence): ''' ''' for i, pcs in result_sequence: out_fn = self.create_out_fn(self.cache_prefix, i) util.create_directory_if_necessary(out_fn) save(out_fn, pcs) return None
def eval_gwas(self, causal_idx, out_fn=None, plot=False, mindist=10.0): """ wrapper for function """ res = eval_gwas(self.p_values, self.pos, causal_idx, mindist=mindist, out_fn=out_fn, plot=plot) res["delta"] = self.delta res["num_pcs"] = self.num_pcs res["num_selected_snps"] = self.selected_snps res["mixing"] = self.mixing from fastlmm.util.pickle_io import save save(out_fn, res) return res
def run(self, methods, num_causal, num_repeats, num_pcs, description, runner, seed=None, plot_fn=None): self.precompute_pca() input_files = [self.snp_fn + ext for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn] input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal, num_pcs, seed, sim_id) for sim_id in range(num_repeats)] output_list = distributed_map.d_map(semisynth_simulations.compute_core, input_args, runner, input_files=input_files) ############################################ results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % ( description, num_repeats, num_causal) reduced_results_fn = results_fn.replace("runs", "reduced.runs") save(results_fn, output_list) methods = output_list[0][0].keys() arg_list = [(method, results_fn) for method in methods] #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared") reduce_runner = Local() combine_output = distributed_map.d_map( semisynth_simulations.combine_results, arg_list, reduce_runner, input_files=[results_fn]) save(reduced_results_fn, combine_output) title = "%i causal, %i repeats" % (num_causal, num_repeats) visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn) return combine_output
def simulate_ascertained(methods, prevalence, iid_count, num_causal, num_repeats, description, snp_args, phenotype_args, runner=Local(), seed=None, plot_fn=None): """ run a synthetic simulation using ascertained data :param methods: A list of functions implementing methods to be compared. :type methods: list<function> :param prevalence: Prior probability of a case, e.g. .1 :type prevalence: a float between 0.0 and 1.0 (exclusive) :param iid_count: The number of individuals to generate. :type iid_count: int :param num_causal: The number causal SNPs in the simulation. :type num_causal: int :param num_repeats: The number of repeats in the simulation. :type num_repeats: int :param description: Short description string of experiment (for output) :type description: str :param num_repeats: The number of repeats in the simulation. :type num_repeats: int :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include 'iid_count' or 'seed' :type snp_args: dictionary :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include 'snp_count' or 'seed' :type phenotype_args: dictionary :param runner: a Runner object (e.g. Local, Hadoop, HPC) :type runner: Runner :param seed: a random seed to control random number generation :type seed: int :param plot_fn: filename under which to save the output figure :type plot_fn: str """ input_args = [(methods, num_causal, prevalence, iid_count, snp_args, phenotype_args, seed, sim_id) for sim_id in range(num_repeats)] output_list = distributed_map.d_map(semisynth_simulations.compute_core_ascertained, input_args, runner) ############################################ results_fn = "%s_ascertained_results.runs_%i.causals_%i.pickle.bzip" % (description, num_repeats, num_causal) reduced_results_fn = results_fn.replace("runs", "reduced.runs") save(results_fn, output_list) methods = output_list[0][0].keys() arg_list = [(method, results_fn) for method in methods] combine_output = distributed_map.d_map(semisynth_simulations.combine_results, arg_list, Local(), input_files=[results_fn]) save(reduced_results_fn, combine_output) title = "%i causal, %i repeats" % (num_causal, num_repeats) visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn) return combine_output
def simulate_ascertained(methods, prevalence, iid_count, num_causal, num_repeats, description, snp_args, phenotype_args, runner=Local(), seed=None, plot_fn=None): """ run a synthetic simulation using ascertained data :param methods: A list of functions implementing methods to be compared. :type methods: list<function> :param prevalence: Prior probability of a case, e.g. .1 :type prevalence: a float between 0.0 and 1.0 (exclusive) :param iid_count: The number of individuals to generate. :type iid_count: int :param num_causal: The number causal SNPs in the simulation. :type num_causal: int :param num_repeats: The number of repeats in the simulation. :type num_repeats: int :param description: Short description string of experiment (for output) :type description: str :param num_repeats: The number of repeats in the simulation. :type num_repeats: int :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include 'iid_count' or 'seed' :type snp_args: dictionary :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include 'snp_count' or 'seed' :type phenotype_args: dictionary :param runner: a Runner object (e.g. Local, Hadoop, HPC) :type runner: Runner :param seed: a random seed to control random number generation :type seed: int :param plot_fn: filename under which to save the output figure :type plot_fn: str """ input_args = [(methods, num_causal, prevalence, iid_count, snp_args, phenotype_args, seed, sim_id) for sim_id in range(num_repeats)] output_list = distributed_map.d_map( semisynth_simulations.compute_core_ascertained, input_args, runner) ############################################ results_fn = "%s_ascertained_results.runs_%i.causals_%i.pickle.bzip" % ( description, num_repeats, num_causal) reduced_results_fn = results_fn.replace("runs", "reduced.runs") save(results_fn, output_list) methods = output_list[0][0].keys() arg_list = [(method, results_fn) for method in methods] combine_output = distributed_map.d_map( semisynth_simulations.combine_results, arg_list, Local(), input_files=[results_fn]) save(reduced_results_fn, combine_output) title = "%i causal, %i repeats" % (num_causal, num_repeats) visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn) return combine_output