def precompute_pca(self):
        """
        compute pcs
        """

        logging.info("computing PCA on train set")
        t0 = time.time()
        
        if not os.path.isfile(self.eigen_fn) or self.force_recompute:

            G = self.snp_reader.read(order='C').standardize().val
            G.flags.writeable = False
            chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(self.snp_reader.pos)

            G_train = G.take(rest_idx, axis=1)

            from sklearn.decomposition import PCA
            pca = PCA()
            pcs = pca.fit_transform(G_train)

            logging.info("saving eigendecomp to file %s" % self.eigen_fn)
            
            eig_dec = {"pcs": pcs}
            save(self.eigen_fn, eig_dec)


            logging.info("time taken for pc computation: " + str(time.time()-t0))
        else:
            logging.info("pc file already exists: %s" % (self.eigen_fn))
    def run(self, methods, num_causal, num_repeats, num_pcs, description, runner, seed=None, plot_fn=None):
        
        
        self.precompute_pca()

        input_files = [self.snp_fn + ext for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn]
        input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal, num_pcs, seed, sim_id) for sim_id in range(num_repeats)]
        output_list = distributed_map.d_map(semisynth_simulations.compute_core, input_args, runner, input_files=input_files)

        ############################################
        results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % (description, num_repeats, num_causal)
        reduced_results_fn = results_fn.replace("runs", "reduced.runs")

        save(results_fn, output_list)

        
        methods = output_list[0][0].keys()
        arg_list = [(method, results_fn) for method in methods]

        #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared")
        reduce_runner = Local()
        combine_output = distributed_map.d_map(semisynth_simulations.combine_results, arg_list, reduce_runner, input_files=[results_fn])
        
        save(reduced_results_fn, combine_output)
        title = "%i causal, %i repeats" % (num_causal, num_repeats)
        visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn)

        return combine_output
    def precompute_pca(self):
        """
        compute pcs
        """

        logging.info("computing PCA on train set")
        t0 = time.time()

        if not os.path.isfile(self.eigen_fn) or self.force_recompute:

            G = self.snp_reader.read(order='C').standardize().val
            G.flags.writeable = False
            chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(
                self.snp_reader.pos)

            G_train = G.take(rest_idx, axis=1)

            from sklearn.decomposition import PCA
            pca = PCA()
            pcs = pca.fit_transform(G_train)

            logging.info("saving eigendecomp to file %s" % self.eigen_fn)

            eig_dec = {"pcs": pcs}
            save(self.eigen_fn, eig_dec)

            logging.info("time taken for pc computation: " +
                         str(time.time() - t0))
        else:
            logging.info("pc file already exists: %s" % (self.eigen_fn))
Exemplo n.º 4
0
 def reduce(self, result_sequence):
     '''
     '''
     for i, pcs in result_sequence:
         out_fn = self.create_out_fn(self.cache_prefix, i)
         util.create_directory_if_necessary(out_fn)
         save(out_fn, pcs)
     return None
Exemplo n.º 5
0
 def reduce(self, result_sequence):
     '''
     '''
     for i, pcs in result_sequence:
         out_fn = self.create_out_fn(self.cache_prefix, i)
         util.create_directory_if_necessary(out_fn)
         save(out_fn, pcs)
     return None
Exemplo n.º 6
0
    def eval_gwas(self, causal_idx, out_fn=None, plot=False, mindist=10.0):
        """
        wrapper for function
        """

        res = eval_gwas(self.p_values, self.pos, causal_idx, mindist=mindist, out_fn=out_fn, plot=plot)
        res["delta"] = self.delta
        res["num_pcs"] = self.num_pcs
        res["num_selected_snps"] = self.selected_snps
        res["mixing"] = self.mixing
            
        from fastlmm.util.pickle_io import save
        save(out_fn, res)

        return res
    def run(self,
            methods,
            num_causal,
            num_repeats,
            num_pcs,
            description,
            runner,
            seed=None,
            plot_fn=None):

        self.precompute_pca()

        input_files = [self.snp_fn + ext
                       for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn]
        input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal,
                       num_pcs, seed, sim_id) for sim_id in range(num_repeats)]
        output_list = distributed_map.d_map(semisynth_simulations.compute_core,
                                            input_args,
                                            runner,
                                            input_files=input_files)

        ############################################
        results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % (
            description, num_repeats, num_causal)
        reduced_results_fn = results_fn.replace("runs", "reduced.runs")

        save(results_fn, output_list)

        methods = output_list[0][0].keys()
        arg_list = [(method, results_fn) for method in methods]

        #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared")
        reduce_runner = Local()
        combine_output = distributed_map.d_map(
            semisynth_simulations.combine_results,
            arg_list,
            reduce_runner,
            input_files=[results_fn])

        save(reduced_results_fn, combine_output)
        title = "%i causal, %i repeats" % (num_causal, num_repeats)
        visualize_reduced_results(methods,
                                  combine_output,
                                  title=title,
                                  plot_fn=plot_fn)

        return combine_output
def simulate_ascertained(methods, prevalence, iid_count, num_causal, num_repeats, description, snp_args, phenotype_args, runner=Local(), seed=None, plot_fn=None):
    """
    run a synthetic simulation using ascertained data
    
    :param methods: A list of functions implementing methods to be compared.
    :type methods: list<function>
    
    :param prevalence: Prior probability of a case, e.g. .1
    :type prevalence: a float between 0.0 and 1.0 (exclusive)
       
    :param iid_count: The number of individuals to generate.
    :type iid_count: int
     
    :param num_causal: The number causal SNPs in the simulation.
    :type num_causal: int

    :param num_repeats: The number of repeats in the simulation.
    :type num_repeats: int

    :param description: Short description string of experiment (for output)
    :type description: str
    
    :param num_repeats: The number of repeats in the simulation.
    :type num_repeats: int

    :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include
    'iid_count' or 'seed'
    :type snp_args: dictionary

    :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include
    'snp_count' or 'seed'
    :type phenotype_args: dictionary

    :param runner: a Runner object (e.g. Local, Hadoop, HPC)
    :type runner: Runner

    :param seed: a random seed to control random number generation
    :type seed: int

    :param plot_fn: filename under which to save the output figure
    :type plot_fn: str

    """    

    
    input_args = [(methods, num_causal, prevalence, iid_count, snp_args, phenotype_args, seed, sim_id) for sim_id in range(num_repeats)]
    output_list = distributed_map.d_map(semisynth_simulations.compute_core_ascertained, input_args, runner)


    ############################################
    results_fn = "%s_ascertained_results.runs_%i.causals_%i.pickle.bzip" % (description, num_repeats, num_causal)
    reduced_results_fn = results_fn.replace("runs", "reduced.runs")

    save(results_fn, output_list)

    
    methods = output_list[0][0].keys()
    arg_list = [(method, results_fn) for method in methods]

    combine_output = distributed_map.d_map(semisynth_simulations.combine_results, arg_list, Local(), input_files=[results_fn])
    
    save(reduced_results_fn, combine_output)
    title = "%i causal, %i repeats" % (num_causal, num_repeats)
    visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn)

    return combine_output
def simulate_ascertained(methods,
                         prevalence,
                         iid_count,
                         num_causal,
                         num_repeats,
                         description,
                         snp_args,
                         phenotype_args,
                         runner=Local(),
                         seed=None,
                         plot_fn=None):
    """
    run a synthetic simulation using ascertained data
    
    :param methods: A list of functions implementing methods to be compared.
    :type methods: list<function>
    
    :param prevalence: Prior probability of a case, e.g. .1
    :type prevalence: a float between 0.0 and 1.0 (exclusive)
       
    :param iid_count: The number of individuals to generate.
    :type iid_count: int
     
    :param num_causal: The number causal SNPs in the simulation.
    :type num_causal: int

    :param num_repeats: The number of repeats in the simulation.
    :type num_repeats: int

    :param description: Short description string of experiment (for output)
    :type description: str
    
    :param num_repeats: The number of repeats in the simulation.
    :type num_repeats: int

    :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include
    'iid_count' or 'seed'
    :type snp_args: dictionary

    :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include
    'snp_count' or 'seed'
    :type phenotype_args: dictionary

    :param runner: a Runner object (e.g. Local, Hadoop, HPC)
    :type runner: Runner

    :param seed: a random seed to control random number generation
    :type seed: int

    :param plot_fn: filename under which to save the output figure
    :type plot_fn: str

    """

    input_args = [(methods, num_causal, prevalence, iid_count, snp_args,
                   phenotype_args, seed, sim_id)
                  for sim_id in range(num_repeats)]
    output_list = distributed_map.d_map(
        semisynth_simulations.compute_core_ascertained, input_args, runner)

    ############################################
    results_fn = "%s_ascertained_results.runs_%i.causals_%i.pickle.bzip" % (
        description, num_repeats, num_causal)
    reduced_results_fn = results_fn.replace("runs", "reduced.runs")

    save(results_fn, output_list)

    methods = output_list[0][0].keys()
    arg_list = [(method, results_fn) for method in methods]

    combine_output = distributed_map.d_map(
        semisynth_simulations.combine_results,
        arg_list,
        Local(),
        input_files=[results_fn])

    save(reduced_results_fn, combine_output)
    title = "%i causal, %i repeats" % (num_causal, num_repeats)
    visualize_reduced_results(methods,
                              combine_output,
                              title=title,
                              plot_fn=plot_fn)

    return combine_output