Пример #1
0
    def test_one(self):
        assert "A" == Local().run(A(1, 0))
        assert "A" == Local().run(A(10, 0))
        assert "A" == Local().run(A(3, 10))
        assert "A" == LocalFromRanges([2, 4]).run(A(10, 0))
        assert "A" == LocalFromRanges([2, 9, 10]).run(A(3, 0))
        assert "A" == LocalFromRanges([2, 9, 10]).run(A(3, 10))

        np.random.seed(0)
        for i in xrange(1000):
            end = int(np.random.uniform(low=1, high=15))
            a_work_count = int(np.random.uniform(low=1, high=15))
            b_work_count = int(np.random.uniform(low=0, high=3))
            extra_steps = int(np.random.uniform(low=0, high=end - 1))
            if end > 1:
                list = sorted(
                    np.random.random_integers(low=1,
                                              high=end - 1,
                                              size=extra_steps))
            else:
                list = []
            list.append(end)
            if i % 100 == 0: logging.info("random test case # {0}".format(i))
            assert "A" == LocalFromRanges(list).run(
                A(a_work_count, b_work_count))
Пример #2
0
def execute_fs(test_snps, pheno, G0, covar):
    """
    run feature selection
    """

    result = {}
    fs_result = {}

    # fs unconditioned
    ########################
    tmp_uuid = str(uuid.uuid4())[0:13]
    out_fn = "tmp_pheno_%s.txt" % (tmp_uuid)
    out_data = pd.DataFrame({
        "id1": G0.iid[:, 0],
        "id2": G0.iid[:, 1],
        "y": pheno["vals"]
    })
    out_data.to_csv(out_fn, sep=" ", header=False, index=False)

    # write out covariates
    items = [
        ('id1', G0.iid[:, 0]),
        ('id2', G0.iid[:, 1]),
    ]

    items += [("pc_%i" % i, covar["vals"][:, i])
              for i in xrange(covar["vals"].shape[1])]
    cov_df = pd.DataFrame.from_items(items)
    cov_fn = "tmp_cov_%s.txt" % (tmp_uuid)
    cov_df.to_csv(cov_fn, sep=" ", header=False, index=False)

    #TODO: fix include_all!!
    fsd = create_feature_selection_distributable(G0,
                                                 out_fn,
                                                 None,
                                                 0,
                                                 "fs_out",
                                                 include_all=False,
                                                 cov_fn=cov_fn)
    fs_result["result_uncond_all"] = Local().run(fsd)
    best_k, best_delta, best_obj, best_snps = fs_result["result_uncond_all"]
    fs_idx = argintersect_left(G0.sid, best_snps)

    G_fs = G0[:, fs_idx]

    result["fs_all"] = single_snp(test_snps, pheno,
                                  G0=G_fs).sort(["Chr", "ChrPos"
                                                 ])["PValue"].as_matrix()
    result["fs_all_cov"] = single_snp(
        test_snps, pheno, G0=G_fs,
        covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix()

    return result, fs_result
Пример #3
0
    def test_local_single(self):
        """
        test leave one chromosome out iterator
        """

        # run on 4 core locally
        runner = Local(4)
        result = d_map(dummy, self.args, runner, input_files=[self.fn])
        expect = [
            '', 'A', 'AA', 'AAA', 'AAAB', 'AAABB', 'AAABBB', 'AAABBBC',
            'AAABBBCC', 'AAABBBCCC'
        ]

        assert expect == result
    def run(self,
            methods,
            num_causal,
            num_repeats,
            num_pcs,
            description,
            runner,
            seed=None,
            plot_fn=None):

        self.precompute_pca()

        input_files = [self.snp_fn + ext
                       for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn]
        input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal,
                       num_pcs, seed, sim_id) for sim_id in range(num_repeats)]
        output_list = distributed_map.d_map(semisynth_simulations.compute_core,
                                            input_args,
                                            runner,
                                            input_files=input_files)

        ############################################
        results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % (
            description, num_repeats, num_causal)
        reduced_results_fn = results_fn.replace("runs", "reduced.runs")

        save(results_fn, output_list)

        methods = output_list[0][0].keys()
        arg_list = [(method, results_fn) for method in methods]

        #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared")
        reduce_runner = Local()
        combine_output = distributed_map.d_map(
            semisynth_simulations.combine_results,
            arg_list,
            reduce_runner,
            input_files=[results_fn])

        save(reduced_results_fn, combine_output)
        title = "%i causal, %i repeats" % (num_causal, num_repeats)
        visualize_reduced_results(methods,
                                  combine_output,
                                  title=title,
                                  plot_fn=plot_fn)

        return combine_output
Пример #5
0
    def run_sim_and_compare(self, name, method):
        logging.info('in test_all')
        import fastlmm.util.runner as runner

        currentFolder = os.path.dirname(os.path.realpath(__file__))
        snp_fn = os.path.realpath(currentFolder + "/../../data/mouse/alldata")
        out_prefix = currentFolder + "/tempdir/mouse_"

    
        description = "test_run_{0}".format(name)
        runner = Local()
    
        num_causals = 500
        num_repeats = 1
        num_pcs = 5
        
        expected_prefix = currentFolder + "/expected/"
        methods = [method]
        combine_output = run_simulation(snp_fn, out_prefix, methods, num_causals, num_repeats, num_pcs, description, runner, plot_fn="out.png", seed=42)
        from fastlmm.util.pickle_io import load
        filename = "%s%s.bzip" % (expected_prefix, name)
        co = load(filename)
        compare_nested(combine_output, co)
def main():
    logging.basicConfig(level=logging.INFO)

    #snp_fn = "data/toydata.5chrom"
    snp_fn = "data/mouse/alldata"
    out_prefix = "results/mouse_"

    description = "test_run"
    queue = "shared"
    #runner = Hadoop2(200, mapmemory=40*1024, reducememory=90*1024, mkl_num_threads=4, queue=queue)
    print "using snps", snp_fn
    #runner = LocalMultiProc(20)
    runner = Local()

    num_causals = 500
    num_repeats = 3
    num_pcs = 5

    # make this a tuple of function and kwargs
    from GWAS_benchmark.methods import execute_lmm, execute_linear_regression, execute_dual_fs, execute_fs
    methods = [execute_fs, execute_linear_regression]

    run_simulation(snp_fn, out_prefix, methods, num_causals, num_repeats,
                   num_pcs, description, runner)
Пример #7
0

if __name__ == '__main__':

    from fastlmm.association.tests.testepistasis import TestEpistasis
    suites = unittest.TestSuite([getTestSuite()])

    if False:  #Standard test run
        r = unittest.TextTestRunner(failfast=False)
        r.run(suites)
    else:  #Cluster test run
        from fastlmm.util.distributabletest import DistributableTest

        runner = HPC(
            10,
            'RR1-N13-09-H44',
            r'\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\Redmond',
            remote_python_parent=
            r"\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\REDMOND\carlk\Source\carlk\july_7_14\tests\runs\2014-07-24_15_02_02_554725991686\pythonpath",
            update_remote_python_parent=True,
            priority="AboveNormal",
            mkl_num_threads=1)
        runner = Local()
        #runner = LocalMultiProc(taskcount=20,mkl_num_threads=5)
        #runner = LocalInParts(1,2,mkl_num_threads=1) # For debugging the cluster runs
        #runner = Hadoop(100, mapmemory=8*1024, reducememory=8*1024, mkl_num_threads=1, queue="default")
        distributable_test = DistributableTest(suites, "temp_test")
        print runner.run(distributable_test)

    logging.info("done with testing")
Пример #8
0
    # this import is needed for the runner
    from fastlmm.association.tests.test_single_snp_select import TestSingleSnpSelect
    suites = unittest.TestSuite([getTestSuite()])

    if True: #Standard test run
        r = unittest.TextTestRunner(failfast=False)
        r.run(suites)
    else: #Cluster test run



        from fastlmm.util.runner import Local, HPC, LocalMultiProc
        logging.basicConfig(level=logging.INFO)

        from fastlmm.util.distributabletest import DistributableTest


        #runner = HPC(10, 'RR1-N13-09-H44',r'\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\Redmond',
        #                remote_python_parent=r"\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\REDMOND\carlk\Source\carlk\july_7_14\tests\runs\2014-07-24_15_02_02_554725991686\pythonpath",
        #                update_remote_python_parent=True,
        #                priority="AboveNormal",mkl_num_threads=1)
        runner = Local()
        #runner = LocalMultiProc(taskcount=20,mkl_num_threads=5)
        #runner = LocalInParts(1,2,mkl_num_threads=1) # For debugging the cluster runs
        #runner = Hadoop(100, mapmemory=8*1024, reducememory=8*1024, mkl_num_threads=1, queue="default")
        distributable_test = DistributableTest(suites,"temp_test")
        print runner.run(distributable_test)


    logging.info("done with testing")
Пример #9
0
    # this import is needed for the runner
    from fastlmm.association.tests.test_single_snp_select import TestSingleSnpSelect
    suites = unittest.TestSuite([getTestSuite()])

    if True: #Standard test run
        r = unittest.TextTestRunner(failfast=False)
        r.run(suites)
    else: #Cluster test run



        from fastlmm.util.runner import Local, HPC, LocalMultiProc
        logging.basicConfig(level=logging.INFO)

        from fastlmm.util.distributabletest import DistributableTest


        #runner = HPC(10, 'RR1-N13-09-H44',r'\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\Redmond',
        #                remote_python_parent=r"\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\REDMOND\carlk\Source\carlk\july_7_14\tests\runs\2014-07-24_15_02_02_554725991686\pythonpath",
        #                update_remote_python_parent=True,
        #                priority="AboveNormal",mkl_num_threads=1)
        runner = Local()
        #runner = LocalMultiProc(taskcount=20,mkl_num_threads=5)
        #runner = LocalInParts(1,2,mkl_num_threads=1) # For debugging the cluster runs
        #runner = Hadoop(100, mapmemory=8*1024, reducememory=8*1024, mkl_num_threads=1, queue="default")
        distributable_test = DistributableTest(suites,"temp_test")
        print(runner.run(distributable_test))


    logging.info("done with testing")
def mf_to_runner_function(mf):
    excluded_nodes = [
    ]  #'GCRCM07B20','GCRCM11B05','GCRCM10B06','GCRCM02B07']#'GCRCM02B11','GCRCM03B07'] #'GCRCM22B06','GCRCN0383','GCRCM02B07','GCRCN0179','GCRCM37B13','GCRCN0376','GCRCN0456']#'gcrcn0231']#"MSR-HDP-DN0316","MSR-HDP-DN0321","MSR-HDP-DN0336","MSR-HDP-DN0377","MSR-HDP-DN0378","MSR-HDP-DN0314","MSR-HDP-DN0335","MSRQC073","MSRQC002","MSRQC015"]
    remote_python_parent = r"\\GCR\Scratch\RR1\escience\carlk\data\carlk\pythonpath10262016"
    clean_up = False

    if mf == "debug":
        runner_function = lambda ignore: LocalInParts(
            215,
            215,
            mkl_num_threads=20,
            result_file="result.p",
            run_dir=r"C:\deldir\test\outputx")
    elif mf == "local":
        runner_function = lambda ignore: Local()
    elif mf == "local1":
        runner_function = lambda ignore: Local(1)
    elif mf == "lmp":
        runner_function = lambda ignore: LocalMultiProc(22, 5)
    elif mf == "lmp4":
        runner_function = lambda ignore: LocalMultiProc(4, 5)
    elif mf == "lmpl":
        runner_function = lambda taskcount: LocalMultiProc(
            taskcount, taskcount, just_one_process=True)
    elif mf == "nodeP":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            #mkl_num_threads=20,
            nodegroups="Preemptable",
            runtime="0:11:0",  # day:hour:min
            #min = 10 #max(1,min(taskcount,110)//20)
            #max = min(taskcount,500),
            clean_up=clean_up,
        )
    elif mf == "nodeP99":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            #mkl_num_threads=20,
            nodegroups="Preemptable,B99",
            runtime="0:11:0",  # day:hour:min
            #min = 10 #max(1,min(taskcount,110)//20)
            #max = min(taskcount,500),
            clean_up=clean_up,
        )
    elif mf == "nodeL99":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="LongRunQ",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            #mkl_num_threads=20,
            nodegroups="LongRunQ,B99",
            runtime="11:0:0",  # day:hour:min
            #min = 10 #max(1,min(taskcount,110)//20)
            #max = min(taskcount,500),
            clean_up=clean_up,
        )
    elif mf == "socketP":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='socket',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            mkl_num_threads=10,
            nodegroups="Preemptable",
            runtime="0:11:0",  # day:hour:min
            #min = max(1,min(taskcount,110)//20),
            clean_up=clean_up,
        )
    elif mf == "coreP":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 1000),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            mkl_num_threads=1,
            runtime="0:11:0",  # day:hour:min
            nodegroups="Preemptable",
            #min = min(taskcount,1100)
            min=1,
            max=200 * 20,
            clean_up=clean_up,
        )
    elif mf == "coreP99":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 1000),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="Preemptable",
            priority="Lowest",
            excluded_nodes=excluded_nodes,
            mkl_num_threads=1,
            runtime="0:11:0",  # day:hour:min
            nodegroups="Preemptable,B99",
            #min = min(taskcount,1100)
            min=1,
            max=200 * 20,
            clean_up=clean_up,
        )
    elif mf == "coreAz":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 1000),
            'GCR',
            r"\\GCR\Scratch\AZ-USCentral\escience",
            remote_python_parent=
            r"\\GCR\Scratch\AZ-USCentral\escience\carlk\data\carlk\pythonpath",
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="Azure IaaS USCentral",
            mkl_num_threads=1,
            runtime="0:8:0",  # day:hour:min,
            clean_up=clean_up,
        )
    elif mf == "nodeE":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 10100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="ExpressQ",
            priority="Normal",
            #node_local = False,
            #mkl_num_threads=20,
            runtime="0:4:0",  # day:hour:min
            #min = min(taskcount,100),
            clean_up=clean_up,
        )
    elif mf == "50tasks":
        runner_function = lambda taskcount: HPC(
            50,
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="ExpressQ",
            priority="Normal",
            #mkl_num_threads=20,
            runtime="0:4:0",  # day:hour:min
            #min = min(taskcount,100),
            clean_up=clean_up,
        )
    elif mf == "coreE":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 10100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="ExpressQ",
            priority="Normal",
            mkl_num_threads=1,
            runtime="0:4:0",  # day:hour:min
            #min = min(taskcount,100),
            clean_up=clean_up,
        )
    elif mf == "nodeA":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='node',  #core, socket, node
            update_remote_python_parent=True,
            template="Admin Template",
            clean_up=clean_up,
        )
    elif mf == "socketA":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='socket',  #core, socket, node
            update_remote_python_parent=True,
            template="Admin Template",
            clean_up=clean_up,
        )
    elif mf == "coreA":
        runner_function = lambda taskcount: HPC(
            min(taskcount, 30100),
            'GCR',
            r"\\GCR\Scratch\RR1\escience",
            remote_python_parent=remote_python_parent,
            unit='core',  #core, socket, node
            update_remote_python_parent=True,
            template="Admin Template",
            clean_up=clean_up,
        )
    elif mf == "nodeH":
        runner_function = lambda taskcount: Hadoop2(min(taskcount, 100000),
                                                    mapmemory=58 * 1024,
                                                    reducememory=8 * 1024,
                                                    min_alloc=2048,
                                                    xmx=3072,
                                                    mkl_num_threads=14,
                                                    queue="shared",
                                                    skipdatacheck=True,
                                                    skipsourcecheck=True)
    elif mf == "coreH":
        runner_function = lambda taskcount: Hadoop2(min(taskcount, 100000),
                                                    mapmemory=8 * 1024,
                                                    reducememory=8 * 1024,
                                                    min_alloc=2048,
                                                    xmx=3072,
                                                    mkl_num_threads=1,
                                                    queue="shared",
                                                    skipdatacheck=True,
                                                    skipsourcecheck=True)
    else:
        raise Exception("don't find mf=" + mf)
    return runner_function
def simulate_ascertained(methods,
                         prevalence,
                         iid_count,
                         num_causal,
                         num_repeats,
                         description,
                         snp_args,
                         phenotype_args,
                         runner=Local(),
                         seed=None,
                         plot_fn=None):
    """
    run a synthetic simulation using ascertained data
    
    :param methods: A list of functions implementing methods to be compared.
    :type methods: list<function>
    
    :param prevalence: Prior probability of a case, e.g. .1
    :type prevalence: a float between 0.0 and 1.0 (exclusive)
       
    :param iid_count: The number of individuals to generate.
    :type iid_count: int
     
    :param num_causal: The number causal SNPs in the simulation.
    :type num_causal: int

    :param num_repeats: The number of repeats in the simulation.
    :type num_repeats: int

    :param description: Short description string of experiment (for output)
    :type description: str
    
    :param num_repeats: The number of repeats in the simulation.
    :type num_repeats: int

    :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include
    'iid_count' or 'seed'
    :type snp_args: dictionary

    :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include
    'snp_count' or 'seed'
    :type phenotype_args: dictionary

    :param runner: a Runner object (e.g. Local, Hadoop, HPC)
    :type runner: Runner

    :param seed: a random seed to control random number generation
    :type seed: int

    :param plot_fn: filename under which to save the output figure
    :type plot_fn: str

    """

    input_args = [(methods, num_causal, prevalence, iid_count, snp_args,
                   phenotype_args, seed, sim_id)
                  for sim_id in range(num_repeats)]
    output_list = distributed_map.d_map(
        semisynth_simulations.compute_core_ascertained, input_args, runner)

    ############################################
    results_fn = "%s_ascertained_results.runs_%i.causals_%i.pickle.bzip" % (
        description, num_repeats, num_causal)
    reduced_results_fn = results_fn.replace("runs", "reduced.runs")

    save(results_fn, output_list)

    methods = output_list[0][0].keys()
    arg_list = [(method, results_fn) for method in methods]

    combine_output = distributed_map.d_map(
        semisynth_simulations.combine_results,
        arg_list,
        Local(),
        input_files=[results_fn])

    save(reduced_results_fn, combine_output)
    title = "%i causal, %i repeats" % (num_causal, num_repeats)
    visualize_reduced_results(methods,
                              combine_output,
                              title=title,
                              plot_fn=plot_fn)

    return combine_output