def test_one(self): assert "A" == Local().run(A(1, 0)) assert "A" == Local().run(A(10, 0)) assert "A" == Local().run(A(3, 10)) assert "A" == LocalFromRanges([2, 4]).run(A(10, 0)) assert "A" == LocalFromRanges([2, 9, 10]).run(A(3, 0)) assert "A" == LocalFromRanges([2, 9, 10]).run(A(3, 10)) np.random.seed(0) for i in xrange(1000): end = int(np.random.uniform(low=1, high=15)) a_work_count = int(np.random.uniform(low=1, high=15)) b_work_count = int(np.random.uniform(low=0, high=3)) extra_steps = int(np.random.uniform(low=0, high=end - 1)) if end > 1: list = sorted( np.random.random_integers(low=1, high=end - 1, size=extra_steps)) else: list = [] list.append(end) if i % 100 == 0: logging.info("random test case # {0}".format(i)) assert "A" == LocalFromRanges(list).run( A(a_work_count, b_work_count))
def execute_fs(test_snps, pheno, G0, covar): """ run feature selection """ result = {} fs_result = {} # fs unconditioned ######################## tmp_uuid = str(uuid.uuid4())[0:13] out_fn = "tmp_pheno_%s.txt" % (tmp_uuid) out_data = pd.DataFrame({ "id1": G0.iid[:, 0], "id2": G0.iid[:, 1], "y": pheno["vals"] }) out_data.to_csv(out_fn, sep=" ", header=False, index=False) # write out covariates items = [ ('id1', G0.iid[:, 0]), ('id2', G0.iid[:, 1]), ] items += [("pc_%i" % i, covar["vals"][:, i]) for i in xrange(covar["vals"].shape[1])] cov_df = pd.DataFrame.from_items(items) cov_fn = "tmp_cov_%s.txt" % (tmp_uuid) cov_df.to_csv(cov_fn, sep=" ", header=False, index=False) #TODO: fix include_all!! fsd = create_feature_selection_distributable(G0, out_fn, None, 0, "fs_out", include_all=False, cov_fn=cov_fn) fs_result["result_uncond_all"] = Local().run(fsd) best_k, best_delta, best_obj, best_snps = fs_result["result_uncond_all"] fs_idx = argintersect_left(G0.sid, best_snps) G_fs = G0[:, fs_idx] result["fs_all"] = single_snp(test_snps, pheno, G0=G_fs).sort(["Chr", "ChrPos" ])["PValue"].as_matrix() result["fs_all_cov"] = single_snp( test_snps, pheno, G0=G_fs, covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix() return result, fs_result
def test_local_single(self): """ test leave one chromosome out iterator """ # run on 4 core locally runner = Local(4) result = d_map(dummy, self.args, runner, input_files=[self.fn]) expect = [ '', 'A', 'AA', 'AAA', 'AAAB', 'AAABB', 'AAABBB', 'AAABBBC', 'AAABBBCC', 'AAABBBCCC' ] assert expect == result
def run(self, methods, num_causal, num_repeats, num_pcs, description, runner, seed=None, plot_fn=None): self.precompute_pca() input_files = [self.snp_fn + ext for ext in [".bed", ".fam", ".bim"]] + [self.eigen_fn] input_args = [(methods, self.snp_fn, self.eigen_fn, num_causal, num_pcs, seed, sim_id) for sim_id in range(num_repeats)] output_list = distributed_map.d_map(semisynth_simulations.compute_core, input_args, runner, input_files=input_files) ############################################ results_fn = "%s_results.runs_%i.causals_%i.pickle.bzip" % ( description, num_repeats, num_causal) reduced_results_fn = results_fn.replace("runs", "reduced.runs") save(results_fn, output_list) methods = output_list[0][0].keys() arg_list = [(method, results_fn) for method in methods] #reduce_runner = Hadoop(len(methods), mapmemory=90*1024, reducememory=90*1024, mkl_num_threads=1, queue="shared") reduce_runner = Local() combine_output = distributed_map.d_map( semisynth_simulations.combine_results, arg_list, reduce_runner, input_files=[results_fn]) save(reduced_results_fn, combine_output) title = "%i causal, %i repeats" % (num_causal, num_repeats) visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn) return combine_output
def run_sim_and_compare(self, name, method): logging.info('in test_all') import fastlmm.util.runner as runner currentFolder = os.path.dirname(os.path.realpath(__file__)) snp_fn = os.path.realpath(currentFolder + "/../../data/mouse/alldata") out_prefix = currentFolder + "/tempdir/mouse_" description = "test_run_{0}".format(name) runner = Local() num_causals = 500 num_repeats = 1 num_pcs = 5 expected_prefix = currentFolder + "/expected/" methods = [method] combine_output = run_simulation(snp_fn, out_prefix, methods, num_causals, num_repeats, num_pcs, description, runner, plot_fn="out.png", seed=42) from fastlmm.util.pickle_io import load filename = "%s%s.bzip" % (expected_prefix, name) co = load(filename) compare_nested(combine_output, co)
def main(): logging.basicConfig(level=logging.INFO) #snp_fn = "data/toydata.5chrom" snp_fn = "data/mouse/alldata" out_prefix = "results/mouse_" description = "test_run" queue = "shared" #runner = Hadoop2(200, mapmemory=40*1024, reducememory=90*1024, mkl_num_threads=4, queue=queue) print "using snps", snp_fn #runner = LocalMultiProc(20) runner = Local() num_causals = 500 num_repeats = 3 num_pcs = 5 # make this a tuple of function and kwargs from GWAS_benchmark.methods import execute_lmm, execute_linear_regression, execute_dual_fs, execute_fs methods = [execute_fs, execute_linear_regression] run_simulation(snp_fn, out_prefix, methods, num_causals, num_repeats, num_pcs, description, runner)
if __name__ == '__main__': from fastlmm.association.tests.testepistasis import TestEpistasis suites = unittest.TestSuite([getTestSuite()]) if False: #Standard test run r = unittest.TextTestRunner(failfast=False) r.run(suites) else: #Cluster test run from fastlmm.util.distributabletest import DistributableTest runner = HPC( 10, 'RR1-N13-09-H44', r'\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\Redmond', remote_python_parent= r"\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\REDMOND\carlk\Source\carlk\july_7_14\tests\runs\2014-07-24_15_02_02_554725991686\pythonpath", update_remote_python_parent=True, priority="AboveNormal", mkl_num_threads=1) runner = Local() #runner = LocalMultiProc(taskcount=20,mkl_num_threads=5) #runner = LocalInParts(1,2,mkl_num_threads=1) # For debugging the cluster runs #runner = Hadoop(100, mapmemory=8*1024, reducememory=8*1024, mkl_num_threads=1, queue="default") distributable_test = DistributableTest(suites, "temp_test") print runner.run(distributable_test) logging.info("done with testing")
# this import is needed for the runner from fastlmm.association.tests.test_single_snp_select import TestSingleSnpSelect suites = unittest.TestSuite([getTestSuite()]) if True: #Standard test run r = unittest.TextTestRunner(failfast=False) r.run(suites) else: #Cluster test run from fastlmm.util.runner import Local, HPC, LocalMultiProc logging.basicConfig(level=logging.INFO) from fastlmm.util.distributabletest import DistributableTest #runner = HPC(10, 'RR1-N13-09-H44',r'\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\Redmond', # remote_python_parent=r"\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\REDMOND\carlk\Source\carlk\july_7_14\tests\runs\2014-07-24_15_02_02_554725991686\pythonpath", # update_remote_python_parent=True, # priority="AboveNormal",mkl_num_threads=1) runner = Local() #runner = LocalMultiProc(taskcount=20,mkl_num_threads=5) #runner = LocalInParts(1,2,mkl_num_threads=1) # For debugging the cluster runs #runner = Hadoop(100, mapmemory=8*1024, reducememory=8*1024, mkl_num_threads=1, queue="default") distributable_test = DistributableTest(suites,"temp_test") print runner.run(distributable_test) logging.info("done with testing")
# this import is needed for the runner from fastlmm.association.tests.test_single_snp_select import TestSingleSnpSelect suites = unittest.TestSuite([getTestSuite()]) if True: #Standard test run r = unittest.TextTestRunner(failfast=False) r.run(suites) else: #Cluster test run from fastlmm.util.runner import Local, HPC, LocalMultiProc logging.basicConfig(level=logging.INFO) from fastlmm.util.distributabletest import DistributableTest #runner = HPC(10, 'RR1-N13-09-H44',r'\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\Redmond', # remote_python_parent=r"\\msr-arrays\Scratch\msr-pool\Scratch_Storage4\REDMOND\carlk\Source\carlk\july_7_14\tests\runs\2014-07-24_15_02_02_554725991686\pythonpath", # update_remote_python_parent=True, # priority="AboveNormal",mkl_num_threads=1) runner = Local() #runner = LocalMultiProc(taskcount=20,mkl_num_threads=5) #runner = LocalInParts(1,2,mkl_num_threads=1) # For debugging the cluster runs #runner = Hadoop(100, mapmemory=8*1024, reducememory=8*1024, mkl_num_threads=1, queue="default") distributable_test = DistributableTest(suites,"temp_test") print(runner.run(distributable_test)) logging.info("done with testing")
def mf_to_runner_function(mf): excluded_nodes = [ ] #'GCRCM07B20','GCRCM11B05','GCRCM10B06','GCRCM02B07']#'GCRCM02B11','GCRCM03B07'] #'GCRCM22B06','GCRCN0383','GCRCM02B07','GCRCN0179','GCRCM37B13','GCRCN0376','GCRCN0456']#'gcrcn0231']#"MSR-HDP-DN0316","MSR-HDP-DN0321","MSR-HDP-DN0336","MSR-HDP-DN0377","MSR-HDP-DN0378","MSR-HDP-DN0314","MSR-HDP-DN0335","MSRQC073","MSRQC002","MSRQC015"] remote_python_parent = r"\\GCR\Scratch\RR1\escience\carlk\data\carlk\pythonpath10262016" clean_up = False if mf == "debug": runner_function = lambda ignore: LocalInParts( 215, 215, mkl_num_threads=20, result_file="result.p", run_dir=r"C:\deldir\test\outputx") elif mf == "local": runner_function = lambda ignore: Local() elif mf == "local1": runner_function = lambda ignore: Local(1) elif mf == "lmp": runner_function = lambda ignore: LocalMultiProc(22, 5) elif mf == "lmp4": runner_function = lambda ignore: LocalMultiProc(4, 5) elif mf == "lmpl": runner_function = lambda taskcount: LocalMultiProc( taskcount, taskcount, just_one_process=True) elif mf == "nodeP": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, #mkl_num_threads=20, nodegroups="Preemptable", runtime="0:11:0", # day:hour:min #min = 10 #max(1,min(taskcount,110)//20) #max = min(taskcount,500), clean_up=clean_up, ) elif mf == "nodeP99": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, #mkl_num_threads=20, nodegroups="Preemptable,B99", runtime="0:11:0", # day:hour:min #min = 10 #max(1,min(taskcount,110)//20) #max = min(taskcount,500), clean_up=clean_up, ) elif mf == "nodeL99": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="LongRunQ", priority="Lowest", excluded_nodes=excluded_nodes, #mkl_num_threads=20, nodegroups="LongRunQ,B99", runtime="11:0:0", # day:hour:min #min = 10 #max(1,min(taskcount,110)//20) #max = min(taskcount,500), clean_up=clean_up, ) elif mf == "socketP": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='socket', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, mkl_num_threads=10, nodegroups="Preemptable", runtime="0:11:0", # day:hour:min #min = max(1,min(taskcount,110)//20), clean_up=clean_up, ) elif mf == "coreP": runner_function = lambda taskcount: HPC( min(taskcount, 1000), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='core', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, mkl_num_threads=1, runtime="0:11:0", # day:hour:min nodegroups="Preemptable", #min = min(taskcount,1100) min=1, max=200 * 20, clean_up=clean_up, ) elif mf == "coreP99": runner_function = lambda taskcount: HPC( min(taskcount, 1000), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='core', #core, socket, node update_remote_python_parent=True, template="Preemptable", priority="Lowest", excluded_nodes=excluded_nodes, mkl_num_threads=1, runtime="0:11:0", # day:hour:min nodegroups="Preemptable,B99", #min = min(taskcount,1100) min=1, max=200 * 20, clean_up=clean_up, ) elif mf == "coreAz": runner_function = lambda taskcount: HPC( min(taskcount, 1000), 'GCR', r"\\GCR\Scratch\AZ-USCentral\escience", remote_python_parent= r"\\GCR\Scratch\AZ-USCentral\escience\carlk\data\carlk\pythonpath", unit='core', #core, socket, node update_remote_python_parent=True, template="Azure IaaS USCentral", mkl_num_threads=1, runtime="0:8:0", # day:hour:min, clean_up=clean_up, ) elif mf == "nodeE": runner_function = lambda taskcount: HPC( min(taskcount, 10100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="ExpressQ", priority="Normal", #node_local = False, #mkl_num_threads=20, runtime="0:4:0", # day:hour:min #min = min(taskcount,100), clean_up=clean_up, ) elif mf == "50tasks": runner_function = lambda taskcount: HPC( 50, 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="ExpressQ", priority="Normal", #mkl_num_threads=20, runtime="0:4:0", # day:hour:min #min = min(taskcount,100), clean_up=clean_up, ) elif mf == "coreE": runner_function = lambda taskcount: HPC( min(taskcount, 10100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='core', #core, socket, node update_remote_python_parent=True, template="ExpressQ", priority="Normal", mkl_num_threads=1, runtime="0:4:0", # day:hour:min #min = min(taskcount,100), clean_up=clean_up, ) elif mf == "nodeA": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='node', #core, socket, node update_remote_python_parent=True, template="Admin Template", clean_up=clean_up, ) elif mf == "socketA": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='socket', #core, socket, node update_remote_python_parent=True, template="Admin Template", clean_up=clean_up, ) elif mf == "coreA": runner_function = lambda taskcount: HPC( min(taskcount, 30100), 'GCR', r"\\GCR\Scratch\RR1\escience", remote_python_parent=remote_python_parent, unit='core', #core, socket, node update_remote_python_parent=True, template="Admin Template", clean_up=clean_up, ) elif mf == "nodeH": runner_function = lambda taskcount: Hadoop2(min(taskcount, 100000), mapmemory=58 * 1024, reducememory=8 * 1024, min_alloc=2048, xmx=3072, mkl_num_threads=14, queue="shared", skipdatacheck=True, skipsourcecheck=True) elif mf == "coreH": runner_function = lambda taskcount: Hadoop2(min(taskcount, 100000), mapmemory=8 * 1024, reducememory=8 * 1024, min_alloc=2048, xmx=3072, mkl_num_threads=1, queue="shared", skipdatacheck=True, skipsourcecheck=True) else: raise Exception("don't find mf=" + mf) return runner_function
def simulate_ascertained(methods, prevalence, iid_count, num_causal, num_repeats, description, snp_args, phenotype_args, runner=Local(), seed=None, plot_fn=None): """ run a synthetic simulation using ascertained data :param methods: A list of functions implementing methods to be compared. :type methods: list<function> :param prevalence: Prior probability of a case, e.g. .1 :type prevalence: a float between 0.0 and 1.0 (exclusive) :param iid_count: The number of individuals to generate. :type iid_count: int :param num_causal: The number causal SNPs in the simulation. :type num_causal: int :param num_repeats: The number of repeats in the simulation. :type num_repeats: int :param description: Short description string of experiment (for output) :type description: str :param num_repeats: The number of repeats in the simulation. :type num_repeats: int :param snp_args: arguments for an internal call to :func:`GWAS_benchmark.snp_gen`. Do not include 'iid_count' or 'seed' :type snp_args: dictionary :param phenotype_args: arguments for an internal call to :func:`.generate_phenotype`. Do not include 'snp_count' or 'seed' :type phenotype_args: dictionary :param runner: a Runner object (e.g. Local, Hadoop, HPC) :type runner: Runner :param seed: a random seed to control random number generation :type seed: int :param plot_fn: filename under which to save the output figure :type plot_fn: str """ input_args = [(methods, num_causal, prevalence, iid_count, snp_args, phenotype_args, seed, sim_id) for sim_id in range(num_repeats)] output_list = distributed_map.d_map( semisynth_simulations.compute_core_ascertained, input_args, runner) ############################################ results_fn = "%s_ascertained_results.runs_%i.causals_%i.pickle.bzip" % ( description, num_repeats, num_causal) reduced_results_fn = results_fn.replace("runs", "reduced.runs") save(results_fn, output_list) methods = output_list[0][0].keys() arg_list = [(method, results_fn) for method in methods] combine_output = distributed_map.d_map( semisynth_simulations.combine_results, arg_list, Local(), input_files=[results_fn]) save(reduced_results_fn, combine_output) title = "%i causal, %i repeats" % (num_causal, num_repeats) visualize_reduced_results(methods, combine_output, title=title, plot_fn=plot_fn) return combine_output