def test_examples_local_engine(self): list_all_examples = get_wf_example_classes() for example in list_all_examples: # if example().__class__.__name__ == "WFExample1" or\ # example().__class__.__name__ == "WFExample2": # example = list_all_examples[0] wf = example().get_workflow() local_engine_wf = example().get_workflow() sfw_engine_wf = example().get_workflow() wf.run(X=self.X, y=self.y) local_engine = LocalEngine(tree_root=local_engine_wf, num_processes=self.n_cores) local_engine_wf = local_engine.run(X=self.X, y=self.y) sfw_engine = SomaWorkflowEngine(tree_root=sfw_engine_wf, num_processes=self.n_cores, #resource_id="ed203246@gabriel", #login="******", remove_finished_wf=False, remove_local_tree=False ) sfw_engine_wf = sfw_engine.run(X=self.X, y=self.y) self.assertTrue(compare_two_node(wf, local_engine_wf)) self.assertTrue(compare_two_node(wf, sfw_engine_wf)) self.assertTrue(comp_2wf_reduce_res(wf, local_engine_wf)) self.assertTrue(comp_2wf_reduce_res(wf, sfw_engine_wf))
def test_examples_local_engine(self): list_all_examples = get_wf_example_classes() for example in list_all_examples: # if example().__class__.__name__ == "WFExample1" or\ # example().__class__.__name__ == "WFExample2": # example = list_all_examples[0] wf = example().get_workflow() local_engine_wf = example().get_workflow() sfw_engine_wf = example().get_workflow() wf.run(X=self.X, y=self.y) local_engine = LocalEngine(tree_root=local_engine_wf, num_processes=self.n_cores) local_engine_wf = local_engine.run(X=self.X, y=self.y) sfw_engine = SomaWorkflowEngine( tree_root=sfw_engine_wf, num_processes=self.n_cores, #resource_id="ed203246@gabriel", #login="******", remove_finished_wf=False, remove_local_tree=False) sfw_engine_wf = sfw_engine.run(X=self.X, y=self.y) self.assertTrue(compare_two_node(wf, local_engine_wf)) self.assertTrue(compare_two_node(wf, sfw_engine_wf)) self.assertTrue(comp_2wf_reduce_res(wf, local_engine_wf)) self.assertTrue(comp_2wf_reduce_res(wf, sfw_engine_wf))
def test_prev_state_methods(self): ## 1) Build dataset ## ================================================ X, y = datasets.make_classification(n_samples=5, n_features=20, n_informative=2) Xy = {"X": X, "y": y} methods = Methods(*[TOY_CLF(v_lambda=v_lambda) for v_lambda in [2, 1]]) methods.run(**Xy) ps_methods = WarmStartMethods(*[TOY_CLF(v_lambda=v_lambda) for v_lambda in [2, 1]]) ps_methods.run(**Xy) self.assertTrue(compare_two_node(methods, ps_methods)) self.assertTrue(comp_2wf_reduce_res(methods, ps_methods))
def test_cv_best_search_refit_parallel(self): n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] n_samples = 500 n_features = 10000 n_cores = 2 X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, n_informative=5) # epac workflow for paralle computing pipelines = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) wf = CV(pipeline, n_folds=n_folds) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=n_cores, remove_finished_wf=False, remove_local_tree=False) sfw_engine_wf = sfw_engine.run(X=X, y=y) # epac workflow for normal node computing pipelines2 = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline2 = CVBestSearchRefitParallel(pipelines2, n_folds=n_folds_nested) wf2 = CV(pipeline2, n_folds=n_folds) wf2.run(X=X, y=y) self.assertTrue(compare_two_node(sfw_engine_wf, wf2)) self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
def test_cv_best_search_refit_parallel(self): n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] n_samples = 500 n_features = 10000 n_cores = 2 X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, n_informative=5) # epac workflow for paralle computing pipelines = Methods(*[Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) wf = CV(pipeline, n_folds=n_folds) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=n_cores, remove_finished_wf=False, remove_local_tree=False) sfw_engine_wf = sfw_engine.run(X=X, y=y) # epac workflow for normal node computing pipelines2 = Methods(*[Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline2 = CVBestSearchRefitParallel(pipelines2, n_folds=n_folds_nested) wf2 = CV(pipeline2, n_folds=n_folds) wf2.run(X=X, y=y) self.assertTrue(compare_two_node(sfw_engine_wf, wf2)) self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
def test_memmapping(self): ## 1) Building dataset ## ============================================================ if self.memmap: # If the proc is 1, always generate the matrix # Otherwise, load it if it exists, or create it if it doesn't writing_mode = (self.n_proc == 1) X = create_mmat(self.n_samples, self.n_features, dir=self.directory, writing_mode=writing_mode) y = create_array(self.n_samples, [0, 1], dir=self.directory, writing_mode=writing_mode) Xy = dict(X=X, y=y) else: X, y = datasets.make_classification(n_samples=self.n_samples, n_features=self.n_features, n_informative=2, random_state=1) Xy = dict(X=X, y=y) ## 2) Building workflow ## ======================================================= from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV(Methods(*[SVC( kernel="linear"), SVC(kernel="rbf")]), n_folds=3) cv_svm = None if self.is_swf: # Running on the cluster from epac import SomaWorkflowEngine mmap_mode = None if self.memmap: mmap_mode = "r+" swf_engine = SomaWorkflowEngine( cv_svm_local, num_processes=self.n_proc, resource_id="jl237561@gabriel", login="******", # remove_finished_wf=False, # remove_local_tree=False, mmap_mode=mmap_mode, queue="Global_long") cv_svm = swf_engine.run(**Xy) # Printing information about the jobs time.sleep(2) print('') sum_memory = 0 max_time_cost = 0 for job_info in swf_engine.engine_info: print( "mem_cost = {0}, vmem_cost = {1}, time_cost = {2}".format( job_info.mem_cost, job_info.vmem_cost, job_info.time_cost)) sum_memory += job_info.mem_cost if max_time_cost < job_info.time_cost: max_time_cost = job_info.time_cost print("sum_memory = ", sum_memory) print("max_time_cost = ", max_time_cost) else: # Running on the local machine from epac import LocalEngine local_engine = LocalEngine(cv_svm_local, num_processes=self.n_proc) cv_svm = local_engine.run(**Xy) cv_svm_reduce = cv_svm.reduce() print("\n -> Reducing results") print(cv_svm_reduce) # Creating the directory to save results, if it doesn't exist dirname = 'tmp_save_tree/' if self.directory is None: directory = '/tmp' else: directory = self.directory if not os.path.isdir(directory): os.mkdir(directory) dirpath = os.path.join(directory, dirname) if not os.path.isdir(dirpath): os.mkdir(dirpath) if self.n_proc == 1: ## 4.1) Saving results on the disk for one process ## =================================================== store = StoreFs(dirpath=dirpath, clear=True) cv_svm.save_tree(store=store) with open(os.path.join(directory, "tmp_save_results"), 'w+') \ as filename: print(filename.name) pickle.dump(cv_svm_reduce, filename) else: ## 4.2) Loading the results for one process ## =================================================== try: store = StoreFs(dirpath=dirpath, clear=False) cv_svm_one_proc = store.load() with open(os.path.join(directory, "tmp_save_results"), 'r+') \ as filename: cv_svm_reduce_one_proc = pickle.load(filename) ## 5.2) Comparing results to the results for one process ## =================================================== print("\nComparing %i proc with one proc" % self.n_proc) self.assertTrue(compare_two_node(cv_svm, cv_svm_one_proc)) self.assertTrue(isequal(cv_svm_reduce, cv_svm_reduce_one_proc)) except KeyError: print("Warning: ") print("No previous tree detected, no possible "\ "comparison of results")
def test_memmapping(self): ## 1) Building dataset ## ============================================================ if self.memmap: # If the proc is 1, always generate the matrix # Otherwise, load it if it exists, or create it if it doesn't writing_mode = (self.n_proc == 1) X = create_mmat(self.n_samples, self.n_features, dir=self.directory, writing_mode=writing_mode) y = create_array(self.n_samples, [0, 1], dir=self.directory, writing_mode=writing_mode) Xy = dict(X=X, y=y) else: X, y = datasets.make_classification(n_samples=self.n_samples, n_features=self.n_features, n_informative=2, random_state=1) Xy = dict(X=X, y=y) ## 2) Building workflow ## ======================================================= from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3) cv_svm = None if self.is_swf: # Running on the cluster from epac import SomaWorkflowEngine mmap_mode = None if self.memmap: mmap_mode = "r+" swf_engine = SomaWorkflowEngine(cv_svm_local, num_processes=self.n_proc, resource_id="jl237561@gabriel", login="******", # remove_finished_wf=False, # remove_local_tree=False, mmap_mode=mmap_mode, queue="Global_long") cv_svm = swf_engine.run(**Xy) # Printing information about the jobs time.sleep(2) print '' sum_memory = 0 max_time_cost = 0 for job_info in swf_engine.engine_info: print "mem_cost=", job_info.mem_cost, \ ", vmem_cost=", job_info.vmem_cost, \ ", time_cost=", job_info.time_cost sum_memory += job_info.mem_cost if max_time_cost < job_info.time_cost: max_time_cost = job_info.time_cost print "sum_memory =", sum_memory print "max_time_cost =", max_time_cost else: # Running on the local machine from epac import LocalEngine local_engine = LocalEngine(cv_svm_local, num_processes=self.n_proc) cv_svm = local_engine.run(**Xy) cv_svm_reduce = cv_svm.reduce() print "\n -> Reducing results" print cv_svm_reduce # Creating the directory to save results, if it doesn't exist dirname = 'tmp_save_tree/' if self.directory is None: directory = '/tmp' else: directory = self.directory if not os.path.isdir(directory): os.mkdir(directory) dirpath = os.path.join(directory, dirname) if not os.path.isdir(dirpath): os.mkdir(dirpath) if self.n_proc == 1: ## 4.1) Saving results on the disk for one process ## =================================================== store = StoreFs(dirpath=dirpath, clear=True) cv_svm.save_tree(store=store) with open(os.path.join(directory, "tmp_save_results"), 'w+') \ as filename: print filename.name pickle.dump(cv_svm_reduce, filename) else: ## 4.2) Loading the results for one process ## =================================================== try: store = StoreFs(dirpath=dirpath, clear=False) cv_svm_one_proc = store.load() with open(os.path.join(directory, "tmp_save_results"), 'r+') \ as filename: cv_svm_reduce_one_proc = pickle.load(filename) ## 5.2) Comparing results to the results for one process ## =================================================== print "\nComparing %i proc with one proc" % self.n_proc self.assertTrue(compare_two_node(cv_svm, cv_svm_one_proc)) self.assertTrue(isequal(cv_svm_reduce, cv_svm_reduce_one_proc)) except KeyError: print "Warning: " print "No previous tree detected, no possible "\ "comparison of results"