def test_constructor_avoid_collision_level2(self): # Test that level 2 collisions are avoided pm = Methods(*[Pipe(SelectKBest(k=2), SVC(kernel="linear", C=C))\ for C in [1, 10]]) leaves_key = [l.get_key() for l in pm.walk_leaves()] self.assertTrue(len(leaves_key) == len(set(leaves_key)), u'Collision could not be avoided')
def test_twomethods(self): key_y_pred = 'y' + conf.SEP + conf.PREDICTION X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) # = With EPAC wf = Methods(LDA(), SVC(kernel="linear")) r_epac = wf.run(X=X, y=y) # = With SKLEARN lda = LDA() svm = SVC(kernel="linear") lda.fit(X, y) svm.fit(X, y) r_sklearn = [lda.predict(X), svm.predict(X)] # Comparison for i_cls in range(2): comp = np.all(np.asarray(r_epac[i_cls][key_y_pred]) == np.asarray(r_sklearn[i_cls])) self.assertTrue(comp, u'Diff Methods') # test reduce r_epac_reduce = [wf.reduce().values()[0][key_y_pred], wf.reduce().values()[1][key_y_pred]] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn)) self.assertTrue(comp, u'Diff Perm / CV: EPAC reduce')
def test_constructor_avoid_collision_level2(self): # Test that level 2 collisions are avoided pm = Methods(*[Pipe(SelectKBest(k=2), SVC(kernel="linear", C=C)) for C in [1, 10]]) leaves_key = [l.get_key() for l in pm.walk_leaves()] self.assertTrue(len(leaves_key) == len(set(leaves_key)), u'Collision could not be avoided')
def get_workflow(self): n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) wf = CV(pipeline, n_folds=n_folds) return wf
def get_workflow(self): #################################################################### ## EPAC WORKFLOW # ------------------------------------- # Perms Perm (Splitter) # / | \ # 0 1 2 Samples (Slicer) # | # CV CV (Splitter) # / | \ # 0 1 2 Folds (Slicer) # | | | # Pipeline Pipeline Pipeline Sequence # | # 2 SelectKBest (Estimator) # | # Methods # | \ # SVM(linear,C=1) SVM(linear,C=10) Classifiers (Estimator) pipeline = Pipe(SelectKBest(k=2), Methods(*[SVC(kernel="linear", C=C) for C in [1, 3]])) wf = Perms(CV(pipeline, n_folds=3), n_perms=3, permute="y", random_state=1) return wf
def func_memm_local(): print("memm_local pt1") ## 1) Build a dataset and convert to np.memmap (for big matrix) ## ============================================================ X, y = datasets.make_classification(n_samples=500, n_features=5000, n_informative=2, random_state=1) print("memm_local pt2") X = convert2memmap(X) y = convert2memmap(y) Xy = dict(X=X, y=y) ## 2) Build two workflows respectively ## ======================================================= print("memm_local pt3") from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV(Methods(*[SVC( kernel="linear"), SVC(kernel="rbf")]), n_folds=3) print("memm_local pt4") # from epac import LocalEngine # local_engine = LocalEngine(cv_svm_local, num_processes=2) # cv_svm = local_engine.run(**Xy) cv_svm_local.run(**Xy) print(cv_svm_local.reduce()) print("memm_local pt5")
def get_workflow(self, n_features=int(1E03)): random_state = 0 C_values = [1, 10] k_values = 0 k_max = "auto" n_folds_nested = 5 n_folds = 10 n_perms = 10 if k_max != "auto": k_values = range_log2(np.minimum(int(k_max), n_features), add_n=True) else: k_values = range_log2(n_features, add_n=True) cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=n_folds), n_perms=n_perms, permute="y", random_state=random_state) return wf
def test_cvbestsearchrefit(self): X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2) n_folds_nested = 2 #random_state = 0 C_values = [.1, 0.5, 1, 2, 5] kernels = ["linear", "rbf"] key_y_pred = 'y' + conf.SEP + conf.PREDICTION # With EPAC methods = Methods(*[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels]) wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested) wf.run(X=X, y=y) r_epac = wf.reduce().values()[0] # - Without EPAC r_sklearn = dict() clf = SVC(kernel="linear") parameters = {'C': C_values, 'kernel': kernels} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u'Diff CVBestSearchRefitParallel: prediction') for key_param in r_epac[conf.BEST_PARAMS][0]: if key_param in r_sklearn[conf.BEST_PARAMS]: comp = r_sklearn[conf.BEST_PARAMS][key_param] == \ r_epac[conf.BEST_PARAMS][0][key_param] self.assertTrue( comp, u'Diff CVBestSearchRefitParallel: best parameters')
def test_prev_state_methods(self): ## 1) Build dataset ## ================================================ X, y = datasets.make_classification(n_samples=5, n_features=20, n_informative=2) Xy = {"X": X, "y": y} methods = Methods(*[TOY_CLF(v_lambda=v_lambda) for v_lambda in [2, 1]]) methods.run(**Xy) ps_methods = WarmStartMethods(*[TOY_CLF(v_lambda=v_lambda) for v_lambda in [2, 1]]) ps_methods.run(**Xy) self.assertTrue(compare_two_node(methods, ps_methods)) self.assertTrue(comp_2wf_reduce_res(methods, ps_methods))
def test_engine_info(self): n_samples = 20 n_features = 100 n_proc = 2 X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, n_informative=2, random_state=1) Xy = dict(X=X, y=y) cv_svm_local = CV(Methods(*[SVC( kernel="linear"), SVC(kernel="rbf")]), n_folds=3) swf_engine = SomaWorkflowEngine(cv_svm_local, num_processes=n_proc, resource_id="jl237561@gabriel", login="******", remove_finished_wf=False, remove_local_tree=False, queue="Global_long") swf_engine.run(**Xy) print("engine_info ================") for job_info in swf_engine.engine_info: print(" job_info=================") print(" mem_cost= ", job_info.mem_cost) print(" vmem_cost= ", job_info.vmem_cost) print(" time_cost= ", job_info.time_cost) self.assertTrue(job_info.time_cost > 0)
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Run Workflow ## =============== time_fit_predict = time.time() ## Run on local machine sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores) ## Run on cluster # sfw_engine = SomaWorkflowEngine( # tree_root=wf, # num_processes=options.n_cores, # resource_id="jl237561@gabriel", # login="******") wf = sfw_engine.run(X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict time_reduce = time.time() ## 4) Reduce Workflow ## ================== print wf.reduce() print "Time ellapsed, reduce:", time.time() - time_reduce
def test_peristence_load_and_fit_predict(self): X, y = datasets.make_classification(n_samples=20, n_features=10, n_informative=2) n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) tree_mem = CV(pipeline, n_folds=n_folds, reducer=ClassificationReport(keep=False)) # Save Tree import tempfile store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True) tree_mem.save_tree(store=store) tree_mem.run(X=X, y=y) res_mem = tree_mem.reduce().values()[0] # Reload Tree tree_fs_noresults = store.load() tree_fs_noresults.run(X=X, y=y) res_fs_noresults = tree_fs_noresults.reduce().values()[0] # Save with results tree_fs_noresults.save_tree(store=store) tree_fs_withresults = store.load() res_fs_withresults = tree_fs_withresults.reduce().values()[0] # Compare comp = np.all([ np.all(np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k])) and np.all( np.asarray(res_fs_noresults[k]) == np.asarray( res_fs_withresults[k])) for k in res_mem ]) self.assertTrue(comp)
def test_cv_best_search_refit_parallel(self): n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] n_samples = 500 n_features = 10000 n_cores = 2 X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, n_informative=5) # epac workflow for paralle computing pipelines = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) wf = CV(pipeline, n_folds=n_folds) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=n_cores, remove_finished_wf=False, remove_local_tree=False) sfw_engine_wf = sfw_engine.run(X=X, y=y) # epac workflow for normal node computing pipelines2 = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline2 = CVBestSearchRefitParallel(pipelines2, n_folds=n_folds_nested) wf2 = CV(pipeline2, n_folds=n_folds) wf2.run(X=X, y=y) self.assertTrue(compare_two_node(sfw_engine_wf, wf2)) self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
def test_peristence_perm_cv_parmethods_pipe_vs_sklearn(self): key_y_pred = 'y' + conf.SEP + conf.PREDICTION X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2) n_folds_nested = 2 #random_state = 0 C_values = [.1, 0.5, 1, 2, 5] kernels = ["linear", "rbf"] # With EPAC methods = Methods( *[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels]) wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested) # Save workflow # ------------- import tempfile #store = StoreFs("/tmp/toto", clear=True) store = StoreFs(tempfile.mktemp()) wf.save_tree(store=store) wf = store.load() wf.run(X=X, y=y) ## Save results wf.save_tree(store=store) wf = store.load() r_epac = wf.reduce().values()[0] # - Without EPAC r_sklearn = dict() clf = SVC(kernel="linear") parameters = {'C': C_values, 'kernel': kernels} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ r_sklearn[conf.BEST_PARAMS]['name'] = 'SVC' # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u'Diff CVBestSearchRefitParallel: prediction') comp = np.all([ r_epac[conf.BEST_PARAMS][0][p] == r_sklearn[conf.BEST_PARAMS][p] for p in r_sklearn[conf.BEST_PARAMS] ]) self.assertTrue(comp, u'Diff CVBestSearchRefitParallel: best parameters')
def test_cvbestsearchrefit_select_k_best(self): list_C_value = range(2, 10, 1) # print repr(list_C_value) for C_value in list_C_value: # C_value = 2 # print C_value X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5) n_folds_nested = 2 #random_state = 0 k_values = [2, 3, 4, 5, 6] key_y_pred = 'y' + conf.SEP + conf.PREDICTION # With EPAC methods = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C_value, kernel="linear")) for k in k_values]) wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested) wf.run(X=X, y=y) r_epac = wf.reduce().values()[0] # - Without EPAC from sklearn.pipeline import Pipeline r_sklearn = dict() clf = Pipeline([('anova', SelectKBest(k=3)), ('svm', SVC(C=C_value, kernel="linear"))]) parameters = {'anova__k': k_values} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ r_sklearn[conf.BEST_PARAMS]['k'] = \ r_sklearn[conf.BEST_PARAMS]['anova__k'] # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u'Diff CVBestSearchRefitParallel: prediction') for key_param in r_epac[conf.BEST_PARAMS][0]: if key_param in r_sklearn[conf.BEST_PARAMS]: comp = r_sklearn[conf.BEST_PARAMS][key_param] == \ r_epac[conf.BEST_PARAMS][0][key_param] self.assertTrue( comp, u'Diff CVBestSearchRefitParallel: best parameters')
def get_workflow(self): #################################################################### ## EPAC WORKFLOW # ------------------------------------- # Perms Perm (Splitter) # / | \ # 0 1 2 Samples (Slicer) # | # CV CV (Splitter) # / | \ # 0 1 2 Folds (Slicer) # | | | # Pipeline Pipeline Pipeline Sequence # | # 2 SelectKBest (Estimator) # | # Methods # | \ # SVM(linear,C=1) SVM(linear,C=10) Classifiers (Estimator) pipeline = Pipe(SelectKBest(k=2), Methods(*[SVC(kernel="linear", C=C) for C in [1, 3]])) wf = CV(pipeline, n_folds=3, reducer=ClassificationReport(keep=True)) return wf
# -*- coding: utf-8 -*- """ Created on Thu May 23 15:21:35 2013 @author: ed203246 """ from sklearn import datasets from sklearn.svm import LinearSVC as SVM from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.feature_selection import SelectKBest from epac.map_reduce.reducers import PvalPerms import numpy X, y = datasets.make_classification(n_samples=100, n_features=200, n_informative=2) X = numpy.random.rand(*X.shape) from epac import Perms, CV, Methods perms_cv_svm = Perms(CV(Methods(SVM(loss="l1"), SVM(loss="l2"))), n_perms=100) perms_cv_svm.run(X=X, y=y) perms_cv_svm.reduce() self = perms_cv_svm key = 'LinearSVC(loss=l1)' self = PvalPerms()
# To save the results of the top-down operation (run) on the disk, it is # possible to convert it to CSV format from epac import export_leaves_csv export_leaves_csv(pipe, 'my_result_run.csv') ## Parallelization ## =============== # Multi-classifiers # ----------------- # Methods Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifiers (Estimator) from epac import Methods multi = Methods(SVM(C=1), SVM(C=10)) multi.run(X=X, y=y) print multi.reduce() # Reduce format outputs into "ResultSet" which is a dict-like structure # which contains the "keys" of the methods that have beeen used. # You can also export the results of the bottom-up operation (reduce) to CSV from epac import export_resultset_csv export_resultset_csv(multi.reduce(), 'my_result_reduce.csv') # Methods Methods (Splitter) # / \ # SVM(l1, C=1) SVM(l1, C=10) ..... SVM(l2, C=10) Classifiers (Estimator)
n_folds = 10 anova_svm = Pipe(SelectKBest(k=5), preprocessing.StandardScaler(), SVM(class_weight='auto')) cv = CV(anova_svm, n_folds=n_folds) cv.run(X=X, y=y) # res_cv_anova_svm = cv.reduce() res_cv_anova_svm["SelectKBest/StandardScaler/LinearSVC"]['y/test/score_recall'] ############################################################################## ## Multimethods, "Methods": SVM l1 vs l2 from epac import Methods, CV svms = Methods(SVM(penalty="l1", class_weight='auto', dual=False), SVM(penalty="l2", class_weight='auto', dual=False)) cv = CV(svms, n_folds=n_folds) cv.run(X=X, y=y) res_cv_svms = cv.reduce() # print res_cv_svms print res_cv_svms["LinearSVC(penalty=l1)"]['y/test/score_recall'] print res_cv_svms["LinearSVC(penalty=l2)"]['y/test/score_recall'] # !!! BIASED RESULT !!! # Re-fit on all data to see which mode is choosen. Warning !!! this is biased # since all data have been used. We use for information only. No score can be # used from it. We look the weights map. svms.run(X=X, y=y) print svms.children[0]
# Predict can return an array. In this case EPAC will # put the prediction in a Result (a dictionnary). with key = "y/pred". y being the # difference between input agrument of fit and predict. The true y will also figure # in the result with key "y/true" class MySVM: def __init__(self, C=1.0): self.C = C def fit(self, X, y): from sklearn.svm import SVC self.svc = SVC(C=self.C) self.svc.fit(X, y) def predict(self, X): return self.svc.predict(X) svms = Methods(MySVM(C=1.0), MySVM(C=2.0)) cv = CV(svms, cv_key="y", cv_type="stratified", n_folds=2, reducer=None) cv.run(X=X, y=y) # top-down process to call transform cv.reduce() # buttom-up process from sklearn.decomposition import PCA class MyPCA(PCA): """PCA with predict method""" def predict(self, X): """Project to X PCs then project back to original space If X is not singular, self.fit(X).predict(X) == X""" return np.dot(self.transform(X), self.components_) + self.mean_ pcas = Methods(MyPCA(n_components=1), MyPCA(n_components=2)) cv = CV(pcas, n_folds=2, reducer=None)
X, y = datasets.make_classification(n_samples=500, n_features=10000, n_informative=2, random_state=1) X = convert2memmap(X) y = convert2memmap(y) Xy = dict(X=X, y=y) ## 2) Build two workflows respectively ## ======================================================= from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV( Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3) cv_svm_swf = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3) ## 3) Run two workflows using local engine and soma-workflow ## ========================================================= from epac import LocalEngine local_engine = LocalEngine(cv_svm_local, num_processes=2) cv_svm = local_engine.run(X=X, y=y) print(cv_svm.reduce()) from epac import SomaWorkflowEngine swf_engine = SomaWorkflowEngine( cv_svm_swf, num_processes=2, #resource_id="jl237561@gabriel",
# Each node call the "transform" method, that take a dictionnary as input # and produces a dictionnary as output. The output is passed to the next node. # The return value of the run is simply agregation of the outputs (dict) of # the leaf nodes ## Parallelization ## =============== # Multi-classifiers # ----------------- # Methods Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifiers (Estimator) from epac import Methods multi = Methods(SVM(C=1), SVM(C=10)) multi.run(X=X, y=y) print multi.reduce() # Reduce format outputs into "ResultSet" which is a dict-like structure # which contains the "keys" of the methods that as beeen used. # Methods Methods (Splitter) # / \ # SVM(l1, C=1) SVM(l1, C=10) ..... SVM(l2, C=10) Classifiers (Estimator) svms = Methods(*[SVM(loss=loss, C=C) for loss in ("l1", "l2") for C in [1, 10]]) svms.run(X=X, y=y) print svms.reduce() # Parallelize sequential Pipeline: Anova(k best selection) + SVM.
from epac import Methods import numpy as np from sklearn import datasets from sklearn.svm import SVC X, y = datasets.make_classification(n_samples=500, n_features=200000, n_informative=2, random_state=1) methods = Methods(*[SVC(C=1, kernel='linear'), SVC(C=1, kernel='rbf')]) data = {"X":X, 'y':y, "methods": methods} # X = np.random.random((500, 200000)) def map_func(data): from sklearn.cross_validation import StratifiedKFold from sklearn import svm, cross_validation kfold = StratifiedKFold(y=data['y'], n_folds=3) # kfold = cross_validation.KFold(n=data.X.shape[0], n_folds=3) # svc = SVC(C=1, kernel='linear') for train, test in kfold: # svc.fit(data['X'][train], data['y'][train]) # svc.predict(data['X'][test]) data['methods'].run(X=data["X"][train], y=data['y'][train])
# To save the results of the top-down operation (run) on the disk, it is # possible to convert it to CSV format from epac import export_leaves_csv export_leaves_csv(pipe, 'my_result_run.csv') ## Parallelization ## =============== # Multi-classifiers # ----------------- # Methods Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifiers (Estimator) from epac import Methods multi = Methods(SVM(C=1), SVM(C=10)) multi.run(X=X, y=y) print(multi.reduce()) # Reduce format outputs into "ResultSet" which is a dict-like structure # which contains the "keys" of the methods that have beeen used. # You can also export the results of the bottom-up operation (reduce) to CSV from epac import export_resultset_csv export_resultset_csv(multi.reduce(), 'my_result_reduce.csv') # Methods Methods (Splitter) # / \ # SVM(l1, C=1) SVM(l1, C=10) ..... SVM(l2, C=10) Classifiers (Estimator) svms = Methods(
def get_workflow(self): wf = Methods(*[SVC(kernel="linear", C=C) for C in [1, 3]]) return wf
return {"y/pred": pred_y, "y/true": y, "best_beta": self.v_beta} if __name__ == "__main__": ## 1) Build dataset ## ================================================ X, y = datasets.make_classification(n_samples=10, n_features=5, n_informative=2, random_state=1) Xy = {"X": X, "y": y} ## 2) Build Methods ## ================================================ print("Methods ===================================") methods = Methods(*[TOY_CLF(v_lambda=v_lambda) for v_lambda in [2, 1]]) print(methods.run(**Xy)) ## 3) Build WarmStartMethods like Methods ## ================================================ ## WarmStartMethods ## / \ ## TOY_CLF(v_lambda=2) TOY_CLF(v_lambda=1) ## ## 1. WarmStartMethods will look for different argumenets as signature ## For example, here is v_lambda, there are different for each leaf ## 2. And then run TOY_CLF(v_lambda=2).transform ## 3. Except v_lambda, WarmStartMethods copy all the other parameters ## from TOY_CLF(v_lambda=2) to TOY_CLF(v_lambda=1) as initialization ## 4. Finally call TOY_CLF(v_lambda=1).transform print("WarmStartMethods ==========================")
def test_memmapping(self): ## 1) Building dataset ## ============================================================ if self.memmap: # If the proc is 1, always generate the matrix # Otherwise, load it if it exists, or create it if it doesn't writing_mode = (self.n_proc == 1) X = create_mmat(self.n_samples, self.n_features, dir=self.directory, writing_mode=writing_mode) y = create_array(self.n_samples, [0, 1], dir=self.directory, writing_mode=writing_mode) Xy = dict(X=X, y=y) else: X, y = datasets.make_classification(n_samples=self.n_samples, n_features=self.n_features, n_informative=2, random_state=1) Xy = dict(X=X, y=y) ## 2) Building workflow ## ======================================================= from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV(Methods(*[SVC( kernel="linear"), SVC(kernel="rbf")]), n_folds=3) cv_svm = None if self.is_swf: # Running on the cluster from epac import SomaWorkflowEngine mmap_mode = None if self.memmap: mmap_mode = "r+" swf_engine = SomaWorkflowEngine( cv_svm_local, num_processes=self.n_proc, resource_id="jl237561@gabriel", login="******", # remove_finished_wf=False, # remove_local_tree=False, mmap_mode=mmap_mode, queue="Global_long") cv_svm = swf_engine.run(**Xy) # Printing information about the jobs time.sleep(2) print('') sum_memory = 0 max_time_cost = 0 for job_info in swf_engine.engine_info: print( "mem_cost = {0}, vmem_cost = {1}, time_cost = {2}".format( job_info.mem_cost, job_info.vmem_cost, job_info.time_cost)) sum_memory += job_info.mem_cost if max_time_cost < job_info.time_cost: max_time_cost = job_info.time_cost print("sum_memory = ", sum_memory) print("max_time_cost = ", max_time_cost) else: # Running on the local machine from epac import LocalEngine local_engine = LocalEngine(cv_svm_local, num_processes=self.n_proc) cv_svm = local_engine.run(**Xy) cv_svm_reduce = cv_svm.reduce() print("\n -> Reducing results") print(cv_svm_reduce) # Creating the directory to save results, if it doesn't exist dirname = 'tmp_save_tree/' if self.directory is None: directory = '/tmp' else: directory = self.directory if not os.path.isdir(directory): os.mkdir(directory) dirpath = os.path.join(directory, dirname) if not os.path.isdir(dirpath): os.mkdir(dirpath) if self.n_proc == 1: ## 4.1) Saving results on the disk for one process ## =================================================== store = StoreFs(dirpath=dirpath, clear=True) cv_svm.save_tree(store=store) with open(os.path.join(directory, "tmp_save_results"), 'w+') \ as filename: print(filename.name) pickle.dump(cv_svm_reduce, filename) else: ## 4.2) Loading the results for one process ## =================================================== try: store = StoreFs(dirpath=dirpath, clear=False) cv_svm_one_proc = store.load() with open(os.path.join(directory, "tmp_save_results"), 'r+') \ as filename: cv_svm_reduce_one_proc = pickle.load(filename) ## 5.2) Comparing results to the results for one process ## =================================================== print("\nComparing %i proc with one proc" % self.n_proc) self.assertTrue(compare_two_node(cv_svm, cv_svm_one_proc)) self.assertTrue(isequal(cv_svm_reduce, cv_svm_reduce_one_proc)) except KeyError: print("Warning: ") print("No previous tree detected, no possible "\ "comparison of results")
Xd, yd = IO.read_Xy(WD=WD) Xd.PAS2gr[Xd.PAS2gr == 1] = -1 Xd.PAS2gr[Xd.PAS2gr == 2] = 1 Xd.CB_EXPO[Xd.CB_EXPO == 0] = -1 X = np.asarray(Xd) y = np.asarray(yd) C_values = [0.01, 0.05, .1, .5, 1, 5, 10] # SVM L1 # ====== svms = Methods(*[ SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values ]) cv = CV(svms, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() #print cv_results epac.export_csv( cv, cv_results, os.path.join(WD, "results", "cv10_caarms+pas+canabis_svmsl1.csv")) # SVM L1 with CVBestSearchRefit # ============================= svms_cv = CVBestSearchRefit(svms, n_folds=10, cv_type="stratified")
def test_constructor_avoid_collision_level1(self): # Test that level 1 collisions are avoided pm = Methods(*[SVC(kernel="linear", C=C) for C in [1, 10]]) leaves_key = [l.get_key() for l in pm.walk_leaves()] self.assertTrue(len(leaves_key) == len(set(leaves_key)), u"Collision could not be avoided")
def test_mysvc_reducer(self): ## 1) Build dataset ## =================================================================== X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2, random_state=1) ## 2) run with Methods ## =================================================================== my_svc1 = MySVC(C=1.0) my_svc2 = MySVC(C=2.0) two_svc_single = Methods(my_svc1, my_svc2) two_svc_local = Methods(my_svc1, my_svc2) two_svc_swf = Methods(my_svc1, my_svc2) two_svc_single.reducer = MyReducer() two_svc_local.reducer = MyReducer() two_svc_swf.reducer = MyReducer() for leaf in two_svc_single.walk_leaves(): print leaf.get_key() for leaf in two_svc_local.walk_leaves(): print leaf.get_key() for leaf in two_svc_swf.walk_leaves(): print leaf.get_key() # top-down process to call transform two_svc_single.run(X=X, y=y) # buttom-up process to compute scores res_single = two_svc_single.reduce() ### You can get below results: ### ================================================================== ### [{'MySVC(C=1.0)': array([ 1., 1.])}, {'MySVC(C=2.0)': array([ 1., 1.])}] ### 3) Run using local multi-processes ### ================================================================== from epac.map_reduce.engine import LocalEngine local_engine = LocalEngine(two_svc_local, num_processes=2) two_svc_local = local_engine.run(**dict(X=X, y=y)) res_local = two_svc_local.reduce() ### 4) Run using soma-workflow ### ================================================================== from epac.map_reduce.engine import SomaWorkflowEngine sfw_engine = SomaWorkflowEngine(tree_root=two_svc_swf, num_processes=2) two_svc_swf = sfw_engine.run(**dict(X=X, y=y)) res_swf = two_svc_swf.reduce() if not repr(res_swf) == repr(res_local): raise ValueError("Cannot dump class definition") if not repr(res_swf) == repr(res_single): raise ValueError("Cannot dump class definition")
def todo_perm_cv_grid_vs_sklearn(self): X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5) n_perms = 3 n_folds = 2 n_folds_nested = 2 random_state = 0 k_values = [2, 3] C_values = [1, 10] # = With EPAC pipelines = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values]) #print [n for n in pipelines.walk_leaves()] pipelines_cv = CVBestSearchRefit(pipelines, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipelines_cv, n_folds=n_folds, reducer=ClassificationReport(keep=True)), n_perms=n_perms, permute="y", reducer=PvalPerms(keep=True), random_state=random_state) wf.fit_predict(X=X, y=y) r_epac = wf.reduce().values()[0] for key in r_epac: print("key=" + repr(key) + ", value=" + repr(r_epac[key])) # = With SKLEARN from sklearn.cross_validation import StratifiedKFold from epac.sklearn_plugins import Permutations from sklearn.pipeline import Pipeline from sklearn import grid_search clf = Pipeline([('anova', SelectKBest(k=3)), ('svm', SVC(kernel="linear"))]) parameters = {'anova__k': k_values, 'svm__C': C_values} r_sklearn = dict() r_sklearn['pred_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['true_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['score_tr'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['score_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['mean_score_te'] = [None] * n_perms r_sklearn['mean_score_tr'] = [None] * n_perms perm_nb = 0 perms = Permutations(n=y.shape[0], n_perms=n_perms, random_state=random_state) for idx in perms: #idx = perms.__iter__().next() y_p = y[idx] cv = StratifiedKFold(y=y_p, n_folds=n_folds) fold_nb = 0 for idx_train, idx_test in cv: #idx_train, idx_test = cv.__iter__().next() X_train = X[idx_train, :] X_test = X[idx_test, :] y_p_train = y_p[idx_train, :] y_p_test = y_p[idx_test, :] # Nested CV cv_nested = StratifiedKFold(y=y_p_train, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X_train, y_p_train) r_sklearn['pred_te'][perm_nb][fold_nb] = gscv.predict(X_test) r_sklearn['true_te'][perm_nb][fold_nb] = y_p_test r_sklearn['score_tr'][perm_nb][fold_nb] =\ gscv.score(X_train, y_p_train) r_sklearn['score_te'][perm_nb][fold_nb] =\ gscv.score(X_test, y_p_test) fold_nb += 1 # Average over folds r_sklearn['mean_score_te'][perm_nb] = \ np.mean(np.asarray(r_sklearn['score_te'][perm_nb]), axis=0) r_sklearn['mean_score_tr'][perm_nb] = \ np.mean(np.asarray(r_sklearn['score_tr'][perm_nb]), axis=0) #np.mean(R2[key]['score_tr'][perm_nb]) perm_nb += 1 print(repr(r_sklearn)) # - Comparisons shared_keys = set(r_epac.keys()).intersection(set(r_sklearn.keys())) comp = {k: np.all(np.asarray(r_epac[k]) == np.asarray(r_sklearn[k])) for k in shared_keys} print("comp=" + repr(comp)) #return comp for key in comp: self.assertTrue(comp[key], u'Diff for attribute: "%s"' % key)
Xd, yd = IO.read_Xy(WD=WD) X = np.asarray(Xd) y = np.asarray(yd) # DO NOT scale #X -= X.mean(axis=0) #X /= X.std(axis=0) #k_values = [1, 2, 3, 4, 5, 10, 15, 20, 25, 27] C_values = [0.01, 0.05, .1, .5, 1, 5, 10] # SVM L1 # ====== svms = Methods(*[SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values]) # #anova_svms = Methods(*[Pipe(SelectKBest(k=k), #preprocessing.StandardScaler(), # Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) for C in C_values for penalty in ['l1', 'l2']])) for k in k_values]) cv = CV(svms, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() #print cv_results epac.export_csv(cv, cv_results, os.path.join(WD, "results", "cv10_svmsl1.csv")) # SVM L1 with CVBestSearchRefit # =============================
""" from sklearn import datasets X, y = datasets.make_classification(n_samples=500, n_features=200000, n_informative=2, random_state=1) Xy = dict(X=X, y=y) ## 2) Building workflow ## ======================================================= print " -> Pt2 : X and y created, building workflow" from sklearn import svm, cross_validation #kfold = cross_validation.KFold(n=len(X), n_folds=3) #svc = svm.SVC(C=1, kernel='linear') #print [svc.fit(X[train], y[train]).score(X[test], y[test]) for train, test in kfold] from epac import CV, Methods cv_svm_local = CV(Methods(*[svm.SVC(kernel="linear"), svm.SVC(kernel="rbf")]), n_folds=3) print " -> Pt3 : Workflow built, defining local engine" cv_svm = None n_proc = 2 # Running on the local machine from epac import LocalEngine local_engine = LocalEngine(cv_svm_local, num_processes=n_proc) print " -> Pt4 : Running" cv_svm = local_engine.run(**Xy) print " -> Success with %i procs!" % n_proc
return {"y/pred": pred_y, "y/true": y, "best_beta": self.v_beta} if __name__ == "__main__": ## 1) Build dataset ## ================================================ X, y = datasets.make_classification(n_samples=10, n_features=5, n_informative=2, random_state=1) Xy = {"X": X, "y": y} ## 2) Build Methods ## ================================================ print "Methods ===================================" methods = Methods(*[TOY_CLF(v_lambda=v_lambda) for v_lambda in [2, 1]]) print methods.run(**Xy) ## 3) Build WarmStartMethods like Methods ## ================================================ ## WarmStartMethods ## / \ ## TOY_CLF(v_lambda=2) TOY_CLF(v_lambda=1) ## ## 1. WarmStartMethods will look for different argumenets as signature ## For example, here is v_lambda, there are different for each leaf ## 2. And then run TOY_CLF(v_lambda=2).transform ## 3. Except v_lambda, WarmStartMethods copy all the other parameters ## from TOY_CLF(v_lambda=2) to TOY_CLF(v_lambda=1) as initialization ## 4. Finally call TOY_CLF(v_lambda=1).transform print "WarmStartMethods =========================="
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Export Workflow to soma_workflow_gui ## =============== time_fit_predict = time.time() if os.path.isdir(options.soma_workflow_dir): shutil.rmtree(options.soma_workflow_dir) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores) sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict # ## 6) Load Epac tree & Reduce # ## ========================== reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py") f = open(reduce_filename, 'w') reduce_str = """from epac.map_reduce.engine import SomaWorkflowEngine wf = SomaWorkflowEngine.load_from_gui("%s") print wf.reduce() """ % options.soma_workflow_dir f.write(reduce_str) f.close() print "#First run\n"\ "soma_workflow_gui\n"\ "\t(1)Open %s\n"\ "\t(2)Submit\n"\ "\t(3)Transfer Input Files\n"\ "\t...wait...\n"\ "\t(4)Transfer Output Files\n"\ "#When done run:\npython %s" % ( os.path.join(options.soma_workflow_dir, sfw_engine.open_me_by_soma_workflow_gui), reduce_filename)
@author: ed203246 """ from sklearn import datasets from sklearn.svm import SVC from sklearn.lda import LDA from sklearn.feature_selection import SelectKBest X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2) from epac import Methods, Pipe self = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel=kernel, C=C)) for kernel in ("linear", "rbf") for C in [1, 10] for k in [1, 2] ]) self = Methods( *[Pipe(SelectKBest(k=k), SVC(C=C)) for C in [1, 10] for k in [1, 2]]) import copy self.fit_predict(X=X, y=y) self.reduce() [l.get_key() for l in svms.walk_nodes()] [l.get_key(2) for l in svms.walk_nodes()] # intermediary key collisions: trig aggregation """ # Model selection using CV: CV + Grid # ----------------------------------------- from epac import CVBestSearchRefit # CV + Grid search of a simple classifier