def get_workflow(self, n_features=int(1E03)): random_state = 0 C_values = [1, 10] k_values = 0 k_max = "auto" n_folds_nested = 5 n_folds = 10 n_perms = 10 if k_max != "auto": k_values = range_log2(np.minimum(int(k_max), n_features), add_n=True) else: k_values = range_log2(n_features, add_n=True) cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=n_folds), n_perms=n_perms, permute="y", random_state=random_state) return wf
def get_workflow(self): #################################################################### ## EPAC WORKFLOW # ------------------------------------- # Perms Perm (Splitter) # / | \ # 0 1 2 Samples (Slicer) # | # CV CV (Splitter) # / | \ # 0 1 2 Folds (Slicer) # | | | # Pipeline Pipeline Pipeline Sequence # | # 2 SelectKBest (Estimator) # | # Methods # | \ # SVM(linear,C=1) SVM(linear,C=10) Classifiers (Estimator) pipeline = Pipe(SelectKBest(k=2), Methods(*[SVC(kernel="linear", C=C) for C in [1, 3]])) wf = Perms(CV(pipeline, n_folds=3), n_perms=3, permute="y", random_state=1) return wf
def test_pipeline(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) # = With EPAC wf = Pipe(SelectKBest(k=2), SVC(kernel="linear")) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN pipe = sklearn.pipeline.Pipeline([("anova", SelectKBest(k=2)), ("svm", SVC(kernel="linear"))]) r_sklearn = pipe.fit(X, y).predict(X) key2cmp = "y" + conf.SEP + conf.PREDICTION # = Comparison self.assertTrue(np.all(r_epac[key2cmp] == r_sklearn), u"Diff in Pipe: EPAC vs sklearn") # test reduce r_epac_reduce = wf.reduce().values()[0][key2cmp] self.assertTrue(np.all(r_epac_reduce == r_sklearn), u"Diff in Pipe: EPAC reduce")
def test_constructor_avoid_collision_level2(self): # Test that level 2 collisions are avoided pm = Methods(*[Pipe(SelectKBest(k=2), SVC(kernel="linear", C=C)) for C in [1, 10]]) leaves_key = [l.get_key() for l in pm.walk_leaves()] self.assertTrue(len(leaves_key) == len(set(leaves_key)), u'Collision could not be avoided')
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Run Workflow ## =============== time_fit_predict = time.time() ## Run on local machine sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores) ## Run on cluster # sfw_engine = SomaWorkflowEngine( # tree_root=wf, # num_processes=options.n_cores, # resource_id="jl237561@gabriel", # login="******") wf = sfw_engine.run(X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict time_reduce = time.time() ## 4) Reduce Workflow ## ================== print wf.reduce() print "Time ellapsed, reduce:", time.time() - time_reduce
def test_pipeline(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) # = With EPAC wf = Pipe(SelectKBest(k=2), SVC(kernel="linear")) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN pipe = sklearn.pipeline.Pipeline([('anova', SelectKBest(k=2)), ('svm', SVC(kernel="linear"))]) r_sklearn = pipe.fit(X, y).predict(X) key2cmp = 'y' + conf.SEP + conf.PREDICTION # = Comparison self.assertTrue(np.all(r_epac[key2cmp] == r_sklearn), u'Diff in Pipe: EPAC vs sklearn') # test reduce r_epac_reduce = wf.reduce().values()[0][key2cmp] self.assertTrue(np.all(r_epac_reduce == r_sklearn), u'Diff in Pipe: EPAC reduce')
def get_workflow(self): n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) wf = CV(pipeline, n_folds=n_folds) return wf
def test_cv_best_search_refit_parallel(self): n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] n_samples = 500 n_features = 10000 n_cores = 2 X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, n_informative=5) # epac workflow for paralle computing pipelines = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) wf = CV(pipeline, n_folds=n_folds) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=n_cores, remove_finished_wf=False, remove_local_tree=False) sfw_engine_wf = sfw_engine.run(X=X, y=y) # epac workflow for normal node computing pipelines2 = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline2 = CVBestSearchRefitParallel(pipelines2, n_folds=n_folds_nested) wf2 = CV(pipeline2, n_folds=n_folds) wf2.run(X=X, y=y) self.assertTrue(compare_two_node(sfw_engine_wf, wf2)) self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
def test_cvbestsearchrefit_select_k_best(self): list_C_value = range(2, 10, 1) # print repr(list_C_value) for C_value in list_C_value: # C_value = 2 # print C_value X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5) n_folds_nested = 2 #random_state = 0 k_values = [2, 3, 4, 5, 6] key_y_pred = 'y' + conf.SEP + conf.PREDICTION # With EPAC methods = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C_value, kernel="linear")) for k in k_values]) wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested) wf.run(X=X, y=y) r_epac = wf.reduce().values()[0] # - Without EPAC from sklearn.pipeline import Pipeline r_sklearn = dict() clf = Pipeline([('anova', SelectKBest(k=3)), ('svm', SVC(C=C_value, kernel="linear"))]) parameters = {'anova__k': k_values} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ r_sklearn[conf.BEST_PARAMS]['k'] = \ r_sklearn[conf.BEST_PARAMS]['anova__k'] # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u'Diff CVBestSearchRefitParallel: prediction') for key_param in r_epac[conf.BEST_PARAMS][0]: if key_param in r_sklearn[conf.BEST_PARAMS]: comp = r_sklearn[conf.BEST_PARAMS][key_param] == \ r_epac[conf.BEST_PARAMS][0][key_param] self.assertTrue( comp, u'Diff CVBestSearchRefitParallel: best parameters')
def test_peristence_load_and_fit_predict(self): X, y = datasets.make_classification(n_samples=20, n_features=10, n_informative=2) n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) tree_mem = CV(pipeline, n_folds=n_folds, reducer=ClassificationReport(keep=False)) # Save Tree import tempfile store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True) tree_mem.save_tree(store=store) tree_mem.run(X=X, y=y) res_mem = tree_mem.reduce().values()[0] # Reload Tree tree_fs_noresults = store.load() tree_fs_noresults.run(X=X, y=y) res_fs_noresults = tree_fs_noresults.reduce().values()[0] # Save with results tree_fs_noresults.save_tree(store=store) tree_fs_withresults = store.load() res_fs_withresults = tree_fs_withresults.reduce().values()[0] # Compare comp = np.all([ np.all(np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k])) and np.all( np.asarray(res_fs_noresults[k]) == np.asarray( res_fs_withresults[k])) for k in res_mem ]) self.assertTrue(comp)
def get_workflow(self): #################################################################### ## EPAC WORKFLOW # ------------------------------------- # Perms Perm (Splitter) # / | \ # 0 1 2 Samples (Slicer) # | # CV CV (Splitter) # / | \ # 0 1 2 Folds (Slicer) # | | | # Pipeline Pipeline Pipeline Sequence # | # 2 SelectKBest (Estimator) # | # Methods # | \ # SVM(linear,C=1) SVM(linear,C=10) Classifiers (Estimator) pipeline = Pipe(SelectKBest(k=2), Methods(*[SVC(kernel="linear", C=C) for C in [1, 3]])) wf = CV(pipeline, n_folds=3, reducer=ClassificationReport(keep=True)) return wf
def _search_best(self, **Xy): # Fit/predict CV grid search self.cv.store = StoreMem() # local store erased at each fit from epac.workflow.pipeline import Pipe self.cv.top_down(**Xy) # Pump-up results cv_result_set = self.cv.reduce(store_results=False) key_val = [(result.key(), result[self.score]) for result in cv_result_set] scores = np.asarray(zip(*key_val)[1]) scores_opt = np.max(scores) if self.arg_max else np.min(scores) idx_best = np.where(scores == scores_opt)[0][0] best_key = key_val[idx_best][0] # Find nodes that match the best nodes_dict = { n.get_signature(): n for n in self.cv.walk_true_nodes() if n.get_signature() in key_split(best_key) } to_refit = Pipe( *[nodes_dict[k].wrapped_node for k in key_split(best_key)]) best_params = [dict(sig) for sig in key_split(best_key, eval=True)] return to_refit, best_params
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Export Workflow to soma_workflow_gui ## =============== time_fit_predict = time.time() if os.path.isdir(options.soma_workflow_dir): shutil.rmtree(options.soma_workflow_dir) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores) sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict # ## 6) Load Epac tree & Reduce # ## ========================== reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py") f = open(reduce_filename, 'w') reduce_str = """from epac.map_reduce.engine import SomaWorkflowEngine wf = SomaWorkflowEngine.load_from_gui("%s") print wf.reduce() """ % options.soma_workflow_dir f.write(reduce_str) f.close() print "#First run\n"\ "soma_workflow_gui\n"\ "\t(1)Open %s\n"\ "\t(2)Submit\n"\ "\t(3)Transfer Input Files\n"\ "\t...wait...\n"\ "\t(4)Transfer Output Files\n"\ "#When done run:\npython %s" % ( os.path.join(options.soma_workflow_dir, sfw_engine.open_me_by_soma_workflow_gui), reduce_filename)
def test_constructor_cannot_avoid_collision_level2(self): # This should raise an exception since collision cannot be avoided self.assertRaises(ValueError, Methods, *[Pipe(SelectKBest(k=2), SVC(kernel="linear", C=C)) for C in [1, 1]])
def todo_perm_cv_grid_vs_sklearn(self): X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5) n_perms = 3 n_folds = 2 n_folds_nested = 2 random_state = 0 k_values = [2, 3] C_values = [1, 10] # = With EPAC pipelines = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values]) #print [n for n in pipelines.walk_leaves()] pipelines_cv = CVBestSearchRefit(pipelines, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipelines_cv, n_folds=n_folds, reducer=ClassificationReport(keep=True)), n_perms=n_perms, permute="y", reducer=PvalPerms(keep=True), random_state=random_state) wf.fit_predict(X=X, y=y) r_epac = wf.reduce().values()[0] for key in r_epac: print("key=" + repr(key) + ", value=" + repr(r_epac[key])) # = With SKLEARN from sklearn.cross_validation import StratifiedKFold from epac.sklearn_plugins import Permutations from sklearn.pipeline import Pipeline from sklearn import grid_search clf = Pipeline([('anova', SelectKBest(k=3)), ('svm', SVC(kernel="linear"))]) parameters = {'anova__k': k_values, 'svm__C': C_values} r_sklearn = dict() r_sklearn['pred_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['true_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['score_tr'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['score_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['mean_score_te'] = [None] * n_perms r_sklearn['mean_score_tr'] = [None] * n_perms perm_nb = 0 perms = Permutations(n=y.shape[0], n_perms=n_perms, random_state=random_state) for idx in perms: #idx = perms.__iter__().next() y_p = y[idx] cv = StratifiedKFold(y=y_p, n_folds=n_folds) fold_nb = 0 for idx_train, idx_test in cv: #idx_train, idx_test = cv.__iter__().next() X_train = X[idx_train, :] X_test = X[idx_test, :] y_p_train = y_p[idx_train, :] y_p_test = y_p[idx_test, :] # Nested CV cv_nested = StratifiedKFold(y=y_p_train, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X_train, y_p_train) r_sklearn['pred_te'][perm_nb][fold_nb] = gscv.predict(X_test) r_sklearn['true_te'][perm_nb][fold_nb] = y_p_test r_sklearn['score_tr'][perm_nb][fold_nb] =\ gscv.score(X_train, y_p_train) r_sklearn['score_te'][perm_nb][fold_nb] =\ gscv.score(X_test, y_p_test) fold_nb += 1 # Average over folds r_sklearn['mean_score_te'][perm_nb] = \ np.mean(np.asarray(r_sklearn['score_te'][perm_nb]), axis=0) r_sklearn['mean_score_tr'][perm_nb] = \ np.mean(np.asarray(r_sklearn['score_tr'][perm_nb]), axis=0) #np.mean(R2[key]['score_tr'][perm_nb]) perm_nb += 1 print(repr(r_sklearn)) # - Comparisons shared_keys = set(r_epac.keys()).intersection(set(r_sklearn.keys())) comp = {k: np.all(np.asarray(r_epac[k]) == np.asarray(r_sklearn[k])) for k in shared_keys} print("comp=" + repr(comp)) #return comp for key in comp: self.assertTrue(comp[key], u'Diff for attribute: "%s"' % key)
Xy = np.load(datasets_filepath) X = Xy["X"] y = Xy["y"] from sklearn.svm import LinearSVC as SVM from sklearn.feature_selection import SelectKBest from sklearn import preprocessing ############################################################################## ## Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2 from epac import Pipe, CV n_folds = 10 anova_svm = Pipe(SelectKBest(k=5), preprocessing.StandardScaler(), SVM(class_weight='auto')) cv = CV(anova_svm, n_folds=n_folds) cv.run(X=X, y=y) # res_cv_anova_svm = cv.reduce() res_cv_anova_svm["SelectKBest/StandardScaler/LinearSVC"]['y/test/score_recall'] ############################################################################## ## Multimethods, "Methods": SVM l1 vs l2 from epac import Methods, CV svms = Methods(SVM(penalty="l1", class_weight='auto', dual=False), SVM(penalty="l2", class_weight='auto', dual=False)) cv = CV(svms, n_folds=n_folds)
""" from sklearn import datasets from sklearn.svm import LinearSVC as SVM from sklearn.lda import LDA from sklearn.feature_selection import SelectKBest X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2, random_state=1) # Build sequential Pipeline # ------------------------- # 2 SelectKBest (Estimator) # | # SVM Classifier (Estimator) from epac import Pipe pipe = Pipe(SelectKBest(k=2), SVM()) pipe.run(X=X, y=y) # The downstream data-flow is a keyword arguments (dict) containing X and y. # It will pass through each processing node, SelectKBest(k=2) and SVM. # Each node calls the "transform" method, that take a dictionary as input # and produces a dictionary as output. The output is passed to the next node. # The return value of the run is simply agregation of the outputs (dict) of # the leaf nodes for leaf in pipe.walk_leaves(): print leaf.load_results() # The result of each branch of the tree is stored in the corresponding leaf. # An iteration on all the leaves of a tree can return all the results
""" from sklearn import datasets from sklearn.svm import LinearSVC as SVM from sklearn.lda import LDA from sklearn.feature_selection import SelectKBest X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2, random_state=1) # Build sequential Pipeline # ------------------------- # 2 SelectKBest (Estimator) # | # SVM Classifier (Estimator) from epac import Pipe pipe = Pipe(SelectKBest(k=2), SVM()) pipe.run(X=X, y=y) # The downstream data-flow is a keyword arguments (dict) containing X and y. # It will pass through each processing node, SelectKBest(k=2) and SVM. # Each node call the "transform" method, that take a dictionnary as input # and produces a dictionnary as output. The output is passed to the next node. # The return value of the run is simply agregation of the outputs (dict) of # the leaf nodes ## Parallelization ## =============== # Multi-classifiers # -----------------
from sklearn import datasets from sklearn.svm import LinearSVC as SVM from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.feature_selection import SelectKBest X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2, random_state=1) # Build sequential Pipeline # ------------------------- # 2 SelectKBest (Estimator) # | # SVM Classifier (Estimator) from epac import Pipe pipe = Pipe(SelectKBest(k=2), SVM()) pipe.run(X=X, y=y) # The downstream data-flow is a keyword arguments (dict) containing X and y. # It will pass through each processing node, SelectKBest(k=2) and SVM. # Each node calls the "transform" method, that take a dictionary as input # and produces a dictionary as output. The output is passed to the next node. # The return value of the run is simply agregation of the outputs (dict) of # the leaf nodes for leaf in pipe.walk_leaves(): print(leaf.load_results()) # The result of each branch of the tree is stored in the corresponding leaf. # An iteration on all the leaves of a tree can return all the results
Xd.PAS2gr[Xd.PAS2gr == 1] = -1 Xd.PAS2gr[Xd.PAS2gr == 2] = 1 Xd.CB_EXPO[Xd.CB_EXPO == 0] = -1 X = np.asarray(Xd) y = np.asarray(yd) #k_values = [1, 2, 3, 4, 5, 10, 15, 20, 25, 27] C_values = [0.01, 0.05, .1, .5, 1, 5, 10, 100, 1000] # anova + SVM L1 # ============== anova_svms = Pipe( mylib.SelectPvalue(alpha=1e-1), Methods(*[ SVM(C=C, penalty="l1", class_weight='auto', dual=False) for C in C_values ])) # P<0.05 cv = CV(anova_svms, cv_type="stratified", n_folds=10) a = cv.run(X=X, y=y) cv_results = cv.reduce() print cv_results epac.export_csv( cv, cv_results, os.path.join(WD, "results", "cv10_caarms+pas+canabis_anova(p<0.05)_svmsl1.csv")) # recall_mean: 67%
@author: ed203246 """ from sklearn import datasets from sklearn.svm import SVC from sklearn.lda import LDA from sklearn.feature_selection import SelectKBest X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2) from epac import Methods, Pipe self = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel=kernel, C=C)) for kernel in ("linear", "rbf") for C in [1, 10] for k in [1, 2] ]) self = Methods( *[Pipe(SelectKBest(k=k), SVC(C=C)) for C in [1, 10] for k in [1, 2]]) import copy self.fit_predict(X=X, y=y) self.reduce() [l.get_key() for l in svms.walk_nodes()] [l.get_key(2) for l in svms.walk_nodes()] # intermediary key collisions: trig aggregation """ # Model selection using CV: CV + Grid # ----------------------------------------- from epac import CVBestSearchRefit