def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Run Workflow ## =============== time_fit_predict = time.time() ## Run on local machine sfw_engine = SomaWorkflowEngine( tree_root=wf, num_processes=options.n_cores ) ## Run on cluster # sfw_engine = SomaWorkflowEngine( # tree_root=wf, # num_processes=options.n_cores, # resource_id="jl237561@gabriel", # login="******") # You can use soma_workflow_gui to track your progress wf = sfw_engine.run(X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict time_reduce = time.time() ## 4) Reduce Workflow ## ================== print wf.reduce() print "Time ellapsed, reduce:", time.time() - time_reduce
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Run Workflow ## =============== time_fit_predict = time.time() ## Run on local machine sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores) ## Run on cluster # sfw_engine = SomaWorkflowEngine( # tree_root=wf, # num_processes=options.n_cores, # resource_id="jl237561@gabriel", # login="******") wf = sfw_engine.run(X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict time_reduce = time.time() ## 4) Reduce Workflow ## ================== print wf.reduce() print "Time ellapsed, reduce:", time.time() - time_reduce
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Run Workflow ## =============== time_fit_predict = time.time() wf.run(X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict time_reduce = time.time() ## 4) Reduce Workflow ## ================== print wf.reduce() print "Time ellapsed, reduce:", time.time() - time_reduce
def test_perm(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_perms = 2 rnd = 0 # = With EPAC wf = Perms(SVC(kernel="linear"), n_perms=n_perms, permute="y", random_state=rnd, reducer=None) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN clf = SVC(kernel="linear") r_sklearn = list() for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd): y_p = y[perm, :] clf.fit(X, y_p) r_sklearn.append(clf.predict(X)) key2cmp = "y" + conf.SEP + conf.PREDICTION # = Comparison for iperm in range(n_perms): comp = np.all(np.asarray(r_epac[iperm][key2cmp]) == np.asarray(r_sklearn[iperm])) self.assertTrue(comp, u"Diff Perm: EPAC vs sklearn") # test reduce for iperm in range(n_perms): r_epac_reduce = wf.reduce().values()[iperm][key2cmp] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn[iperm])) self.assertTrue(comp, u"Diff Perm: EPAC reduce")
def test_perm(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_perms = 2 rnd = 0 # = With EPAC wf = Perms(SVC(kernel="linear"), n_perms=n_perms, permute="y", random_state=rnd, reducer=None) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN clf = SVC(kernel="linear") r_sklearn = list() for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd): y_p = y[perm, :] clf.fit(X, y_p) r_sklearn.append(clf.predict(X)) key2cmp = 'y' + conf.SEP + conf.PREDICTION # = Comparison for iperm in range(n_perms): comp = np.all(np.asarray(r_epac[iperm][key2cmp]) == np.asarray(r_sklearn[iperm])) self.assertTrue(comp, u'Diff Perm: EPAC vs sklearn') # test reduce for iperm in range(n_perms): r_epac_reduce = wf.reduce().values()[iperm][key2cmp] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn[iperm])) self.assertTrue(comp, u'Diff Perm: EPAC reduce')
def test_perm_cv(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_perms = 3 n_folds = 2 rnd = 0 # = With EPAC wf = Perms(CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True)), n_perms=n_perms, permute="y", random_state=rnd, reducer=None) r_epac = wf.run(X=X, y=y) # = With SKLEARN from sklearn.cross_validation import StratifiedKFold clf = SVC(kernel="linear") r_sklearn = [[None] * n_folds for i in xrange(n_perms)] perm_nb = 0 for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd): y_p = y[perm] fold_nb = 0 for idx_train, idx_test in StratifiedKFold(y=y_p, n_folds=n_folds): X_train = X[idx_train, :] X_test = X[idx_test, :] y_p_train = y_p[idx_train, :] clf.fit(X_train, y_p_train) r_sklearn[perm_nb][fold_nb] = clf.predict(X_test) fold_nb += 1 perm_nb += 1 cmp_key = 'y' + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION # Comparison for iperm in range(n_perms): for icv in range(n_folds): comp = np.all( np.asarray(r_epac[iperm][icv][cmp_key]) == np.asarray( r_sklearn[iperm][icv])) self.assertTrue(comp, u'Diff Perm / CV: EPAC vs sklearn') # test reduce for iperm in range(n_perms): for icv in range(n_folds): ## iperm = 0 ## icv = 0 comp = np.all( np.asarray(wf.reduce().values()[iperm][cmp_key][icv]) == np.asarray(r_sklearn[iperm][icv])) self.assertTrue(comp, u'Diff Perm / CV: EPAC reduce')
def test_perm_cv(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_perms = 3 n_folds = 2 rnd = 0 # = With EPAC wf = Perms(CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True)), n_perms=n_perms, permute="y", random_state=rnd, reducer=None) r_epac = wf.run(X=X, y=y) # = With SKLEARN from sklearn.cross_validation import StratifiedKFold clf = SVC(kernel="linear") r_sklearn = [[None] * n_folds for i in xrange(n_perms)] perm_nb = 0 for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd): y_p = y[perm] fold_nb = 0 for idx_train, idx_test in StratifiedKFold(y=y_p, n_folds=n_folds): X_train = X[idx_train, :] X_test = X[idx_test, :] y_p_train = y_p[idx_train, :] clf.fit(X_train, y_p_train) r_sklearn[perm_nb][fold_nb] = clf.predict(X_test) fold_nb += 1 perm_nb += 1 cmp_key = 'y' + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION # Comparison for iperm in range(n_perms): for icv in range(n_folds): comp = np.all( np.asarray(r_epac[iperm][icv][cmp_key]) == np.asarray(r_sklearn[iperm][icv]) ) self.assertTrue(comp, u'Diff Perm / CV: EPAC vs sklearn') # test reduce for iperm in range(n_perms): for icv in range(n_folds): ## iperm = 0 ## icv = 0 comp = np.all( np.asarray(wf.reduce().values()[iperm][cmp_key][icv]) == np.asarray(r_sklearn[iperm][icv]) ) self.assertTrue(comp, u'Diff Perm / CV: EPAC reduce')
def todo_perm_cv_grid_vs_sklearn(self): X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5) n_perms = 3 n_folds = 2 n_folds_nested = 2 random_state = 0 k_values = [2, 3] C_values = [1, 10] # = With EPAC pipelines = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values]) #print [n for n in pipelines.walk_leaves()] pipelines_cv = CVBestSearchRefit(pipelines, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipelines_cv, n_folds=n_folds, reducer=ClassificationReport(keep=True)), n_perms=n_perms, permute="y", reducer=PvalPerms(keep=True), random_state=random_state) wf.fit_predict(X=X, y=y) r_epac = wf.reduce().values()[0] for key in r_epac: print("key=" + repr(key) + ", value=" + repr(r_epac[key])) # = With SKLEARN from sklearn.cross_validation import StratifiedKFold from epac.sklearn_plugins import Permutations from sklearn.pipeline import Pipeline from sklearn import grid_search clf = Pipeline([('anova', SelectKBest(k=3)), ('svm', SVC(kernel="linear"))]) parameters = {'anova__k': k_values, 'svm__C': C_values} r_sklearn = dict() r_sklearn['pred_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['true_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['score_tr'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['score_te'] = [[None] * n_folds for i in range(n_perms)] r_sklearn['mean_score_te'] = [None] * n_perms r_sklearn['mean_score_tr'] = [None] * n_perms perm_nb = 0 perms = Permutations(n=y.shape[0], n_perms=n_perms, random_state=random_state) for idx in perms: #idx = perms.__iter__().next() y_p = y[idx] cv = StratifiedKFold(y=y_p, n_folds=n_folds) fold_nb = 0 for idx_train, idx_test in cv: #idx_train, idx_test = cv.__iter__().next() X_train = X[idx_train, :] X_test = X[idx_test, :] y_p_train = y_p[idx_train, :] y_p_test = y_p[idx_test, :] # Nested CV cv_nested = StratifiedKFold(y=y_p_train, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X_train, y_p_train) r_sklearn['pred_te'][perm_nb][fold_nb] = gscv.predict(X_test) r_sklearn['true_te'][perm_nb][fold_nb] = y_p_test r_sklearn['score_tr'][perm_nb][fold_nb] =\ gscv.score(X_train, y_p_train) r_sklearn['score_te'][perm_nb][fold_nb] =\ gscv.score(X_test, y_p_test) fold_nb += 1 # Average over folds r_sklearn['mean_score_te'][perm_nb] = \ np.mean(np.asarray(r_sklearn['score_te'][perm_nb]), axis=0) r_sklearn['mean_score_tr'][perm_nb] = \ np.mean(np.asarray(r_sklearn['score_tr'][perm_nb]), axis=0) #np.mean(R2[key]['score_tr'][perm_nb]) perm_nb += 1 print(repr(r_sklearn)) # - Comparisons shared_keys = set(r_epac.keys()).intersection(set(r_sklearn.keys())) comp = {k: np.all(np.asarray(r_epac[k]) == np.asarray(r_sklearn[k])) for k in shared_keys} print("comp=" + repr(comp)) #return comp for key in comp: self.assertTrue(comp[key], u'Diff for attribute: "%s"' % key)
# -*- coding: utf-8 -*- """ Created on Thu May 23 15:21:35 2013 @author: ed203246 """ from sklearn import datasets from sklearn.svm import LinearSVC as SVM from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.feature_selection import SelectKBest from epac.map_reduce.reducers import PvalPerms import numpy X, y = datasets.make_classification(n_samples=100, n_features=200, n_informative=2) X = numpy.random.rand(*X.shape) from epac import Perms, CV, Methods perms_cv_svm = Perms(CV(Methods(SVM(loss="l1"), SVM(loss="l2"))), n_perms=100) perms_cv_svm.run(X=X, y=y) perms_cv_svm.reduce() self = perms_cv_svm key = 'LinearSVC(loss=l1)' self = PvalPerms()
n_perms=n_perms, permute="y", random_state=random_state) # wf.run(X=X, y=y) # for leaf in wf.walk_leaves(): # print leaf.load_results() # wf.reduce() # from epac.map_reduce.engine import LocalEngine # local_engine = LocalEngine(tree_root=wf, num_processes=2) # wf = local_engine.run(X=X, y=y) # for leaf in wf.walk_leaves(): # print leaf.load_results() # wf.reduce() from epac.map_reduce.engine import SomaWorkflowEngine sfw_engine = SomaWorkflowEngine( tree_root=wf, num_processes=3, remove_finished_wf=False, remove_local_tree=False) wf = sfw_engine.run(X=X, y=y) #for leaf in wf.walk_leaves(): # print leaf.load_results() #for node in wf.walk_true_nodes(): # print node # print node.load_results() print wf.reduce()
def todo_perm_cv_grid_vs_sklearn(self): X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5) n_perms = 3 n_folds = 2 n_folds_nested = 2 random_state = 0 k_values = [2, 3] C_values = [1, 10] # = With EPAC pipelines = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values]) #print [n for n in pipelines.walk_leaves()] pipelines_cv = CVBestSearchRefit(pipelines, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipelines_cv, n_folds=n_folds, reducer=ClassificationReport(keep=True)), n_perms=n_perms, permute="y", reducer=PvalPerms(keep=True), random_state=random_state) wf.fit_predict(X=X, y=y) r_epac = wf.reduce().values()[0] for key in r_epac: print "key=" + repr(key) + ", value=" + repr(r_epac[key]) # = With SKLEARN from sklearn.cross_validation import StratifiedKFold from epac.sklearn_plugins import Permutations from sklearn.pipeline import Pipeline from sklearn import grid_search clf = Pipeline([('anova', SelectKBest(k=3)), ('svm', SVC(kernel="linear"))]) parameters = {'anova__k': k_values, 'svm__C': C_values} r_sklearn = dict() r_sklearn['pred_te'] = [[None] * n_folds for i in xrange(n_perms)] r_sklearn['true_te'] = [[None] * n_folds for i in xrange(n_perms)] r_sklearn['score_tr'] = [[None] * n_folds for i in xrange(n_perms)] r_sklearn['score_te'] = [[None] * n_folds for i in xrange(n_perms)] r_sklearn['mean_score_te'] = [None] * n_perms r_sklearn['mean_score_tr'] = [None] * n_perms perm_nb = 0 perms = Permutations(n=y.shape[0], n_perms=n_perms, random_state=random_state) for idx in perms: #idx = perms.__iter__().next() y_p = y[idx] cv = StratifiedKFold(y=y_p, n_folds=n_folds) fold_nb = 0 for idx_train, idx_test in cv: #idx_train, idx_test = cv.__iter__().next() X_train = X[idx_train, :] X_test = X[idx_test, :] y_p_train = y_p[idx_train, :] y_p_test = y_p[idx_test, :] # Nested CV cv_nested = StratifiedKFold(y=y_p_train, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X_train, y_p_train) r_sklearn['pred_te'][perm_nb][fold_nb] = gscv.predict(X_test) r_sklearn['true_te'][perm_nb][fold_nb] = y_p_test r_sklearn['score_tr'][perm_nb][fold_nb] =\ gscv.score(X_train, y_p_train) r_sklearn['score_te'][perm_nb][fold_nb] =\ gscv.score(X_test, y_p_test) fold_nb += 1 # Average over folds r_sklearn['mean_score_te'][perm_nb] = \ np.mean(np.asarray(r_sklearn['score_te'][perm_nb]), axis=0) r_sklearn['mean_score_tr'][perm_nb] = \ np.mean(np.asarray(r_sklearn['score_tr'][perm_nb]), axis=0) #np.mean(R2[key]['score_tr'][perm_nb]) perm_nb += 1 print repr(r_sklearn) # - Comparisons shared_keys = set(r_epac.keys()).intersection(set(r_sklearn.keys())) comp = {k: np.all(np.asarray(r_epac[k]) == np.asarray(r_sklearn[k])) for k in shared_keys} print "comp=" + repr(comp) #return comp for key in comp: self.assertTrue(comp[key], u'Diff for attribute: "%s"' % key)
# -*- coding: utf-8 -*- """ Created on Thu May 23 15:21:35 2013 @author: ed203246 """ from sklearn import datasets from sklearn.svm import LinearSVC as SVM from sklearn.lda import LDA from sklearn.feature_selection import SelectKBest X, y = datasets.make_classification(n_samples=100, n_features=200, n_informative=2) X = numpy.random.rand(*X.shape) from epac import Perms, CV, Methods perms_cv_svm = Perms(CV(Methods(SVM(loss="l1"), SVM(loss="l2"))), n_perms=100) perms_cv_svm.run(X=X, y=y) perms_cv_svm.reduce() self = perms_cv_svm key = "LinearSVC(loss=l1)" self = PvalPerms()