def get_workflow(self, n_features=int(1E03)): random_state = 0 C_values = [1, 10] k_values = 0 k_max = "auto" n_folds_nested = 5 n_folds = 10 n_perms = 10 if k_max != "auto": k_values = range_log2(np.minimum(int(k_max), n_features), add_n=True) else: k_values = range_log2(n_features, add_n=True) cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=n_folds), n_perms=n_perms, permute="y", random_state=random_state) return wf
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Run Workflow ## =============== time_fit_predict = time.time() ## Run on local machine sfw_engine = SomaWorkflowEngine( tree_root=wf, num_processes=options.n_cores ) ## Run on cluster # sfw_engine = SomaWorkflowEngine( # tree_root=wf, # num_processes=options.n_cores, # resource_id="jl237561@gabriel", # login="******") # You can use soma_workflow_gui to track your progress wf = sfw_engine.run(X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict time_reduce = time.time() ## 4) Reduce Workflow ## ================== print wf.reduce() print "Time ellapsed, reduce:", time.time() - time_reduce
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Run Workflow ## =============== time_fit_predict = time.time() ## Run on local machine sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores) ## Run on cluster # sfw_engine = SomaWorkflowEngine( # tree_root=wf, # num_processes=options.n_cores, # resource_id="jl237561@gabriel", # login="******") wf = sfw_engine.run(X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict time_reduce = time.time() ## 4) Reduce Workflow ## ================== print wf.reduce() print "Time ellapsed, reduce:", time.time() - time_reduce
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 # print options # sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification( n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative ) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Export Workflow to soma_workflow_gui ## =============== time_fit_predict = time.time() if os.path.isdir(options.soma_workflow_dir): shutil.rmtree(options.soma_workflow_dir) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores) sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict # ## 6) Load Epac tree & Reduce # ## ========================== reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py") f = open(reduce_filename, "w") reduce_str = ( """from epac.map_reduce.engine import SomaWorkflowEngine wf = SomaWorkflowEngine.load_from_gui("%s") print wf.reduce() """ % options.soma_workflow_dir ) f.write(reduce_str) f.close() print "#First run\n" "soma_workflow_gui\n" "\t(1)Open %s\n" "\t(2)Submit\n" "\t(3)Transfer Input Files\n" "\t...wait...\n" "\t(4)Transfer Output Files\n" "#When done run:\npython %s" % ( os.path.join(options.soma_workflow_dir, sfw_engine.open_me_by_soma_workflow_gui), reduce_filename, )
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Run Workflow ## =============== time_fit_predict = time.time() wf.run(X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict time_reduce = time.time() ## 4) Reduce Workflow ## ================== print wf.reduce() print "Time ellapsed, reduce:", time.time() - time_reduce
def get_workflow(self, n_features=int(1E03)): random_state = 0 C_values = [1, 10] k_values = 0 k_max = "auto" n_folds_nested = 5 n_folds = 10 n_perms = 10 if k_max != "auto": k_values = range_log2(np.minimum(int(k_max), n_features), add_n=True) else: k_values = range_log2(n_features, add_n=True) cls = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values]) pipeline = CVBestSearchRefit(cls, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=n_folds), n_perms=n_perms, permute="y", random_state=random_state) return wf
def do_all(options): if options.k_max != "auto": k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True) else: k_values = range_log2(options.n_features, add_n=True) C_values = [1, 10] random_state = 0 #print options #sys.exit(0) if options.trace: from epac import conf conf.TRACE_TOPDOWN = True ## 1) Build dataset ## ================ X, y = datasets.make_classification(n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative) ## 2) Build Workflow ## ================= time_start = time.time() ## CV + Grid search of a pipeline with a nested grid search cls = Methods(*[ Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values ]) pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state) print "Time ellapsed, tree construction:", time.time() - time_start ## 3) Export Workflow to soma_workflow_gui ## =============== time_fit_predict = time.time() if os.path.isdir(options.soma_workflow_dir): shutil.rmtree(options.soma_workflow_dir) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores) sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y) print "Time ellapsed, fit predict:", time.time() - time_fit_predict # ## 6) Load Epac tree & Reduce # ## ========================== reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py") f = open(reduce_filename, 'w') reduce_str = """from epac.map_reduce.engine import SomaWorkflowEngine wf = SomaWorkflowEngine.load_from_gui("%s") print wf.reduce() """ % options.soma_workflow_dir f.write(reduce_str) f.close() print "#First run\n"\ "soma_workflow_gui\n"\ "\t(1)Open %s\n"\ "\t(2)Submit\n"\ "\t(3)Transfer Input Files\n"\ "\t...wait...\n"\ "\t(4)Transfer Output Files\n"\ "#When done run:\npython %s" % ( os.path.join(options.soma_workflow_dir, sfw_engine.open_me_by_soma_workflow_gui), reduce_filename)
n_features = int(1E03) X, y = datasets.make_classification(n_samples=100, n_features=n_features, n_informative=2) random_state = 0 C_values = [1, 10] k_values = 0 k_max = "auto" n_folds_nested = 5 n_folds = 10 n_perms = 10 if k_max != "auto": k_values = range_log2(np.minimum(int(k_max), n_features), add_n=True) else: k_values = range_log2(n_features, add_n=True) cls = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values for k in k_values]) pipeline = CVBestSearchRefitParallel(cls, n_folds=n_folds_nested, random_state=random_state) wf = Perms(CV(pipeline, n_folds=n_folds), n_perms=n_perms, permute="y", random_state=random_state) # wf.run(X=X, y=y)
# res_cv_svms_auto = cv.reduce() print res_cv_svms_auto print res_cv_svms_auto["CVBestSearchRefit"]['y/test/score_recall'] # Re-fit on all data. Warning: biased !!! svms_auto.run(X=X, y=y) print svms_auto.best_params print svms_auto.refited.estimator.coef_ ############################################################################## # Put everything together # Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2 from epac import range_log2 from epac import CVBestSearchRefit, Pipe, Methods, CV k_values = range_log2(X.shape[1], add_n=True) C_values = [.1, 1, 10, 100] anova_svms = Methods(*[Pipe(SelectKBest(k=k), preprocessing.StandardScaler(), Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) for C in C_values for penalty in ['l1', 'l2']])) for k in k_values]) # Take a look print [l for l in anova_svms.walk_leaves()] ## k and C selection based on CV anova_svms_auto = CVBestSearchRefit(anova_svms) #anova_svm_all = Methods(anova_svm, anova_svm_cv) cv = CV(anova_svms_auto, n_folds=n_folds) time_fit_predict = time.time()