def func_memm_local(): print "memm_local pt1" ## 1) Build a dataset and convert to np.memmap (for big matrix) ## ============================================================ X, y = datasets.make_classification(n_samples=500, n_features=5000, n_informative=2, random_state=1) print "memm_local pt2" X = convert2memmap(X) y = convert2memmap(y) Xy = dict(X=X, y=y) ## 2) Build two workflows respectively ## ======================================================= print "memm_local pt3" from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3) print "memm_local pt4" # from epac import LocalEngine # local_engine = LocalEngine(cv_svm_local, num_processes=2) # cv_svm = local_engine.run(**Xy) cv_svm_local.run(**Xy) print cv_svm_local.reduce() print "memm_local pt5"
def func_memm_local(): print("memm_local pt1") ## 1) Build a dataset and convert to np.memmap (for big matrix) ## ============================================================ X, y = datasets.make_classification(n_samples=500, n_features=5000, n_informative=2, random_state=1) print("memm_local pt2") X = convert2memmap(X) y = convert2memmap(y) Xy = dict(X=X, y=y) ## 2) Build two workflows respectively ## ======================================================= print("memm_local pt3") from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV(Methods(*[SVC( kernel="linear"), SVC(kernel="rbf")]), n_folds=3) print("memm_local pt4") # from epac import LocalEngine # local_engine = LocalEngine(cv_svm_local, num_processes=2) # cv_svm = local_engine.run(**Xy) cv_svm_local.run(**Xy) print(cv_svm_local.reduce()) print("memm_local pt5")
def test_mem(): X, y = datasets.make_classification(n_samples=2000, n_features=10000, n_informative=2, random_state=1) # f = open("/home/jinpeng/x.log", "w") # pickle.dump(X, f) # =>> 474 MB # f.close() # np.savez ("/home/jinpeng/np_x.log", dict(X=X)) # ===> 160 MB cv_svm = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=10) cv_svm.run(X=X, y=y) # Top-down process: computing recognition rates, etc. # local_engine = LocalEngine(cv_svm, num_processes=2) # cv_svm = local_engine.run(X=X, y=y) print cv_svm.reduce() # Bottom-up process: computing p-values, etc.
def test_peristence_load_and_fit_predict(self): X, y = datasets.make_classification(n_samples=20, n_features=10, n_informative=2) n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[ Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline = CVBestSearchRefit(pipelines, n_folds=n_folds_nested) tree_mem = CV(pipeline, n_folds=n_folds, reducer=ClassificationReport(keep=False)) # Save Tree import tempfile store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True) tree_mem.save_tree(store=store) tree_mem.run(X=X, y=y) res_mem = tree_mem.reduce().values()[0] # Reload Tree tree_fs_noresults = store.load() tree_fs_noresults.run(X=X, y=y) res_fs_noresults = tree_fs_noresults.reduce().values()[0] # Save with results tree_fs_noresults.save_tree(store=store) tree_fs_withresults = store.load() res_fs_withresults = tree_fs_withresults.reduce().values()[0] # # Compare comp = np.all([ np.all( np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k])) and np.all(np.asarray(res_fs_noresults[k]) == np.asarray(res_fs_withresults[k])) for k in res_mem]) self.assertTrue(comp)
def test_peristence_load_and_fit_predict(self): X, y = datasets.make_classification(n_samples=20, n_features=10, n_informative=2) n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) tree_mem = CV(pipeline, n_folds=n_folds, reducer=ClassificationReport(keep=False)) # Save Tree import tempfile store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True) tree_mem.save_tree(store=store) tree_mem.run(X=X, y=y) res_mem = tree_mem.reduce().values()[0] # Reload Tree tree_fs_noresults = store.load() tree_fs_noresults.run(X=X, y=y) res_fs_noresults = tree_fs_noresults.reduce().values()[0] # Save with results tree_fs_noresults.save_tree(store=store) tree_fs_withresults = store.load() res_fs_withresults = tree_fs_withresults.reduce().values()[0] # Compare comp = np.all([ np.all(np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k])) and np.all( np.asarray(res_fs_noresults[k]) == np.asarray( res_fs_withresults[k])) for k in res_mem ]) self.assertTrue(comp)
def test_cv_best_search_refit_parallel(self): n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] n_samples = 500 n_features = 10000 n_cores = 2 X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, n_informative=5) # epac workflow for paralle computing pipelines = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) wf = CV(pipeline, n_folds=n_folds) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=n_cores, remove_finished_wf=False, remove_local_tree=False) sfw_engine_wf = sfw_engine.run(X=X, y=y) # epac workflow for normal node computing pipelines2 = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline2 = CVBestSearchRefitParallel(pipelines2, n_folds=n_folds_nested) wf2 = CV(pipeline2, n_folds=n_folds) wf2.run(X=X, y=y) self.assertTrue(compare_two_node(sfw_engine_wf, wf2)) self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
def test_cv_best_search_refit_parallel(self): n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] n_samples = 500 n_features = 10000 n_cores = 2 X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, n_informative=5) # epac workflow for paralle computing pipelines = Methods(*[Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) wf = CV(pipeline, n_folds=n_folds) sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=n_cores, remove_finished_wf=False, remove_local_tree=False) sfw_engine_wf = sfw_engine.run(X=X, y=y) # epac workflow for normal node computing pipelines2 = Methods(*[Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline2 = CVBestSearchRefitParallel(pipelines2, n_folds=n_folds_nested) wf2 = CV(pipeline2, n_folds=n_folds) wf2.run(X=X, y=y) self.assertTrue(compare_two_node(sfw_engine_wf, wf2)) self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
# difference between input agrument of fit and predict. The true y will also figure # in the result with key "y/true" class MySVM: def __init__(self, C=1.0): self.C = C def fit(self, X, y): from sklearn.svm import SVC self.svc = SVC(C=self.C) self.svc.fit(X, y) def predict(self, X): return self.svc.predict(X) svms = Methods(MySVM(C=1.0), MySVM(C=2.0)) cv = CV(svms, cv_key="y", cv_type="stratified", n_folds=2, reducer=None) cv.run(X=X, y=y) # top-down process to call transform cv.reduce() # buttom-up process from sklearn.decomposition import PCA class MyPCA(PCA): """PCA with predict method""" def predict(self, X): """Project to X PCs then project back to original space If X is not singular, self.fit(X).predict(X) == X""" return np.dot(self.transform(X), self.components_) + self.mean_ pcas = Methods(MyPCA(n_components=1), MyPCA(n_components=2)) cv = CV(pcas, n_folds=2, reducer=None) cv.run(X=X, y=y) # top-down process to call transform cv.reduce() # buttom-up process
Xd.CB_EXPO[Xd.CB_EXPO == 0] = -1 X = np.asarray(Xd) y = np.asarray(yd) C_values = [0.01, 0.05, .1, .5, 1, 5, 10] # SVM L1 # ====== svms = Methods(*[ SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values ]) cv = CV(svms, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() #print cv_results epac.export_csv( cv, cv_results, os.path.join(WD, "results", "cv10_caarms+pas+canabis_svmsl1.csv")) # SVM L1 with CVBestSearchRefit # ============================= svms_cv = CVBestSearchRefit(svms, n_folds=10, cv_type="stratified") cv = CV(svms_cv, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() print cv_results
from epac import Pipe, Methods, CV, Perms from epac import ClassificationReport, PvalPerms from epac import StoreFs from epac import CVBestSearchRefit from epac.sklearn_plugins import Permutations from epac.configuration import conf X, y = datasets.make_classification(n_samples=20, n_features=10, n_informative=2) n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[ Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline = CVBestSearchRefit(pipelines, n_folds=n_folds_nested) tree_mem = CV(pipeline, n_folds=n_folds, reducer=ClassificationReport(keep=False)) # Save Tree import tempfile store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True) tree_mem.save_tree(store=store) tree_mem.run(X=X, y=y) tree_mem.reduce()
#k_values = [1, 2, 3, 4, 5, 10, 15, 20, 25, 27] C_values = [0.01, 0.05, .1, .5, 1, 5, 10] # SVM L1 # ====== svms = Methods(*[SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values]) # #anova_svms = Methods(*[Pipe(SelectKBest(k=k), #preprocessing.StandardScaler(), # Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) for C in C_values for penalty in ['l1', 'l2']])) for k in k_values]) cv = CV(svms, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() #print cv_results epac.export_csv(cv, cv_results, os.path.join(WD, "results", "cv10_svmsl1.csv")) # SVM L1 with CVBestSearchRefit # ============================= svms_cv = CVBestSearchRefit(svms, n_folds=10) cv = CV(svms_cv, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() print cv_results #[{'key': CVBestSearchRefit, 'y/test/score_f1': [ 0.84848485 0.76190476], 'y/test/recall_pvalues': [ 0.01086887 0.03000108], 'y/test/score_precision': [ 0.82352941 0.8 ], 'y/test/recall_mean_pvalue': 0.00592461228371, 'y/test/score_recall': [ 0.875 0.72727273], 'y/test/score_accuracy': 0.814814814815, 'y/test/score_recall_mean': 0.801136363636}]) #
anovas_svm.run(X=X, y=y) print(anovas_svm.reduce()) # Cross-validation # ---------------- # CV of LDA # CV (Splitter) # / | \ # 0 1 2 Folds (Slicer) # | | # Methods (Splitter) # / \ # LDA SVM Classifier (Estimator) from epac import CV, Methods cv = CV(Methods(LDA(), SVM())) cv.run(X=X, y=y) print(cv.reduce()) # Model selection using CV # ------------------------ # CVBestSearchRefit # Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifier (Estimator) from epac import Pipe, CVBestSearchRefit, Methods # CV + Grid search of a simple classifier wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10))) wf.run(X=X, y=y) print(wf.reduce()) # Feature selection combined with SVM and LDA
print anovas_svm.reduce() # Cross-validation # ---------------- # CV of LDA # CV (Splitter) # / | \ # 0 1 2 Folds (Slicer) # | | # Methods (Splitter) # / \ # LDA SVM Classifier (Estimator) from epac import CV, Methods cv = CV(Methods(LDA(), SVM())) cv.run(X=X, y=y) print cv.reduce() # Model selection using CV # ------------------------ # CVBestSearchRefit # Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifier (Estimator) from epac import Pipe, CVBestSearchRefit, Methods # CV + Grid search of a simple classifier wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10))) wf.run(X=X, y=y) print wf.reduce()