Пример #1
0
def func_memm_local():
    print("memm_local pt1")
    ## 1) Build a dataset and convert to np.memmap (for big matrix)
    ## ============================================================
    X, y = datasets.make_classification(n_samples=500,
                                        n_features=5000,
                                        n_informative=2,
                                        random_state=1)
    print("memm_local pt2")
    X = convert2memmap(X)
    y = convert2memmap(y)
    Xy = dict(X=X, y=y)
    ## 2) Build two workflows respectively
    ## =======================================================
    print("memm_local pt3")
    from sklearn.svm import SVC
    from epac import CV, Methods
    cv_svm_local = CV(Methods(*[SVC(
        kernel="linear"), SVC(kernel="rbf")]),
                      n_folds=3)
    print("memm_local pt4")
    #    from epac import LocalEngine
    #    local_engine = LocalEngine(cv_svm_local, num_processes=2)
    #    cv_svm = local_engine.run(**Xy)
    cv_svm_local.run(**Xy)
    print(cv_svm_local.reduce())
    print("memm_local pt5")
Пример #2
0
    def test_cv(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5,
                                            n_informative=2)
        n_folds = 2

        # = With EPAC
        wf = CV(SVC(kernel="linear"), n_folds=n_folds,
                reducer=ClassificationReport(keep=True))
        r_epac = wf.top_down(X=X, y=y)

        # = With SKLEARN
        clf = SVC(kernel="linear")
        r_sklearn = list()
        for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds):
            #idx_train, idx_test  = cv.__iter__().next()
            X_train = X[idx_train, :]
            X_test = X[idx_test, :]
            y_train = y[idx_train, :]
            clf.fit(X_train, y_train)
            r_sklearn.append(clf.predict(X_test))

        # = Comparison
        key2cmp = 'y' + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION
        for icv in range(n_folds):
            comp = np.all(np.asarray(r_epac[0][key2cmp])
                          == np.asarray(r_sklearn[0]))
            self.assertTrue(comp, u'Diff CV: EPAC vs sklearn')

        # test reduce
        r_epac_reduce = wf.reduce().values()[0][key2cmp]
        comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn))
        self.assertTrue(comp, u'Diff CV: EPAC reduce')
Пример #3
0
def func_memm_local():
    print "memm_local pt1"
    ## 1) Build a dataset and convert to np.memmap (for big matrix)
    ## ============================================================
    X, y = datasets.make_classification(n_samples=500,
                                        n_features=5000,
                                        n_informative=2,
                                        random_state=1)
    print "memm_local pt2"
    X = convert2memmap(X)
    y = convert2memmap(y)
    Xy = dict(X=X, y=y)
    ## 2) Build two workflows respectively
    ## =======================================================
    print "memm_local pt3"
    from sklearn.svm import SVC
    from epac import CV, Methods
    cv_svm_local = CV(Methods(*[SVC(kernel="linear"),
                                SVC(kernel="rbf")]),
                      n_folds=3)
    print "memm_local pt4"
#    from epac import LocalEngine
#    local_engine = LocalEngine(cv_svm_local, num_processes=2)
#    cv_svm = local_engine.run(**Xy)
    cv_svm_local.run(**Xy)
    print cv_svm_local.reduce()
    print "memm_local pt5"
Пример #4
0
    def test_cv(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
        n_folds = 2

        # = With EPAC
        wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True))
        r_epac = wf.top_down(X=X, y=y)

        # = With SKLEARN
        clf = SVC(kernel="linear")
        r_sklearn = list()
        for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds):
            # idx_train, idx_test  = cv.__iter__().next()
            X_train = X[idx_train, :]
            X_test = X[idx_test, :]
            y_train = y[idx_train, :]
            clf.fit(X_train, y_train)
            r_sklearn.append(clf.predict(X_test))

        # = Comparison
        key2cmp = "y" + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION
        for icv in range(n_folds):
            comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0]))
            self.assertTrue(comp, u"Diff CV: EPAC vs sklearn")

        # test reduce
        r_epac_reduce = wf.reduce().values()[0][key2cmp]
        comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn))
        self.assertTrue(comp, u"Diff CV: EPAC reduce")
Пример #5
0
 def get_workflow(self):
     ####################################################################
     ## EPAC WORKFLOW
     # -------------------------------------
     #             Perms                      Perm (Splitter)
     #         /     |       \
     #        0      1       2                Samples (Slicer)
     #        |
     #       CV                               CV (Splitter)
     #  /       |       \
     # 0        1       2                     Folds (Slicer)
     # |        |       |
     # Pipeline     Pipeline     Pipeline     Sequence
     # |
     # 2                                      SelectKBest (Estimator)
     # |
     # Methods
     # |                     \
     # SVM(linear,C=1)   SVM(linear,C=10)     Classifiers (Estimator)
     pipeline = Pipe(SelectKBest(k=2),
                     Methods(*[SVC(kernel="linear", C=C)
                               for C in [1, 3]]))
     wf = Perms(CV(pipeline, n_folds=3),
                n_perms=3,
                permute="y",
                random_state=1)
     return wf
Пример #6
0
def test_mem():
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10000,
                                        n_informative=2,
                                        random_state=1)
#    f = open("/home/jinpeng/x.log", "w")
#    pickle.dump(X, f) # =>> 474 MB
#    f.close()
#    np.savez ("/home/jinpeng/np_x.log", dict(X=X)) # ===> 160 MB
    
    cv_svm = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]),
                     n_folds=10)
    cv_svm.run(X=X, y=y) # Top-down process: computing recognition rates, etc.
    # local_engine = LocalEngine(cv_svm, num_processes=2)
    # cv_svm = local_engine.run(X=X, y=y)
    print cv_svm.reduce() # Bottom-up process: computing p-values, etc.
Пример #7
0
 def get_workflow(self, n_features=int(1E03)):
     random_state = 0
     C_values = [1, 10]
     k_values = 0
     k_max = "auto"
     n_folds_nested = 5
     n_folds = 10
     n_perms = 10
     if k_max != "auto":
         k_values = range_log2(np.minimum(int(k_max), n_features),
                               add_n=True)
     else:
         k_values = range_log2(n_features, add_n=True)
     cls = Methods(*[
         Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values
         for k in k_values
     ])
     pipeline = CVBestSearchRefit(cls,
                                  n_folds=n_folds_nested,
                                  random_state=random_state)
     wf = Perms(CV(pipeline, n_folds=n_folds),
                n_perms=n_perms,
                permute="y",
                random_state=random_state)
     return wf
Пример #8
0
 def test_engine_info(self):
     n_samples = 20
     n_features = 100
     n_proc = 2
     X, y = datasets.make_classification(n_samples=n_samples,
                                         n_features=n_features,
                                         n_informative=2,
                                         random_state=1)
     Xy = dict(X=X, y=y)
     cv_svm_local = CV(Methods(*[SVC(
         kernel="linear"), SVC(kernel="rbf")]),
                       n_folds=3)
     swf_engine = SomaWorkflowEngine(cv_svm_local,
                                     num_processes=n_proc,
                                     resource_id="jl237561@gabriel",
                                     login="******",
                                     remove_finished_wf=False,
                                     remove_local_tree=False,
                                     queue="Global_long")
     swf_engine.run(**Xy)
     print("engine_info ================")
     for job_info in swf_engine.engine_info:
         print("  job_info=================")
         print("  mem_cost= ", job_info.mem_cost)
         print("  vmem_cost= ", job_info.vmem_cost)
         print("  time_cost= ", job_info.time_cost)
         self.assertTrue(job_info.time_cost > 0)
Пример #9
0
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features),
                              add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[
        Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values
        for k in k_values
    ])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Run Workflow
    ## ===============
    time_fit_predict = time.time()
    ## Run on local machine
    sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                    num_processes=options.n_cores)
    ## Run on cluster
    #    sfw_engine = SomaWorkflowEngine(
    #                        tree_root=wf,
    #                        num_processes=options.n_cores,
    #                        resource_id="jl237561@gabriel",
    #                        login="******")
    wf = sfw_engine.run(X=X, y=y)
    print "Time ellapsed, fit predict:", time.time() - time_fit_predict
    time_reduce = time.time()

    ## 4) Reduce Workflow
    ## ==================
    print wf.reduce()
    print "Time ellapsed, reduce:", time.time() - time_reduce
Пример #10
0
    def test_perm_cv(self):
        X, y = datasets.make_classification(n_samples=20,
                                            n_features=5,
                                            n_informative=2)
        n_perms = 3
        n_folds = 2
        rnd = 0
        # = With EPAC
        wf = Perms(CV(SVC(kernel="linear"),
                      n_folds=n_folds,
                      reducer=ClassificationReport(keep=True)),
                   n_perms=n_perms,
                   permute="y",
                   random_state=rnd,
                   reducer=None)
        r_epac = wf.run(X=X, y=y)
        # = With SKLEARN
        from sklearn.cross_validation import StratifiedKFold
        clf = SVC(kernel="linear")
        r_sklearn = [[None] * n_folds for i in xrange(n_perms)]
        perm_nb = 0
        for perm in Permutations(n=y.shape[0],
                                 n_perms=n_perms,
                                 random_state=rnd):
            y_p = y[perm]
            fold_nb = 0
            for idx_train, idx_test in StratifiedKFold(y=y_p, n_folds=n_folds):
                X_train = X[idx_train, :]
                X_test = X[idx_test, :]
                y_p_train = y_p[idx_train, :]
                clf.fit(X_train, y_p_train)
                r_sklearn[perm_nb][fold_nb] = clf.predict(X_test)
                fold_nb += 1
            perm_nb += 1

        cmp_key = 'y' + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION
        # Comparison
        for iperm in range(n_perms):
            for icv in range(n_folds):
                comp = np.all(
                    np.asarray(r_epac[iperm][icv][cmp_key]) == np.asarray(
                        r_sklearn[iperm][icv]))
                self.assertTrue(comp, u'Diff Perm / CV: EPAC vs sklearn')

        # test reduce
        for iperm in range(n_perms):
            for icv in range(n_folds):
                ## iperm = 0
                ## icv = 0
                comp = np.all(
                    np.asarray(wf.reduce().values()[iperm][cmp_key][icv]) ==
                    np.asarray(r_sklearn[iperm][icv]))
                self.assertTrue(comp, u'Diff Perm / CV: EPAC reduce')
    def test_cv_best_search_refit_parallel(self):
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        n_samples = 500
        n_features = 10000
        n_cores = 2
        X, y = datasets.make_classification(n_samples=n_samples,
                                            n_features=n_features,
                                            n_informative=5)
        # epac workflow for paralle computing
        pipelines = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])
        pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested)
        wf = CV(pipeline, n_folds=n_folds)

        sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                        num_processes=n_cores,
                                        remove_finished_wf=False,
                                        remove_local_tree=False)
        sfw_engine_wf = sfw_engine.run(X=X, y=y)

        # epac workflow for normal node computing
        pipelines2 = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])
        pipeline2 = CVBestSearchRefitParallel(pipelines2,
                                              n_folds=n_folds_nested)
        wf2 = CV(pipeline2, n_folds=n_folds)
        wf2.run(X=X, y=y)

        self.assertTrue(compare_two_node(sfw_engine_wf, wf2))
        self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
Пример #12
0
 def get_workflow(self):
     n_folds = 2
     n_folds_nested = 3
     k_values = [1, 2]
     C_values = [1, 2]
     pipelines = Methods(*[Pipe(SelectKBest(k=k),
                                Methods(*[SVC(kernel="linear", C=C)
                                          for C in C_values]))
                           for k in k_values])
     pipeline = CVBestSearchRefitParallel(pipelines,
                                          n_folds=n_folds_nested)
     wf = CV(pipeline, n_folds=n_folds)
     return wf
    def test_cv_best_search_refit_parallel(self):
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        n_samples = 500
        n_features = 10000
        n_cores = 2
        X, y = datasets.make_classification(n_samples=n_samples,
                                            n_features=n_features,
                                            n_informative=5)
        # epac workflow for paralle computing
        pipelines = Methods(*[Pipe(SelectKBest(k=k),
                              Methods(*[SVC(kernel="linear", C=C)
                              for C in C_values]))
                              for k in k_values])
        pipeline = CVBestSearchRefitParallel(pipelines,
                                             n_folds=n_folds_nested)
        wf = CV(pipeline, n_folds=n_folds)

        sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                        num_processes=n_cores,
                                        remove_finished_wf=False,
                                        remove_local_tree=False)
        sfw_engine_wf = sfw_engine.run(X=X, y=y)

        # epac workflow for normal node computing
        pipelines2 = Methods(*[Pipe(SelectKBest(k=k),
                              Methods(*[SVC(kernel="linear", C=C)
                              for C in C_values]))
                              for k in k_values])
        pipeline2 = CVBestSearchRefitParallel(pipelines2,
                                             n_folds=n_folds_nested)
        wf2 = CV(pipeline2, n_folds=n_folds)
        wf2.run(X=X, y=y)

        self.assertTrue(compare_two_node(sfw_engine_wf, wf2))
        self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
Пример #14
0
    def test_peristence_load_and_fit_predict(self):
        X, y = datasets.make_classification(n_samples=20,
                                            n_features=10,
                                            n_informative=2)
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        pipelines = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])

        pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested)

        tree_mem = CV(pipeline,
                      n_folds=n_folds,
                      reducer=ClassificationReport(keep=False))
        # Save Tree
        import tempfile
        store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True)
        tree_mem.save_tree(store=store)
        tree_mem.run(X=X, y=y)
        res_mem = tree_mem.reduce().values()[0]
        # Reload Tree
        tree_fs_noresults = store.load()
        tree_fs_noresults.run(X=X, y=y)
        res_fs_noresults = tree_fs_noresults.reduce().values()[0]
        # Save with results
        tree_fs_noresults.save_tree(store=store)
        tree_fs_withresults = store.load()
        res_fs_withresults = tree_fs_withresults.reduce().values()[0]
        # Compare
        comp = np.all([
            np.all(np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k]))
            and np.all(
                np.asarray(res_fs_noresults[k]) == np.asarray(
                    res_fs_withresults[k])) for k in res_mem
        ])
        self.assertTrue(comp)
Пример #15
0
 def get_workflow(self):
     ####################################################################
     ## EPAC WORKFLOW
     # -------------------------------------
     #             Perms                      Perm (Splitter)
     #         /     |       \
     #        0      1       2                Samples (Slicer)
     #        |
     #       CV                               CV (Splitter)
     #  /       |       \
     # 0        1       2                     Folds (Slicer)
     # |        |       |
     # Pipeline     Pipeline     Pipeline     Sequence
     # |
     # 2                                      SelectKBest (Estimator)
     # |
     # Methods
     # |                     \
     # SVM(linear,C=1)   SVM(linear,C=10)     Classifiers (Estimator)
     pipeline = Pipe(SelectKBest(k=2),
                     Methods(*[SVC(kernel="linear", C=C) for C in [1, 3]]))
     wf = CV(pipeline, n_folds=3, reducer=ClassificationReport(keep=True))
     return wf
Пример #16
0
    def test_peristence_load_and_fit_predict(self):
        X, y = datasets.make_classification(n_samples=20, n_features=10,
                                        n_informative=2)
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        pipelines = Methods(*[
                            Pipe(SelectKBest(k=k),
                            Methods(*[SVC(kernel="linear", C=C)
                            for C in C_values]))
                            for k in k_values])

        pipeline = CVBestSearchRefit(pipelines,
                                     n_folds=n_folds_nested)

        tree_mem = CV(pipeline, n_folds=n_folds,
                      reducer=ClassificationReport(keep=False))
        # Save Tree
        import tempfile
        store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True)
        tree_mem.save_tree(store=store)
        tree_mem.run(X=X, y=y)
        res_mem = tree_mem.reduce().values()[0]
        # Reload Tree
        tree_fs_noresults = store.load()
        tree_fs_noresults.run(X=X, y=y)
        res_fs_noresults = tree_fs_noresults.reduce().values()[0]
        # Save with results
        tree_fs_noresults.save_tree(store=store)
        tree_fs_withresults = store.load()
        res_fs_withresults = tree_fs_withresults.reduce().values()[0]
        #
        # Compare
        comp = np.all([
            np.all(
            np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k]))
            and
            np.all(np.asarray(res_fs_noresults[k]) ==
            np.asarray(res_fs_withresults[k]))
            for k in res_mem])
        self.assertTrue(comp)
Пример #17
0
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features),
                              add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[
        Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values
        for k in k_values
    ])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Export Workflow to soma_workflow_gui
    ## ===============
    time_fit_predict = time.time()
    if os.path.isdir(options.soma_workflow_dir):
        shutil.rmtree(options.soma_workflow_dir)
    sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                    num_processes=options.n_cores)
    sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y)

    print "Time ellapsed, fit predict:", time.time() - time_fit_predict

    #    ## 6) Load Epac tree & Reduce
    #    ## ==========================
    reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py")
    f = open(reduce_filename, 'w')
    reduce_str = """from epac.map_reduce.engine import SomaWorkflowEngine
wf = SomaWorkflowEngine.load_from_gui("%s")
print wf.reduce()
""" % options.soma_workflow_dir
    f.write(reduce_str)
    f.close()
    print "#First run\n"\
        "soma_workflow_gui\n"\
        "\t(1)Open %s\n"\
        "\t(2)Submit\n"\
        "\t(3)Transfer Input Files\n"\
        "\t...wait...\n"\
        "\t(4)Transfer Output Files\n"\
        "#When done run:\npython %s" % (
            os.path.join(options.soma_workflow_dir,
                         sfw_engine.open_me_by_soma_workflow_gui),
            reduce_filename)
Пример #18
0
"""

from sklearn import datasets

X, y = datasets.make_classification(n_samples=500,
                                    n_features=200000,
                                    n_informative=2,
                                    random_state=1)

Xy = dict(X=X, y=y)
## 2) Building workflow
## =======================================================
print " -> Pt2 : X and y created, building workflow"
from sklearn import svm, cross_validation
#kfold = cross_validation.KFold(n=len(X), n_folds=3)
#svc = svm.SVC(C=1, kernel='linear')
#print [svc.fit(X[train], y[train]).score(X[test], y[test]) for train, test in kfold]
from epac import CV, Methods
cv_svm_local = CV(Methods(*[svm.SVC(kernel="linear"),
                            svm.SVC(kernel="rbf")]),
                  n_folds=3)
print " -> Pt3 : Workflow built, defining local engine"
cv_svm = None
n_proc = 2
# Running on the local machine
from epac import LocalEngine
local_engine = LocalEngine(cv_svm_local, num_processes=n_proc)
print " -> Pt4 : Running"
cv_svm = local_engine.run(**Xy)
print " -> Success with %i procs!" % n_proc
Пример #19
0
from sklearn.svm import LinearSVC as SVM
from sklearn.feature_selection import SelectKBest
from sklearn import preprocessing


##############################################################################
## Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2
from epac import Pipe, CV
n_folds = 10

anova_svm = Pipe(SelectKBest(k=5), 
                 preprocessing.StandardScaler(), 
                 SVM(class_weight='auto'))

cv = CV(anova_svm, n_folds=n_folds)
cv.run(X=X, y=y)
#
res_cv_anova_svm = cv.reduce()
res_cv_anova_svm["SelectKBest/StandardScaler/LinearSVC"]['y/test/score_recall']

##############################################################################
## Multimethods, "Methods": SVM l1 vs l2
from epac import Methods, CV
svms = Methods(SVM(penalty="l1", class_weight='auto', dual=False), 
               SVM(penalty="l2", class_weight='auto', dual=False))

cv = CV(svms, n_folds=n_folds)
cv.run(X=X, y=y)
res_cv_svms = cv.reduce()
#
Пример #20
0
    def todo_perm_cv_grid_vs_sklearn(self):
        X, y = datasets.make_classification(n_samples=100,
                                            n_features=500,
                                            n_informative=5)
        n_perms = 3
        n_folds = 2
        n_folds_nested = 2
        random_state = 0
        k_values = [2, 3]
        C_values = [1, 10]
        # = With EPAC
        pipelines = Methods(*[Pipe(SelectKBest(k=k),
                                   SVC(C=C, kernel="linear"))
                              for C in C_values
                              for k in k_values])
        #print [n for n in pipelines.walk_leaves()]
        pipelines_cv = CVBestSearchRefit(pipelines,
                                         n_folds=n_folds_nested,
                                         random_state=random_state)
        wf = Perms(CV(pipelines_cv, n_folds=n_folds,
                      reducer=ClassificationReport(keep=True)),
                   n_perms=n_perms, permute="y",
                   reducer=PvalPerms(keep=True),
                   random_state=random_state)
        wf.fit_predict(X=X, y=y)
        r_epac = wf.reduce().values()[0]
        for key in r_epac:
            print("key=" + repr(key) + ", value=" + repr(r_epac[key]))

        # = With SKLEARN
        from sklearn.cross_validation import StratifiedKFold
        from epac.sklearn_plugins import Permutations
        from sklearn.pipeline import Pipeline
        from sklearn import grid_search

        clf = Pipeline([('anova', SelectKBest(k=3)),
                        ('svm', SVC(kernel="linear"))])
        parameters = {'anova__k': k_values, 'svm__C': C_values}

        r_sklearn = dict()
        r_sklearn['pred_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['true_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['score_tr'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['score_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['mean_score_te'] = [None] * n_perms
        r_sklearn['mean_score_tr'] = [None] * n_perms

        perm_nb = 0
        perms = Permutations(n=y.shape[0],
                             n_perms=n_perms,
                             random_state=random_state)
        for idx in perms:
            #idx = perms.__iter__().next()
            y_p = y[idx]
            cv = StratifiedKFold(y=y_p, n_folds=n_folds)
            fold_nb = 0
            for idx_train, idx_test in cv:
                #idx_train, idx_test  = cv.__iter__().next()
                X_train = X[idx_train, :]
                X_test = X[idx_test, :]
                y_p_train = y_p[idx_train, :]
                y_p_test = y_p[idx_test, :]
                # Nested CV
                cv_nested = StratifiedKFold(y=y_p_train,
                                            n_folds=n_folds_nested)
                gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
                gscv.fit(X_train, y_p_train)
                r_sklearn['pred_te'][perm_nb][fold_nb] = gscv.predict(X_test)
                r_sklearn['true_te'][perm_nb][fold_nb] = y_p_test
                r_sklearn['score_tr'][perm_nb][fold_nb] =\
                    gscv.score(X_train, y_p_train)
                r_sklearn['score_te'][perm_nb][fold_nb] =\
                    gscv.score(X_test, y_p_test)
                fold_nb += 1
            # Average over folds
            r_sklearn['mean_score_te'][perm_nb] = \
                np.mean(np.asarray(r_sklearn['score_te'][perm_nb]), axis=0)
            r_sklearn['mean_score_tr'][perm_nb] = \
                np.mean(np.asarray(r_sklearn['score_tr'][perm_nb]), axis=0)
                    #np.mean(R2[key]['score_tr'][perm_nb])
            perm_nb += 1

        print(repr(r_sklearn))
        # - Comparisons
        shared_keys = set(r_epac.keys()).intersection(set(r_sklearn.keys()))
        comp = {k: np.all(np.asarray(r_epac[k]) == np.asarray(r_sklearn[k]))
                for k in shared_keys}
        print("comp=" + repr(comp))
        #return comp
        for key in comp:
            self.assertTrue(comp[key], u'Diff for attribute: "%s"' % key)
Пример #21
0
# -*- coding: utf-8 -*-
"""
Created on Thu May 23 15:21:35 2013

@author: ed203246
"""

from sklearn import datasets
from sklearn.svm import LinearSVC as SVM
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SelectKBest
from epac.map_reduce.reducers import PvalPerms
import numpy

X, y = datasets.make_classification(n_samples=100,
                                    n_features=200,
                                    n_informative=2)
X = numpy.random.rand(*X.shape)

from epac import Perms, CV, Methods
perms_cv_svm = Perms(CV(Methods(SVM(loss="l1"), SVM(loss="l2"))), n_perms=100)
perms_cv_svm.run(X=X, y=y)
perms_cv_svm.reduce()

self = perms_cv_svm
key = 'LinearSVC(loss=l1)'
self = PvalPerms()
Пример #22
0
anovas_svm.run(X=X, y=y)
print anovas_svm.reduce()


# Cross-validation
# ----------------
# CV of LDA
#      CV                 (Splitter)
#  /   |   \
# 0    1    2  Folds      (Slicer)
# |    |
#   Methods               (Splitter)
#    /   \
#  LDA  SVM    Classifier (Estimator)
from epac import CV, Methods
cv = CV(Methods(LDA(), SVM()))
cv.run(X=X, y=y)
print cv.reduce()


# Model selection using CV
# ------------------------
# CVBestSearchRefit
#      Methods       (Splitter)
#      /    \
# SVM(C=1)  SVM(C=10)   Classifier (Estimator)
from epac import Pipe, CVBestSearchRefit, Methods
# CV + Grid search of a simple classifier
wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10)))
wf.run(X=X, y=y)
print wf.reduce()
Пример #23
0
anovas_svm = Methods(*[Pipe(SelectKBest(k=k), SVM()) for k in [1, 5, 10]])
anovas_svm.run(X=X, y=y)
print(anovas_svm.reduce())

# Cross-validation
# ----------------
# CV of LDA
#      CV                 (Splitter)
#  /   |   \
# 0    1    2  Folds      (Slicer)
# |    |
#   Methods               (Splitter)
#    /   \
#  LDA  SVM    Classifier (Estimator)
from epac import CV, Methods
cv = CV(Methods(LDA(), SVM()))
cv.run(X=X, y=y)
print(cv.reduce())

# Model selection using CV
# ------------------------
# CVBestSearchRefit
#      Methods       (Splitter)
#      /    \
# SVM(C=1)  SVM(C=10)   Classifier (Estimator)
from epac import Pipe, CVBestSearchRefit, Methods
# CV + Grid search of a simple classifier
wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10)))
wf.run(X=X, y=y)
print(wf.reduce())
Пример #24
0
## ============================================================
X, y = datasets.make_classification(n_samples=500,
                                    n_features=10000,
                                    n_informative=2,
                                    random_state=1)
X = convert2memmap(X)
y = convert2memmap(y)

Xy = dict(X=X, y=y)

## 2) Build two workflows respectively
## =======================================================

from sklearn.svm import SVC
from epac import CV, Methods
cv_svm_local = CV(
    Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3)
cv_svm_swf = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3)

## 3) Run two workflows using local engine and soma-workflow
## =========================================================

from epac import LocalEngine
local_engine = LocalEngine(cv_svm_local, num_processes=2)
cv_svm = local_engine.run(X=X, y=y)
print(cv_svm.reduce())

from epac import SomaWorkflowEngine
swf_engine = SomaWorkflowEngine(
    cv_svm_swf,
    num_processes=2,
    #resource_id="jl237561@gabriel",
Пример #25
0
# Predict can return an array. In this case EPAC will
# put the prediction in a Result (a dictionnary). with key = "y/pred". y being the
# difference between input agrument of fit and predict. The true y will also figure
# in the result with key "y/true"
class MySVM:
    def __init__(self, C=1.0):
        self.C = C
    def fit(self, X, y):
        from sklearn.svm import SVC
        self.svc = SVC(C=self.C)
        self.svc.fit(X, y)
    def predict(self, X):
        return self.svc.predict(X)

svms = Methods(MySVM(C=1.0), MySVM(C=2.0))
cv = CV(svms, cv_key="y", cv_type="stratified", n_folds=2,
        reducer=None)
cv.run(X=X, y=y)  # top-down process to call transform
cv.reduce()       # buttom-up process

from sklearn.decomposition import PCA
class MyPCA(PCA):
    """PCA with predict method"""
    def predict(self, X):
        """Project to X PCs then project back to original space
        If X is not singular, self.fit(X).predict(X) == X"""
        return np.dot(self.transform(X), self.components_) + self.mean_

pcas = Methods(MyPCA(n_components=1), MyPCA(n_components=2))
cv = CV(pcas, n_folds=2, reducer=None)
cv.run(X=X, y=y)  # top-down process to call transform
cv.reduce()       # buttom-up process
Пример #26
0

#k_values = [1, 2, 3, 4, 5, 10, 15, 20, 25, 27]
C_values = [0.01, 0.05, .1, .5, 1, 5, 10]

# SVM L1
# ======

svms = Methods(*[SVM(dual=False, class_weight='auto', penalty="l1", C=C)  for C in C_values])

#
#anova_svms = Methods(*[Pipe(SelectKBest(k=k),       #preprocessing.StandardScaler(),
#                            Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) for C in C_values for penalty in  ['l1', 'l2']])) for k in k_values])


cv = CV(svms, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
#print cv_results

epac.export_csv(cv, cv_results, os.path.join(WD, "results", "cv10_svmsl1.csv"))

# SVM L1 with CVBestSearchRefit
# =============================

svms_cv = CVBestSearchRefit(svms, n_folds=10)
cv = CV(svms_cv, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
print cv_results
#[{'key': CVBestSearchRefit, 'y/test/score_f1': [ 0.84848485  0.76190476], 'y/test/recall_pvalues': [ 0.01086887  0.03000108], 'y/test/score_precision': [ 0.82352941  0.8       ], 'y/test/recall_mean_pvalue': 0.00592461228371, 'y/test/score_recall': [ 0.875       0.72727273], 'y/test/score_accuracy': 0.814814814815, 'y/test/score_recall_mean': 0.801136363636}])
from epac import Pipe, Methods, CV, Perms
from epac import ClassificationReport, PvalPerms
from epac import StoreFs
from epac import CVBestSearchRefit
from epac.sklearn_plugins import Permutations
from epac.configuration import conf

X, y = datasets.make_classification(n_samples=20,
                                    n_features=10,
                                    n_informative=2)
n_folds = 2
n_folds_nested = 3
k_values = [1, 2]
C_values = [1, 2]
pipelines = Methods(*[
                    Pipe(SelectKBest(k=k),
                    Methods(*[SVC(kernel="linear", C=C)
                    for C in C_values]))
                    for k in k_values])

pipeline = CVBestSearchRefit(pipelines,
                             n_folds=n_folds_nested)

tree_mem = CV(pipeline, n_folds=n_folds,
              reducer=ClassificationReport(keep=False))
# Save Tree
import tempfile
store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True)
tree_mem.save_tree(store=store)
tree_mem.run(X=X, y=y)
tree_mem.reduce()
Пример #28
0
Xd.PAS2gr[Xd.PAS2gr == 2] = 1
Xd.CB_EXPO[Xd.CB_EXPO == 0] = -1

X = np.asarray(Xd)
y = np.asarray(yd)

C_values = [0.01, 0.05, .1, .5, 1, 5, 10]

# SVM L1
# ======

svms = Methods(*[
    SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values
])

cv = CV(svms, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
#print cv_results

epac.export_csv(
    cv, cv_results,
    os.path.join(WD, "results", "cv10_caarms+pas+canabis_svmsl1.csv"))

# SVM L1 with CVBestSearchRefit
# =============================

svms_cv = CVBestSearchRefit(svms, n_folds=10, cv_type="stratified")
cv = CV(svms_cv, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
Пример #29
0
    def test_memmapping(self):
        ## 1) Building dataset
        ## ============================================================
        if self.memmap:
            # If the proc is 1, always generate the matrix
            # Otherwise, load it if it exists, or create it if it doesn't
            writing_mode = (self.n_proc == 1)
            X = create_mmat(self.n_samples,
                            self.n_features,
                            dir=self.directory,
                            writing_mode=writing_mode)
            y = create_array(self.n_samples, [0, 1],
                             dir=self.directory,
                             writing_mode=writing_mode)
            Xy = dict(X=X, y=y)
        else:
            X, y = datasets.make_classification(n_samples=self.n_samples,
                                                n_features=self.n_features,
                                                n_informative=2,
                                                random_state=1)
            Xy = dict(X=X, y=y)
        ## 2) Building workflow
        ## =======================================================
        from sklearn.svm import SVC
        from epac import CV, Methods
        cv_svm_local = CV(Methods(*[SVC(
            kernel="linear"), SVC(kernel="rbf")]),
                          n_folds=3)

        cv_svm = None
        if self.is_swf:
            # Running on the cluster
            from epac import SomaWorkflowEngine
            mmap_mode = None
            if self.memmap:
                mmap_mode = "r+"
            swf_engine = SomaWorkflowEngine(
                cv_svm_local,
                num_processes=self.n_proc,
                resource_id="jl237561@gabriel",
                login="******",
                # remove_finished_wf=False,
                # remove_local_tree=False,
                mmap_mode=mmap_mode,
                queue="Global_long")

            cv_svm = swf_engine.run(**Xy)

            # Printing information about the jobs
            time.sleep(2)
            print('')
            sum_memory = 0
            max_time_cost = 0
            for job_info in swf_engine.engine_info:
                print(
                    "mem_cost = {0}, vmem_cost = {1}, time_cost = {2}".format(
                        job_info.mem_cost, job_info.vmem_cost,
                        job_info.time_cost))
                sum_memory += job_info.mem_cost
                if max_time_cost < job_info.time_cost:
                    max_time_cost = job_info.time_cost
            print("sum_memory = ", sum_memory)
            print("max_time_cost = ", max_time_cost)
        else:
            # Running on the local machine
            from epac import LocalEngine
            local_engine = LocalEngine(cv_svm_local, num_processes=self.n_proc)
            cv_svm = local_engine.run(**Xy)

        cv_svm_reduce = cv_svm.reduce()
        print("\n -> Reducing results")
        print(cv_svm_reduce)

        # Creating the directory to save results, if it doesn't exist
        dirname = 'tmp_save_tree/'
        if self.directory is None:
            directory = '/tmp'
        else:
            directory = self.directory
        if not os.path.isdir(directory):
            os.mkdir(directory)
        dirpath = os.path.join(directory, dirname)
        if not os.path.isdir(dirpath):
            os.mkdir(dirpath)

        if self.n_proc == 1:
            ## 4.1) Saving results on the disk for one process
            ## ===================================================
            store = StoreFs(dirpath=dirpath, clear=True)
            cv_svm.save_tree(store=store)

            with open(os.path.join(directory, "tmp_save_results"), 'w+') \
                    as filename:
                print(filename.name)
                pickle.dump(cv_svm_reduce, filename)

        else:
            ## 4.2) Loading the results for one process
            ## ===================================================
            try:
                store = StoreFs(dirpath=dirpath, clear=False)
                cv_svm_one_proc = store.load()

                with open(os.path.join(directory, "tmp_save_results"), 'r+') \
                        as filename:
                    cv_svm_reduce_one_proc = pickle.load(filename)

                ## 5.2) Comparing results to the results for one process
                ## ===================================================
                print("\nComparing %i proc with one proc" % self.n_proc)
                self.assertTrue(compare_two_node(cv_svm, cv_svm_one_proc))
                self.assertTrue(isequal(cv_svm_reduce, cv_svm_reduce_one_proc))
            except KeyError:
                print("Warning: ")
                print("No previous tree detected, no possible "\
                    "comparison of results")