Пример #1
0
 def test_cvbestsearchrefit(self):
     X, y = datasets.make_classification(n_samples=12, n_features=10,
                                         n_informative=2)
     n_folds_nested = 2
     #random_state = 0
     C_values = [.1, 0.5, 1, 2, 5]
     kernels = ["linear", "rbf"]
     key_y_pred = 'y' + conf.SEP + conf.PREDICTION
     # With EPAC
     methods = Methods(*[SVC(C=C, kernel=kernel)
         for C in C_values for kernel in kernels])
     wf = CVBestSearchRefit(methods, n_folds=n_folds_nested)
     wf.run(X=X, y=y)
     r_epac = wf.reduce().values()[0]
     # - Without EPAC
     r_sklearn = dict()
     clf = SVC(kernel="linear")
     parameters = {'C': C_values, 'kernel': kernels}
     cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
     gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
     gscv.fit(X, y)
     r_sklearn[key_y_pred] = gscv.predict(X)
     r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
     # - Comparisons
     comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
     self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction')
     for key_param in r_epac[conf.BEST_PARAMS][0]:
         if key_param in r_sklearn[conf.BEST_PARAMS]:
             comp = r_sklearn[conf.BEST_PARAMS][key_param] == \
                     r_epac[conf.BEST_PARAMS][0][key_param]
             self.assertTrue(comp, \
                 u'Diff CVBestSearchRefit: best parameters')
def test_mem():
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10000,
                                        n_informative=2,
                                        random_state=1)
    wf = CVBestSearchRefit(
                Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]),
                n_folds=10)
    wf.run(X=X, y=y) # Top-down process: computing recognition rates, etc.
    print wf.reduce() # Bottom-up process: computing p-values, etc.
Пример #3
0
 def get_workflow(self, n_features=int(1E03)):
     random_state = 0
     C_values = [1, 10]
     k_values = 0
     k_max = "auto"
     n_folds_nested = 5
     n_folds = 10
     n_perms = 10
     if k_max != "auto":
         k_values = range_log2(np.minimum(int(k_max), n_features),
                               add_n=True)
     else:
         k_values = range_log2(n_features, add_n=True)
     cls = Methods(*[
         Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values
         for k in k_values
     ])
     pipeline = CVBestSearchRefit(cls,
                                  n_folds=n_folds_nested,
                                  random_state=random_state)
     wf = Perms(CV(pipeline, n_folds=n_folds),
                n_perms=n_perms,
                permute="y",
                random_state=random_state)
     return wf
Пример #4
0
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features),
                              add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[
        Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values
        for k in k_values
    ])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Run Workflow
    ## ===============
    time_fit_predict = time.time()
    ## Run on local machine
    sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                    num_processes=options.n_cores)
    ## Run on cluster
    #    sfw_engine = SomaWorkflowEngine(
    #                        tree_root=wf,
    #                        num_processes=options.n_cores,
    #                        resource_id="jl237561@gabriel",
    #                        login="******")
    wf = sfw_engine.run(X=X, y=y)
    print "Time ellapsed, fit predict:", time.time() - time_fit_predict
    time_reduce = time.time()

    ## 4) Reduce Workflow
    ## ==================
    print wf.reduce()
    print "Time ellapsed, reduce:", time.time() - time_reduce
Пример #5
0
    def test_cvbestsearchrefit_select_k_best(self):
        list_C_value = range(2, 10, 1)
#        print repr(list_C_value)
        for C_value in list_C_value:
#            C_value = 2
#            print C_value
            X, y = datasets.make_classification(n_samples=100,
                                                n_features=500,
                                                n_informative=5)
            n_folds_nested = 2
            #random_state = 0
            k_values = [2, 3, 4, 5, 6]
            key_y_pred = 'y' + conf.SEP + conf.PREDICTION
            # With EPAC
            methods = Methods(*[Pipe(SelectKBest(k=k),
                                     SVC(C=C_value, kernel="linear"))
                                     for k in k_values])
            wf = CVBestSearchRefit(methods, n_folds=n_folds_nested)
            wf.run(X=X, y=y)
            r_epac = wf.reduce().values()[0]
            # - Without EPAC
            from sklearn.pipeline import Pipeline
            r_sklearn = dict()
            clf = Pipeline([('anova', SelectKBest(k=3)),
                            ('svm', SVC(C=C_value, kernel="linear"))])
            parameters = {'anova__k': k_values}
            cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
            gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
            gscv.fit(X, y)
            r_sklearn[key_y_pred] = gscv.predict(X)
            r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
            r_sklearn[conf.BEST_PARAMS]['k'] = \
                r_sklearn[conf.BEST_PARAMS]['anova__k']
            # - Comparisons
            comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
            self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction')
            for key_param in r_epac[conf.BEST_PARAMS][0]:
                if key_param in r_sklearn[conf.BEST_PARAMS]:
                    comp = r_sklearn[conf.BEST_PARAMS][key_param] == \
                            r_epac[conf.BEST_PARAMS][0][key_param]
                    self.assertTrue(comp, \
                        u'Diff CVBestSearchRefit: best parameters')
Пример #6
0
    def test_peristence_perm_cv_parmethods_pipe_vs_sklearn(self):
        key_y_pred = 'y' + conf.SEP + conf.PREDICTION
        X, y = datasets.make_classification(n_samples=12, n_features=10,
                                            n_informative=2)
        n_folds_nested = 2
        #random_state = 0
        C_values = [.1, 0.5, 1, 2, 5]
        kernels = ["linear", "rbf"]
        # With EPAC
        methods = Methods(*[SVC(C=C, kernel=kernel)
            for C in C_values for kernel in kernels])
        wf = CVBestSearchRefit(methods, n_folds=n_folds_nested)
        # Save workflow
        # -------------
        import tempfile
        #store = StoreFs("/tmp/toto", clear=True)
        store = StoreFs(tempfile.mktemp())
        wf.save_tree(store=store)
        wf = store.load()
        wf.run(X=X, y=y)
        ## Save results
        wf.save_tree(store=store)
        wf = store.load()
        r_epac = wf.reduce().values()[0]

        # - Without EPAC
        r_sklearn = dict()
        clf = SVC(kernel="linear")
        parameters = {'C': C_values, 'kernel': kernels}
        cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
        gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
        gscv.fit(X, y)
        r_sklearn[key_y_pred] = gscv.predict(X)
        r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
        r_sklearn[conf.BEST_PARAMS]['name'] = 'SVC'

        # - Comparisons
        comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
        self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction')
        comp = np.all([r_epac[conf.BEST_PARAMS][0][p] == r_sklearn[conf.BEST_PARAMS][p]
        for p in  r_sklearn[conf.BEST_PARAMS]])
        self.assertTrue(comp, u'Diff CVBestSearchRefit: best parameters')
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features),
                              add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[
        Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values
        for k in k_values
    ])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Export Workflow to soma_workflow_gui
    ## ===============
    time_fit_predict = time.time()
    if os.path.isdir(options.soma_workflow_dir):
        shutil.rmtree(options.soma_workflow_dir)
    sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                    num_processes=options.n_cores)
    sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y)

    print "Time ellapsed, fit predict:", time.time() - time_fit_predict

    #    ## 6) Load Epac tree & Reduce
    #    ## ==========================
    reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py")
    f = open(reduce_filename, 'w')
    reduce_str = """from epac.map_reduce.engine import SomaWorkflowEngine
wf = SomaWorkflowEngine.load_from_gui("%s")
print wf.reduce()
""" % options.soma_workflow_dir
    f.write(reduce_str)
    f.close()
    print "#First run\n"\
        "soma_workflow_gui\n"\
        "\t(1)Open %s\n"\
        "\t(2)Submit\n"\
        "\t(3)Transfer Input Files\n"\
        "\t...wait...\n"\
        "\t(4)Transfer Output Files\n"\
        "#When done run:\npython %s" % (
            os.path.join(options.soma_workflow_dir,
                         sfw_engine.open_me_by_soma_workflow_gui),
            reduce_filename)
Пример #8
0
    def todo_perm_cv_grid_vs_sklearn(self):
        X, y = datasets.make_classification(n_samples=100,
                                            n_features=500,
                                            n_informative=5)
        n_perms = 3
        n_folds = 2
        n_folds_nested = 2
        random_state = 0
        k_values = [2, 3]
        C_values = [1, 10]
        # = With EPAC
        pipelines = Methods(*[Pipe(SelectKBest(k=k),
                                   SVC(C=C, kernel="linear"))
                              for C in C_values
                              for k in k_values])
        #print [n for n in pipelines.walk_leaves()]
        pipelines_cv = CVBestSearchRefit(pipelines,
                                         n_folds=n_folds_nested,
                                         random_state=random_state)
        wf = Perms(CV(pipelines_cv, n_folds=n_folds,
                      reducer=ClassificationReport(keep=True)),
                   n_perms=n_perms, permute="y",
                   reducer=PvalPerms(keep=True),
                   random_state=random_state)
        wf.fit_predict(X=X, y=y)
        r_epac = wf.reduce().values()[0]
        for key in r_epac:
            print("key=" + repr(key) + ", value=" + repr(r_epac[key]))

        # = With SKLEARN
        from sklearn.cross_validation import StratifiedKFold
        from epac.sklearn_plugins import Permutations
        from sklearn.pipeline import Pipeline
        from sklearn import grid_search

        clf = Pipeline([('anova', SelectKBest(k=3)),
                        ('svm', SVC(kernel="linear"))])
        parameters = {'anova__k': k_values, 'svm__C': C_values}

        r_sklearn = dict()
        r_sklearn['pred_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['true_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['score_tr'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['score_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['mean_score_te'] = [None] * n_perms
        r_sklearn['mean_score_tr'] = [None] * n_perms

        perm_nb = 0
        perms = Permutations(n=y.shape[0],
                             n_perms=n_perms,
                             random_state=random_state)
        for idx in perms:
            #idx = perms.__iter__().next()
            y_p = y[idx]
            cv = StratifiedKFold(y=y_p, n_folds=n_folds)
            fold_nb = 0
            for idx_train, idx_test in cv:
                #idx_train, idx_test  = cv.__iter__().next()
                X_train = X[idx_train, :]
                X_test = X[idx_test, :]
                y_p_train = y_p[idx_train, :]
                y_p_test = y_p[idx_test, :]
                # Nested CV
                cv_nested = StratifiedKFold(y=y_p_train,
                                            n_folds=n_folds_nested)
                gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
                gscv.fit(X_train, y_p_train)
                r_sklearn['pred_te'][perm_nb][fold_nb] = gscv.predict(X_test)
                r_sklearn['true_te'][perm_nb][fold_nb] = y_p_test
                r_sklearn['score_tr'][perm_nb][fold_nb] =\
                    gscv.score(X_train, y_p_train)
                r_sklearn['score_te'][perm_nb][fold_nb] =\
                    gscv.score(X_test, y_p_test)
                fold_nb += 1
            # Average over folds
            r_sklearn['mean_score_te'][perm_nb] = \
                np.mean(np.asarray(r_sklearn['score_te'][perm_nb]), axis=0)
            r_sklearn['mean_score_tr'][perm_nb] = \
                np.mean(np.asarray(r_sklearn['score_tr'][perm_nb]), axis=0)
                    #np.mean(R2[key]['score_tr'][perm_nb])
            perm_nb += 1

        print(repr(r_sklearn))
        # - Comparisons
        shared_keys = set(r_epac.keys()).intersection(set(r_sklearn.keys()))
        comp = {k: np.all(np.asarray(r_epac[k]) == np.asarray(r_sklearn[k]))
                for k in shared_keys}
        print("comp=" + repr(comp))
        #return comp
        for key in comp:
            self.assertTrue(comp[key], u'Diff for attribute: "%s"' % key)
Пример #9
0
    SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values
])

cv = CV(svms, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
#print cv_results

epac.export_csv(
    cv, cv_results,
    os.path.join(WD, "results", "cv10_caarms+pas+canabis_svmsl1.csv"))

# SVM L1 with CVBestSearchRefit
# =============================

svms_cv = CVBestSearchRefit(svms, n_folds=10, cv_type="stratified")
cv = CV(svms_cv, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
print cv_results

#[{'key': CVBestSearchRefit, 'y/test/score_f1': [ 0.82352941  0.7       ], 'y/test/recall_pvalues': [ 0.01086887  0.06790736], 'y/test/score_precision': [ 0.77777778  0.77777778], 'y/test/recall_mean_pvalue': 0.0191572904587, 'y/test/score_recall': [ 0.875       0.63636364], 'y/test/score_accuracy': 0.777777777778, 'y/test/score_recall_mean': 0.755681818182}])

#
#Parmis les 27 11 ont fait la transition et 16 ne l'on pas faite
#- Sensibilité (Taux de detection de les transitions)
#63.63 % soit 7 / 11 (p = 0.067)
#
#- Spécificité (Taux de detection de ceux qui n'ont pas transité ou 1 - Faux positifs)
#87.5 % soit 14 / 16 (p = 0.01)
#
Пример #10
0
#
#anova_svms = Methods(*[Pipe(SelectKBest(k=k),       #preprocessing.StandardScaler(),
#                            Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) for C in C_values for penalty in  ['l1', 'l2']])) for k in k_values])


cv = CV(svms, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
#print cv_results

epac.export_csv(cv, cv_results, os.path.join(WD, "results", "cv10_svmsl1.csv"))

# SVM L1 with CVBestSearchRefit
# =============================

svms_cv = CVBestSearchRefit(svms, n_folds=10)
cv = CV(svms_cv, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
print cv_results
#[{'key': CVBestSearchRefit, 'y/test/score_f1': [ 0.84848485  0.76190476], 'y/test/recall_pvalues': [ 0.01086887  0.03000108], 'y/test/score_precision': [ 0.82352941  0.8       ], 'y/test/recall_mean_pvalue': 0.00592461228371, 'y/test/score_recall': [ 0.875       0.72727273], 'y/test/score_accuracy': 0.814814814815, 'y/test/score_recall_mean': 0.801136363636}])
#
#Parmis les 27 11 ont fait la transition et 16 ne l'on pas faite
#- Sensibilité (Taux de detection de les transitions)
#72.72 % soit 8 / 11 (p = 0.03)
#
#- Spécificité (Taux de detection de ceux qui n'ont pas transité ou 1 - Faux positifs)
#87.5 % soit 14 / 16 (p = 0.01)
#
#Nous avons un taux de bonne classification moyen de 81.4 %
#
Пример #11
0
#    /   \
#  LDA  SVM    Classifier (Estimator)
from epac import CV, Methods
cv = CV(Methods(LDA(), SVM()))
cv.run(X=X, y=y)
print(cv.reduce())

# Model selection using CV
# ------------------------
# CVBestSearchRefit
#      Methods       (Splitter)
#      /    \
# SVM(C=1)  SVM(C=10)   Classifier (Estimator)
from epac import Pipe, CVBestSearchRefit, Methods
# CV + Grid search of a simple classifier
wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10)))
wf.run(X=X, y=y)
print(wf.reduce())

# Feature selection combined with SVM and LDA
# CVBestSearchRefit
#                     Methods          (Splitter)
#               /              \
#            KBest(1)         KBest(5) SelectKBest (Estimator)
#              |
#            Methods                   (Splitter)
#        /          \
#    LDA()          SVM() ...          Classifiers (Estimator)
pipelines = Methods(
    *[Pipe(SelectKBest(k=k), Methods(LDA(), SVM())) for k in [1, 5]])
print([n for n in pipelines.walk_leaves()])
Пример #12
0
#  LDA  SVM    Classifier (Estimator)
from epac import CV, Methods
cv = CV(Methods(LDA(), SVM()))
cv.run(X=X, y=y)
print cv.reduce()


# Model selection using CV
# ------------------------
# CVBestSearchRefit
#      Methods       (Splitter)
#      /    \
# SVM(C=1)  SVM(C=10)   Classifier (Estimator)
from epac import Pipe, CVBestSearchRefit, Methods
# CV + Grid search of a simple classifier
wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10)))
wf.run(X=X, y=y)
print wf.reduce()

# Feature selection combined with SVM and LDA
# CVBestSearchRefit
#                     Methods          (Splitter)
#               /              \
#            KBest(1)         KBest(5) SelectKBest (Estimator)
#              |
#            Methods                   (Splitter)
#        /          \
#    LDA()          SVM() ...          Classifiers (Estimator)
pipelines = Methods(*[Pipe(SelectKBest(k=k), Methods(LDA(), SVM())) for k in [1, 5]])
print [n for n in pipelines.walk_leaves()]
best_cv = CVBestSearchRefit(pipelines)
Пример #13
0
print svms.children[0]
svms.children[0].estimator.coef_
print svms.children[1]
svms.children[1].estimator.coef_

print "Weights given by SVMs"
d = dict(var = imaging_variables,
svm_weights_l1 = svms.children[0].estimator.coef_.ravel(),
svm_weights_l2 = svms.children[1].estimator.coef_.ravel())
print pd.DataFrame(d).to_string()

##############################################################################
# Automatic model selection: "CVBestSearchRefit"
from epac import CVBestSearchRefit, Methods, CV

svms_auto = CVBestSearchRefit(svms)
cv = CV(svms_auto, n_folds=n_folds)
cv.run(X=X, y=y)
#
res_cv_svms_auto = cv.reduce()
print res_cv_svms_auto
print res_cv_svms_auto["CVBestSearchRefit"]['y/test/score_recall']

# Re-fit on all data. Warning: biased !!!
svms_auto.run(X=X, y=y)
print svms_auto.best_params
print svms_auto.refited.estimator.coef_

##############################################################################
# Put everything together
# Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2