예제 #1
0
 def get_workflow(self, n_features=int(1E03)):
     random_state = 0
     C_values = [1, 10]
     k_values = 0
     k_max = "auto"
     n_folds_nested = 5
     n_folds = 10
     n_perms = 10
     if k_max != "auto":
         k_values = range_log2(np.minimum(int(k_max), n_features),
                               add_n=True)
     else:
         k_values = range_log2(n_features, add_n=True)
     cls = Methods(*[
         Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values
         for k in k_values
     ])
     pipeline = CVBestSearchRefit(cls,
                                  n_folds=n_folds_nested,
                                  random_state=random_state)
     wf = Perms(CV(pipeline, n_folds=n_folds),
                n_perms=n_perms,
                permute="y",
                random_state=random_state)
     return wf
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features), add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[Pipe(SelectKBest(k=k),
                      SVC(kernel="linear", C=C))
                      for C in C_values
                      for k in k_values])
    pipeline = CVBestSearchRefit(cls,
                  n_folds=options.n_folds_nested, random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
             n_perms=options.n_perms,
             permute="y",
             random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Run Workflow
    ## ===============
    time_fit_predict = time.time()
    ## Run on local machine
    sfw_engine = SomaWorkflowEngine(
                        tree_root=wf,
                        num_processes=options.n_cores
                        )
    ## Run on cluster
#    sfw_engine = SomaWorkflowEngine(
#                        tree_root=wf,
#                        num_processes=options.n_cores,
#                        resource_id="jl237561@gabriel",
#                        login="******")
    # You can use soma_workflow_gui to track your progress
    wf = sfw_engine.run(X=X, y=y)
    print "Time ellapsed, fit predict:",  time.time() - time_fit_predict
    time_reduce = time.time()

    ## 4) Reduce Workflow
    ## ==================
    print wf.reduce()
    print "Time ellapsed, reduce:",   time.time() - time_reduce
예제 #3
0
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features),
                              add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[
        Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values
        for k in k_values
    ])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Run Workflow
    ## ===============
    time_fit_predict = time.time()
    ## Run on local machine
    sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                    num_processes=options.n_cores)
    ## Run on cluster
    #    sfw_engine = SomaWorkflowEngine(
    #                        tree_root=wf,
    #                        num_processes=options.n_cores,
    #                        resource_id="jl237561@gabriel",
    #                        login="******")
    wf = sfw_engine.run(X=X, y=y)
    print "Time ellapsed, fit predict:", time.time() - time_fit_predict
    time_reduce = time.time()

    ## 4) Reduce Workflow
    ## ==================
    print wf.reduce()
    print "Time ellapsed, reduce:", time.time() - time_reduce
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max), options.n_features), add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    # print options
    # sys.exit(0)
    if options.trace:
        from epac import conf

        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(
        n_samples=options.n_samples, n_features=options.n_features, n_informative=options.n_informative
    )

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values for k in k_values])
    pipeline = CVBestSearchRefit(cls, n_folds=options.n_folds_nested, random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds), n_perms=options.n_perms, permute="y", random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Export Workflow to soma_workflow_gui
    ## ===============
    time_fit_predict = time.time()
    if os.path.isdir(options.soma_workflow_dir):
        shutil.rmtree(options.soma_workflow_dir)
    sfw_engine = SomaWorkflowEngine(tree_root=wf, num_processes=options.n_cores)
    sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y)

    print "Time ellapsed, fit predict:", time.time() - time_fit_predict

    #    ## 6) Load Epac tree & Reduce
    #    ## ==========================
    reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py")
    f = open(reduce_filename, "w")
    reduce_str = (
        """from epac.map_reduce.engine import SomaWorkflowEngine
wf = SomaWorkflowEngine.load_from_gui("%s")
print wf.reduce()
"""
        % options.soma_workflow_dir
    )
    f.write(reduce_str)
    f.close()
    print "#First run\n" "soma_workflow_gui\n" "\t(1)Open %s\n" "\t(2)Submit\n" "\t(3)Transfer Input Files\n" "\t...wait...\n" "\t(4)Transfer Output Files\n" "#When done run:\npython %s" % (
        os.path.join(options.soma_workflow_dir, sfw_engine.open_me_by_soma_workflow_gui),
        reduce_filename,
    )
예제 #5
0
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features), add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[Pipe(SelectKBest(k=k),
                         SVC(kernel="linear", C=C))
                    for C in C_values
                    for k in k_values])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Run Workflow
    ## ===============
    time_fit_predict = time.time()
    wf.run(X=X, y=y)
    print "Time ellapsed, fit predict:",  time.time() - time_fit_predict
    time_reduce = time.time()

    ## 4) Reduce Workflow
    ## ==================
    print wf.reduce()
    print "Time ellapsed, reduce:",   time.time() - time_reduce
예제 #6
0
 def get_workflow(self, n_features=int(1E03)):
     random_state = 0
     C_values = [1, 10]
     k_values = 0
     k_max = "auto"
     n_folds_nested = 5
     n_folds = 10
     n_perms = 10
     if k_max != "auto":
         k_values = range_log2(np.minimum(int(k_max), n_features),
                               add_n=True)
     else:
         k_values = range_log2(n_features, add_n=True)
     cls = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear"))
                                for C in C_values
                                for k in k_values])
     pipeline = CVBestSearchRefit(cls,
                                  n_folds=n_folds_nested,
                                  random_state=random_state)
     wf = Perms(CV(pipeline, n_folds=n_folds),
                     n_perms=n_perms,
                     permute="y",
                     random_state=random_state)
     return wf
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features),
                              add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[
        Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values
        for k in k_values
    ])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Export Workflow to soma_workflow_gui
    ## ===============
    time_fit_predict = time.time()
    if os.path.isdir(options.soma_workflow_dir):
        shutil.rmtree(options.soma_workflow_dir)
    sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                    num_processes=options.n_cores)
    sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y)

    print "Time ellapsed, fit predict:", time.time() - time_fit_predict

    #    ## 6) Load Epac tree & Reduce
    #    ## ==========================
    reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py")
    f = open(reduce_filename, 'w')
    reduce_str = """from epac.map_reduce.engine import SomaWorkflowEngine
wf = SomaWorkflowEngine.load_from_gui("%s")
print wf.reduce()
""" % options.soma_workflow_dir
    f.write(reduce_str)
    f.close()
    print "#First run\n"\
        "soma_workflow_gui\n"\
        "\t(1)Open %s\n"\
        "\t(2)Submit\n"\
        "\t(3)Transfer Input Files\n"\
        "\t...wait...\n"\
        "\t(4)Transfer Output Files\n"\
        "#When done run:\npython %s" % (
            os.path.join(options.soma_workflow_dir,
                         sfw_engine.open_me_by_soma_workflow_gui),
            reduce_filename)

n_features = int(1E03)
X, y = datasets.make_classification(n_samples=100,
                                    n_features=n_features,
                                    n_informative=2)
random_state = 0
C_values = [1, 10]
k_values = 0
k_max = "auto"
n_folds_nested = 5
n_folds = 10
n_perms = 10

if k_max != "auto":
    k_values = range_log2(np.minimum(int(k_max), n_features),
                          add_n=True)
else:
    k_values = range_log2(n_features, add_n=True)

cls = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear"))
                           for C in C_values
                           for k in k_values])
pipeline = CVBestSearchRefitParallel(cls,
                             n_folds=n_folds_nested,
                             random_state=random_state)
wf = Perms(CV(pipeline, n_folds=n_folds),
                n_perms=n_perms,
                permute="y",
                random_state=random_state)

# wf.run(X=X, y=y)
예제 #9
0
#
res_cv_svms_auto = cv.reduce()
print res_cv_svms_auto
print res_cv_svms_auto["CVBestSearchRefit"]['y/test/score_recall']

# Re-fit on all data. Warning: biased !!!
svms_auto.run(X=X, y=y)
print svms_auto.best_params
print svms_auto.refited.estimator.coef_

##############################################################################
# Put everything together
# Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2
from epac import range_log2
from epac import CVBestSearchRefit, Pipe, Methods, CV
k_values = range_log2(X.shape[1], add_n=True)
C_values = [.1, 1, 10, 100]
anova_svms = Methods(*[Pipe(SelectKBest(k=k), preprocessing.StandardScaler(),
                      Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) 
                                for C in C_values for penalty in  ['l1', 'l2']]))
                  for k in k_values])

# Take a look
print [l for l in anova_svms.walk_leaves()]

## k and C selection based on CV
anova_svms_auto = CVBestSearchRefit(anova_svms)

#anova_svm_all = Methods(anova_svm, anova_svm_cv)
cv = CV(anova_svms_auto, n_folds=n_folds)
time_fit_predict = time.time()