Пример #1
0
    def test_cvbestsearchrefit_select_k_best(self):
        list_C_value = range(2, 10, 1)
        #        print repr(list_C_value)
        for C_value in list_C_value:
            #            C_value = 2
            #            print C_value
            X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5)
            n_folds_nested = 2
            # random_state = 0
            k_values = [2, 3, 4, 5, 6]
            key_y_pred = "y" + conf.SEP + conf.PREDICTION
            # With EPAC
            methods = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C_value, kernel="linear")) for k in k_values])
            wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
            wf.run(X=X, y=y)
            r_epac = wf.reduce().values()[0]
            # - Without EPAC
            from sklearn.pipeline import Pipeline

            r_sklearn = dict()
            clf = Pipeline([("anova", SelectKBest(k=3)), ("svm", SVC(C=C_value, kernel="linear"))])
            parameters = {"anova__k": k_values}
            cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
            gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
            gscv.fit(X, y)
            r_sklearn[key_y_pred] = gscv.predict(X)
            r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
            r_sklearn[conf.BEST_PARAMS]["k"] = r_sklearn[conf.BEST_PARAMS]["anova__k"]
            # - Comparisons
            comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
            self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: prediction")
            for key_param in r_epac[conf.BEST_PARAMS][0]:
                if key_param in r_sklearn[conf.BEST_PARAMS]:
                    comp = r_sklearn[conf.BEST_PARAMS][key_param] == r_epac[conf.BEST_PARAMS][0][key_param]
                    self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: best parameters")
Пример #2
0
 def test_cvbestsearchrefit(self):
     X, y = datasets.make_classification(n_samples=12,
                                         n_features=10,
                                         n_informative=2)
     n_folds_nested = 2
     #random_state = 0
     C_values = [.1, 0.5, 1, 2, 5]
     kernels = ["linear", "rbf"]
     key_y_pred = 'y' + conf.SEP + conf.PREDICTION
     # With EPAC
     methods = Methods(*[SVC(C=C, kernel=kernel)
                         for C in C_values for kernel in kernels])
     wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
     wf.run(X=X, y=y)
     r_epac = wf.reduce().values()[0]
     # - Without EPAC
     r_sklearn = dict()
     clf = SVC(kernel="linear")
     parameters = {'C': C_values, 'kernel': kernels}
     cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
     gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
     gscv.fit(X, y)
     r_sklearn[key_y_pred] = gscv.predict(X)
     r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
     # - Comparisons
     comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
     self.assertTrue(comp, u'Diff CVBestSearchRefitParallel: prediction')
     for key_param in r_epac[conf.BEST_PARAMS][0]:
         if key_param in r_sklearn[conf.BEST_PARAMS]:
             comp = r_sklearn[conf.BEST_PARAMS][key_param] == \
                 r_epac[conf.BEST_PARAMS][0][key_param]
             self.assertTrue(
                 comp,
                 u'Diff CVBestSearchRefitParallel: best parameters')
Пример #3
0
 def test_cvbestsearchrefit(self):
     X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2)
     n_folds_nested = 2
     # random_state = 0
     C_values = [0.1, 0.5, 1, 2, 5]
     kernels = ["linear", "rbf"]
     key_y_pred = "y" + conf.SEP + conf.PREDICTION
     # With EPAC
     methods = Methods(*[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels])
     wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
     wf.run(X=X, y=y)
     r_epac = wf.reduce().values()[0]
     # - Without EPAC
     r_sklearn = dict()
     clf = SVC(kernel="linear")
     parameters = {"C": C_values, "kernel": kernels}
     cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
     gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
     gscv.fit(X, y)
     r_sklearn[key_y_pred] = gscv.predict(X)
     r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
     # - Comparisons
     comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
     self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: prediction")
     for key_param in r_epac[conf.BEST_PARAMS][0]:
         if key_param in r_sklearn[conf.BEST_PARAMS]:
             comp = r_sklearn[conf.BEST_PARAMS][key_param] == r_epac[conf.BEST_PARAMS][0][key_param]
             self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: best parameters")
Пример #4
0
    def test_cvbestsearchrefit_select_k_best(self):
        list_C_value = range(2, 10, 1)
#        print repr(list_C_value)
        for C_value in list_C_value:
#            C_value = 2
#            print C_value
            X, y = datasets.make_classification(n_samples=100,
                                                n_features=500,
                                                n_informative=5)
            n_folds_nested = 2
            #random_state = 0
            k_values = [2, 3, 4, 5, 6]
            key_y_pred = 'y' + conf.SEP + conf.PREDICTION
            # With EPAC
            methods = Methods(*[Pipe(SelectKBest(k=k),
                                     SVC(C=C_value, kernel="linear"))
                                for k in k_values])
            wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
            wf.run(X=X, y=y)
            r_epac = wf.reduce().values()[0]
            # - Without EPAC
            from sklearn.pipeline import Pipeline
            r_sklearn = dict()
            clf = Pipeline([('anova', SelectKBest(k=3)),
                            ('svm', SVC(C=C_value, kernel="linear"))])
            parameters = {'anova__k': k_values}
            cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
            gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
            gscv.fit(X, y)
            r_sklearn[key_y_pred] = gscv.predict(X)
            r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
            r_sklearn[conf.BEST_PARAMS]['k'] = \
                r_sklearn[conf.BEST_PARAMS]['anova__k']
            # - Comparisons
            comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
            self.assertTrue(comp,
                            u'Diff CVBestSearchRefitParallel: prediction')
            for key_param in r_epac[conf.BEST_PARAMS][0]:
                if key_param in r_sklearn[conf.BEST_PARAMS]:
                    comp = r_sklearn[conf.BEST_PARAMS][key_param] == \
                        r_epac[conf.BEST_PARAMS][0][key_param]
                    self.assertTrue(
                        comp,
                        u'Diff CVBestSearchRefitParallel: best parameters')
Пример #5
0
    def test_peristence_perm_cv_parmethods_pipe_vs_sklearn(self):
        key_y_pred = 'y' + conf.SEP + conf.PREDICTION
        X, y = datasets.make_classification(n_samples=12,
                                            n_features=10,
                                            n_informative=2)
        n_folds_nested = 2
        #random_state = 0
        C_values = [.1, 0.5, 1, 2, 5]
        kernels = ["linear", "rbf"]
        # With EPAC
        methods = Methods(
            *[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels])
        wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
        # Save workflow
        # -------------
        import tempfile
        #store = StoreFs("/tmp/toto", clear=True)
        store = StoreFs(tempfile.mktemp())
        wf.save_tree(store=store)
        wf = store.load()
        wf.run(X=X, y=y)
        ## Save results
        wf.save_tree(store=store)
        wf = store.load()
        r_epac = wf.reduce().values()[0]

        # - Without EPAC
        r_sklearn = dict()
        clf = SVC(kernel="linear")
        parameters = {'C': C_values, 'kernel': kernels}
        cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
        gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
        gscv.fit(X, y)
        r_sklearn[key_y_pred] = gscv.predict(X)
        r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
        r_sklearn[conf.BEST_PARAMS]['name'] = 'SVC'

        # - Comparisons
        comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
        self.assertTrue(comp, u'Diff CVBestSearchRefitParallel: prediction')
        comp = np.all([
            r_epac[conf.BEST_PARAMS][0][p] == r_sklearn[conf.BEST_PARAMS][p]
            for p in r_sklearn[conf.BEST_PARAMS]
        ])
        self.assertTrue(comp,
                        u'Diff CVBestSearchRefitParallel: best parameters')
    def test_cv_best_search_refit_parallel(self):
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        n_samples = 500
        n_features = 10000
        n_cores = 2
        X, y = datasets.make_classification(n_samples=n_samples,
                                            n_features=n_features,
                                            n_informative=5)
        # epac workflow for paralle computing
        pipelines = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])
        pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested)
        wf = CV(pipeline, n_folds=n_folds)

        sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                        num_processes=n_cores,
                                        remove_finished_wf=False,
                                        remove_local_tree=False)
        sfw_engine_wf = sfw_engine.run(X=X, y=y)

        # epac workflow for normal node computing
        pipelines2 = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])
        pipeline2 = CVBestSearchRefitParallel(pipelines2,
                                              n_folds=n_folds_nested)
        wf2 = CV(pipeline2, n_folds=n_folds)
        wf2.run(X=X, y=y)

        self.assertTrue(compare_two_node(sfw_engine_wf, wf2))
        self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
Пример #7
0
 def get_workflow(self):
     n_folds = 2
     n_folds_nested = 3
     k_values = [1, 2]
     C_values = [1, 2]
     pipelines = Methods(*[Pipe(SelectKBest(k=k),
                                Methods(*[SVC(kernel="linear", C=C)
                                          for C in C_values]))
                           for k in k_values])
     pipeline = CVBestSearchRefitParallel(pipelines,
                                          n_folds=n_folds_nested)
     wf = CV(pipeline, n_folds=n_folds)
     return wf
Пример #8
0
    def test_peristence_perm_cv_parmethods_pipe_vs_sklearn(self):
        key_y_pred = 'y' + conf.SEP + conf.PREDICTION
        X, y = datasets.make_classification(n_samples=12, n_features=10,
                                            n_informative=2)
        n_folds_nested = 2
        #random_state = 0
        C_values = [.1, 0.5, 1, 2, 5]
        kernels = ["linear", "rbf"]
        # With EPAC
        methods = Methods(*[SVC(C=C, kernel=kernel)
                          for C in C_values for kernel in kernels])
        wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
        # Save workflow
        # -------------
        import tempfile
        #store = StoreFs("/tmp/toto", clear=True)
        store = StoreFs(tempfile.mktemp())
        wf.save_tree(store=store)
        wf = store.load()
        wf.run(X=X, y=y)
        ## Save results
        wf.save_tree(store=store)
        wf = store.load()
        r_epac = wf.reduce().values()[0]

        # - Without EPAC
        r_sklearn = dict()
        clf = SVC(kernel="linear")
        parameters = {'C': C_values, 'kernel': kernels}
        cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
        gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
        gscv.fit(X, y)
        r_sklearn[key_y_pred] = gscv.predict(X)
        r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
        r_sklearn[conf.BEST_PARAMS]['name'] = 'SVC'

        # - Comparisons
        comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
        self.assertTrue(comp, u'Diff CVBestSearchRefitParallel: prediction')
        comp = np.all([r_epac[conf.BEST_PARAMS][0][p] ==
                       r_sklearn[conf.BEST_PARAMS][p]
                       for p in r_sklearn[conf.BEST_PARAMS]])
        self.assertTrue(comp,
                        u'Diff CVBestSearchRefitParallel: best parameters')
Пример #9
0
    def test_peristence_load_and_fit_predict(self):
        X, y = datasets.make_classification(n_samples=20,
                                            n_features=10,
                                            n_informative=2)
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        pipelines = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])

        pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested)

        tree_mem = CV(pipeline,
                      n_folds=n_folds,
                      reducer=ClassificationReport(keep=False))
        # Save Tree
        import tempfile
        store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True)
        tree_mem.save_tree(store=store)
        tree_mem.run(X=X, y=y)
        res_mem = tree_mem.reduce().values()[0]
        # Reload Tree
        tree_fs_noresults = store.load()
        tree_fs_noresults.run(X=X, y=y)
        res_fs_noresults = tree_fs_noresults.reduce().values()[0]
        # Save with results
        tree_fs_noresults.save_tree(store=store)
        tree_fs_withresults = store.load()
        res_fs_withresults = tree_fs_withresults.reduce().values()[0]
        # Compare
        comp = np.all([
            np.all(np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k]))
            and np.all(
                np.asarray(res_fs_noresults[k]) == np.asarray(
                    res_fs_withresults[k])) for k in res_mem
        ])
        self.assertTrue(comp)
Пример #10
0
 def get_workflow(self, n_features=int(1E03)):
     random_state = 0
     C_values = [1, 10]
     k_values = 0
     k_max = "auto"
     n_folds_nested = 5
     n_folds = 10
     n_perms = 10
     if k_max != "auto":
         k_values = range_log2(np.minimum(int(k_max), n_features),
                               add_n=True)
     else:
         k_values = range_log2(n_features, add_n=True)
     cls = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear"))
                     for C in C_values
                     for k in k_values])
     pipeline = CVBestSearchRefitParallel(cls,
                                          n_folds=n_folds_nested,
                                          random_state=random_state)
     wf = Perms(CV(pipeline, n_folds=n_folds),
                n_perms=n_perms,
                permute="y",
                random_state=random_state)
     return wf