Пример #1
0
def func_memm_local():
    print "memm_local pt1"
    ## 1) Build a dataset and convert to np.memmap (for big matrix)
    ## ============================================================
    X, y = datasets.make_classification(n_samples=500,
                                        n_features=5000,
                                        n_informative=2,
                                        random_state=1)
    print "memm_local pt2"
    X = convert2memmap(X)
    y = convert2memmap(y)
    Xy = dict(X=X, y=y)
    ## 2) Build two workflows respectively
    ## =======================================================
    print "memm_local pt3"
    from sklearn.svm import SVC
    from epac import CV, Methods
    cv_svm_local = CV(Methods(*[SVC(kernel="linear"),
                                SVC(kernel="rbf")]),
                      n_folds=3)
    print "memm_local pt4"
#    from epac import LocalEngine
#    local_engine = LocalEngine(cv_svm_local, num_processes=2)
#    cv_svm = local_engine.run(**Xy)
    cv_svm_local.run(**Xy)
    print cv_svm_local.reduce()
    print "memm_local pt5"
Пример #2
0
    def test_cv(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
        n_folds = 2

        # = With EPAC
        wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True))
        r_epac = wf.top_down(X=X, y=y)

        # = With SKLEARN
        clf = SVC(kernel="linear")
        r_sklearn = list()
        for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds):
            # idx_train, idx_test  = cv.__iter__().next()
            X_train = X[idx_train, :]
            X_test = X[idx_test, :]
            y_train = y[idx_train, :]
            clf.fit(X_train, y_train)
            r_sklearn.append(clf.predict(X_test))

        # = Comparison
        key2cmp = "y" + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION
        for icv in range(n_folds):
            comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0]))
            self.assertTrue(comp, u"Diff CV: EPAC vs sklearn")

        # test reduce
        r_epac_reduce = wf.reduce().values()[0][key2cmp]
        comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn))
        self.assertTrue(comp, u"Diff CV: EPAC reduce")
Пример #3
0
def func_memm_local():
    print("memm_local pt1")
    ## 1) Build a dataset and convert to np.memmap (for big matrix)
    ## ============================================================
    X, y = datasets.make_classification(n_samples=500,
                                        n_features=5000,
                                        n_informative=2,
                                        random_state=1)
    print("memm_local pt2")
    X = convert2memmap(X)
    y = convert2memmap(y)
    Xy = dict(X=X, y=y)
    ## 2) Build two workflows respectively
    ## =======================================================
    print("memm_local pt3")
    from sklearn.svm import SVC
    from epac import CV, Methods
    cv_svm_local = CV(Methods(*[SVC(
        kernel="linear"), SVC(kernel="rbf")]),
                      n_folds=3)
    print("memm_local pt4")
    #    from epac import LocalEngine
    #    local_engine = LocalEngine(cv_svm_local, num_processes=2)
    #    cv_svm = local_engine.run(**Xy)
    cv_svm_local.run(**Xy)
    print(cv_svm_local.reduce())
    print("memm_local pt5")
Пример #4
0
def test_mem():
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10000,
                                        n_informative=2,
                                        random_state=1)
#    f = open("/home/jinpeng/x.log", "w")
#    pickle.dump(X, f) # =>> 474 MB
#    f.close()
#    np.savez ("/home/jinpeng/np_x.log", dict(X=X)) # ===> 160 MB
    
    cv_svm = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]),
                     n_folds=10)
    cv_svm.run(X=X, y=y) # Top-down process: computing recognition rates, etc.
    # local_engine = LocalEngine(cv_svm, num_processes=2)
    # cv_svm = local_engine.run(X=X, y=y)
    print cv_svm.reduce() # Bottom-up process: computing p-values, etc.
Пример #5
0
    def test_cv(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5,
                                            n_informative=2)
        n_folds = 2

        # = With EPAC
        wf = CV(SVC(kernel="linear"), n_folds=n_folds,
                reducer=ClassificationReport(keep=True))
        r_epac = wf.top_down(X=X, y=y)

        # = With SKLEARN
        clf = SVC(kernel="linear")
        r_sklearn = list()
        for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds):
            #idx_train, idx_test  = cv.__iter__().next()
            X_train = X[idx_train, :]
            X_test = X[idx_test, :]
            y_train = y[idx_train, :]
            clf.fit(X_train, y_train)
            r_sklearn.append(clf.predict(X_test))

        # = Comparison
        key2cmp = 'y' + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION
        for icv in range(n_folds):
            comp = np.all(np.asarray(r_epac[0][key2cmp])
                          == np.asarray(r_sklearn[0]))
            self.assertTrue(comp, u'Diff CV: EPAC vs sklearn')

        # test reduce
        r_epac_reduce = wf.reduce().values()[0][key2cmp]
        comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn))
        self.assertTrue(comp, u'Diff CV: EPAC reduce')
Пример #6
0
    def test_peristence_load_and_fit_predict(self):
        X, y = datasets.make_classification(n_samples=20, n_features=10,
                                        n_informative=2)
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        pipelines = Methods(*[
                            Pipe(SelectKBest(k=k),
                            Methods(*[SVC(kernel="linear", C=C)
                            for C in C_values]))
                            for k in k_values])

        pipeline = CVBestSearchRefit(pipelines,
                                     n_folds=n_folds_nested)

        tree_mem = CV(pipeline, n_folds=n_folds,
                      reducer=ClassificationReport(keep=False))
        # Save Tree
        import tempfile
        store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True)
        tree_mem.save_tree(store=store)
        tree_mem.run(X=X, y=y)
        res_mem = tree_mem.reduce().values()[0]
        # Reload Tree
        tree_fs_noresults = store.load()
        tree_fs_noresults.run(X=X, y=y)
        res_fs_noresults = tree_fs_noresults.reduce().values()[0]
        # Save with results
        tree_fs_noresults.save_tree(store=store)
        tree_fs_withresults = store.load()
        res_fs_withresults = tree_fs_withresults.reduce().values()[0]
        #
        # Compare
        comp = np.all([
            np.all(
            np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k]))
            and
            np.all(np.asarray(res_fs_noresults[k]) ==
            np.asarray(res_fs_withresults[k]))
            for k in res_mem])
        self.assertTrue(comp)
Пример #7
0
    def test_peristence_load_and_fit_predict(self):
        X, y = datasets.make_classification(n_samples=20,
                                            n_features=10,
                                            n_informative=2)
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        pipelines = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])

        pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested)

        tree_mem = CV(pipeline,
                      n_folds=n_folds,
                      reducer=ClassificationReport(keep=False))
        # Save Tree
        import tempfile
        store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True)
        tree_mem.save_tree(store=store)
        tree_mem.run(X=X, y=y)
        res_mem = tree_mem.reduce().values()[0]
        # Reload Tree
        tree_fs_noresults = store.load()
        tree_fs_noresults.run(X=X, y=y)
        res_fs_noresults = tree_fs_noresults.reduce().values()[0]
        # Save with results
        tree_fs_noresults.save_tree(store=store)
        tree_fs_withresults = store.load()
        res_fs_withresults = tree_fs_withresults.reduce().values()[0]
        # Compare
        comp = np.all([
            np.all(np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k]))
            and np.all(
                np.asarray(res_fs_noresults[k]) == np.asarray(
                    res_fs_withresults[k])) for k in res_mem
        ])
        self.assertTrue(comp)
Пример #8
0
# in the result with key "y/true"
class MySVM:
    def __init__(self, C=1.0):
        self.C = C
    def fit(self, X, y):
        from sklearn.svm import SVC
        self.svc = SVC(C=self.C)
        self.svc.fit(X, y)
    def predict(self, X):
        return self.svc.predict(X)

svms = Methods(MySVM(C=1.0), MySVM(C=2.0))
cv = CV(svms, cv_key="y", cv_type="stratified", n_folds=2,
        reducer=None)
cv.run(X=X, y=y)  # top-down process to call transform
cv.reduce()       # buttom-up process

from sklearn.decomposition import PCA
class MyPCA(PCA):
    """PCA with predict method"""
    def predict(self, X):
        """Project to X PCs then project back to original space
        If X is not singular, self.fit(X).predict(X) == X"""
        return np.dot(self.transform(X), self.components_) + self.mean_

pcas = Methods(MyPCA(n_components=1), MyPCA(n_components=2))
cv = CV(pcas, n_folds=2, reducer=None)
cv.run(X=X, y=y)  # top-down process to call transform
cv.reduce()       # buttom-up process

Пример #9
0
X = np.asarray(Xd)
y = np.asarray(yd)

C_values = [0.01, 0.05, .1, .5, 1, 5, 10]

# SVM L1
# ======

svms = Methods(*[
    SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values
])

cv = CV(svms, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
#print cv_results

epac.export_csv(
    cv, cv_results,
    os.path.join(WD, "results", "cv10_caarms+pas+canabis_svmsl1.csv"))

# SVM L1 with CVBestSearchRefit
# =============================

svms_cv = CVBestSearchRefit(svms, n_folds=10, cv_type="stratified")
cv = CV(svms_cv, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
print cv_results
from epac import ClassificationReport, PvalPerms
from epac import StoreFs
from epac import CVBestSearchRefit
from epac.sklearn_plugins import Permutations
from epac.configuration import conf

X, y = datasets.make_classification(n_samples=20,
                                    n_features=10,
                                    n_informative=2)
n_folds = 2
n_folds_nested = 3
k_values = [1, 2]
C_values = [1, 2]
pipelines = Methods(*[
                    Pipe(SelectKBest(k=k),
                    Methods(*[SVC(kernel="linear", C=C)
                    for C in C_values]))
                    for k in k_values])

pipeline = CVBestSearchRefit(pipelines,
                             n_folds=n_folds_nested)

tree_mem = CV(pipeline, n_folds=n_folds,
              reducer=ClassificationReport(keep=False))
# Save Tree
import tempfile
store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True)
tree_mem.save_tree(store=store)
tree_mem.run(X=X, y=y)
tree_mem.reduce()
Пример #11
0
#k_values = [1, 2, 3, 4, 5, 10, 15, 20, 25, 27]
C_values = [0.01, 0.05, .1, .5, 1, 5, 10]

# SVM L1
# ======

svms = Methods(*[SVM(dual=False, class_weight='auto', penalty="l1", C=C)  for C in C_values])

#
#anova_svms = Methods(*[Pipe(SelectKBest(k=k),       #preprocessing.StandardScaler(),
#                            Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) for C in C_values for penalty in  ['l1', 'l2']])) for k in k_values])


cv = CV(svms, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
#print cv_results

epac.export_csv(cv, cv_results, os.path.join(WD, "results", "cv10_svmsl1.csv"))

# SVM L1 with CVBestSearchRefit
# =============================

svms_cv = CVBestSearchRefit(svms, n_folds=10)
cv = CV(svms_cv, cv_type="stratified", n_folds=10)
cv.run(X=X, y=y)
cv_results = cv.reduce()
print cv_results
#[{'key': CVBestSearchRefit, 'y/test/score_f1': [ 0.84848485  0.76190476], 'y/test/recall_pvalues': [ 0.01086887  0.03000108], 'y/test/score_precision': [ 0.82352941  0.8       ], 'y/test/recall_mean_pvalue': 0.00592461228371, 'y/test/score_recall': [ 0.875       0.72727273], 'y/test/score_accuracy': 0.814814814815, 'y/test/score_recall_mean': 0.801136363636}])
#
#Parmis les 27 11 ont fait la transition et 16 ne l'on pas faite
Пример #12
0
print(anovas_svm.reduce())

# Cross-validation
# ----------------
# CV of LDA
#      CV                 (Splitter)
#  /   |   \
# 0    1    2  Folds      (Slicer)
# |    |
#   Methods               (Splitter)
#    /   \
#  LDA  SVM    Classifier (Estimator)
from epac import CV, Methods
cv = CV(Methods(LDA(), SVM()))
cv.run(X=X, y=y)
print(cv.reduce())

# Model selection using CV
# ------------------------
# CVBestSearchRefit
#      Methods       (Splitter)
#      /    \
# SVM(C=1)  SVM(C=10)   Classifier (Estimator)
from epac import Pipe, CVBestSearchRefit, Methods
# CV + Grid search of a simple classifier
wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10)))
wf.run(X=X, y=y)
print(wf.reduce())

# Feature selection combined with SVM and LDA
# CVBestSearchRefit
Пример #13
0

# Cross-validation
# ----------------
# CV of LDA
#      CV                 (Splitter)
#  /   |   \
# 0    1    2  Folds      (Slicer)
# |    |
#   Methods               (Splitter)
#    /   \
#  LDA  SVM    Classifier (Estimator)
from epac import CV, Methods
cv = CV(Methods(LDA(), SVM()))
cv.run(X=X, y=y)
print cv.reduce()


# Model selection using CV
# ------------------------
# CVBestSearchRefit
#      Methods       (Splitter)
#      /    \
# SVM(C=1)  SVM(C=10)   Classifier (Estimator)
from epac import Pipe, CVBestSearchRefit, Methods
# CV + Grid search of a simple classifier
wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10)))
wf.run(X=X, y=y)
print wf.reduce()

# Feature selection combined with SVM and LDA
Пример #14
0
from sklearn import preprocessing


##############################################################################
## Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2
from epac import Pipe, CV
n_folds = 10

anova_svm = Pipe(SelectKBest(k=5), 
                 preprocessing.StandardScaler(), 
                 SVM(class_weight='auto'))

cv = CV(anova_svm, n_folds=n_folds)
cv.run(X=X, y=y)
#
res_cv_anova_svm = cv.reduce()
res_cv_anova_svm["SelectKBest/StandardScaler/LinearSVC"]['y/test/score_recall']

##############################################################################
## Multimethods, "Methods": SVM l1 vs l2
from epac import Methods, CV
svms = Methods(SVM(penalty="l1", class_weight='auto', dual=False), 
               SVM(penalty="l2", class_weight='auto', dual=False))

cv = CV(svms, n_folds=n_folds)
cv.run(X=X, y=y)
res_cv_svms = cv.reduce()
#
print res_cv_svms
print res_cv_svms["LinearSVC(penalty=l1)"]['y/test/score_recall']
print res_cv_svms["LinearSVC(penalty=l2)"]['y/test/score_recall']