예제 #1
0
def train_svm(filename,
              X_train,
              X_test,
              y_train,
              y_test,
              solver='rbf',
              full_param=False,
              debug=False,
              numFolds=10,
              njobs=-1,
              scalar=1,
              make_graphs=False,
              pSVM={}):
    np.random.seed(1)
    algo = 'SVM'

    start = time.time()
    if len(pSVM) == 0:
        if full_param:
            param_grid = [{
                'kernel': [solver],
                # 0.0001 - Finished for Linear
                # 'max_iter': [-1, 10000, 100000],
                # 'shrinking'   : [True, False], # Seems to just make things faster/slower on larger iterations, I think cutting down 2x is better
                # 'probability' : [True, False],
                'random_state': [1]
            }]
            if solver == 'rbf':
                param_grid[0]['C'] = [
                    0.001
                ]  #, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000]
                param_grid[0]['gamma'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000,
                    100000
                ]
            elif solver == 'sigmoid':
                param_grid[0]['gamma'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000
                ]
                param_grid[0]['coef0'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
                param_grid[0]['C'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000,
                    100000
                ]

            elif solver == 'poly':
                param_grid[0]['gamma'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000,
                    100000
                ]
                param_grid[0]['degree'] = [1, 2, 3, 4, 5, 6, 7, 8]
                param_grid[0]['coef0'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
                param_grid[0]['C'] = [
                    0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000,
                    100000
                ]
            elif solver == 'linear':
                param_grid[0]['C'] = [1.0]

        else:
            param_grid = [{
                'kernel': [solver],
                'C': [0.01, 0.1, 1., 10., 100],
                'cache_size': [2000],
                'random_state': [1]
            }]
            if solver == 'poly' or solver == 'linear':
                param_grid = [{
                    'kernel': [solver],
                    'C': [0.001, 0.01, 0.1, 1., 10.],
                    'cache_size': [2000],
                    'random_state': [1]
                }]
        svm_classifier = svm.SVC(probability=True)
        grid_search = GridSearchCV(svm_classifier,
                                   param_grid,
                                   cv=numFolds,
                                   scoring='roc_auc_ovr_weighted',
                                   return_train_score=True,
                                   n_jobs=njobs,
                                   verbose=debug)
        grid_search.fit(X_train, y_train)

        cvres = grid_search.cv_results_
        best_params = grid_search.best_params_

        util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar, solver)

        svm_classifier = svm.SVC()
        svm_classifier.set_params(**best_params)
    else:
        svm_classifier = svm.SVC()
        svm_classifier.set_params(**pSVM)

    start = time.time()
    svm_classifier.fit(X_train, y_train)
    print('SVM Fit Time: ', time.time() - start)
    start = time.time()

    y_prob = svm_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('SVM Train Score Time: ', time.time() - start)

    start = time.time()

    y_prob = svm_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('SVM Test Score Time: ', time.time() - start)
    test_class = svm.SVC()
    test_class.set_params(**pSVM)

    if make_graphs:
        util.plot_learning_curve(svm_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)
        util.compute_vc(algo,
                        'kernel', ['rbf', 'sigmoid', 'poly', 'linear'],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        svm_classifier,
                        filename[:-4],
                        test_class,
                        pSVM,
                        log=False,
                        njobs=njobs,
                        debug=debug,
                        smalllegend=True)
        util.svm_rbf_C_Gamma_viz(X_train, y_train, pSVM, njobs, filename[:-4],
                                 train_score)

        # computer Model Complexity/Validation curves
        util.compute_vc(algo,
                        'kernel', ['rbf', 'sigmoid', 'poly', 'linear'],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        svm_classifier,
                        filename[:-4],
                        test_class,
                        pSVM,
                        log=False,
                        njobs=njobs)

        util.compute_vc(algo,
                        'C',
                        [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        svm_classifier,
                        filename[:-4],
                        test_class,
                        pSVM,
                        log=True,
                        njobs=njobs,
                        debug=debug)
        if solver == 'rbf':
            util.compute_vc(algo,
                            'gamma',
                            [0.0001, 0.001, 0.01, 0.1, 1.0, 5.0, 10.0],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            svm_classifier,
                            filename[:-4],
                            test_class,
                            pSVM,
                            log=True,
                            njobs=njobs,
                            debug=debug)
        elif solver == 'sigmoid':
            util.compute_vc(
                algo,
                'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
                X_train,
                y_train,
                X_test,
                y_test,
                svm_classifier,
                filename[:-4],
                test_class,
                pSVM,
                log=True,
                njobs=njobs,
                debug=debug)
            util.compute_vc(algo,
                            'coef0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            svm_classifier,
                            filename[:-4],
                            test_class,
                            pSVM,
                            log=False,
                            njobs=njobs,
                            debug=debug)
        elif solver == 'poly':
            util.compute_vc(
                algo,
                'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
                X_train,
                y_train,
                X_test,
                y_test,
                svm_classifier,
                filename[:-4],
                test_class,
                pSVM,
                log=True,
                njobs=njobs,
                debug=debug)
            util.compute_vc(algo,
                            'coef0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            svm_classifier,
                            filename[:-4],
                            test_class,
                            pSVM,
                            log=False,
                            njobs=njobs,
                            debug=debug)
            util.compute_vc(algo,
                            'degree', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            svm_classifier,
                            filename[:-4],
                            test_class,
                            pSVM,
                            log=False,
                            njobs=njobs,
                            debug=debug)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
예제 #2
0
def train_kmeansNN(filename,
                   X_train,
                   X_test,
                   y_train,
                   y_test,
                   debug=False,
                   numFolds=10,
                   njobs=-1,
                   scalar=1,
                   make_graphs=False,
                   pNN={},
                   nolegend=False,
                   random_seed=1,
                   num_clusts=4):
    np.random.seed(random_seed)
    algo = 'Neural Network'

    start = time.time()
    if num_clusts != 1:
        KClusters = KMeans(init='k-means++',
                           n_clusters=num_clusts,
                           n_init=100,
                           random_state=random_seed,
                           max_iter=100).fit(X_train)
        X_train.insert(0, 'Cluster', KClusters.predict(X_train))
        X_train['Cluster'] = X_train['Cluster'].apply(str)
        X_test.insert(0, 'Cluster', KClusters.predict(X_test))
        X_test['Cluster'] = X_test['Cluster'].apply(str)

    X_train = pd.get_dummies(X_train, prefix='Cluster')
    X_test = pd.get_dummies(X_test, prefix='Cluster')

    param_grid = [{
        'hidden_layer_sizes': [(512, 512, 512, 512)],
        'activation': ['relu'],  # 'identity',
        'solver': ['adam'],
        'alpha': [0.01],  #[0.0001, 0.001, 0.01, 0.1],
        'batch_size': ['auto'],
        'learning_rate_init': [0.01],  #[0.001, 0.01],
        'max_iter': [10000],
        'warm_start': [True],
        'early_stopping': [True],
        'random_state': [1]
    }]

    nn_classifier = MLPClassifier()

    grid_search = GridSearchCV(nn_classifier,
                               param_grid,
                               cv=numFolds,
                               scoring='roc_auc_ovr_weighted',
                               return_train_score=True,
                               n_jobs=njobs,
                               verbose=debug)
    grid_search.fit(X_train, y_train)
    cvres = grid_search.cv_results_

    util.save_gridsearch_to_csv(cvres, algo,
                                filename[:-4] + '-' + str(num_clusts), scalar,
                                '-kmeans')

    start = time.time()
    nn_classifier.fit(X_train, y_train)
    print('NN Fit Time: ', time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('NN Train Score Time: ', train_score, time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('NN Test Score Time: ', test_score, time.time() - start)

    test_class = MLPClassifier()
    test_class.set_params(**pNN)

    if make_graphs:
        # computer Model Complexity/Validation curves
        util.plot_learning_curve(nn_classifier,
                                 'K-Means',
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
예제 #3
0
def train_NN_LLE(filename,
                 X_train,
                 X_test,
                 y_train,
                 y_test,
                 debug=False,
                 numFolds=10,
                 njobs=-1,
                 scalar=1,
                 make_graphs=False,
                 pNN={},
                 nolegend=False,
                 random_seed=1,
                 num_dim=4):
    np.random.seed(random_seed)
    algo = 'LLE' + str(num_dim)

    start = time.time()
    lle = LocallyLinearEmbedding(n_neighbors=10,
                                 n_components=num_dim,
                                 random_state=random_seed,
                                 n_jobs=-1)
    lle.fit(X_train)
    X_train = lle.transform(X_train)
    X_test = lle.transform(X_test)

    param_grid = [{
        'hidden_layer_sizes': [(512, 512, 512, 512)],
        'activation': ['relu'],  # 'identity',
        'solver': ['adam'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'batch_size': ['auto'],
        'learning_rate_init': [0.001, 0.01],
        'max_iter': [10000],
        'warm_start': [True],
        'early_stopping': [True],
        'random_state': [1]
    }]

    nn_classifier = MLPClassifier()

    grid_search = GridSearchCV(nn_classifier,
                               param_grid,
                               cv=numFolds,
                               scoring='roc_auc_ovr_weighted',
                               return_train_score=True,
                               n_jobs=njobs,
                               verbose=debug)
    grid_search.fit(X_train, y_train)
    cvres = grid_search.cv_results_

    util.save_gridsearch_to_csv(cvres, algo,
                                filename[:-4] + '-' + str(num_dim), scalar, '')

    start = time.time()
    nn_classifier.fit(X_train, y_train)
    print('NN Fit Time: ', time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('NN Train Score Time: ', train_score, time.time() - start)

    start = time.time()
    y_prob = nn_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('NN Test Score Time: ', test_score, time.time() - start)

    test_class = MLPClassifier()
    test_class.set_params(**pNN)

    if make_graphs:
        # computer Model Complexity/Validation curves
        util.plot_learning_curve(nn_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
예제 #4
0
def train_BTree(filename, X_train, X_test, y_train, y_test, full_param=False, debug=False, numFolds=10, njobs=-1,
                scalar=1, make_graphs=False, pBTree={}):
    np.random.seed(1)
    start = time.time()
    algo = 'Boosted Tree'

    if len(pBTree) == 0:
        if full_param:
            param_grid = [{'base_estimator__criterion'     : ['gini', 'entropy'],
                           'base_estimator__max_depth'     : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100],
                           # 'base_estimator__min_samples_split': [2, 3, 5, 6, 8, 10],
                           # 'base_estimator__min_samples_leaf' : [1, 2, 3, 5, 6, 8, 10],
                           # 'base_estimator__max_features'  : [0.9, 1.0],  # 0.1, 0.3, 0.5,
                           'base_estimator__max_leaf_nodes': [10, 100],  # 2, 4, 5, 7,
                           'base_estimator__ccp_alpha'     : [0.0, 0.005, 0.01],
                           # 0.015, 0.02, 0.025, 0.030, 0.035, 0.04],
                           "base_estimator__splitter"      : ["best"],  # "random"],
                           "n_estimators"                  : [1, 50, 100, 150, 200, 250, 300],
                           "learning_rate"                 : [0.1, 0.5, 1],
                           'random_state'                  : [1]
                           }]
        else:
            param_grid = [{'base_estimator__criterion': ['gini', 'entropy'],
                           'base_estimator__max_depth': [3, 5, 7, 10],
                           'base_estimator__ccp_alpha': [0.0, 0.005, 0.01, 0.035],
                           # 'base_estimator__min_samples_split': [3, 5, 7, 10],
                           # 'base_estimator__ccp_alpha'        : [0.0, 0.005, 0.015, 0.025, 0.35, 0.04],
                           "n_estimators"             : [1, 50, 100, 150],
                           # "learning_rate"                    : [0.1, 0.5, 1],
                           'random_state'             : [1]
                           }]

        DTC = DecisionTreeClassifier(random_state=11)
        adaTree = AdaBoostClassifier(base_estimator=DTC)

        # run grid search
        grid_search = GridSearchCV(adaTree, param_grid=param_grid, cv=numFolds,
                                   scoring='roc_auc_ovr_weighted',
                                   return_train_score=True, n_jobs=njobs, verbose=debug)

        grid_search.fit(X_train, y_train)

        cvres = grid_search.cv_results_
        best_params = grid_search.best_params_

        util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar)

        btree_classifier = AdaBoostClassifier(base_estimator=DTC)
        btree_classifier.set_params(**best_params)
    else:
        DTC = DecisionTreeClassifier()
        btree_classifier = AdaBoostClassifier(base_estimator=DTC)
        btree_classifier.set_params(**pBTree)

    start = time.time()
    btree_classifier.fit(X_train, y_train)
    print('BTree Fit Time: ', time.time() - start)
    start = time.time()

    y_prob = btree_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted")
    print('BTree Train Score Time: ', time.time() - start)
    start = time.time()

    y_prob = btree_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted")
    print('BTree Test Score Time: ', time.time() - start)
    DTC = DecisionTreeClassifier()
    test_class = AdaBoostClassifier(base_estimator=DTC)
    test_class.set_params(**pBTree)

    if make_graphs:
        util.boost_lr_vs_nest(X_train, y_train, pBTree, njobs, filename[:-4], train_score)
        util.compute_vc(algo, 'n_estimators',
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000],
                        X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree,
                        log=True, njobs=njobs, debug=debug, extraText='log')

        util.plot_learning_curve(btree_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10,
                                 n_jobs=njobs, debug=debug)

        util.compute_vc(algo, 'base_estimator__max_depth',
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80,
                         90, 100], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class,
                        pBTree, log=True, njobs=njobs, debug=debug)

        util.compute_vc(algo, 'base_estimator__max_leaf_nodes',
                        [2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 50, 75, 100, 200, 500, 1000, 10000], X_train, y_train, X_test,
                        y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs)
        # computer Model Complexity/Validation curves
        util.compute_vc(algo, 'base_estimator__criterion', ['gini', 'entropy'], X_train, y_train, X_test, y_test,
                        btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs)

        util.compute_vc(algo, 'n_estimators',
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000],
                        X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree,
                        log=False, njobs=njobs, debug=debug)
        util.compute_vc(algo, 'n_estimators',
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000],
                        X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree,
                        log=True, njobs=njobs, debug=debug, extraText='log')
        util.compute_vc(algo, 'learning_rate',
                        [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01,
                         0.05, 0.1, 0.5, 1], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4],
                        test_class, pBTree, log=True, njobs=njobs, debug=debug)

        util.compute_vc(algo, 'base_estimator__ccp_alpha',
                        [0.000001, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009,
                         0.0001, 0.00011, 0.00012, 0.00013, 0.00014, 0.00015, 0.00016, 0.00017, 0.00018, 0.00019,
                         0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.01, 0.1, 1],
                        X_train,
                        y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True,
                        njobs=njobs)
        util.compute_vc(algo, 'base_estimator__min_samples_split', [2, 3, 5, 6, 8, 10], X_train, y_train, X_test,
                        y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs)
        util.compute_vc(algo, 'base_estimator__min_samples_leaf',
                        [1, 2, 3, 5, 6, 8, 10, 25, 50, 75, 100, 250, 500, 750, 1000], X_train,
                        y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True,
                        njobs=njobs)
        util.compute_vc(algo, 'base_estimator__max_features',
                        [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 0.99999, 1.0], X_train, y_train, X_test, y_test,
                        btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs)

        util.compute_vc(algo, 'base_estimator__splitter', ["best", "random"], X_train, y_train, X_test, y_test,
                        btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
예제 #5
0
def train_NN(filename,
             X_train,
             X_test,
             y_train,
             y_test,
             solver='adam',
             full_param=False,
             debug=False,
             numFolds=10,
             njobs=-1,
             scalar=1,
             make_graphs=False,
             pNN={},
             nolegend=False):
    np.random.seed(1)
    algo = 'Neural Network'

    start = time.time()
    if len(pNN) == 0:
        if full_param:
            param_grid = [{
                'hidden_layer_sizes': [(8), (16), (32), (8, 8), (16, 16),
                                       (32, 32), (8, 8, 8), (16, 16, 16),
                                       (32, 32, 32), (128, ), (128, 128),
                                       (128, 128, 128), (128, 128, 128, 128),
                                       (256, ), (256, 256), (512, ),
                                       (512, 512), (256, 256, 256),
                                       (256, 256, 256, 256), (512, 512, 512),
                                       (512, 512, 512, 512)],
                'activation': ['logistic', 'tanh', 'relu'],  # 'identity',
                'solver': [solver],  # 'lbfgs',
                'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1],
                'batch_size': ['auto'],
                'learning_rate_init': [0.001, 0.01],
                'max_iter': [10000],
                'warm_start': [True],
                'early_stopping': [True],
                'random_state': [1]
            }]
            if solver == 'sgd':
                param_grid[0]['learning_rate'] = [
                    'constant', 'invscaling', 'adaptive'
                ]  # Only used when solver='sgd'

        else:
            param_grid = [{
                'hidden_layer_sizes': [(8), (16), (32), (8, 8), (16, 16),
                                       (32, 32), (8, 16), (8, 32), (16, 32),
                                       (128, ), (128, 128), (128, 128, 128),
                                       (128, 128, 128, 128)],
                # 'hidden_layer_sizes': [(512, 512),  (256, 256), (1024), (1024, 1024),], #(256, 256, 256), (256, 256, 256, 256), (512, 512, 512), (512, 512, 512, 512)],
                'solver': [solver],
                'activation': ['identity', 'relu'],  # , 'logistic', 'tanh'],
                'max_iter': [10000],
                'early_stopping': [True],
                'random_state': [1]
            }]

        nn_classifier = MLPClassifier()

        grid_search = GridSearchCV(nn_classifier,
                                   param_grid,
                                   cv=numFolds,
                                   scoring='roc_auc_ovr_weighted',
                                   return_train_score=True,
                                   n_jobs=njobs,
                                   verbose=debug)

        grid_search.fit(X_train, y_train)

        cvres = grid_search.cv_results_
        best_params = grid_search.best_params_

        util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar, solver)

        nn_classifier = MLPClassifier()
        nn_classifier.set_params(**best_params)
    else:
        nn_classifier = MLPClassifier()
        nn_classifier.set_params(**pNN)

    start = time.time()
    nn_classifier.fit(X_train, y_train)
    print('NN Fit Time: ', time.time() - start)
    start = time.time()

    y_prob = nn_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('NN Train Score Time: ', time.time() - start)

    start = time.time()

    y_prob = nn_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('NN Test Score Time: ', time.time() - start)

    test_class = MLPClassifier()
    test_class.set_params(**pNN)

    if make_graphs:
        # computer Model Complexity/Validation curves
        util.plot_learning_curve(nn_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)
        util.compute_vc(algo,
                        'activation', ['identity', 'logistic', 'tanh', 'relu'],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        nn_classifier,
                        filename[:-4],
                        test_class,
                        pNN,
                        log=False,
                        njobs=njobs,
                        debug=debug)
        util.compute_vc(
            algo,
            'max_iter', [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90,
                100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000,
                4000, 5000, 6000, 7000, 8000, 9000, 10000
            ],
            X_train,
            y_train,
            X_test,
            y_test,
            nn_classifier,
            filename[:-4],
            test_class,
            pNN,
            log=True,
            njobs=njobs,
            debug=debug)
        util.compute_vc(algo,
                        'hidden_layer_sizes', [(1), (2), (4), (8), (16), (32),
                                               (64), (128), (256), (512)],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        nn_classifier,
                        filename[:-4],
                        test_class,
                        pNN,
                        log=False,
                        njobs=njobs,
                        debug=debug,
                        fString=True,
                        extraText=' 1-Layer',
                        rotatex=True,
                        nolegend=nolegend)

        util.compute_vc(algo,
                        'hidden_layer_sizes', [
                            (1, 1),
                            (2, 2),
                            (4, 4),
                            (8, 8),
                            (16, 16),
                            (32, 32),
                            (64, 64),
                            (128, 128),
                            (256, 256),
                            (512, 512),
                        ],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        nn_classifier,
                        filename[:-4],
                        test_class,
                        pNN,
                        log=False,
                        njobs=njobs,
                        debug=debug,
                        fString=True,
                        extraText=' 2-Layer',
                        rotatex=True,
                        nolegend=nolegend)
        util.compute_vc(algo,
                        'hidden_layer_sizes', [
                            (1, 1, 1),
                            (2, 2, 2),
                            (4, 4, 4),
                            (8, 8, 8),
                            (16, 16, 16),
                            (32, 32, 32),
                            (64, 64, 64),
                            (128, 128, 128),
                            (256, 256, 256),
                            (512, 512, 512),
                        ],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        nn_classifier,
                        filename[:-4],
                        test_class,
                        pNN,
                        log=False,
                        njobs=njobs,
                        debug=debug,
                        fString=True,
                        extraText=' 3-Layer',
                        rotatex=True,
                        nolegend=nolegend)
        util.compute_vc(algo,
                        'hidden_layer_sizes', [(1, 1, 1, 1), (2, 2, 2, 2),
                                               (4, 4, 4, 4), (8, 8, 8, 8),
                                               (16, 16, 16, 16),
                                               (32, 32, 32, 32),
                                               (64, 64, 64, 64),
                                               (128, 128, 128, 128),
                                               (256, 256, 256, 256),
                                               (512, 512, 512, 512)],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        nn_classifier,
                        filename[:-4],
                        test_class,
                        pNN,
                        log=False,
                        njobs=njobs,
                        debug=debug,
                        fString=True,
                        extraText=' 4-Layer',
                        rotatex=True,
                        nolegend=nolegend)

        util.compute_vc(algo,
                        'solver', ['adam', 'sgd', 'lbfgs'],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        nn_classifier,
                        filename[:-4],
                        test_class,
                        log=False,
                        njobs=njobs)
        util.compute_vc(
            algo,
            'alpha', [
                0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1,
                0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 100000, 1000000
            ],
            X_train,
            y_train,
            X_test,
            y_test,
            nn_classifier,
            filename[:-4],
            test_class,
            pNN,
            log=True,
            njobs=njobs,
            debug=debug)

        if solver == 'sgd':
            util.compute_vc(algo,
                            'learning_rate',
                            ['constant', 'invscaling', 'adaptive'],
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            nn_classifier,
                            filename[:-4],
                            test_class,
                            log=True,
                            njobs=njobs)

        return time.time() - start, round(train_score, 4), round(test_score, 4)
예제 #6
0
def train_DTree(filename,
                X_train,
                X_test,
                y_train,
                y_test,
                full_param=False,
                debug=False,
                numFolds=10,
                njobs=-1,
                scalar=1,
                make_graphs=False,
                pDTree={}):
    np.random.seed(1)
    algo = 'Decision Tree'
    start = time.time()
    if len(pDTree) == 0:
        if full_param:
            param_grid = [{
                'criterion': ['gini', 'entropy'],
                'max_depth': [3, 5, 7, 10, 100],  # 3, 5, 7, 10, 100,\
                'min_samples_split': [2, 3, 5, 7, 8, 10],
                # 'min_samples_leaf' : [0.1, 0.2, 0.3, 0.5],
                'ccp_alpha': [0.0, 0.00001, 0.0001, 0.001, 0.005, 0.01, 0.015],
                'random_state': [1],
            }]
        else:
            param_grid = [{
                'criterion': ['gini', 'entropy'],
                'max_depth': [3, 5, 7, 10, 100],
                'min_samples_split': [2, 3, 5, 7, 10],
                'ccp_alpha': [0, .01, .02],
                'random_state': [1]
            }]

        tree_classifier = DecisionTreeClassifier()
        grid_search = GridSearchCV(tree_classifier,
                                   param_grid,
                                   cv=numFolds,
                                   scoring='roc_auc_ovr_weighted',
                                   return_train_score=True,
                                   n_jobs=njobs,
                                   verbose=debug)
        grid_search.fit(X_train, y_train)

        cvres = grid_search.cv_results_
        best_params = grid_search.best_params_

        util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar)
        # Fit algo to best parameters and compute test score
        tree_classifier = DecisionTreeClassifier()
        tree_classifier.set_params(**best_params)
    else:
        # Fit algo to best parameters and compute test score
        tree_classifier = DecisionTreeClassifier()
        tree_classifier.set_params(**pDTree)

    start = time.time()
    tree_classifier.fit(X_train, y_train)
    print('DTree Fit Time: ', time.time() - start)
    start = time.time()

    y_prob = tree_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train,
                                y_prob,
                                multi_class="ovr",
                                average="weighted")
    print('DTree Train Score Time: ', time.time() - start)

    start = time.time()

    y_prob = tree_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test,
                               y_prob,
                               multi_class="ovr",
                               average="weighted")
    print('DTree Test Score Time: ', time.time() - start)

    if make_graphs:
        '''# Plot DTree
        create_DT_image(tree_classifier, X_train, filename[:-4], scalar, True)

        # Plot without pruning, need to make it again with ccp_alpha = 0
        unprune_tree = DecisionTreeClassifier()
        unprune_tree.set_params(**pDTree)
        unprune_tree.set_params(**{'ccp_alpha': 0})
        unprune_tree.fit(X_train, y_train)
        create_DT_image(unprune_tree, X_train, filename[:-4], scalar, False)'''
        # computer Model Complexity/Validation curves
        test_class = DecisionTreeClassifier()
        test_class.set_params(**pDTree)

        util.compute_vc(algo,
                        'criterion', ['gini', 'entropy'],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        tree_classifier,
                        filename[:-4],
                        test_class,
                        pDTree,
                        log=False,
                        njobs=njobs,
                        debug=debug,
                        smalllegend=True)

        # Plot Learning Curve
        util.plot_learning_curve(tree_classifier,
                                 algo,
                                 filename[:-4],
                                 X_train,
                                 y_train,
                                 ylim=(0.0, 1.05),
                                 cv=10,
                                 n_jobs=njobs,
                                 debug=debug)

        util.compute_vc(algo,
                        'max_depth', [
                            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 50, 75, 100
                        ],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        tree_classifier,
                        filename[:-4],
                        test_class,
                        pDTree,
                        log=True,
                        njobs=njobs,
                        debug=debug)

        util.compute_vc(
            algo,
            'ccp_alpha', [
                0.000001, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006,
                0.00007, 0.00008, 0.00009, 0.0001, 0.00011, 0.00012, 0.00013,
                0.00014, 0.00015, 0.00016, 0.00017, 0.00018, 0.00019, 0.0002,
                0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001,
                0.01, 0.1, 1
            ],
            X_train,
            y_train,
            X_test,
            y_test,
            tree_classifier,
            filename[:-4],
            test_class,
            pDTree,
            log=True,
            njobs=njobs,
            debug=debug)

        util.compute_vc(algo,
                        'min_samples_split', [2, 3, 5, 7, 10],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        tree_classifier,
                        filename[:-4],
                        test_class,
                        pDTree,
                        log=False,
                        njobs=njobs,
                        debug=debug)

        util.compute_vc(algo,
                        'min_samples_leaf', [
                            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 50, 75, 100,
                            250, 500, 750, 1000
                        ],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        tree_classifier,
                        filename[:-4],
                        test_class,
                        pDTree,
                        log=True,
                        njobs=njobs)

        util.compute_vc(
            algo,
            'max_leaf_nodes', [
                2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60, 100,
                250, 500, 750, 1000, 2500, 5000, 7500, 10000
            ],
            X_train,
            y_train,
            X_test,
            y_test,
            tree_classifier,
            filename[:-4],
            test_class,
            pDTree,
            log=True,
            njobs=njobs)

        util.compute_vc(algo,
                        'max_features', [
                            0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
                            0.55, 0.6, 0.65, 0.8, 0.85, 0.9, 0.95, 0.99999, 1.0
                        ],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        tree_classifier,
                        filename[:-4],
                        test_class,
                        pDTree,
                        log=False,
                        njobs=njobs)

        util.compute_vc(algo,
                        'splitter', ["best", "random"],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        tree_classifier,
                        filename[:-4],
                        test_class,
                        pDTree,
                        log=False,
                        njobs=njobs)

        tree_classifier.set_params(**{'ccp_alpha': 0})
        test_class.set_params(**{'ccp_alpha': 0})
        pDTree['ccp_alpha'] = 0
        util.compute_vc(algo,
                        'max_depth', [
                            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 50, 75, 100
                        ],
                        X_train,
                        y_train,
                        X_test,
                        y_test,
                        tree_classifier,
                        filename[:-4],
                        test_class,
                        pDTree,
                        log=True,
                        njobs=njobs,
                        debug=debug,
                        extraText='noprune')

    return time.time() - start, round(train_score, 4), round(test_score, 4)