예제 #1
0
def eval_model():
    comments, labels = load_extended_data()

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    #models = [clf1]
    cv = ShuffleSplit(len(comments),
                      n_iterations=5,
                      test_size=0.2,
                      indices=True)
    scores = []
    for train, test in cv:
        probs_common = np.zeros((len(test), 2))
        for clf in models:
            X_train, y_train = comments[train], labels[train]
            X_test, y_test = comments[test], labels[test]
            clf.fit(X_train, y_train)
            probs = clf.predict_proba(X_test)
            print("score: %f" % auc_score(y_test, probs[:, 1]))
            probs_common += probs
        probs_common /= 4.
        scores.append(auc_score(y_test, probs_common[:, 1]))
        print("combined score: %f" % scores[-1])

    print(np.mean(scores), np.std(scores))
예제 #2
0
def grid_search():
    comments, labels = load_data()
    param_grid = dict(logr__C=np.arange(1, 20, 5))
    clf = build_nltk_model()

    cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2)
    grid = GridSearchCV(clf,
                        cv=cv,
                        param_grid=param_grid,
                        verbose=4,
                        n_jobs=12,
                        score_func=auc_score)
    grid.fit(comments, labels)
    print(grid.best_score_)
    print(grid.best_params_)

    tracer()
    cv_scores = grid.scores_
    for param in cv_scores.params:
        means, errors = cv_scores.accumulated(param, 'max')
        plt.errorbar(cv_scores.values[param], means, yerr=errors)
        plt.xlabel(param)
        plt.ylim((0.85, 0.93))
        plt.savefig("grid_plot_%s.png" % param)
        plt.close()
    comments_test, dates_test = load_test()
    prob_pred = grid.best_estimator_.predict_proba(comments_test)
    write_test(prob_pred[:, 1])
예제 #3
0
def eval_model():
    comments, labels = load_extended_data()

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    #models = [clf1]
    cv = ShuffleSplit(len(comments), n_iterations=5, test_size=0.2,
            indices=True)
    scores = []
    for train, test in cv:
        probs_common = np.zeros((len(test), 2))
        for clf in models:
            X_train, y_train = comments[train], labels[train]
            X_test, y_test = comments[test], labels[test]
            clf.fit(X_train, y_train)
            probs = clf.predict_proba(X_test)
            print("score: %f" % auc_score(y_test, probs[:, 1]))
            probs_common += probs
        probs_common /= 4.
        scores.append(auc_score(y_test, probs_common[:, 1]))
        print("combined score: %f" % scores[-1])

    print(np.mean(scores), np.std(scores))
예제 #4
0
def apply_models():
    comments, labels = load_extended_data()
    comments_test = load_test("impermium_verification_set_.csv")

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    probs_common = np.zeros((len(comments_test), 2))
    for i, clf in enumerate(models):
        clf.fit(comments, labels)
        probs = clf.predict_proba(comments_test)
        #print("score: %f" % auc_score(labels_test, probs[:, 1]))
        probs_common += probs
        write_test(probs[:, 1], "test_prediction_model_%d.csv" % i,
                ds="impermium_verification_set_.csv")
    probs_common /= 4.
    #score = auc_score(labels_test, probs_common[:, 1])
    #print("combined score: %f" % score)
    write_test(probs_common[:, 1], "test_prediction_combined.csv",
            ds="impermium_verification_set_.csv")
예제 #5
0
def apply_models():
    comments, labels = load_extended_data()
    comments_test = load_test("impermium_verification_set_.csv")

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    probs_common = np.zeros((len(comments_test), 2))
    for i, clf in enumerate(models):
        clf.fit(comments, labels)
        probs = clf.predict_proba(comments_test)
        #print("score: %f" % auc_score(labels_test, probs[:, 1]))
        probs_common += probs
        write_test(probs[:, 1],
                   "test_prediction_model_%d.csv" % i,
                   ds="impermium_verification_set_.csv")
    probs_common /= 4.
    #score = auc_score(labels_test, probs_common[:, 1])
    #print("combined score: %f" % score)
    write_test(probs_common[:, 1],
               "test_prediction_combined.csv",
               ds="impermium_verification_set_.csv")
예제 #6
0
def grid_search():
    comments, labels = load_data()
    param_grid = dict(logr__C=np.arange(1, 20, 5))
    clf = build_nltk_model()

    cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2)
    grid = GridSearchCV(clf, cv=cv, param_grid=param_grid, verbose=4,
            n_jobs=12, score_func=auc_score)
    grid.fit(comments, labels)
    print(grid.best_score_)
    print(grid.best_params_)

    tracer()
    cv_scores = grid.scores_
    for param in cv_scores.params:
        means, errors = cv_scores.accumulated(param, 'max')
        plt.errorbar(cv_scores.values[param], means, yerr=errors)
        plt.xlabel(param)
        plt.ylim((0.85, 0.93))
        plt.savefig("grid_plot_%s.png" % param)
        plt.close()
    comments_test, dates_test = load_test()
    prob_pred = grid.best_estimator_.predict_proba(comments_test)
    write_test(prob_pred[:, 1])