Пример #1
0
def grid_search():
    comments, labels = load_data()
    param_grid = dict(logr__C=np.arange(1, 20, 5))
    clf = build_nltk_model()

    cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2)
    grid = GridSearchCV(clf,
                        cv=cv,
                        param_grid=param_grid,
                        verbose=4,
                        n_jobs=12,
                        score_func=auc_score)
    grid.fit(comments, labels)
    print(grid.best_score_)
    print(grid.best_params_)

    tracer()
    cv_scores = grid.scores_
    for param in cv_scores.params:
        means, errors = cv_scores.accumulated(param, 'max')
        plt.errorbar(cv_scores.values[param], means, yerr=errors)
        plt.xlabel(param)
        plt.ylim((0.85, 0.93))
        plt.savefig("grid_plot_%s.png" % param)
        plt.close()
    comments_test, dates_test = load_test()
    prob_pred = grid.best_estimator_.predict_proba(comments_test)
    write_test(prob_pred[:, 1])
Пример #2
0
def apply_models():
    comments, labels = load_extended_data()
    comments_test = load_test("impermium_verification_set_.csv")

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    probs_common = np.zeros((len(comments_test), 2))
    for i, clf in enumerate(models):
        clf.fit(comments, labels)
        probs = clf.predict_proba(comments_test)
        #print("score: %f" % auc_score(labels_test, probs[:, 1]))
        probs_common += probs
        write_test(probs[:, 1], "test_prediction_model_%d.csv" % i,
                ds="impermium_verification_set_.csv")
    probs_common /= 4.
    #score = auc_score(labels_test, probs_common[:, 1])
    #print("combined score: %f" % score)
    write_test(probs_common[:, 1], "test_prediction_combined.csv",
            ds="impermium_verification_set_.csv")
Пример #3
0
def apply_models():
    comments, labels = load_extended_data()
    comments_test = load_test("impermium_verification_set_.csv")

    clf1 = build_base_model()
    clf2 = build_elasticnet_model()
    clf3 = build_stacked_model()
    clf4 = build_nltk_model()
    models = [clf1, clf2, clf3, clf4]
    probs_common = np.zeros((len(comments_test), 2))
    for i, clf in enumerate(models):
        clf.fit(comments, labels)
        probs = clf.predict_proba(comments_test)
        #print("score: %f" % auc_score(labels_test, probs[:, 1]))
        probs_common += probs
        write_test(probs[:, 1],
                   "test_prediction_model_%d.csv" % i,
                   ds="impermium_verification_set_.csv")
    probs_common /= 4.
    #score = auc_score(labels_test, probs_common[:, 1])
    #print("combined score: %f" % score)
    write_test(probs_common[:, 1],
               "test_prediction_combined.csv",
               ds="impermium_verification_set_.csv")
Пример #4
0
def grid_search():
    comments, labels = load_data()
    param_grid = dict(logr__C=np.arange(1, 20, 5))
    clf = build_nltk_model()

    cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2)
    grid = GridSearchCV(clf, cv=cv, param_grid=param_grid, verbose=4,
            n_jobs=12, score_func=auc_score)
    grid.fit(comments, labels)
    print(grid.best_score_)
    print(grid.best_params_)

    tracer()
    cv_scores = grid.scores_
    for param in cv_scores.params:
        means, errors = cv_scores.accumulated(param, 'max')
        plt.errorbar(cv_scores.values[param], means, yerr=errors)
        plt.xlabel(param)
        plt.ylim((0.85, 0.93))
        plt.savefig("grid_plot_%s.png" % param)
        plt.close()
    comments_test, dates_test = load_test()
    prob_pred = grid.best_estimator_.predict_proba(comments_test)
    write_test(prob_pred[:, 1])