Exemplo n.º 1
0
def roc_model_score_all():
    classifiers = (
        LogisticRegression(),
        SVC(probability=True, verbose=2),
        MultinomialNB(),
        RandomForestClassifier(),
    )
    colors = ('blue', 'red', 'green', 'black')
    plot_data = []

    train_mails = parse_mails(TRAIN_ALL['filename'])
    train_labels = TRAIN_ALL['label']
    test_mails = parse_mails(TEST_ALL['filename'])
    test_labels = TEST_ALL['label']
    for clf in classifiers:
        model = AntispamModel(clf)
        ret = roc_model_score(model, train_mails, train_labels,
                              test_mails, test_labels)
        plot_data.append(ret)

    for (estimator_name, tpr, fpr), color in zip(plot_data, colors):
        plt.plot(fpr, tpr, lw=2, color=color, label=estimator_name)
        plt.fill_between(fpr, tpr, alpha=0.1)
    plt.legend(loc='lower right')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.savefig('doc/charts/ROC_ALL.png')
    plt.show()
Exemplo n.º 2
0
def logistic_regression():
    train_mails = parse_mails(TRAIN_ALL['filename'])
    clf = LogisticRegression()
    model = AntispamModel(clf)
    model.train(train_mails, TRAIN_ALL['label'])
    test_mails = parse_mails(TEST_ALL['filename'])
    model.plot_roc_curve(test_mails, TEST_ALL['label'])
Exemplo n.º 3
0
def prepare_model():
    print "Preparing model..."
    clf = LogisticRegression(C=3.5)
    model = AntispamModel(clf)
    train_mails = parse_mails(TRAIN_ALL['filename'])
    train_labels = TRAIN_ALL['label']
    model.train(train_mails, train_labels)
    print "Model ready!"
    return model
Exemplo n.º 4
0
def grid_test(clf, params, n_folds=5, filepath=None,
              extra_params={}):
    train_mails = parse_mails(COMPLETE_ALL['filename'])
    train_labels = COMPLETE_ALL['label']
    model = AntispamModel(clf)
    model.spam_filter.set_params(**extra_params)
    scorer = ROCScorer(params.keys())
    cv = StratifiedKFold(train_labels, n_folds)
    grid_search = GridSearchCV(
        model.spam_filter, params, scoring=scorer,
        n_jobs=1, cv=cv, refit=False, verbose=2
    )
    grid_search.fit(train_mails, train_labels)

    print_scores_table(grid_search.grid_scores_)

    plt.figure(figsize=(8, 12))
    interp_scores = scorer.interp_scores.items()
    interp_scores.sort(key=lambda x: x[1].auc)
    scores_count = len(interp_scores)

    for (params, score), (ls, lc) in zip(interp_scores, linestyles_gen()):
        save_grid_test_result(clf, params, score)

        label = "%s (AUC: %.5f)" % (params, score.auc)
        plt.subplot(2, 1, 1)
        score.plot(label=label, lc=lc, ls=ls, fill_alpha=0.5 / scores_count)
        plt.subplot(2, 1, 2)
        score.plot(label=label, lc=lc, ls=ls, fill_alpha=0.5 / scores_count)
    plt.subplot(2, 1, 1)
    plt.grid(True)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend(loc='lower right', fontsize='medium')
    plt.gca().add_patch(
        plt.Rectangle((0, 0.8), 0.2, 0.2, ls='dashed', fc='none')
    )
    plt.xlim(-0.05, 1)
    plt.ylim(0, 1.05)
    plt.subplot(2, 1, 2)
    plt.grid(True)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    if filepath:
        plt.savefig(filepath)
    plt.close()
    return grid_search
Exemplo n.º 5
0
def classifiers_comparison():
    classifiers = [
        ("Regresja logistyczna",
         LogisticRegression(),
         {'classifier__C': 5.0}),

        ("Naiwny klas. bayesowski",
         MultinomialNB(),
         {'classifier__alpha': 0.1}),

        ("SVM (liniowy)",
         SVC(kernel='linear', probability=True),
         {'classifier__C': 3.5,
          'features__text_words': 500, 'features__subject_words': 50}),

        ("SVM (RBF)",
         SVC(kernel='rbf', probability=True),
         {'classifier__C': 0.5, 'classifier__gamma': 0.1,
          'features__text_words': 500, 'features__subject_words': 50}),

        ("Las drzew losowych",
         RandomForestClassifier(),
         {'classifier__n_estimators': 100}),
    ]

    clf_count = len(classifiers)

    train_mails = parse_mails(COMPLETE_ALL['filename'])
    train_labels = COMPLETE_ALL['label']
    plt.figure(figsize=(8, 12))
    for (clf_name, clf, params), (ls, lc) in zip(classifiers,
                                                 linestyles_gen()):
        model = AntispamModel(clf)
        model.spam_filter.set_params(**params)
        cv = StratifiedKFold(train_labels, 5)
        scorer = ROCScorer(params.keys())
        cross_val_score(model.spam_filter, train_mails, train_labels,
                        cv=cv, scoring=scorer, verbose=2)
        score = scorer.interp_scores.values()[0]
        label = clf_name
        plt.subplot(2, 1, 1)
        score.plot(label=label, lc=lc, ls=ls, fill_alpha=0.5 / clf_count)
        plt.subplot(2, 1, 2)
        score.plot(label=label, lc=lc, ls=ls, fill_alpha=0.5 / clf_count)
    plt.subplot(2, 1, 1)
    plt.grid(True)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend(loc='lower right', fontsize='medium')
    plt.gca().add_patch(
        plt.Rectangle((0, 0.8), 0.2, 0.2, ls='dashed', fc='none')
    )
    plt.xlim(-0.05, 1)
    plt.ylim(0, 1.05)
    plt.subplot(2, 1, 2)
    plt.grid(True)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    plt.savefig('doc/charts/ROC_ALL.png')
    plt.show()
Exemplo n.º 6
0
        elif isinstance(X, DataWrapper):
            X = X.df

        if len(cols) == 1:
            t = X[cols[0]]
        else:
            t = X.as_matrix(cols)

        if np.all(X.dtypes[cols] == 'object'):
            t = np.array(t, dtype='|U')  # '|U' instead of '|S'

        return t


class StubExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X):
        pass

    def transform(self, X):
        return X


if __name__ == '__main__':
    mails_data = parse_mails(TRAIN_ALL['filename'])
    fext = FeaturesExtractor()
    #print len(fext.extract())
    #fext.extract()