예제 #1
0
def test_density():
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(10, 5))
    X[1, 2] = 0
    X[5, 3] = 0
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)
    X_coo = sparse.coo_matrix(X)
    X_lil = sparse.lil_matrix(X)

    for X_ in (X_csr, X_csc, X_coo, X_lil):
        assert_equal(density(X_), density(X))
예제 #2
0
def test_density():
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(10, 5))
    X[1, 2] = 0
    X[5, 3] = 0
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)
    X_coo = sparse.coo_matrix(X)
    X_lil = sparse.lil_matrix(X)

    for X_ in (X_csr, X_csc, X_coo, X_lil):
        assert_equal(density(X_), density(X))
예제 #3
0
def gridsearch_svm(Xtrain, Ytrain, Xval, Yval):
    #---------------------------------- Scaling
    X1, scaler = scale_data(Xtrain)
    X2 = scale_data(Xval, scaler)
    #---------------------------------- Factor analysis
    fa = FactorAnalysis()
    X1 = fa.fit_transform(X1)
    X2 = fa.fit(X2)
    #---------------------------------- Cross validation and grid search
    cv = ShuffleSplit(len(Xtrain),
                      n_iter=1,
                      train_size=0.25,
                      test_size=.03,
                      random_state=0)
    params = {'C': [1, 10], 'kernel': ['rbf', 'linear']}
    svr = svm.SVC(verbose=True, shrinking=False)
    classifier = grid_search.GridSearchCV(svr, params, verbose=3, cv=cv)
    t0 = time()
    classifier.fit(X1, Ytrain)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    #---------------------------------- Prediction on validation set:
    t0 = time()
    pred = list(classifier.predict(X2))
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    if hasattr(classifier, 'coef_'):
        print("dimensionality: %d" % classifier.coef_.shape[1])
        print("density: %f" % density(classifier.coef_))
    print 'F1-score : ', f1_score(Yval, pred, average='binary')
    print("classification report:")
    print(classification_report(Yval, pred, target_names=['0', '1'], digits=4))
    print("confusion matrix:")
    print(confusion_matrix(Yval, pred))
    return classifier, scaler
예제 #4
0
def train_svm(Xtrain, Ytrain, Xval=None, Yval=None):
    X1, scaler = scale_data(Xtrain)
    if Xval is not None:
        X2 = scale_data(Xval, scaler)
    # initialize basic SVM
    classifier = svm.SVC(verbose=True, shrinking=False, C=10, kernel='rbf')
    # train
    t0 = time()
    classifier.fit(X1, Ytrain)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    if Xval is not None:
        # prediction on validation set:
        t0 = time()
        pred = list(classifier.predict(X2))
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)
        if hasattr(classifier, 'coef_'):
            print("dimensionality: %d" % classifier.coef_.shape[1])
            print("density: %f" % density(classifier.coef_))
        print 'F1-score : ', f1_score(Yval, pred, average='binary')
        print("classification report:")
        print(
            classification_report(Yval,
                                  pred,
                                  target_names=['0', '1'],
                                  digits=4))
        print("confusion matrix:")
        print(confusion_matrix(Yval, pred))
    return classifier, scaler
예제 #5
0
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()

    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if feature_names is not None:
            print("top 10 keywords per class:")
        print()

    if True:
        print("confusion matrix:")
        cm = metrics.confusion_matrix(y_test, pred)
        
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #6
0
def benchmark(clf,X_train,y_train,X_test,y_test):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        print()

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #7
0
def benchmark(clf):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X, Y)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_dev)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(Y_dev, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, "coef_"):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(Y_dev, pred))

    print()
    clf_descr = str(clf).split("(")[0]
    return clf_descr, score, train_time, test_time
예제 #8
0
def report_accuracy(model, categories, test_target, predicted):
    score = metrics.f1_score(test_target, predicted)
    print "f1-score: {:.3f}".format(score)

    clf = model.named_steps['clf']
    if hasattr(clf, 'coef_'):
        coef = model.named_steps['clf'].coef_
        print "dimensionality: {}".format(coef.shape[1])
        print "density: {}".format(density(coef))

        print "top 15 keywords per class:"
        feature_names = np.asarray(
            model.named_steps['vect'].get_feature_names())
        for i, category in enumerate(categories):
            topkw = np.argsort(coef[i])[-15:]
            keywords = '\n\t'.join(
                textwrap.wrap(", ".join(feature_names[topkw])))
            print "{}: {}".format(category, keywords)
        print

    print "classification report:"
    print metrics.classification_report(test_target,
                                        predicted,
                                        target_names=categories)

    print "confusion matrix:"
    print metrics.confusion_matrix(test_target, predicted)
    print
예제 #9
0
def train_rf(Xtrain, Ytrain, Xval=None, Yval=None):
    X1, scaler = scale_data(Xtrain)
    classifier = RandomForestClassifier(n_estimators=100, verbose=True)
    t0 = time()
    classifier.fit(X1, np.ravel(Ytrain))
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    if Xval is not None:
        X2 = scale_data(Xval, scaler)
        t0 = time()
        pred = classifier.predict(X2.as_matrix())
        test_time = time() - t0
        print("test time: %0.3fs" % test_time)
        if hasattr(classifier, 'coef_'):
            print("dimensionality: %d" % classifier.coef_.shape[1])
            print("density: %f" % density(classifier.coef_))
        print 'F1-score : ', f1_score(Yval, pred, average='binary')
        print("classification report:")
        print(
            classification_report(Yval,
                                  pred,
                                  target_names=['0', '1'],
                                  digits=4))
        print("confusion matrix:")
        print(confusion_matrix(Yval, pred))
    return classifier, scaler
예제 #10
0
def report_accuracy(model, categories, test_target, predicted):
    score = metrics.f1_score(test_target, predicted)
    print "f1-score: {:.3f}".format(score)

    clf = model.named_steps['clf']
    if hasattr(clf, 'coef_'):
        coef = model.named_steps['clf'].coef_
        print "dimensionality: {}".format(coef.shape[1])
        print "density: {}".format(density(coef))

        print "top 15 keywords per class:"
        feature_names = np.asarray(model.named_steps['vect'].get_feature_names())
        for i, category in enumerate(categories):
            topkw = np.argsort(coef[i])[-15:]
            keywords = '\n\t'.join(textwrap.wrap(
                ", ".join(feature_names[topkw])
            ))
            print "{}: {}".format(category, keywords)
        print

    print "classification report:"
    print metrics.classification_report(test_target, predicted,
                                        target_names=categories)

    print "confusion matrix:"
    print metrics.confusion_matrix(test_target, predicted)
    print
예제 #11
0
def benchmark(clf):
    print 'en benchmark'
    logger.info('_' * 80)
    logger.info("Entrenamiento: ")
    logger.info(clf)
    t0 = time.time()

    clf.fit(X_train, y_train)

    train_time = time.time() - t0
    logger.info("tiempo de entrenamiento: %0.3fs" % train_time)

    t0 = time.time()
    pred = clf.predict(X_test)
    test_time = time.time() - t0
    logger.info("tiempo de test:  %0.3fs" % test_time)

    precision = metrics.precision_score(y_test, pred)
    score = metrics.recall_score(y_test, pred)
    funo = metrics.f1_score(y_test, pred)
    logger.info("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        logger.info("dimensionalidad: %d" % clf.coef_.shape[1])
        logger.info("densidad: %f" % density(clf.coef_))

    logger.info("Informe de clasificacion:")
    logger.info(metrics.classification_report(y_test, pred))

    logger.info("Matriz de confusion:")
    logger.info(metrics.confusion_matrix(y_test, pred))

    clf_descr = str(clf).split('(')[0]

    return precision, score, funo, train_time, test_time
예제 #12
0
def benchmarkDoc2Vec(clf):
    """
    Benchmark classifiers
    """
    print('_' * 80)
    print("Training Doc2Vec: ")
    print(clf)
    t0 = time()
    clf.fit(x_train1, y_train) #! train data using x(features) & y(targets)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred1 = clf.predict(x_test1)   #!predict data
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred1) #! check the accuracy of preds using the test targets
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("classification report:")
    print(metrics.classification_report(y_test, pred1, target_names=class_names))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred1))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #13
0
def benchmarkWithCV(clf, X, y, n_folds):
    print('-' * 80)
    print("Training: ")
    print(clf)
    cv = StratifiedKFold(y, n_folds)
    cv_scores = []
    t0 = time()
    for train, test in cv:
        clf.fit(X[train], y[train])
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X[test])
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)
        #score = np.sum(pred == y_test) / float(np.size(y_test))
        score = metrics.accuracy_score(y[test], pred)
        cv_scores.append(score)
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

    clf_descr = str(clf).split('(')[0]
    mean_score = np.mean(cv_scores)
    return clf_descr, mean_score, train_time, test_time
예제 #14
0
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time

    score = metric(y_test, pred)
    print "MAE:  %0.3f" % score

    if hasattr(clf, 'alpha_'):
        print "Alpha", clf.alpha_

    try:
        if hasattr(clf, 'coef_'):
            print "density: %f" % density(clf.coef_)
            print "dimensionality: %d" % clf.coef_.shape[0]

            print
    except Exception as ex:
        print ex


    print
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #15
0
def _classify(clf, cluster_data, X_train, y_train, X_test, feature_names,
             categories, c_params):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    cluster_data.cluster_of_posts = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if feature_names is not None:
            print("top 10 keywords per class:")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s"
                           % (category, " ".join(feature_names[top10]))))
        print()

    if c_params.is_report_printed:
        print("classification report:")

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, train_time, test_time
def benchmark_features_selection(clf,name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2),
              scoring='accuracy')
    rfecv.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    print(name+"Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")    
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

 
    t0 = time()
    pred = rfecv.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("Saving data to database:")
    save_results_data(cursor, name, testing_identifiant_produit_list, pred)
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr,train_time,test_time
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.accuracy_score(y_test, pred)
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))
            print("top 10 keywords per class:")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (category, " ".join(feature_names[top10]).encode("utf-8"))))
            print()

        print("classification report:")
        print(metrics.classification_report(y_test, pred,target_names=categories))

        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time
def benchmark(clf, clf_name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(x_train_std, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(x_test_std)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=["not helpful", "helpful"]))
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    save_confusion_matrix(confusion_matrix(y_test, pred), pred, clf_name)
    return clf_descr, score, train_time, test_time
    def getReport(self, save=1, get_top_words=0):  # returns report
        report = ""
        if get_top_words == 1:
            if hasattr(self.mlModel, 'coef_'):
                report += "Dimensionality: " + str(self.mlModel.coef_.shape[1])
                report += "\nDensity: " + str(density(self.mlModel.coef_))

                rank = np.argsort(self.mlModel.coef_[0])
                top10 = rank[-20:]
                bottom10 = rank[:20]
                report += "\n\nTop 10 keywords: "
                report += "\nPositive: " + (" ".join(
                    self.feature_names[top10]))
                report += "\nNegative: " + (" ".join(
                    self.feature_names[bottom10]))

        score = metrics.accuracy_score(self.y_test, self.y_pred)
        report += "\n\nAccuracy: " + str(score)
        report += "\nClassification report: "
        report += "\n\n" + str(
            metrics.classification_report(
                self.y_test,
                self.y_pred,
                target_names=["Negative", "Positive"]))
        report += "\nConfusion matrix: "
        report += "\n\n" + str(
            metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n"

        if save == 1:
            with open(self.model_path + "report.txt", "w") as text_file:
                text_file.write(report)

        return report
예제 #20
0
    def benchmark(self, clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time.time()
        clf.fit(self.X_train, self.y_train)
        train_time = time.time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time.time()
        pred = clf.predict(self.X_test)
        test_time = time.time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.accuracy_score(self.y_test, pred)
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

        
            print("top 10 keywords per class:")
            for i, label in enumerate(self.target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(self.trim("%s: %s" % (label, " ".join(self.feature_names[top10]))))
        print()

        print("confusion matrix:")
        print(metrics.confusion_matrix(self.y_test, pred))

        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time
예제 #21
0
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

      

    
    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #22
0
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

      

    
    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
def benchmark(clf):
    ## Benchmark classifiers
    ## Modified after: http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html#sphx-glr-auto-examples-text-document-classification-20newsgroups-py
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, Y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    Y_pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = clf.score(X_train, Y_train)
    print("Training accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("classification report:")
    print(
        metrics.classification_report(Y_test,
                                      Y_pred,
                                      target_names=target_names))

    print("confusion matrix:")
    print(metrics.confusion_matrix(Y_test, Y_pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #24
0
def test(classifier):
    print('\n\n')
    print("Training: ")
    print(classifier)
    t0 = time()
    classifier.fit(train_matrix, train_y)
    train_time = time() - t0
    print("train time: %0.4fs" % train_time)

    t0 = time()
    pred = classifier.predict(test_matrix)
    test_time = time() - t0
    print("test time:  %0.4fs" % test_time)

    score = metrics.accuracy_score(test_y, pred)
    print("accuracy:   %0.4f" % score)

    if hasattr(classifier, 'coef_'):
        print("dimensionality: %d" % classifier.coef_.shape[1])
        print("density: %f" % density(classifier.coef_))

        print("top 50 keywords per rating:")
        for i in stars:
            top50 = np.argsort(classifier.coef_[i-1])[-50:]
            print(trim("%d: %s" % (i, " ".join(feature_list[top50]))))
        print()

        print("Classification report:")
        print(metrics.classification_report(test_y, pred))

        print("Confusion matrix:")
        print(metrics.confusion_matrix(test_y, pred))

    classifier_name = str(classifier).split('(')[0]
    return classifier_name, score, train_time, test_time
예제 #25
0
    def benchmark(clf):
        print('_' * 80)
        print(clf.steps[1][0])
        print('_' * 80)
        print("Training: ")
        # print(clf)
        t0 = time()
        clf.fit(X_train_flip, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test_flip)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.accuracy_score(y_test, pred)
        print("accuracy:   %0.3f" % score)

        # f1score = metrics.f1_score(y_test, pred,average='micro')
        # print("f1-score:   %0.3f" % f1score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

            print()

        print("classification report:")
        print(metrics.classification_report(y_test, pred, target_names=settings[4]))

        # print("confusion matrix:")
        cm = metrics.confusion_matrix(y_test, pred)
        # print(cm)
        np.set_printoptions(precision=2)

        # Plot non-normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cm, classes=settings[4],
                              title='Confusion matrix')

        # Plot normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cm, classes=settings[4], normalize=True,
                              title='Normalized confusion matrix')

        plt.show()

        # print()
        # print("roc auc score:")
        # roc_auc_score = metrics.roc_auc_score(y_test, pred, average='micro')
        # print(roc_auc_score)

        # print("roc curve:")
        # roc_curve = metrics.roc_curve(y_test, pred)
        # print(roc_curve)

        print()
        clf_descr = str(clf).split('(')[0]
        return clf, clf_descr, score, train_time, test_time
예제 #26
0
def benchmark(clf):
    print('_'*80)
    print('training')
    print(clf)
    t0=time()
    clf.fit(X_train,y_train)
    train_time=time()-t0
    print('train time%0.3f'%train_time)

    to=time()
    pred=clf.predict(X_test)
    test_time=time()-t0
    print('test time %0.3f'%test_time)
    score=metrics.accuracy_score(y_test,pred)
    print('accuracy:%0.3f'%score)

    if hasattr(clf,'coef_'):
        print('dimensionality:%d'%clf.coef_.shape[1])
        print('density:%f'%density(clf.coef_))
        if opts.print_top10 and feature_names is not None:
            print('top10 keywords per class:')
            for i,label in enumerate(target_names):
                top10=np.argsort(clf.coef_[i])[-10:]
                print(trim('%s:%s'%(label,' '.join(feature_names[top10]))))
        print()
    if opts.print_report:
        print('classfication report:')
        print(metrics.classification_report(y_test,pred,target_names=target_names))

    if opts.print_cm:
        print('confusion matrix')
        print(metrics.confusion_matrix(y_test,pred))
    print()
    clf.descr=str(clf).split('(')[0]
    return clf.descr,score,train_time,test_time
예제 #27
0
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time

    score = metric(y_test, pred)
    print "MAE:  %0.3f" % score

    if hasattr(clf, 'alpha_'):
        print "Alpha", clf.alpha_

    try:
        if hasattr(clf, 'coef_'):
            print "density: %f" % density(clf.coef_)
            print "dimensionality: %d" % clf.coef_.shape[0]

            print
    except Exception as ex:
        print ex

    print
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #28
0
def train_logit(Xtrain, Ytrain, Xval=None, Yval=None):
    X1, scaler = scale_data(Xtrain)
    classifier = LogisticRegression(C=1, penalty='l1', tol=1e-4, verbose=True)
    t0 = time()
    classifier.fit(X1, Ytrain)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    if Xval is not None:
        X2 = scale_data(Xval, scaler)
        t0 = time()
        pred = classifier.predict(X2)
        test_time = time() - t0
        print("test time: %0.3fs" % test_time)
        if hasattr(classifier, 'coef_'):
            print("dimensionality: %d" % classifier.coef_.shape[1])
            print("density: %f" % density(classifier.coef_))
        print 'F1-score : ', f1_score(Yval, pred, average='binary')
        print("classification report:")
        print(
            classification_report(Yval,
                                  pred,
                                  target_names=['0', '1'],
                                  digits=4))
        print("confusion matrix:")
        print(confusion_matrix(Yval, pred))
    return classifier, scaler
    def getReport(self,save = 1, get_top_words = 0):       # returns report
        report = ""
        if get_top_words == 1:
            if hasattr(self.mlModel, 'coef_'):
                    report += "Dimensionality: " + str(self.mlModel.coef_.shape[1])
                    report += "\nDensity: " +  str(density(self.mlModel.coef_))

                    rank = np.argsort(self.mlModel.coef_[0])
                    top10 = rank[-20:]
                    bottom10 = rank[:20]
                    report += "\n\nTop 10 keywords: "
                    report += "\nPositive: " + (" ".join(self.feature_names[top10]))
                    report += "\nNegative: " + (" ".join(self.feature_names[bottom10]))

        score = metrics.accuracy_score(self.y_test, self.y_pred)
        report += "\n\nAccuracy: " + str(score)
        report += "\nClassification report: "
        report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"]))
        report += "\nConfusion matrix: "
        report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n"

        if save == 1:
            with open(self.model_path + "report.txt", "w") as text_file:
                text_file.write(report)

        return report
예제 #30
0
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(len(X_train))

    clf_descr = str(clf).split('(')[0]
    print("model name:" + clf_descr)

    a = datetime.now()

    if clf_descr.__contains__('tensorflow'):
        history = clf.fit(X_train,
                          y_train,
                          epochs=epochs,
                          callbacks=callbacks,
                          validation_data=(X_test, y_test),
                          verbose=2,
                          batch_size=batch_size)
    else:
        clf.fit(X_train, y_train)

    b = datetime.now()
    c = a - b
    train_time = c.microseconds
    print("train time: %0.3fs" % train_time)

    pred = clf.predict(X_test)

    pred_train = clf.predict(X_train)

    if clf_descr.__contains__('tensorflow'):
        for i in range(len(pred)):
            if (pred[i] >= 0.3):
                pred[i] = 1
            else:
                pred[i] = 0
        for i in range(len(pred_train)):
            if (pred_train[i] >= 0.3):
                pred_train[i] = 1
            else:
                pred_train[i] = 0

    f1_score = metrics.f1_score(y_test, pred)
    print("f1_score:   %0.3f" % f1_score)

    f1_score_train = metrics.f1_score(y_train, pred_train)
    print("f1_score_train:   %0.3f" % f1_score_train)

    print("classification report:")
    print(classification_report(y_test, pred))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    return clf_descr, f1_score_train, f1_score, train_time
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)

    cumScore = 0
    cumTrainTime = 0
    cumTestTime = 0
    for classC in categories:
        print("*** One class model for : ", classC, " ***")
        t0 = time()
        new_y_train = y_train.tolist()
        new_y_test = y_test.tolist()
        new_y_train = [x if x == classC else 0 for x in new_y_train]
        new_y_test = [x if x == classC else 0 for x in new_y_test]
        new_y_train = np.array(new_y_train)
        new_y_test = np.array(new_y_test)
        clf.fit(X_train, new_y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)
        cumTrainTime += train_time

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)
        cumTestTime += test_time

        score = metrics.accuracy_score(new_y_test, pred)
        cumScore += score
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

            if False:  # or opts.print_top10 and feature_names is not None:
                print("top 10 keywords per class:")
                for i, category in enumerate(categories):
                    top10 = np.argsort(clf.coef_[i])[-10:]
                    print(
                        trim("%s: %s" %
                             (category, " ".join(feature_names[top10]))))
            print()
        if False:  #or opts.print_report :
            print("classification report:")
            print(
                metrics.classification_report(new_y_test,
                                              pred,
                                              target_names=categories))
        if False:  #opts.print_cm:
            print("confusion matrix:")
            print(metrics.confusion_matrix(new_y_test, pred))

        print()
        clf_descr = str(clf).split('(')[0]
    return clf_descr, cumScore / len(categories), cumTrainTime, cumTestTime
예제 #32
0
def benchmark(clf,
              X_train,
              y_train,
              X_test,
              y_test,
              target_names,
              print_report=True,
              feature_names=None,
              print_top10=False,
              print_cm=True):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    #print("Accuracy: %0.3f (+/- %0.3f)" % (score.mean(), score.std() * 2))

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate([
                    "Make Update", "Setup Printer", "Shutdown Computer",
                    "Software Recommendation", "None"
            ]):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(
                    trim("%s: %s" %
                         (label, " ".join([feature_names[i] for i in top10]))))
        print()

    if print_report:
        print("classification report:")
        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=target_names))

    if print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #33
0
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)
    
        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)
    
        score = metrics.f1_score(y_test, pred)
        accscore = metrics.accuracy_score(y_test, pred)
        print ("pred count is %d" %len(pred))
        print ('accuracy score:     %0.3f' % accscore)
        print("f1-score:   %0.3f" % score)
    
        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))
    
          
    
        
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                                target_names=categories))
    
        
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))
        
        print("confidence for unlabeled data:")
        #compute absolute confidence for each unlabeled sample in each class
        confidences = np.abs(clf.decision_function(X_unlabeled))
        #average abs(confidence) over all classes for each unlabeled sample (if there is more than 2 classes)
        if(len(categories) > 2):
            confidences = np.average(confidences, axix=1)
        
        print confidences
        sorted_confidences = np.argsort(confidences)
        question_samples = []
        #select top k low confidence unlabeled samples
        low_confidence_samples = sorted_confidences[0:NUM_QUESTIONS]
        #select top k high confidence unlabeled samples
        high_confidence_samples = sorted_confidences[-NUM_QUESTIONS:]

        question_samples.extend(low_confidence_samples.tolist())
        question_samples.extend(high_confidence_samples.tolist())

        
        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time, question_samples
    def benchmark(clf, section):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.accuracy_score(y_test, pred)
        print("accuracy:   %0.4f" % score)

        print("classification report:")
        class_matrix = metrics.classification_report(
            y_test, pred, target_names=target_names).split()

        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=target_names))

        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

        print()

        clf_descr = str(clf).split('(')[0]

        if hasattr(clf, 'coef_'):
            with open("svm_top_features.tsv", "a", encoding="utf-8") as tsv2:

                print("dimensionality: %d" % clf.coef_.shape[1])
                print("density: %f" % density(clf.coef_))

                coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
                top = zip(coefs_with_fns[:50], coefs_with_fns[:-(50 + 1):-1])
                for (coef_1, fn_1), (coef_2, fn_2) in top:
                    print("\t%.4f\t%-15s\t\t%.4f\t%-15s" %
                          (coef_1, fn_1, coef_2, fn_2))
                    # writes coef_d feat_d coef_r feat_r section model acc_d acc_r acc_all recall f1 support
                    tsv2.write(
                        "\n%.5f\t%s\t%.5f\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
                        %
                        (coef_1, fn_1, coef_2, fn_2, section,
                         str(clf).split("(")[0], class_matrix[5],
                         class_matrix[10], class_matrix[17], class_matrix[18],
                         class_matrix[19], class_matrix[20]))
                tsv2.close()

            print()

        return clf_descr, score, train_time, test_time
예제 #35
0
def benchmark(clf):
    global train_duration, test_duration
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    if isinstance(clf, (GensimFastText, FastText)):
        clf.fit(train_text, y_train)
        train_time = time() - t0
    else:
        clf.fit(X_train, y_train)
        train_time = train_duration + (time() - t0)
    print("train time: %0.3fs" % train_time)

    t0 = time()
    if isinstance(clf, (GensimFastText, FastText)):
        pred = clf.predict(test_text)
        test_time = time() - t0
        # fix unknown predictions
        pred = [most_freq if p is None else p for p in pred]
    else:
        pred = clf.predict(X_test)
        test_time = test_duration + (time() - t0)
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred, average='macro')
    print("macro F1:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(
                    trim("%s: %s" %
                         (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #36
0
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.f1_score(y_test, pred)
        accscore = metrics.accuracy_score(y_test, pred)
        print("pred count is %d" % len(pred))
        print('accuracy score:     %0.3f' % accscore)
        print("f1-score:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

        print("classification report:")
        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=categories))

        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

        print("confidence for unlabeled data:")
        #compute absolute confidence for each unlabeled sample in each class
        confidences = np.abs(clf.decision_function(X_unlabeled))
        #average abs(confidence) over all classes for each unlabeled sample (if there is more than 2 classes)
        if (len(categories) > 2):
            confidences = np.average(confidences, axix=1)

        print confidences
        sorted_confidences = np.argsort(confidences)
        question_samples = []
        #select top k low confidence unlabeled samples
        low_confidence_samples = sorted_confidences[0:NUM_QUESTIONS]
        #select top k high confidence unlabeled samples
        high_confidence_samples = sorted_confidences[-NUM_QUESTIONS:]

        question_samples.extend(low_confidence_samples.tolist())
        question_samples.extend(high_confidence_samples.tolist())

        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time, question_samples
예제 #37
0
def benchmark(clf, name):
    print('_' * 80)

    if not opts.restore:
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

    clf_path = "models/%s/" % name
    if opts.save:
        if not os.path.exists(clf_path):
            os.makedirs(clf_path)
        joblib.dump(clf, clf_path + "model.pkl")

    if opts.restore:
        clf = joblib.load(clf_path + "model.pkl")
        train_time = 0

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=target_names))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.accuracy_score(y_test, pred)
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

            print_top10 = True
            if print_top10 and feature_names is not None:
                print("top 10 keywords per class:")
                for i, label in enumerate(target_names):
                    if i >= len(clf.coef_):
                        print("%s: Missing data???" % label)
                        continue
                    top10 = np.argsort(clf.coef_[i])[-10:]
                    try:
                        print(
                            trim("%s: \"%r\"" %
                                 (label, '" "'.join(feature_names[top10]))))
                    except UnicodeEncodeError as e:
                        print(e)
                print()

        print_report = True
        if print_report:
            print("classification report:")
            print(target_names)
            print(
                metrics.classification_report(y_test,
                                              pred,
                                              target_names=target_names))

        print_cm = True
        if print_cm:
            print("confusion matrix:")
            print(metrics.confusion_matrix(y_test, pred))

        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time
예제 #39
0
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    ccm = clf.predict(X_test)
    print(ccm)
    #pred_list = clf.predict_proba(X_test)
    #print (clf.classes_, pred_list)

    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    # Add F1 socre
    score_acc = metrics.accuracy_score(y_test, pred)
    score_macro_f1 = metrics.f1_score(y_test, pred, average='macro')
    score_micro_f1 = metrics.f1_score(y_test, pred, average='micro')
    print("Precision:   %0.3f" % score_acc)
    print("Macro F1:   %0.3f" % score_macro_f1)
    print("Micro F1:   %0.3f" % score_micro_f1)
    print(metrics.classification_report(y_test, pred, target_names))

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=target_names))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score_acc, train_time, test_time
예제 #40
0
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    predictions = clf.predict_proba(X_test)
    fin_predict = []
    for i in xrange(0,len(predictions)):
        x = np.argpartition(predictions[i],-5)[-5:]
        x = clf.classes_[x]
        fin_predict.append([bunch.target_names[e] for e in x])
    
    our_accuracies.append(final_accuracy(fin_predict))
    print(our_accuracies[-1])
    # print("------------predictions------------")
    # print(pred)
    # print("-------------------------")
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s"
                      % (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time
    score = metrics.f1_score(y_test, pred)
    print "f1-score:   %0.3f" % score

    if hasattr(clf, 'coef_'):
        print "dimensionality: %d" % max(clf.coef_.shape)
        print "density: %f" % density(clf.coef_)

        if opts.print_top10:
            print "top 10 keywords per class:"
            for i, category in enumerate(categories):
                import pdb
                pdb.set_trace()
                if len(clf.coef_.shape) == 1:
                    top10 = np.argsort(clf.coef_[i])[-10:]
                else:
                    top10 = np.argsort(clf.coef_[0][i])[-10:]
                print trim(
                    "%s: %s" %
                    (category, " ".join(np.array(feature_names)[top10])))
        print
    pos_hits = []
    for i in range(len(pred)):
        if pred[i] == 1:
            pos_hits.append(y_test[i])
    #print float(sum(pos_hits))/len(pos_hits)
    #print len(pos_hits)

    if opts.print_report:
        print "classification report:"
        print metrics.classification_report(y_test,
                                            pred,
                                            target_names=map(str, categories))

    if opts.print_cm:
        print "confusion matrix:"
        print metrics.confusion_matrix(y_test, pred)

    print
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
def benchmark(clf):
    print('_' * 80)
    print("training: ")
    print(clf)

    t0 = time()
    clf.fit(x_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(x_test)
    test_time = time() - t0
    print("test time: %0.3f" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy: %0.3f" % score)

    if hasattr(clf, 'coef_'):
        """
		`coef_` : array, shape (n_features, ) or (n_targets, n_features)
        		Estimated coefficients for the linear regression problem.
        		If multiple targets are passed during the fit (y 2D), this
        		is a 2D array of shape (n_targets, n_features), while if only
        		one target is passed, this is a 1D array of length n_features
		"""
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class: ")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(
                    trim("%s: %s" %
                         (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report: ")
        print(
            metrics.classification_report(y_test,
                                          pred,
                                          target_names=categories))

    if opts.print_cm:
        print("confusion matrix: ")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]

    return clf_descr, score, train_time, test_time
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    # FIXME: use X_train.toarray() instead. if it didn't work use y_train.toarray() too :D
    #y_train.toarray()
    #X_train.toarray()
    #clf.fit(X_train.toarray(), y_train)
    #clf.fit(X_train, y_train.toarray())
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    score = metrics.precision_score(y_test, pred, average='weighted', pos_label=None)
    print("precision:   %0.3f" % score)
    score = metrics.recall_score(y_test, pred, average='weighted', pos_label=None)
    print("recall:   %0.3f" % score)
    score = metrics.f1_score(y_test, pred, average='weighted', pos_label=None)
    print("f1:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            #for i, category in enumerate(categories):
            #    top10 = np.argsort(clf.coef_[i])[-10:]
            #    print(trim("%s: %s"
            #          % (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        #print(metrics.classification_report(y_test, pred,
        #                                    target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #44
0
    def benchmark(self, clf):
        print_topX = self.print_topX
        print_report = self.print_report
        print_cm = self.print_cm
        X_train = self.X_train
        y_train = self.y_train
        X_test = self.X_test
        y_test = self.y_test
        feature_names = self.feature_names
        categories = ["1"]

        print("_" * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.f1_score(y_test, pred)
        print("f1-score:   %0.3f" % score)

        if hasattr(clf, "coef_"):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

            if print_topX:
                print("top 10 keywords per class:")
                for i, category in enumerate(categories):
                    topX = np.argsort(clf.coef_[i])[-print_topX:]
                    print(trim("%s: %s" % (category, " ".join(feature_names[topX]))))
            print()

        if print_report:
            print("classification report:")
            print(classification_report(y_test, pred))
            # target_names=categories))

        if print_cm:
            print("confusion matrix:")
            print(metrics.confusion_matrix(y_test, pred))

        print()
        clf_descr = str(clf).split("(")[0]
        return clf_descr, score, train_time, test_time, clf, pred
예제 #45
0
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time
    score = metrics.f1_score(y_test, pred)
    print "f1-score:   %0.3f" % score

    if hasattr(clf, 'coef_'):
        print "dimensionality: %d" % max(clf.coef_.shape)
        print "density: %f" % density(clf.coef_)

        if opts.print_top10:
            print "top 10 keywords per class:"
            for i, category in enumerate(categories):
                import pdb;pdb.set_trace()
                if len(clf.coef_.shape) == 1:
                    top10 = np.argsort(clf.coef_[i])[-10:]
                else:
                    top10 = np.argsort(clf.coef_[0][i])[-10:]
                print trim("%s: %s" % (
                    category, " ".join(np.array(feature_names)[top10])))
        print
    pos_hits = []
    for i in range(len(pred)):
        if pred[i] == 1:
            pos_hits.append(y_test[i])
    #print float(sum(pos_hits))/len(pos_hits)
    #print len(pos_hits)

    if opts.print_report:
        print "classification report:"
        print metrics.classification_report(y_test, pred,
                                            target_names=map(str,categories))

    if opts.print_cm:
        print "confusion matrix:"
        print metrics.confusion_matrix(y_test, pred)

    print
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #46
0
def benchmark(clf):
    needsDense=[RandomForestClassifier, AdaBoostClassifier, Pipeline]
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    if type(clf) in needsDense:
        clf.fit(X_train.todense(), y_train)
    else:
        clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    if type(clf) in needsDense:
        pred = clf.predict(X_test.todense())
    else:
        pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if print_topX:
            print("top feature per class:")
            for i, category in enumerate(categories):
                # topX = np.min(clf.coef_.shape[1], print_topX)
                topX = np.argsort(clf.coef_[i])[-print_topX:][::-1]
                print(trim("%s: %s"
                           % (category, " | ".join(feature_names[topX]))))
        print()

    if print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred))
                                            # target_names=categories))

    if print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time, clf, pred
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    try:
        clf.fit(X_train, y_train)
    except:
        clf.fit(X_train.toarray(), y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    try:
        pred = clf.predict(X_test)
    except:
        pred = clf.predict(X_test.toarray())
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s"
                      % (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
예제 #48
0
def benchmark(clf, clf_descr, X_train, X_test, y_train, y_test, feature_names, categories, silent, print_top10):
    """
    Benchmark a classifier.
    """
    if not silent:
        print('_' * 80)
        print("Training: ")
    if (not silent) or print_top10:
        print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    if not silent:
        print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    if not silent:
        print("test time:  %0.3fs" % test_time)

    #score = metrics.f1_score(y_test, pred)
    score = np.mean(pred == y_test)
    if not silent:
        print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        if not silent:
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))

        if print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            #print(categories)
            if len(categories) > 2: # multi-class
                for i, category in enumerate(categories):
                    top10 = np.argsort(clf.coef_[i])[-10:]
                    print("%s: %s" % (category, " ".join(feature_names[top10])))
            else: # binary
                top10 = np.argsort(clf.coef_[0])[-10:]
                print("%s" % (" ".join(feature_names[top10])))
            print()

    #clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time, pred
def benchmark(clf,name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
#            for i, category in enumerate(categories):
#                top10 = np.argsort(clf.coef_[i])[-10:]
#                print(trim("%s: %s"
#                      % (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred))
#        print(metrics.classification_report(y_test, pred,
#                                            target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

        print("Saving data to database:")
    save_my_data(cursor, name, testing_identifiant_produit_list, y_test, pred)
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    if type(clf) is RandomForestClassifier:
        clf.fit(X_train.todense(), y_train)
    else:
        clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    if type(clf) is RandomForestClassifier:
        pred = clf.predict(X_test.todense())
    else:
        pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred)
    accscore = metrics.accuracy_score(y_test, pred)
    print ("pred count is %d" % len(pred))
    print ('accuracy score:     %0.3f' % accscore)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

      

    
    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
def benchmark(clf, name):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

#        if opts.print_top10 and feature_names is not None:
        print("top 10 keywords per class:")
        for i, category in enumerate(categories):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print(trim("%s: %s"
                  % (category, " ".join(feature_names[top10]))))

    #if opts.print_report:
    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    #if opts.print_cm:
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    clf_descr = str(clf).split('(')[0]
    
    
    #predicted_labels = le.inverse_transform(clf.predict(dtM_whole))
    #np.savetxt(name+'.csv', predicted_labels, delimiter=",")

    return clf_descr, score, train_time, test_time
예제 #52
0
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)

    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)

    score = metrics.f1_score(y_test, pred)

    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s"
                      % (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        # print(metrics.confusion_matrix(y_test, pred))
        cm = confusion_matrix(y_test, pred)
        plt.matshow(cm)
        plt.title('Confusion matrix')
        plt.colorbar()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score
 def benchmark(self, opts, classifier, dataTrain, labelTrain, dataTest, labelTest):
     print '_' * 80
     print 'Training: '
     print classifier
     
     # start training and measure the time frame
     t0 = time()
     classifier.fit(dataTrain, labelTrain)
     trainTime = time() - t0
     print 'train time: %0.3fs' % trainTime
     
     # start prediction and measure the time frame
     t0 = time()
     predictor = classifier.predict(dataTest)
     testTime = time() - t0
     print 'test time: %0.3fs' % testTime
     
     # accuracy
     score = metrics.f1_score(labelTest, predictor)
     print 'f1 score: %0.3f' % score
     
     if hasattr(classifier, 'coef_'):
         print 'dimensionality: %d' % classifier.coef_.shape[1]
         print 'density: %f' % density(classifier.coef_)
         
         if opts.print_top10 and feature_name is not None:
             print 'top 10 keywords per class:'
             for i, category in enumerate(categories):
                 top10 = np.argsort(classifier.coef_[i])[-10 : ]
                 print trim('%s: %s' % (category, ' '.join(featureNames[top10])))
         print 
     
     if opts.print_report:
         print 'classification report:'
         print metrics.classification_report(labelTest, predictor, target_names = categories)
     
     if opts.print_cm:
         print 'confusion matrix:'
         print metrics.confusion_matrix(labelTest, predictor)
     
     print
     classifierDescription = str(classifier).split('(')[0]
     return classifierDescription, score, trainTime, testTime
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time

    score = metrics.f1_score(y_test, pred)
    print "f1-score:   %0.3f" % score

    if hasattr(clf, 'coef_'):
        print "dimensionality: %d" % clf.coef_.shape[1]
        print "density: %f" % density(clf.coef_)

        if opts.print_top10:
            print "top 10 keywords per class:"
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print trim("%s: %s" % (
                    category, " ".join(np.array(feature_names)[top10])))
                print clf.coef_[i][top10]
        print

    if opts.print_report:
        print "classification report:"
        print metrics.classification_report(y_test, pred,
                                            target_names=categories)

    if opts.print_cm:
        print "confusion matrix:"
        print metrics.confusion_matrix(y_test, pred)

    print
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
def benchmark(clf, name): # Read two parameters
    print('_' * 80) # Add one line for presentation
    print("Training: ")
    print(clf)
    t0 = time() # Read the current time
    clf.fit(X_train, y_train) # Training machine ???
    train_time = time() - t0 # Measure the time spent training
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test) # Prediction with test set
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred) # Get accuracy score
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'): # If the result 'has attribute' coef, print it
        # .shape shows dimensions of a numpy array object        
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
        # Didn't understand this part 100%
        # The purpose of this is to find top keywords of C and L
        print("top 10 keywords per class:") 
        top10C = np.argsort(clf.coef_[0])[-10:] # Conservative -1
        top10L = np.argsort(clf.coef_[0])[:10] # Liberal +1
        print(trim("C: %s"
                  % (" ".join([feature_names[word_idx] for word_idx in top10C]))))
        print(trim("L: %s"
                  % (" ".join([feature_names[word_idx] for word_idx in top10L]))))

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    clf_descr = str(clf).split('(')[0]

    return clf_descr, score, train_time, test_time
예제 #56
0
파일: train.py 프로젝트: vojnovski/mktweets
def benchmark(clf, X_train, X_test, y_train, y_test, feature_names):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time.time()
    pred = clf.predict(X_test)
    test_time = time.time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("density: %f" % density(clf.coef_))
        if feature_names is not None:
            print("top 10 keywords per class:")
            if clf.coef_.shape[0] == 1:
                top10female = np.argsort(clf.coef_[0])[-10:]
                top10male = np.argsort(clf.coef_[0])[:10]
            else:
                top10female = np.argsort(clf.coef_)[-10:]
                top10male = np.argsort(clf.coef_)[:10]
            print("%s: %s" % ("Female", ", ".join(feature_names[top10female])))
            print("%s: %s" % ("Male", ", ".join(feature_names[top10male])))

        print ""

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                        target_names=['Female', 'Male']))
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print ""
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time
def benchmark(clf):
    print('_' * 80)
    print('Training: ')
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print('train time: %0.3fs' % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print('test time:  %0.3fs' % test_time)

    score = metrics.f1_score(y_test, pred)
    print('f1-score:   %0.3f' % score)

    if hasattr(clf, 'coef_'):
        print('dimensionality: %d' % clf.coef_.shape[1])
        print('density: %f' % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print('top 10 keywords per class:')
            for (i, category) in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim('%s: %s' % (category,
                      ' '.join(feature_names[top10]))))
        print()

    if opts.print_report:
        print('classification report:')
        print(metrics.classification_report(y_test, pred,
              target_names=categories))

    if opts.print_cm:
        print('confusion matrix:')
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return (clf_descr, score, train_time, test_time)
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        print(clf.__name__)

        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)

        score = metrics.accuracy_score(y_test, pred)
        print("accuracy:   %0.3f" % score)

        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))
            print("top 10 keywords per class:")
            
            rank = np.argsort(clf.coef_[0])
            top10 = rank[-10:]
            bottom10 = rank[:10]
            print(trim("%s: %s" % ("Funny: ", " ".join(feature_names[top10]).encode("utf-8"))))
            print(trim("%s: %s" % ("Not Funny: ", " ".join(feature_names[bottom10]).encode("utf-8"))))

            print()

        print("classification report:")
        print(metrics.classification_report(y_test, pred,target_names=categories))

        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time
예제 #59
0
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)

    t0 = time()
    try:
        score = cross_validation.cross_val_score( clf, X, y, cv=5)
    except:
        score = cross_validation.cross_val_score( clf, X.toarray(), y, cv=5)
    test_time = time() - t0
    print("CV time:  %0.3fs" % test_time)

#    score = metrics.f1_score(y_test, pred)
    print("CV-score:   %s" % str(score))
    print("Mean CV-score:   %f" % np.mean(score))

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s"
                      % (category, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=categories))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, np.mean(score)