Exemplo n.º 1
0
def classify(features, results, test_features, test_results, C, gamma):
    cli = "%s@%s" %(C, gamma) 
    st = time.time()
    log.info("Classifier begins")
    classifier = SVC(C=C, gamma=gamma, kernel="rbf")
    classifier.fit(features, results)
    st2 = time.time()
    prediction = classifier.predict(test_features)
    log.info("id: %s Training time: %s, Prediction time: %s" %(cli, st2-st, time.time()-st2) )
    error = 0
    for index, value in enumerate(prediction):
        if test_results[index] != value:
            error += 1
    return (error/float(len(test_results))) * 100
Exemplo n.º 2
0
def classify(train_file, test_file):
    """
    Train a model and test

    train_file: file that the model is trained on
    test_file: file that is used to test the model
    """
    X_train, y_train = load_svmlight_file(train_file)
    X_test, y_test = load_svmlight_file(test_file, X_train.shape[1])
    # X_train = X_train.todense()
    # X_test = X_test.todense()
    clf = SparseSVC(kernel="linear", C=0.2)
    # clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    print sklearn.metrics.classification_report(y_test, y_predict)
    print sklearn.metrics.confusion_matrix(y_test, y_predict)
# Test for 10 rounds using the results from 10 fold cross validations
for i, (train_index, test_index) in enumerate(kf):

    print "run %d" % (i+1)

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_den_train, X_den_test = X_den[train_index], X_den[test_index]

    # feed models
    clf_mNB.fit(X_train, y_train)
    clf_ridge.fit(X_train, y_train)
    clf_SGD.fit(X_train, y_train)
    clf_lSVC.fit(X_train, y_train)
    clf_SVC.fit(X_train, y_train)

    # get prediction for this fold run
    prob_mNB    = clf_mNB.predict_proba(X_test)
    prob_ridge  = clf_ridge.decision_function(X_test)
    prob_SGD    = clf_SGD.decision_function(X_test)
    prob_lSVC   = clf_lSVC.decision_function(X_test)
    prob_SVC    = clf_SVC.predict_proba(X_test)

    # add prob functions into the z 2d-array
    z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC)
    z = np.append(z, z_temp, axis=0)


# remove the first sub-1d-array of z, due to the creation with 0s
z = np.delete(z, 0, 0)
    for i, (train_index, test_index) in enumerate(kf):

        # print "run %d" % (i+1)

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_den_train, X_den_test = X_den[train_index], X_den[test_index]

        # feed models
        clf_mNB.fit(X_train, y_train)
        # clf_bNB.fit(X_train, y_train)
        clf_ridge.fit(X_train, y_train)
        # clf_kNN.fit(X_train, y_train)
        clf_lSVC.fit(X_train, y_train)
        # clf_SVC.fit(X_train, y_train)
        clf_SVC.fit(X_den_train, y_train)

        # get prediction for this fold run
        prob_mNB    = clf_mNB.predict_proba(X_test)
        # prob_bNB    = clf_bNB.predict_proba(X_test)
        prob_ridge  = clf_ridge.decision_function(X_test)
        # prob_kNN    = clf_kNN.decision_function(X_test)
        prob_lSVC   = clf_lSVC.decision_function(X_test)
        # prob_SVC    = clf_SVC.predict_proba(X_test)
        prob_SVC    = clf_SVC.predict_proba(X_den_test)

        # add prob functions into the z 2d-array
        # z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC)
        # z_temp = (prob_mNB + prob_ridge + prob_bNB + prob_lSVC + prob_SVC)
        # z_temp = (prob_mNB + 2*prob_ridge + 2*prob_lSVC + prob_SVC)
        z_temp = (prob_mNB + prob_ridge + prob_lSVC + prob_SVC)
        # Initialize variables for couting the average
        f1_all = []
        f5_all = []
        acc_all = []
        pre_all = []
        rec_all = []

        # Test for 10 rounds using the results from 10 fold cross validations
        for train_index, test_index in kf:

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # fit and predict
            clf.fit(X_train, y_train)
            pred = clf.predict(X_test)

            # print y_test
            # print pred
            # print type(pred)

            # output tree into graph
            # out = StringIO()
            # out = export_graphviz(clf, out_file=out)

            # metrics

            # # Original
            f1_score = metrics.f1_score(y_test, pred)
            f5_score = metrics.fbeta_score(y_test, pred, beta=0.5)
Exemplo n.º 6
0
clf_rdg = RidgeClassifier(tol=1e-1)
clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2")

# Logistic regression requires OneVsRestClassifier which hides
# its methods such as decision_function
# It will require extra implementation efforts to use it as a candidate
# for multilabel classification
# clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1'))
# kNN does not have decision function due to its nature
# clf_knn = KNeighborsClassifier(n_neighbors=13)

# train
clf_nb.fit(X, y)
clf_lsvc.fit(X, y)
clf_rdg.fit(X, y)
clf_svc.fit(X, y)
clf_sgd.fit(X, y)

print "Train time: %0.3fs" % (time() - t0)
print


# # predict by simply apply the classifier
# # this will not use the multi-label threshold
# predicted = clf_rdg.predict(X_new)
# for doc, category in zip(docs_new, predicted):
#     print '%r => %s' % (doc, data_train.target_names[int(category)])
#     print


####################################