def classify(features, results, test_features, test_results, C, gamma): cli = "%s@%s" %(C, gamma) st = time.time() log.info("Classifier begins") classifier = SVC(C=C, gamma=gamma, kernel="rbf") classifier.fit(features, results) st2 = time.time() prediction = classifier.predict(test_features) log.info("id: %s Training time: %s, Prediction time: %s" %(cli, st2-st, time.time()-st2) ) error = 0 for index, value in enumerate(prediction): if test_results[index] != value: error += 1 return (error/float(len(test_results))) * 100
def classify(train_file, test_file): """ Train a model and test train_file: file that the model is trained on test_file: file that is used to test the model """ X_train, y_train = load_svmlight_file(train_file) X_test, y_test = load_svmlight_file(test_file, X_train.shape[1]) # X_train = X_train.todense() # X_test = X_test.todense() clf = SparseSVC(kernel="linear", C=0.2) # clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) clf.fit(X_train, y_train) y_predict = clf.predict(X_test) print sklearn.metrics.classification_report(y_test, y_predict) print sklearn.metrics.confusion_matrix(y_test, y_predict)
# Test for 10 rounds using the results from 10 fold cross validations for i, (train_index, test_index) in enumerate(kf): print "run %d" % (i+1) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train, y_train) clf_ridge.fit(X_train, y_train) clf_SGD.fit(X_train, y_train) clf_lSVC.fit(X_train, y_train) clf_SVC.fit(X_train, y_train) # get prediction for this fold run prob_mNB = clf_mNB.predict_proba(X_test) prob_ridge = clf_ridge.decision_function(X_test) prob_SGD = clf_SGD.decision_function(X_test) prob_lSVC = clf_lSVC.decision_function(X_test) prob_SVC = clf_SVC.predict_proba(X_test) # add prob functions into the z 2d-array z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC) z = np.append(z, z_temp, axis=0) # remove the first sub-1d-array of z, due to the creation with 0s z = np.delete(z, 0, 0)
for i, (train_index, test_index) in enumerate(kf): # print "run %d" % (i+1) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train, y_train) # clf_bNB.fit(X_train, y_train) clf_ridge.fit(X_train, y_train) # clf_kNN.fit(X_train, y_train) clf_lSVC.fit(X_train, y_train) # clf_SVC.fit(X_train, y_train) clf_SVC.fit(X_den_train, y_train) # get prediction for this fold run prob_mNB = clf_mNB.predict_proba(X_test) # prob_bNB = clf_bNB.predict_proba(X_test) prob_ridge = clf_ridge.decision_function(X_test) # prob_kNN = clf_kNN.decision_function(X_test) prob_lSVC = clf_lSVC.decision_function(X_test) # prob_SVC = clf_SVC.predict_proba(X_test) prob_SVC = clf_SVC.predict_proba(X_den_test) # add prob functions into the z 2d-array # z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC) # z_temp = (prob_mNB + prob_ridge + prob_bNB + prob_lSVC + prob_SVC) # z_temp = (prob_mNB + 2*prob_ridge + 2*prob_lSVC + prob_SVC) z_temp = (prob_mNB + prob_ridge + prob_lSVC + prob_SVC)
# Initialize variables for couting the average f1_all = [] f5_all = [] acc_all = [] pre_all = [] rec_all = [] # Test for 10 rounds using the results from 10 fold cross validations for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # fit and predict clf.fit(X_train, y_train) pred = clf.predict(X_test) # print y_test # print pred # print type(pred) # output tree into graph # out = StringIO() # out = export_graphviz(clf, out_file=out) # metrics # # Original f1_score = metrics.f1_score(y_test, pred) f5_score = metrics.fbeta_score(y_test, pred, beta=0.5)
clf_rdg = RidgeClassifier(tol=1e-1) clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") # Logistic regression requires OneVsRestClassifier which hides # its methods such as decision_function # It will require extra implementation efforts to use it as a candidate # for multilabel classification # clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1')) # kNN does not have decision function due to its nature # clf_knn = KNeighborsClassifier(n_neighbors=13) # train clf_nb.fit(X, y) clf_lsvc.fit(X, y) clf_rdg.fit(X, y) clf_svc.fit(X, y) clf_sgd.fit(X, y) print "Train time: %0.3fs" % (time() - t0) print # # predict by simply apply the classifier # # this will not use the multi-label threshold # predicted = clf_rdg.predict(X_new) # for doc, category in zip(docs_new, predicted): # print '%r => %s' % (doc, data_train.target_names[int(category)]) # print ####################################