def test(self, test_data): """ Data should be nx(m+1) numpy matrix where n is the number of examples and m is the number of features (recall that the first element of the vector is the label). You should print the accuracy, precision, and recall on the test data. """ if self.classifier_type == 'decision_tree': import decision_tree decision_tree.test(self.params, test_data) if self.classifier_type == 'naive_bayes': import naive_bayes naive_bayes.test(self.params, test_data) if self.classifier_type == 'neural_net': import neural_nets neural_nets.test(self.params, test_data)
def main(): training = read_data(argv[1]) test = read_data(argv[2]) classifiers_unfiltered = naive_bayes.train(training, False) accuracy_unfiltered_nb = naive_bayes.test(test, False, classifiers_unfiltered) print("Naive Bayes is", "{0:.6f}".format(accuracy_unfiltered_nb), "accurate with stop words unfiltered") classifiers_filtered = naive_bayes.train(training, True) accuracy_filtered_nb = naive_bayes.test(test, True, classifiers_filtered) print("Naive Bayes is", "{0:.6f}".format(accuracy_filtered_nb), "accurate with stop words filtered") for i in range(3, len(argv)): print() lambda_constant = float(argv[i]) weights_unfiltered = logistic_regression.train(training, False, 25, lambda_constant) accuracy_unfiltered_lr = logistic_regression.test( test, False, weights_unfiltered) print( "Logistic Regression is", "{0:.6f}".format(accuracy_unfiltered_lr), "accurate with stop words unfiltered and lambda constant equal to", lambda_constant) weights_filtered = logistic_regression.train(training, True, 25, lambda_constant) accuracy_filtered_lr = logistic_regression.test( test, True, weights_filtered) print( "Logistic Regression is", "{0:.6f}".format(accuracy_filtered_lr), "accurate with stop words filtered and lambda constant equal to", lambda_constant)
def evaluate_on_each_doc(clf_name, clf, features_doc, labels_doc, phrase_idx_doc, phrase_list, true_keys_doc, N=10): precisions = [] recalls = [] # go through each document docid = 0 for features, labels, phrase_indices, true_keys in zip(features_doc, labels_doc, phrase_idx_doc, true_keys_doc): ### print "*docid", docid ### docid += 1 if clf_name == 'NB': pred_idx = NB.test(clf, N, features) pred_keys = [] # collect all phrases that have pred label 1 for idx in pred_idx: pred_keys.append(phrase_list[phrase_indices[idx]]) ### print "--pred_keys:" print pred_keys print "--true keys:" print true_keys ### precisions.append(get_precision(true_keys, pred_keys)) recalls.append(get_recall(true_keys, pred_keys)) if clf_name == 'svm': pred_labels = clf.predict(features) confidence_scores = clf.decision_function(features) pred_keys = [] ### print '--pred keys:', str(sum(pred_labels)) ### # collect all phrases that has pred label 1 predictions = zip(pred_labels, phrase_indices, confidence_scores) predictions.sort(key=lambda x: x[2], reverse=True) for label, idx in zip(pred_labels, phrase_indices): if label == 1: pred_keys.append(phrase_list[idx]) precisions.append(get_precision(true_keys, pred_keys)) recalls.append(get_recall(true_keys, pred_keys)) precision_avg = sum(precisions) / len(precisions) recall_avg = sum(recalls) / len(recalls) return precision_avg, recall_avg
def evaluate_one_doc(clf_name, clf, phrases, features, true_keys, N=10): pred_idx = [] if clf_name == 'NB': pred_idx = NB.test(clf, N, features) if clf_name == 'svm': pred_idx= svm.test(clf, N, features) pred_keys = [] print "# pred_keys", len(pred_keys) # get top N pred keys for idx in pred_idx: pred_keys.append(phrases[idx]) ### print "--pred_keys:" print pred_keys print "--true keys:" print true_keys ### precision = get_precision(true_keys, pred_keys) recall = get_recall(true_keys, pred_keys) return precision, recall
# sort on importance # Print the feature ranking #print("Feature ranking:") # Plot the feature importances of the forest # plt.figure() # plt.title("Feature importances") # plt.bar( range(nb_grams[features].shape[1]), importances[indices], # color="r", yerr=std[indices], align="center") # plt.xticks(range(nb_grams[features].shape[1]), indices) # plt.xlim([-1, nb_grams[features].shape[1]]) # plt.show() nb_correct, nb_incorrect, nb_tp, nb_fp, nb_tn, nb_fn, nb_predictions = naive_bayes.test( X_test_bal, y_test_bal, nb_classifier, nb_grams) # if emotion == 'joy': # for i in range(len(nb_predictions)): # if nb_predictions[i]: # print i # pdb.set_trace() for tweet in X_test: emotionFound = False # get overall counts all_correct = 0 all_wrong = 0 svm_correct = 0 lr_correct = 0
""" Example usage: $ python predict_custom_input.py "new jerry seinfeld show releases" new jerry seinfeld show releases ['news'] ['e'] """ import naive_bayes import sys if __name__ == '__main__': mysql_obj_identifier = naive_bayes.mysql_connection('news_identifier') mysql_obj_classifier = naive_bayes.mysql_connection('news_classifier') input_text = sys.argv[1] input_text = naive_bayes.normalize_text(input_text) print input_text identifier_output = naive_bayes.test([input_text], mysql_obj_identifier) print identifier_output if identifier_output[0] == 'news': print naive_bayes.test([input_text], mysql_obj_classifier)
import naive_bayes as nb from models.company import * import pickle import datetime import sys import os print datetime.datetime.now() text = "Anyone can create an account and start explaining rap. Highlight any line to explain it yourself, suggest changes to existing explanations, and put up your favorite new songs." text_2="Coupons.com is a provider of digital coupons, including online printable, coupon codes, save to loyalty card and mobile promotions. The company's products include Coupons.com as well as Grocery iQ and Coupons.com mobile applications." vcs= db.session.query(VC).filter("id >= 0 AND id <=50").all() print 'done query' vc_names = [] vc_urls = [] percent = [] for vc in vcs: vc_name = vc.name vc_url = vc.url vc_model = pickle.loads(vc.nb_model) result = nb.test(text=text_2, model=vc_model) if result >= .55: percent.append(result) vc_names.append(vc_name) vc_urls.append(vc_url) print percent print vc_names print datetime.datetime.now() # print results
# 1. TRAIN # 1.1. Load training data print 'Load training data ...' training_data = scipy.io.loadmat('spamTrain.mat') #print training_data X = training_data['X'] y = training_data['y'] print 'X.shape =', X.shape print 'y.shape =', y.shape # 1.2. Train Naive Bayes classifier print 'Train Naive Bayes classifier ...' phi, phi0, phi1 = naive_bayes.train(X, y) print 'phi =', phi print 'phi0[0:10] =', phi0[0:10] print 'phi1[0:10] =', phi1[0:10] # 2. TEST # 2.1. Load test data print 'Load test data ...' test_data = scipy.io.loadmat('spamTest.mat') X_test = test_data['Xtest'] y_test = test_data['ytest'] print 'X_test.shape =', X_test.shape print 'y_test.shape =', y_test.shape # 2.2. Test Naive Bayes classifier print 'Test Naive Bayes classifier ...' acc = naive_bayes.test(phi, phi0, phi1, X_test, y_test) print 'Accuracy =', acc