def cross_validate(dataset, scramBool): global num_classes backup_data = copy.copy(dataset) test_results = [] stats = [] full_set_stats = [] if scramBool: dataset = ten_percent_scrambler(dataset) backup_data = splitter(backup_data) for i in range(10): # iterates through passing each of the 10 subsets of our now scrambled and split dataset nb.freqTable = [] to_learn = copy.copy(backup_data) # Grabs a fresh copy of the dataset each time, since the to_learn list pops deletes a tenth of the data in each loop to_test = make_test_set(to_learn.pop(i)) to_learn = flatten_list(to_learn) # print('tester') # array_printer_2d(to_test) # print('learner') nb.train(to_learn) #array_printer_3d(nb.freqTable) # array_printer_2d(to_learn) to_test = nb.classify(to_test) test_results.append(to_test) # print("classified data") # array_printer_2d(to_test) stats.append(analyze(backup_data[i], to_test, num_classes)) # print(len(to_learn)) # learn(temp) # this will call the learner algo # test_results.append(test(to_test, dataset[i])) # This tests our model with the current tenth of the dataset #array_printer_2d(stats) full_set_stats = analyze(flatten_list(backup_data), flatten_list(test_results), num_classes) # Performs analysis on the entire classified set compared to the original data array_printer_2d(full_set_stats)
def naive_bayes(train_docs, train_keys, test_docs, test_keys,model_file, N): X_train, y_train, phrase_list_train, idf_vec= extract_features(train_docs, train_keys) #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys) #print y_train print "--Feature matrices calculated, NB now training..." clf = NB.train(X_train, y_train) print "--Saving model..." with open(model_file, 'w') as f: pickle.dump(clf, f) with open(model_file+'.phrase_list', 'w') as f: pickle.dump(phrase_list_train, f) with open(model_file+'.idf_vec', 'w') as f: pickle.dump(idf_vec, f) with open(model_file+'.training_size', 'w') as f: pickle.dump(len(train_docs), f) print "--NB trained, NB now testing..." #accuracy = NB.score(clf, X_test, y_test) accuracy = 0 precisions = [] recalls = [] for doc, true_keys in zip(test_docs, test_keys): candidates, features = extract_candidates_doc(doc, phrase_list_train, idf_vec, len(train_docs)) precision, recall = evaluate_one_doc('NB', clf, candidates, features, true_keys, N) precisions.append(precision) recalls.append(recall) avg_precision = sum(precisions) / len(precisions) avg_recall = sum(recalls) / len(recalls) #features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys) #avg_precision, avg_recall = evaluate_on_each_doc('NB', clf, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys, 10) return {'accuracy': accuracy, 'recall': avg_recall, 'precision': avg_precision}
def rebuild_models(self, for_eval=False): ''' Rebuilds all models over the current labeled datasets. ''' datasets = self.labeled_datasets if self.undersample_before_eval and for_eval: print "undersampling before building models.." datasets = self.undersample_function() all_train_sets, labels = self._datasets_to_matrices(datasets) self.models = [NB_Model(naive_bayes.train(training_set, labels)) for training_set in all_train_sets]
def main(): training = read_data(argv[1]) test = read_data(argv[2]) classifiers_unfiltered = naive_bayes.train(training, False) accuracy_unfiltered_nb = naive_bayes.test(test, False, classifiers_unfiltered) print("Naive Bayes is", "{0:.6f}".format(accuracy_unfiltered_nb), "accurate with stop words unfiltered") classifiers_filtered = naive_bayes.train(training, True) accuracy_filtered_nb = naive_bayes.test(test, True, classifiers_filtered) print("Naive Bayes is", "{0:.6f}".format(accuracy_filtered_nb), "accurate with stop words filtered") for i in range(3, len(argv)): print() lambda_constant = float(argv[i]) weights_unfiltered = logistic_regression.train(training, False, 25, lambda_constant) accuracy_unfiltered_lr = logistic_regression.test( test, False, weights_unfiltered) print( "Logistic Regression is", "{0:.6f}".format(accuracy_unfiltered_lr), "accurate with stop words unfiltered and lambda constant equal to", lambda_constant) weights_filtered = logistic_regression.train(training, True, 25, lambda_constant) accuracy_filtered_lr = logistic_regression.test( test, True, weights_filtered) print( "Logistic Regression is", "{0:.6f}".format(accuracy_filtered_lr), "accurate with stop words filtered and lambda constant equal to", lambda_constant)
def main(): args_parser = build_args_parser() args = args_parser.parse_args() results_dir_path = 'results' raw_data_dir_path = 'data' if not os.path.exists(results_dir_path): os.makedirs(results_dir_path) for file_path in glob.glob(raw_data_dir_path + '/*.csv'): file_name = os.path.basename(file_path) file_name = file_name.replace(pathlib.Path(file_name).suffix, "") df = pd.read_csv(file_path) train_sample, test_sample = train_test_split(df, test_size=0.2) model = None if args.option == 1: model = nb.train(train_sample) if args.option == 2: model = knn.train(train_sample) if model is not None: predicted = model.predict(test_sample['conteudo']) precision = np.mean(predicted == test_sample['saida']) from sklearn.metrics import classification_report, confusion_matrix, accuracy_score file = open(results_dir_path + "/" + file_name + "_classification.txt", "w") file.write("Quantidade de entradas para treino: " + str(len(train_sample.index)) + "\n") file.write("Quantidade de entradas para teste: " + str(len(test_sample.index)) + "\n") file.write("Precisão: " + str(precision) + "\n") file.write(str(confusion_matrix(test_sample['saida'], predicted))) file.write(str(classification_report(test_sample['saida'], predicted))) file.write(str(accuracy_score(test_sample['saida'], predicted))) # file.write(str("Recall: %0.2f (+/- %0.2f)" % (scores['test_recall_macro'] .mean(), scores['test_recall_macro'] .std() * 2))) file.close()
def naive_bayes(train_docs, train_keys, test_docs, test_keys, model_file, N): X_train, y_train, phrase_list_train, idf_vec = extract_features( train_docs, train_keys) #X_test, y_test, fl_test, junk = extract_features(test_docs, test_keys) #print y_train print "--Feature matrices calculated, NB now training..." clf = NB.train(X_train, y_train) print "--Saving model..." with open(model_file, 'w') as f: pickle.dump(clf, f) with open(model_file + '.phrase_list', 'w') as f: pickle.dump(phrase_list_train, f) with open(model_file + '.idf_vec', 'w') as f: pickle.dump(idf_vec, f) with open(model_file + '.training_size', 'w') as f: pickle.dump(len(train_docs), f) print "--NB trained, NB now testing..." #accuracy = NB.score(clf, X_test, y_test) accuracy = 0 precisions = [] recalls = [] for doc, true_keys in zip(test_docs, test_keys): candidates, features = extract_candidates_doc(doc, phrase_list_train, idf_vec, len(train_docs)) precision, recall = evaluate_one_doc('NB', clf, candidates, features, true_keys, N) precisions.append(precision) recalls.append(recall) avg_precision = sum(precisions) / len(precisions) avg_recall = sum(recalls) / len(recalls) #features_doc, labels_doc, phrase_idx_doc, phrase_list = extract_features_test(test_docs, test_keys) #avg_precision, avg_recall = evaluate_on_each_doc('NB', clf, features_doc, labels_doc, phrase_idx_doc, phrase_list, test_keys, 10) return { 'accuracy': accuracy, 'recall': avg_recall, 'precision': avg_precision }
#!/usr/bin/env python INPUT_FILE = 'SMSSpamCollection' TEST_FILE = 'TestCollection' import csv import naive_bayes dataset = naive_bayes.load_dataset(INPUT_FILE) model = naive_bayes.train(dataset) total = {'spam': 0, 'ham': 0} correct = {'spam': 0, 'ham': 0} with open(TEST_FILE) as f: reader = csv.reader(f, delimiter='\t') for line in reader: result = line[0] #print result, #print ': ' prediction = naive_bayes.predict(line[1], model) if (result == 'ham'): if prediction == result: correct['ham'] += 1 total['ham'] += 1 else: if prediction == result: correct['spam'] += 1 total['spam'] += 1 print 'ham accuracy: {}%'.format(float(correct['ham']) * 100 / total['ham']) print 'spam accuracy: {}%'.format(float(correct['spam']) * 100 / total['spam'])
# naive_bayes_predictions # pdb.set_trace() X_train_bal, y_train_bal, start_index_rich, start_index_non = get_twenty_twenty( tweets, binary_labels, start_index_rich, start_index_non) # array = np.array([]) # for line in tweets: # array = np.append(array, re.sub(r'[^\w\'] ', " ", line).split() ) # vectorizer = CountVectorizer(tokenizer=tokenize, analyzer='word', ) # pdb.set_trace() nb_classifier, nb_grams, nb_features = naive_bayes.train( X_train_bal, y_train_bal) # randForest_classifier = RandomForestClassifier() # randForest_classifier.fit(X_train_bal, y_train_bal) # show_most_informative_features(vectorizer, nb_classifier, 20) importances = nb_classifier.feature_importances_ std = np.std( [tree.feature_importances_ for tree in nb_classifier.estimators_], axis=0) indices = np.argsort(importances)[::-1] feature_importance = [] i = 0 for gram in nb_grams:
from naive_bayes import extract_features, train, predict #reading the data text files in unicode and spliting into train and test sets print("\t-------Loading Dataset-------") X, y = load_dataset() #generating the train set print("Length of Dataset:", len(X)) #td-idf vectorizer and split data to the test and train sets print("\t-------Extracting Features and Splitting Dataset-------") train_x, test_x, train_y, test_y = extract_features(X,y) #generating the train set print("Length of Training set:", len(train_x)) print("Length of Test set:", len(test_x)) print("\t-------Start Training------") classifier = train(train_x, train_y) f = open(checkpoint_path, 'wb') pickle.dump(classifier, f) f.close() print("Model saved:", checkpoint_path) print("\t-------End Training-------") print("\t-------Start Testing------") f = open(checkpoint_path, 'rb') classifier = pickle.load(f) f.close() accuracy, confusion_matrix = predict(classifier,test_x,test_y) print("Accuracy :", accuracy * 100) print("\nConfusion Matrix:") print(confusion_matrix)
#!/usr/bin/env python INPUT_FILE = 'SMSSpamCollection' TEST_FILE = 'TestCollection' import csv import naive_bayes dataset = naive_bayes.load_dataset(INPUT_FILE) model = naive_bayes.train(dataset) total = {'spam': 0, 'ham': 0} correct = {'spam': 0, 'ham': 0} with open(TEST_FILE) as f: reader = csv.reader(f, delimiter='\t') for line in reader: result = line[0] #print result, #print ': ' prediction = naive_bayes.predict(line[1], model) if(result == 'ham'): if prediction == result : correct['ham'] += 1 total['ham'] += 1 else: if prediction == result : correct['spam'] += 1 total['spam'] += 1 print 'ham accuracy: {}%'.format( float(correct['ham'])*100/total['ham'] ) print 'spam accuracy: {}%'.format(float(correct['spam'])*100/total['spam'] )
import naive_bayes import scipy.io # 1. TRAIN # 1.1. Load training data print 'Load training data ...' training_data = scipy.io.loadmat('spamTrain.mat') #print training_data X = training_data['X'] y = training_data['y'] print 'X.shape =', X.shape print 'y.shape =', y.shape # 1.2. Train Naive Bayes classifier print 'Train Naive Bayes classifier ...' phi, phi0, phi1 = naive_bayes.train(X, y) print 'phi =', phi print 'phi0[0:10] =', phi0[0:10] print 'phi1[0:10] =', phi1[0:10] # 2. TEST # 2.1. Load test data print 'Load test data ...' test_data = scipy.io.loadmat('spamTest.mat') X_test = test_data['Xtest'] y_test = test_data['ytest'] print 'X_test.shape =', X_test.shape print 'y_test.shape =', y_test.shape # 2.2. Test Naive Bayes classifier print 'Test Naive Bayes classifier ...'