def test_load_dataset_from_csv(): classifier = NaiveBayesClassifier() csv_filename = 'datasets/iris.csv' data_0 = ['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'] data_2 = ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'] data_39 = ['5.1','3.4','1.5','0.2','Iris-setosa'] data_60 = ['5.0','2.0','3.5','1.0','Iris-versicolor'] data_81 = ['5.5','2.4','3.7','1.0','Iris-versicolor'] data_89 = ['5.5','2.5','4.0','1.3','Iris-versicolor'] data_104 = ['6.5','3.0','5.8','2.2','Iris-virginica'] data_110 = ['6.5','3.2','5.1','2.0','Iris-virginica'] data_125 = ['7.2', '3.2', '6.0', '1.8', 'Iris-virginica'] data_143 = ['6.8','3.2','5.9','2.3','Iris-virginica'] readed_dataset = classifier.load_dataset_from_csv(csv_filename) assert readed_dataset[0] == data_0 assert readed_dataset[2] == data_2 assert readed_dataset[39] == data_39 assert readed_dataset[60] == data_60 assert readed_dataset[81] == data_81 assert readed_dataset[89] == data_89 assert readed_dataset[104] == data_104 assert readed_dataset[110] == data_110 assert readed_dataset[125] == data_125 assert readed_dataset[143] == data_143 csv_filename_2 = 'tests/unit_tests/resources/load_test.csv' readed_dataset_2 = classifier.load_dataset_from_csv(csv_filename_2) assert len(readed_dataset_2) == 3
def __init__(self): self.dataset_filename = 'datasets/iris.csv' self.description_filename = 'datasets/iris.names' self.nbc = NaiveBayesClassifier() self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename) self.class_map = dict()
def test_std_deviation(): classifier = NaiveBayesClassifier() numbers = [0.5, 1, 4.56, 3] assert np.around(classifier.std_deviation(numbers), 13) == 1.8728498783049 assert classifier.std_deviation(numbers) == np.std(numbers, ddof=1)
def test_gaussian_probability(): classifier = NaiveBayesClassifier() numbers = [[1.0, 1.0, 1.0], [2.0, 1.0, 1.0], [0.0, 1.0, 1.0]] results = [0.3989422804014327, 0.24197072451914337, 0.24197072451914337] for i in range(0, len(numbers)): assert classifier.gaussian_probability(numbers[i][0], numbers[i][1], numbers[i][2]) == results[i]
def main(): parser = argparse.ArgumentParser() parser.add_argument('path', type=str, help='the path to training data') args = parser.parse_args() training_data_path = args.path _, labels, sentences = load_training_set(path=training_data_path) nb = NB(sentences=sentences, labels=labels) nb.learn() nb.save_model('./nbmodel.txt')
def __init__(self): NaiveBayesClassifier.__init__( self, 5, classes=["ORGANIZATION", "LOCATION", "PERSON"]) NAMES = [] f = open('./gazeteers/names.male', 'r').read().splitlines() for n in f: NAMES.append(n) f = open('./gazeteers/names.female', 'r').read().splitlines() for n in f: NAMES.append(n) f = open('./gazeteers/names.family', 'r').read().splitlines() for n in f: NAMES.append(n) self.names = NAMES
def test_evaluate_algorithm(): classifier = NaiveBayesClassifier() dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]] n_folds = 5 results_data = classifier.evaluate_algorithm(dataset, n_folds) assert len(results_data) == n_folds assert [data for data in results_data if 0 <= data <= 100]
def test_gather_data_params(): classifier = NaiveBayesClassifier() dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]] results_dataset = [(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)] test_results = classifier.gather_data_params(dataset) assert test_results == results_dataset
def test_k_fold_cross_validation_split(): classifier = NaiveBayesClassifier() dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]] folds_num = 5 results_dataset = classifier.k_fold_cross_validation_split( dataset, folds_num) assert len(results_dataset) == folds_num
def test_predict(): classifier = NaiveBayesClassifier() dataset = { 1: [(2.7420144012, 0.9265683289298018, 5), (3.0054686692, 1.1073295894898725, 5)], 0: [(7.6146523718, 1.2344321550313704, 5), (2.9914679790000003, 1.4541931384601618, 5)] } row = [3.7, 2.9, 0] results_predict = classifier.predict(dataset, row) assert results_predict == 1
def main(): parser = argparse.ArgumentParser() parser.add_argument('path', type=str, help='the path to training data') args = parser.parse_args() test_data_path = args.path ids, sentences = load_dev_set(test_data_path) nb = NB() nb.load_model('./nbmodel.txt') results = list() for s in sentences: re = nb.classify(s) results.append(re) save_results('./nboutput.txt', results, ids)
def test_calculate_class_parameters(): classifier = NaiveBayesClassifier() dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]] results_dataset = { 0: [(2.7420144012, 0.9265683289298018, 5), (3.0054686692, 1.1073295894898725, 5)], 1: [(7.6146523718, 1.2344321550313704, 5), (2.9914679790000003, 1.4541931384601618, 5)] } assert classifier.calculate_class_parameters(dataset) == results_dataset
def main(): ''' Main function :return: NAN ''' # Load Data x_train, y_train, x_test, y_test, label_dict = load_mnist( which_type='fashion', threshold=0.5) # Get the Model nbc = NaiveBayesClassifier() # Train nbc.fit(x_train, y_train) # Test predictions = nbc.predict(x_test) # Evaluate accuracy accuracy = np.sum(np.uint8(predictions == y_test)) / len(y_test) print("Accuracy: ", accuracy) # Show Confusion Matrix plot_confusion_matrix(targets=y_test, predictions=predictions, classes=[label_dict[l] for l in label_dict]) # Plot predictions plt.figure() while True: idx = np.random.randint(0, x_test.shape[0]) x = x_test[idx] p = predictions[idx] y = y_test[idx] plt.imshow(x, cmap='gray') plt.title('Target: {}, Prediction: {}'.format(label_dict[int(y)], label_dict[int(p)])) plt.waitforbuttonpress()
def test_divide_data_by_class(): classifier = NaiveBayesClassifier() dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]] results_dataset = { 0: [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0]], 1: [[7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]] } assert classifier.divide_data_by_class(dataset) == results_dataset
def test_map_class_names_to_ints(): classifier = NaiveBayesClassifier() dataset = [['3.393533211', '2.331273381', '0'], ['3.110073483', '1.781539638', '0'], ['1.343808831', '3.368360954', '0'], ['3.582294042', '4.67917911', '0'], ['2.280362439', '2.866990263', '0'], ['7.423436942', '4.696522875', '1'], ['5.745051997', '3.533989803', '1'], ['9.172168622', '2.511101045', '1'], ['7.792783481', '3.424088941', '1'], ['7.939820817', '0.791637231', '1']] classifier.map_class_names_to_ints(dataset, len(dataset[0]) - 1, True) for i in range(0, len(dataset)): tested_row = random.randint(0, len(dataset) - 1) assert isinstance(dataset[tested_row][len(dataset[0]) - 1], int) classifier.map_class_names_to_ints(dataset, len(dataset[0]) - 1, False) for i in range(0, len(dataset)): tested_row = random.randint(0, len(dataset) - 1) assert isinstance(dataset[tested_row][len(dataset[0]) - 1], int)
def main(): """ Main function """ # load data x_train, y_train, x_test, y_test, label_dict = load_mnist( which_type='digits', threshold=0.5) # get the model nbc = NaiveBayesClassifier() # train nbc.fit(x_train, y_train) # test predictions = nbc.predict(x_test) # evaluate performances accuracy = np.sum(np.uint8(predictions == y_test)) / len(y_test) print('Accuracy: {}'.format(accuracy)) # show confusion matrix plot_confusion_matrix(targets=y_test, predictions=predictions, classes=[label_dict[l] for l in label_dict]) # plot predictions plt.figure() while True: idx = np.random.randint(0, x_test.shape[0]) x = x_test[idx] p = predictions[idx] y = y_test[idx] plt.imshow(x, cmap='gray') plt.title('Target: {}, Prediction: {}'.format(label_dict[int(y)], label_dict[int(p)])) plt.waitforbuttonpress()
'/Users/rileylittlefield/Desktop/notes/readingnotes/python-ml/data-science-from-scratch/12-exercises' ) from data_split_for_model_training import split_data from naive_bayes import NaiveBayesClassifier from data_harvester import data import random import pdb from collections import defaultdict random.seed(0) train_data, test_data = split_data(data, 0.75) print("train_data_length = %s" % len(train_data)) print("test_data_length = %s" % len(test_data)) classifier = NaiveBayesClassifier() # pdb.set_trace() classifier.train(train_data) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] true_positives = [] true_negatives = [] false_positives = [] false_negatives = [] subject, classification, predicted_prob = 0, 1, 2 for my_tuple in classified: is_spam = my_tuple[classification] predict_is_spam = (my_tuple[predicted_prob] > 0.5) # if predict_is_spam: # print('hey ho!')
X2 = titanic_data.iloc[:, 1:] y2 = titanic_data.iloc[:, 0] ########################## Classification ################################## X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42) X_train = X_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) y_test = y_test.reset_index(drop=True) clf = NaiveBayesClassifier(type='Gaussian') ####### Convert X_train and X_test into an np array for Logistic Regression ###### # clf = LogisticRegression(num_steps=5000, regularisation='L2') # clf1 = DecisionTree(max_depth=5, split_val_metric='mean', split_node_criterion='gini') # clf = RandomForest(n_trees=10, sample_size=0.8, max_features=6, # max_depth=5, split_val_metric='mean', split_node_criterion='gini') ##### Using two decision trees and a single naive bayes here while logistic regression is by default the meta-learner # clf = Stacking([(clf, 1), (clf1, 2)]) # clf1 = BoostingDecisionTree(max_depth=5, split_val_metric='mean', split_node_criterion='gini') # clf = AdaBoostClassifier(n_trees=100, learning_rate=1) #### For Logistic Regression
# get in/out file information train_labels_file = input('Enter training file: ') test_file = input('Enter test file: ') out_file = input('Enter out file: ') # corpus files are relative to these files path_to_train_labels_file = '/'.join(train_labels_file.split('/')[0:-1]) path_to_test_file = '/'.join(test_file.split('/')[0:-1]) # helper to get doc tokens get_doc_tokens = lambda handle: \ list(map(stemmer.stem, nltk.word_tokenize(''.join(handle.readlines())))) # training stage print('Beginning training stage...') classifier = NaiveBayesClassifier() with open(train_labels_file) as train_handle: for line in train_handle: doc_file, cls = line.rstrip('\n').split(' ') with open(f'{path_to_train_labels_file}/{doc_file}', 'r') as doc_handle: classifier.train(get_doc_tokens(doc_handle), cls) classifier.compile() # validation stage print('Beginning validation stage...') with open(test_file, 'r') as test_handle: doc_filenames = test_handle.read().splitlines()
from naive_bayes import NaiveBayesClassifier import pandas as pd train_data = pd.read_excel("data/PreferenciasBritanicos.xlsx") nb = NaiveBayesClassifier() test_data = [[1, 0, 1, 1, 0], [0, 1, 1, 0, 1]] test_data = pd.DataFrame( test_data, columns=["scones", "cerveza", "wiskey", "avena", "futbol"]) print("*** Train data ***\n\n", train_data, "\n") print("*** Test data ***\n\n", test_data, "\n") nb.train(train_data, "Nacionalidad") prediction = nb.predict(test_data) print("*** Prediction ***\n\n", prediction)
import numpy as np from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from naive_bayes import NaiveBayesClassifier def compute_accuracy(y_true, y_pred): return np.sum(y_true == y_pred) / len(y_true) if __name__ == '__main__': X, y = make_classification(n_samples=1000, n_features=10, n_classes=2) X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True) clf = NaiveBayesClassifier() clf.fit(X_train, Y_train) predictions = clf.predict(X_test) accuracy = compute_accuracy(Y_test, predictions) print("The accuracy of the model is: {}".format(accuracy))
def test_arithmetic_mean(): classifier = NaiveBayesClassifier() assert classifier.arithmetic_mean(numbers=[1, 2, 3, 4, 5, 6, 7]) == 4
print() print(classification_report(y_test, predictions)) ''' ### K-NEAREST NEIGHBORS ### ''' from sklearn.datasets import load_iris from knn import KNN X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) model = KNN() model.fit(X_train, y_train) predictions = model.predict(X_test) print(classification_report(y_test, predictions)) ''' ### NAIVE BAYES CLASSIFIER ### from sklearn.datasets import load_wine from naive_bayes import NaiveBayesClassifier X, y = load_wine(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) model = NaiveBayesClassifier() model.fit(X_train, y_train) predictions = model.predict(X_test) print(classification_report(y_test, predictions))
def train(nb, file): correct = 0 count = 0 with codecs.open(file, 'r', encoding=ModelReader.encoding) as f: for line in f: count += 1 lang, sent = line.strip().split("\t") pred_lang = nb.predict(sent) print("Predicted {0}, actual {1}".format(pred_lang, lang)) correct += (pred_lang == lang) print("Accuracy: {0}".format(correct / count)) def test(nb, file): with codecs.open(file, 'r', encoding=ModelReader.encoding) as f: for line in f: _, sent = line.strip().split("\t") pred_lang = nb.predict(sent) print("Predicted '{0}' for '{1}'".format(pred_lang, sent)) if __name__ == "__main__": test_pass = True dir = "/opt/dropbox/17-18/473/project5/language-models" files = [os.path.abspath(os.path.join(dir, file)) for file in os.listdir(dir)] lang_file_pairs = { file.split(".")[0][-3:]: ModelReader(file).get() for file in files } nb = NaiveBayesClassifier(lang_file_pairs, verbose=True) if test_pass: test(nb, "/opt/dropbox/17-18/473/project5/test.txt") else: train(nb, "/opt/dropbox/17-18/473/project5/train.txt")
class PimaIndiansDiabetes: """ Works on pima-indians-diabetes.csv dataset and interactively performs the following actions:\n 1. Classify new data entered by user.\n 2. Calculate the algorithm implementation accuracy.\n 3. Show dataset description (pima-indians-diabetes.names file).\n 4. Show dataset rows. """ def __init__(self): self.dataset_filename = 'datasets/pima-indians-diabetes.csv' self.description_filename = 'datasets/pima-indians-diabetes.names' self.nbc = NaiveBayesClassifier() self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename) def data_preprocessing(self): """ Converts class names (strings) to ints and class values to floats. Args: None. Returns: Nothing. """ for i in range(len(self.dataset[0]) - 1): self.nbc.convert_class_values_to_floats(self.dataset, i) self.nbc.map_class_names_to_ints(self.dataset, len(self.dataset[0]) - 1, numbers_already=True) def classify_data(self): """ Creates a new row with values inputted by the user, then classifies it to the proper class using Naive Bayes Classifier algorithm. Args: None. Returns: Nothing. """ print('\nEnter the data to be classified.\n') attributes = { 'Number of times pregnant: ': 0.0, 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test: ': 0.0, 'Diastolic blood pressure (mm Hg): ': 0.0, 'Triceps skin fold thickness (mm): ': 0.0, '2-Hour serum insulin (mu U/ml): ': 0.0, 'Body mass index (weight in kg/(height in m)^2): ': 0.0, 'Diabetes pedigree function: ': 0.0, 'Age (years): ': 0.0 } for attr in attributes: correct_input = False while correct_input == False: try: attr_value = float(input(attr)) correct_input = True except: print( 'Incorrect value! Please enter an integer or a floating point number.' ) attributes[attr] = attr_value print('\nEntered attributes:\n') for attr in attributes: print(f'{attr}{attributes[attr]}') print() confirm_sign = '' while confirm_sign not in ['y', 'Y', 'n', 'N']: confirm_sign = input('Confirm (y/n): ') if confirm_sign in ['n', 'N']: return model = self.nbc.calculate_class_parameters(self.dataset) label = self.nbc.predict(model, list(attributes.values())) # Original dataset contains class names represented as numbers, # so it's needed to print the labels explicitly if label == 0: print(f'\nThe entered entity was classified as: Negative') elif label == 1: print(f'\nThe entered entity was classified as: Positive') else: raise def calculate_accuracy(self, n_folds=5): """ Calculates algorithm accuracy by using evaluate_algorithm() function. Args: n_folds (int) Number of folds used in the k-fold cross validation split algorithm. Returns: accuracy Calculated classifier accuracy in percent. """ scores = self.nbc.evaluate_algorithm(self.dataset, n_folds) print( '\n\nCalculating the accuracy of the classifier using the pima-indians-diabetes.csv dataset...' ) print('\nResampling: k-fold cross validation split') accuracy = (sum(scores) / float(len(scores))) print(f'\nAccuracy ({n_folds} folds): {round(accuracy, 3)} %\n') return accuracy def show_dataset_description(self): """ Prints the 'pima-indians-diabetes.names' file to the console output. Args: None. Returns: Nothing. """ with open(self.description_filename, 'r') as f: csv_reader = csv.reader(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for row in csv_reader: for word in row: print(word, end='') print() def show_dataset_rows(self): """ Prints the 'pima-indians-diabetes.csv' file to the console output. Args: None. Returns: Nothing. """ with open(self.dataset_filename, 'r') as f: csv_reader = csv.reader(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for row in csv_reader: for i in range(len(row) - 1): print(row[i], end=',') print(row[len(row) - 1]) def run(self): """ Creates the interactive menu from which the user can execute the actions handled by the other methods in this class. Args: None. Returns: Nothing. """ seed(1) print('\n=================================') print(' Pima Indians Diabetes dataset') print('=================================') self.data_preprocessing() returned_from_function = True while True: if returned_from_function == True: print('\nChoose the action:') print('\n1. Classify new data.') print('2. Calculate algorithm accuracy.') print('3. Show dataset description.') print('4. Show dataset rows.') print('5. Go back to the main menu.\n') returned_from_function = False choice = input('Choice: ') if choice not in ['1', '2', '3', '4', '5']: print('Wrong choice! Please choose option 1-5.') elif choice == '1': try: self.classify_data() returned_from_function = True continue except KeyboardInterrupt: returned_from_function = True continue elif choice == '2': try: self.calculate_accuracy() returned_from_function = True continue except KeyboardInterrupt: returned_from_function = True continue elif choice == '3': try: self.show_dataset_description() returned_from_function = True continue except KeyboardInterrupt: returned_from_function = True continue elif choice == '4': try: self.show_dataset_rows() returned_from_function = True continue except KeyboardInterrupt: returned_from_function = True continue elif choice == '5': break else: raise
def __init__(self): self.dataset_filename = 'datasets/pima-indians-diabetes.csv' self.description_filename = 'datasets/pima-indians-diabetes.names' self.nbc = NaiveBayesClassifier() self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)
from naive_bayes import NaiveBayesClassifier import csv classifier = NaiveBayesClassifier(range(5)) with open("train 2.tsv", 'rb') as phrasedata: tsvin = csv.reader(phrasedata, delimiter='\t') headers = tsvin.next() curriter = 1 for (_, _, phrase, sentiment) in tsvin: classifier.add_example(phrase, int(sentiment)) if curriter % 1000 == 0: print "nother 1000" curriter += 1 classifier.sanitize_features() print "done training" with open("test.tsv", 'rb') as phrasedata: tsvin = csv.reader(phrasedata, delimiter='\t') headers = tsvin.next() curritter = 1 print "PhraseId,Sentiment" for (phraseid, _, phrase) in tsvin: label = classifier.predict(phrase) print "{},{}".format(phraseid, label)
from sklearn.model_selection import train_test_split from naive_bayes import NaiveBayesClassifier # Upload Dataset spams = pd.read_csv("spam.csv", engine="python") # Clean the DataFrame spams = spams.dropna(axis=1) spams.columns = ["spam", "body"] spams = spams[["body", "spam"]] # Encode the label spams["spam"] = LabelEncoder().fit_transform(spams["spam"]) emails = spams["body"] labels = spams["spam"] X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.3) train_data = pd.concat([X_train, y_train], axis=1) # Train and classify nc = NaiveBayesClassifier() nc.train(train_data) print(nc) print(nc.classify("sign up today and win a prize")) print(nc.classify("At what time would you like to meet")) # Notes: this type of models work better on small datasets
class Iris: """ Works on iris.csv dataset and interactively performs the following actions:\n 1. Classify new data entered by user.\n 2. Calculate the algorithm implementation accuracy.\n 3. Show dataset description (iris.names file).\n 4. Show dataset rows. """ def __init__(self): self.dataset_filename = 'datasets/iris.csv' self.description_filename = 'datasets/iris.names' self.nbc = NaiveBayesClassifier() self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename) self.class_map = dict() def data_preprocessing(self): """ Converts class names (strings) to ints and class values to floats. Args: None. Returns: Nothing. """ seed(1) for i in range(len(self.dataset[0]) - 1): self.nbc.convert_class_values_to_floats(self.dataset, i) self.class_map = self.nbc.map_class_names_to_ints( self.dataset, len(self.dataset[0]) - 1) def classify_data(self): """ Creates a new row with values inputted by the user, then classifies it to the proper class using Naive Bayes Classifier algorithm. Args: None. Returns: Nothing. """ print('\nEnter the data to be classified.\n') attributes = { 'Sepal length [cm]: ': 0.0, 'Sepal width [cm]: ': 0.0, 'Petal length [cm]: ': 0.0, 'Petal width [cm]: ': 0.0 } for attr in attributes: correct_input = False while correct_input == False: try: attr_value = float(input(attr)) correct_input = True except: print( 'Incorrect value! Please enter an integer or a floating point number.' ) attributes[attr] = attr_value print('\nEntered attributes:\n') for attr in attributes: print(f'{attr}{attributes[attr]}') print() confirm_sign = '' while confirm_sign not in ['y', 'Y', 'n', 'N']: confirm_sign = input('Confirm (y/n): ') if confirm_sign in ['n', 'N']: return model = self.nbc.calculate_class_parameters(self.dataset) label = self.nbc.predict(model, list(attributes.values())) for key, value in self.class_map.items(): if value == label: print(f'\nThe entered entity was classified as: {key}') break def calculate_accuracy(self, n_folds=5): """ Calculates algorithm accuracy by using evaluate_algorithm() function. Args: n_folds (int) Number of folds used in the k-fold cross validation split algorithm. Returns: accuracy Calculated classifier accuracy in percent. """ scores = self.nbc.evaluate_algorithm(self.dataset, n_folds) print( '\n\nCalculating the accuracy of the classifier using the iris.csv dataset...' ) print('\nResampling: k-fold cross validation split') accuracy = (sum(scores) / float(len(scores))) print(f'\nAccuracy ({n_folds} folds): {round(accuracy, 3)} %\n') return accuracy def show_dataset_description(self): """ Prints the 'iris.names' file to the console output. Args: None. Returns: Nothing. """ with open(self.description_filename, 'r') as f: csv_reader = csv.reader(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for row in csv_reader: for word in row: print(word, end='') print() def show_dataset_rows(self): """ Prints the 'iris.csv' file to the console output. Args: None. Returns: Nothing. """ with open(self.dataset_filename, 'r') as f: csv_reader = csv.reader(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for row in csv_reader: for i in range(len(row) - 1): print(row[i], end=',') print(row[len(row) - 1]) def run(self): """ Creates the interactive menu from which the user can execute the actions handled by the other methods in this class. Args: None. Returns: Nothing. """ print('\n=================================') print(' Iris dataset') print('=================================\n') self.data_preprocessing() returned_from_function = True while True: if returned_from_function == True: print('\nChoose the action:') print('\n1. Classify new data.') print('2. Calculate algorithm accuracy.') print('3. Show dataset description.') print('4. Show dataset rows.') print('5. Go back to the main menu.\n') returned_from_function = False choice = input('Choice: ') if choice not in ['1', '2', '3', '4', '5']: print('Wrong choice! Please choose option 1-5.') elif choice == '1': try: self.classify_data() returned_from_function = True continue except KeyboardInterrupt: returned_from_function = True continue elif choice == '2': try: self.calculate_accuracy() returned_from_function = True continue except KeyboardInterrupt: returned_from_function = True continue elif choice == '3': try: self.show_dataset_description() returned_from_function = True continue except KeyboardInterrupt: returned_from_function = True continue elif choice == '4': try: self.show_dataset_rows() returned_from_function = True continue except KeyboardInterrupt: returned_from_function = True continue elif choice == '5': break else: raise
word_features = sorted(word_features) word_features = sorted(word_features) word_vector = vectorize(word_features, data['tweet'], data['sentiment']) vector = [] labels = [] for example in word_vector: vector = vector + [example[0]] labels = labels + [example[1]] print "Stage 1: Word Polarity" print "training bayesian network" words = get_words("features.txt") bayes_vector = naive_bayes_vector(words, data['tweet'], data['sentiment']) #print bayes_vector NaiveBayesClassifier.train(bayes_vector) #gnb = BernoulliNB() #gnb.fit(vector,labels) #with open('classifier_bayes.pkl', 'wb') as fid: # cPickle.dump(gnb, fid) print "training svm" svc = SVM() svm_vector = [] for v in vector: for i in range(0, len(v)): if (v[i] == True): v[i] = 1.0 else:
for w in word_features: features[w] = (w in words) return features featuresets = [(find_features(rev), category) for (rev, category) in documents] # Making sure there is no bias random.shuffle(featuresets) print(len(featuresets)) training_set = featuresets[:3000] testing_set = featuresets[3000:4000] classifier = NaiveBayesClassifier.train(training_set) print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100) classifier.show_most_informative_features(15) save_classifier = open("LIMITED_PICKLES/originalnaivebayes5k.pickle", "wb") pickle.dump(classifier, save_classifier) save_classifier.close() # MNB_classifier = SklearnClassifier(MultinomialNB()) # MNB_classifier.train(training_set) # print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) # ############### # save_classifier = open("LIMITED_PICKLES/MNB_classifier5k.pickle","wb") # pickle.dump(MNB_classifier, save_classifier)
def main(): os.system("clear") print "Sentiment Analysis by Luca Giacomel. Disclaimer: this very simple algorithm wont probably work, but it might be worth a try." def update_progress(progress,current_operation_message,p): df=2 #dimension factor, len of the graph = 100/df sys.stdout.write('\r[{0}{1}] {2}% (Page: {4}) Current operation: {3}\r\r'.format('#'*(progress/df)," "*(100/df-(progress/df)), progress,current_operation_message,p)) sys.stdout.flush() load_from_hd="n" if os.path.exists("/tmp/db.bin") and os.path.exists("/tmp/neg.tweets") and os.path.exists("/tmp/pos.tweets"): proceed=raw_input("I found some tweets already stored, do you want me to use them [y=Yes | n=No | a=Append]? [y/N/a] ").lower() while proceed not in ["","y","n","a"]: proceed=raw_input("I found some tweets already stored, do you want me to use them? [y/N] ").lower() load_from_hd=proceed.lower() if load_from_hd=="y" or load_from_hd=="": test_tweets=[] nb=NaiveBayesClassifier(db_path="/tmp/db.bin",categories=['positive','negative']) print "Done. Read a db of %s words" % len(nb.db) search_value=raw_input("What keyword do you want to use to perform the analysis? (you can use @ # :) :( as special operators) ") print "Downloading 30 tweets for keywords %s.." % search_value z=json.loads(urllib.urlopen("http://search.twitter.com/search.json?q=%s&rpp=30&lang=en" % (urllib.quote(search_value))).read()) print "Done." for m in z['results']: test_tweets.append(m['text']) elif load_from_hd=="n" or load_from_hd=="a": pages_to_load=raw_input("How many pages should I load? [default=20] ") while 1: try: if pages_to_load=="": pages_to_load=20 break pages_to_load=int(pages_to_load) break except: pages_to_load=raw_input("How many pages should I load? [default=20] ") if load_from_hd=="a": pos_tweets=json.load(open("/tmp/neg.tweets")) neg_tweets=json.load(open("/tmp/pos.tweets")) else: pos_tweets,neg_tweets=[],[] for p in range(1,pages_to_load+1): perc=int(float(p*100)/pages_to_load) isleep=0 cycle=True while 1: try: if cycle: raw_pos_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":)"))).read()) raw_neg_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":("))).read()) if len(neg_tweets)<len(pos_tweets): cycle=False else: raw_neg_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":("))).read()) raw_pos_tweets=json.loads(urllib.urlopen("http://search.twitter.com/search.json?page=%s&q=%s&rpp=100&lang=en" % (p,urllib.quote(":)"))).read()) if len(neg_tweets)>len(pos_tweets): cycle=True raw_pos_tweets['results'],raw_neg_tweets['results'] time.sleep(1) for i in raw_pos_tweets['results']: if pos_tweets.count((i['text'],'positive'))==0: pos_tweets.append((i['text'],'positive')) for i in raw_neg_tweets['results']: if neg_tweets.count((i['text'],'negative'))==0: neg_tweets.append((i['text'],'negative')) update_progress(perc, "Elements: %s positive, %s negative." % (len(pos_tweets),len(neg_tweets)),p) break except: update_progress(perc, "Failed to fetch the json, trying again in %s seconds" % 2**isleep ,p) time.sleep(2**isleep) isleep+=1 if 2**isleep>64: update_progress(perc, "Load time >64sec. Skipping page.. "+str(p),p) break update_progress(perc, "\n",p) open("/tmp/pos.tweets","w").write(json.dumps(pos_tweets)) open("/tmp/neg.tweets","w").write(json.dumps(neg_tweets)) training_start=time.time() index=min(len(pos_tweets),len(neg_tweets)) test_tweets=[] search_value=raw_input("What keyword do you want to use to perform the analysis? (you can use @ # :) :( as special operators) ") print "Downloading 30 tweets for keywords %s.." % search_value z=json.loads(urllib.urlopen("http://search.twitter.com/search.json?q=%s&rpp=30&lang=en" % (urllib.quote(search_value))).read()) print "Done." for m in z['results']: test_tweets.append(m['text']) print "Training the classifier. This might take a while, grab a coffe while I work." nb=NaiveBayesClassifier(db={},categories=['negative','positive']) nb.train(pos_tweets[:index]+neg_tweets[:index]) print "Done. Training based on a set of %s elements took %s seconds." % (index*2,time.time()-training_start) for tx in test_tweets: print "Tweet: "+OKBLUE+tx+ENDC r=nb.classify(tx.lower()) if r=="positive": print "Result: "+OKGREEN+r+ENDC elif r=="negative": print "Result: "+FAIL+r+ENDC #else: #print "Result: "+WARNING+"neutral (was %s with accuracy %s)" % (r[0],r[1]) +ENDC nb.save_to_hard_disk() nb.show_most_informative()