def getAccuracyRate(self): if self.accuracy == 0: naiveBayes = NaiveBayesClassifier() naiveBayes.word_features = self.chromosome classifier = naiveBayes.NaiveBayes(self.chromosomeManager.sentences) self.accuracy = naiveBayes.getAccuracy(classifier, self.chromosomeManager.sentences) return self.accuracy
def main(): print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(mp.cpu_count()) # for parallel processing df = pd.DataFrame( columns=['comment', 'otherMetadata', 'likeDislikeRatio', 'sentiment']) i = 0 for x in range(751): with open("../output/" + str(x) + ".json", encoding='utf-8') as f: data = json.load(f) # global commentsText commentsText = "" # tags = data[0]["tags"] sentiment = data[0]['sentiment'] if 'tags' in data[0].keys(): tags = str(' '.join(data[0]["tags"])) # print("tags" + tags) otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"] + " " + tags) else: otherMetaData = pr.process(data[0]["title"] + " " + data[0]["description"]) likes = int(data[0]["likeCount"]) dislikes = int(data[0]["dislikeCount"]) likeDislikeRatio = str(float(likes / dislikes)) results = pool.map(preprocess, [item for item in data[0]["comments"]]) for result in results: if result is not None: commentsText += result[0] df.loc[i] = [commentsText] + [otherMetaData] + [likeDislikeRatio ] + [sentiment] # print(str(i)+" : "+df['posToNegCommentRatio'].loc[i]) # print(df['otherMetadata'].iloc[0]) print(i) i += 1 df['sentiment_one_hot'] = df['sentiment'].apply(lambda x: 0 if x == 'N' else 1) df['data'] = df['comment'] + ' ' + df['otherMetadata'] traindf, testdf = train_test_split(df, test_size=0.2) x_train, x_test, y_train, y_test = train_test_split( df['data'], df['sentiment_one_hot'], test_size=0.2) NBModel = NaiveBayesClassifier() NBModel.train(x_train, y_train, alpha=1) print(y_test) # hateVideoComments = df.loc[18]['comment'] # print(hateVideoComments) levelOfHate = NBModel.getHateLevel(x_test) print(levelOfHate)
def main(use_stop_words): bc = NaiveBayesClassifier(use_stop_words) spam_total_count, ham_total_count = bc.train() spam_success_ratio, ham_success_ratio, total_success_ratio = bc.classify( spam_total_count, ham_total_count) print('Success Ratio For Spam Emails: %.4f%%' % (spam_success_ratio * 100)) print('Success Ratio For Ham Emails: %.4f%%' % (ham_success_ratio * 100)) print('Success Ratio For All Emails: %.4f%%' % (total_success_ratio * 100))
def __init__(self): self.NBC = NaiveBayesClassifier() self.LRC = LogisticRegression() abbreviations = [] self.abbreviations = abbreviations[1:] self.punctuations1 = ['!', '?', ',', '"', '(', ')'] #Always token self.punctuations2 = ['.', ':'] #Ambiguous self.split_characters = [' ']
def main(): x_train, y_train = load_data('data/train.txt') x_validation, y_validation = load_data('data/validation.txt') number_of_words = 5000 classifier = NaiveBayesClassifier(number_of_words) print("Training classifier...") classifier.train(x_train, y_train) print("Testing classifier...") print('Accuracy:', classifier.get_accuracy(x_validation, y_validation) * 100)
def __init__(self): self.NBC = NaiveBayesClassifier() self.LRC = LogisticRegression() nbp = open(PATH + "/" + 'non_breaking_prefixes_tr.txt', mode='r', encoding='utf-8').readlines() abbreviations = [] for line in nbp: if line.strip() != '' and line.strip()[0] != "#": abbreviations.append(line.strip()) self.abbreviations = abbreviations[1:] self.sentence_ending_punctuations = ['.', '!', '?'] self.quotation_space_starters = ['"', "("] self.quotation_space_enders = ['"', ")"]
def testIris(self): from NaiveBayesClassifier import NaiveBayesClassifier iris = load_iris() X_train = iris.data y_train = iris.target nb = NaiveBayesClassifier(smoothing=True) accuracy = np.mean(cross_validation(nb, X_train, y_train)) print("NaiveBayesClassifier with Laplacian correction: accuracy:", accuracy)
def __init__(self, vocabulary, n, delta): self.vocabulary = vocabulary self.n = n self.delta = delta self.model_ca = NaiveBayesClassifier( vocabulary, n, delta, "./training_files/ca_training-tweets.txt", "ca", 18318) self.model_gl = NaiveBayesClassifier( vocabulary, n, delta, "./training_files/gl_training-tweets.txt", "gl", 18318) self.model_en = NaiveBayesClassifier( vocabulary, n, delta, "./training_files/en_training-tweets.txt", "en", 18318) self.model_es = NaiveBayesClassifier( vocabulary, n, delta, "./training_files/es_training-tweets.txt", "es", 18318) self.model_pt = NaiveBayesClassifier( vocabulary, n, delta, "./training_files/pt_training-tweets.txt", "pt", 18318) self.model_eu = NaiveBayesClassifier( vocabulary, n, delta, "./training_files/eu_training-tweets.txt", "eu", 18318) self.arrayModel = [ self.model_ca, self.model_gl, self.model_en, self.model_es, self.model_pt, self.model_eu ] self.probability = 0 self.totalTweetCount = 18318
def train(self): self._neg_train_data = self.get_training_data(self.data_root + 'train\\neg\\') self._pos_train_data = self.get_training_data(self.data_root + 'train\\pos\\') print ("Preparing Train Data... ") documents = [(list(self._neg_train_data.words(fileid)), "neg") for fileid in self._neg_train_data.fileids()] documents += [(list(self._pos_train_data.words(fileid)), "pos") for fileid in self._pos_train_data.fileids()] print ("Preparing Train FeatureSets... ") featuresets = [(self.extract_feature(d), c) for d, c in documents] print ("Training... ") self.classifier = NaiveBayesClassifier.train(featuresets)
def __init__(self): self.NaiveBayesClassifier = NaiveBayesClassifier() # Sentence Splitters self.RuleBasedSentenceSplitter = RuleBasedSentenceSplitter() self.MLBasedSentenceSplitter = MLBasedSentenceSplitter() # Tokenizers self.RuleBasedTokenizer = RuleBasedTokenizer() self.MLBasedTokenizer = MLBasedTokenizer() # Normalizer self.Normalizer = Normalizer() # Stemmer self.Stemmer = Stemmer() # Stopword Eliminators self.StaticStopWordEliminator = StaticStopwordRemover() self.DynamicStopWordEliminator = DynamicStopWordEliminator()
def __init__(self): self.model_ca = NaiveBayesClassifier( 0, 3, 0.3, "./training_files/ca_training-tweets.txt", "ca", 18318) self.model_gl = NaiveBayesClassifier( 0, 3, 0.3, "./training_files/gl_training-tweets.txt", "gl", 18318) self.model_en = NaiveBayesClassifier( 0, 3, 0.3, "./training_files/en_training-tweets.txt", "en", 18318) self.model_es = NaiveBayesClassifier( 0, 3, 0.3, "./training_files/es_training-tweets.txt", "es", 18318) self.model_pt = NaiveBayesClassifier( 0, 3, 0.3, "./training_files/pt_training-tweets.txt", "pt", 18318) self.model_eu = NaiveBayesClassifier( 0, 3, 0.3, "./training_files/eu_training-tweets.txt", "eu", 18318) self.arrayModel = [ self.model_ca, self.model_gl, self.model_en, self.model_es, self.model_pt, self.model_eu ] self.probability = 0 self.totalTweetCount = 18318
def main(): np.set_printoptions(threshold=sys.maxsize) # separate data into training and testing training = np.zeros((1, 2), dtype=str) testing = np.zeros((1, 2), dtype=str) genres = set() os.chdir('../Corpus') with open('movie_titles_metadata.txt', encoding='utf-8', errors='ignore') as file: for line in file: line = line.strip().split(" +++$+++ ") for x in ast.literal_eval(line[5]): genres.add(x) arr = [[line[0], line[5]]] r = random.random() if r < 0.80: training = np.concatenate((training, arr), axis=0) else: testing = np.concatenate((testing, arr), axis=0) file.close() training = np.delete(training, 0, axis=0) testing = np.delete(testing, 0, axis=0) # train and test model start_time = time.time() features = extract_features('movie_lines.txt') model = NaiveBayesClassifier() print("TRAINING:") model.fit(training, features, genres) pred_labels, correct_labels, genres = model.predict(training, features) accuracy(pred_labels, correct_labels, genres) fscore(pred_labels, correct_labels, genres) print("TESTING:") pred_labels, correct_labels, genres = model.predict(testing, features) accuracy(pred_labels, correct_labels, genres) fscore(pred_labels, correct_labels, genres) print("Time for training and test: %.2f seconds" % (time.time() - start_time))
# loading data dataset = pd.read_csv('iris.data.txt', names=names).values # dataset = pd.read_csv('column_3C.dat.txt', names=names).values dataframe = pd.read_csv('breast-cancer-wisconsin.data.csv', names=names) dataframe = dataframe.drop(['id', 'bare_nucleoli'], axis=1) # dataframe = pd.read_csv('dermatology.csv') # dataframe = dataframe.drop(['age'], axis=1) # dataset = dataframe.values # artificial dataset # features_1 = np.array([[random.uniform(0.4, 0.6), random.uniform(0.4, 0.6), 0] for _ in range(50)]) # features_2 = np.array([[random.uniform(-0.1, 0.1), random.uniform(0.9, 1.1), 1] for _ in range(50)]) # features_3 = np.array([[random.uniform(0.9, 1.1), random.uniform(0.9, 1.1), 2] for _ in range(50)]) # dataset = np.concatenate([features_1, features_2, features_3], axis=0) bayesClassifier = NaiveBayesClassifier() # bayesClassifier = BayesLDAClassifier() # bayesClassifier = BayesQDAClassifier() dataset = bayesClassifier.normalize_dataset(dataset) accuracies = [] for j in range(0, 1): print("realization %d" % j) train_X, train_y, test_X, test_y = bayesClassifier.train_test_split(dataset) bayesClassifier.fit(train_X, train_y) predictions = bayesClassifier.predict(test_X) accuracies.append(bayesClassifier.evaluate(test_y, predictions)) print(bayesClassifier.confusion_matrix(test_y, predictions)) bayesClassifier.plot_decision_boundaries(train_X, train_y, test_X, test_y, j)
test_score = [] print("LSA created.") ########################### # LSA human_keywords = l.manage_keywords(f.keywords) lsa_results = l.train_phrases(human_keywords) print("LSA Results computed.") for j in range(50): sets = Set(lsa_results, f.y, f.x) for i in range(len(sets.x_train)): ########################### ########################### # NAIVE BAYES naive = NaiveBayesClassifier(alpha=0.01) naive.train(numpy.array(sets.x_train[i]), sets.y_train[i]) test_score.append( naive.test_score(numpy.array(sets.x_test[i]), numpy.array(sets.y_test[i]))) avg = numpy.round(numpy.average(numpy.array(test_score)), 2) y.append(avg) min_ = numpy.round(numpy.array(test_score).min(), 2) yerrormin.append(numpy.round(avg - min_, 2)) max_ = numpy.round(numpy.array(test_score).max(), 2) yerrormax.append(numpy.round(max_ - avg, 2)) print("Avg test performance: ", avg) print(min_) print(max_) print('\n' * 3)
from NaiveBayesClassifier import NaiveBayesClassifier import GeneticAlgorithm import time naiveBayes = NaiveBayesClassifier() # read training data df = naiveBayes.loadCSV('V1.4_Training_new.csv') contractions = naiveBayes.loadCSV('Contractions.csv', 'contraction', 'text1').to_dict('split') df['label'] = df.sentence.str.extract(r'((\b\w+)[\.?!\s]*$)')[0] df['label'] = df.label.map(lambda x: 'suggestion' if x == '1' else 'nonsuggestion') # preprocessing training data df = naiveBayes.Preprocessing(df, contractions) # First element is an array containing the words and second element is the type of sentiment. # We get rid of the words smaller than 2 characters and we use lowercase for everything. allsentences = [(row['sentence'], row['label']) for index, row in df.iterrows()] trainingsentences = [(([e.lower() for e in words.split() if len(e) >= 3], sentiment)) for (words, sentiment) in allsentences] GA = GeneticAlgorithm chromosomeManager = GA.ChromosomeManager() chromosomeManager.sentences = trainingsentences word_features = naiveBayes.getWordFeatures(trainingsentences)
# Spencer Barton # 10-601 # Naive Bayes Classifier from NaiveBayesClassifier import NaiveBayesClassifier import sys #=============================================== # Script #=============================================== TRAIN_FILE_NAME = sys.argv[1] NB = NaiveBayesClassifier() NB.train(TRAIN_FILE_NAME) topWords = NB.getSortedWords() N = 20 for i in xrange(0,N): pair = topWords['lib'][i] print pair[0], round(pair[1],4) print for i in xrange(0,N): pair = topWords['con'][i] print pair[0], round(pair[1],4)
# Spencer Barton # 10-601 # Naive Bayes Classifier from NaiveBayesClassifier import NaiveBayesClassifier import sys #=============================================== # Script #=============================================== TRAIN_FILE_NAME = sys.argv[1] TEST_FILE_NAME = sys.argv[2] NB = NaiveBayesClassifier() NB.train(TRAIN_FILE_NAME) NB.test(TEST_FILE_NAME)
def __init__(self): self.bag = BagOfWords() self.nbc = NaiveBayesClassifier()
l = LSA(MAX_GRAM, MIN_FREQ, P_EIG, x) print("Parameters: Min_freq =", l.min_freq,"NGram_max =", l.ngram_max, "P_eig =", l.p_eig*100) print("LSA created.") ########################### # LSA human_keywords = l.manage_keywords(f.keywords) lsa_results = l.train_phrases(human_keywords) print("LSA Results computed.") sets = Set(lsa_results, numpy.array(y), numpy.array(x)) for i in range(len(sets.x_train)): ########################### ########################### # NAIVE BAYES naive = NaiveBayesClassifier(alpha=ALPHA) naive.train(numpy.array(sets.x_train[i]), sets.y_train[i]) test_score.append(naive.test_score(numpy.array(sets.x_test[i]), numpy.array(sets.y_test[i]))) if not test_score: break elements.append(n_elements) avg = numpy.round(numpy.average(numpy.array(test_score)), 2) classification.append(avg) min_ = numpy.round(numpy.array(test_score).min(), 2) classificationerrormin.append(numpy.round(avg - min_, 2)) max_ = numpy.round(numpy.array(test_score).max(), 2) classificationerrormax.append(numpy.round(max_ - avg, 2)) print("Avg test performance: ", avg) print(min_) print(max_) print('\n'*3)
# Spencer Barton # 10-601 # Naive Bayes Classifier from NaiveBayesClassifier import NaiveBayesClassifier import sys #=============================================== # Script #=============================================== TRAIN_FILE_NAME = sys.argv[1] TEST_FILE_NAME = sys.argv[2] N_STOP_WORDS = int(sys.argv[3]) NB = NaiveBayesClassifier(nStopWords=N_STOP_WORDS) NB.train(TRAIN_FILE_NAME) NB.test(TEST_FILE_NAME)
print("LSA created.") ########################### # LSA human_keywords = lsa.manage_keywords(f.keywords) print("Start", datetime.datetime.now()) aux1 = datetime.datetime.now() ex1 = lsa.process_examples(human_keywords, train_set[i]) ex1.shutdown(wait=True) print("LSA Results computed.") ########################### ########################### # NAIVE BAYES naive = NaiveBayesClassifier(alpha=0.01) ex2 = lsa.process_examples(human_keywords, test_set[i]) naive.train(numpy.array(train_set[i].get_lsa_results()), train_set[i].get_class_labels()) ex2.shutdown(wait=True) test_score.append( naive.test_score(numpy.array(test_set[i].get_lsa_results()), numpy.array(test_set[i].get_class_labels()), "test")) naive.test_score(numpy.array(train_set[i].get_lsa_results()), numpy.array(train_set[i].get_class_labels()), "train") print("End", datetime.datetime.now()) aux2 = datetime.datetime.now() time_score.append(aux2 - aux1) print("Difference", aux2 - aux1) print("Avg test performance: ", numpy.average(numpy.array(test_score)))
import requests from NaiveBayesClassifier import NaiveBayesClassifier from Ploter import Ploter from Tokenizer import Tokenizer if __name__ == '__main__': nbc = NaiveBayesClassifier() urls = open('Testset/test.txt','r').readlines() url = urls[4].split(" ") print "Class of the newspaper article is: " + str(url[0]) print "URL: " + str(url[1]) #url = raw_input("Enter URL:") response = requests.get(url[1]) if response.status_code == 200: t = Tokenizer(response.content) clearText = t.clear print "Parsed html: " + str(clearText) prediction = nbc.predict(clearText) percentages = [round(x * 100, 2) for x in prediction[0]] result = zip(nbc.bbc_train.target_names, percentages) print "Prediction probability in percentage: " + str(result) labels = nbc.bbc_train.target_names pl = Ploter(labels, percentages) pl.drawPie()
def NaiveBayesDriver(self): naivebayes = NaiveBayesClassifier(self.train_x, self.train_y, self.test_x, self.test_y) # -----= naiveBayes naivebayes.train_1() naivebayes_labels = naivebayes.predic() naivebayes_acc = naivebayes.getAccuracy() naivebayes.printResult() self.acc['naivebayes-GaussianNB'] = { 'accuracy': naivebayes_acc, 'train-time': naivebayes.trainTime(), 'test-time': naivebayes.testTime(), } naivebayes.train_2() naivebayes_labels_2 = naivebayes.predic() naivebayes_acc = naivebayes.getAccuracy() naivebayes.printResult() self.acc['naivebayes-MultinomialNB'] = { 'accuracy': naivebayes_acc, 'train-time': naivebayes.trainTime(), 'test-time': naivebayes.testTime(), } naivebayes.train_3() naivebayes_labels_3 = naivebayes.predic() naivebayes_acc = naivebayes.getAccuracy() naivebayes.printResult() self.acc['naivebayes-ComplementNB'] = { 'accuracy': naivebayes_acc, 'train-time': naivebayes.trainTime(), 'test-time': naivebayes.testTime(), } # -----= naiveBayes naivebayes.train_1() naivebayes_labels = naivebayes.predic() naivebayes_acc = naivebayes.getAccuracy() naivebayes.printResult() self.acc['naivebayes-GaussianNB'] = { 'accuracy': naivebayes_acc, 'train-time': naivebayes.trainTime(), 'test-time': naivebayes.testTime(), }
contents = list(content) contents = map(lambda x: 'space' if x == ' ' else x, contents) wekaContent = ' '.join(contents) fname = 'weka/%c/%d.txt' % (label, i) writeFile(fname, wekaContent) print('Done') labels = ['e', 'j', 's'] features = list(string.ascii_lowercase) + [' '] df = pd.DataFrame(columns=features + ['label']) for label in labels: for i in range(0, 10): fname = 'languageID/%c%d.txt' % (label, i) dfRow = getAttributes(fname) dfRow['label'] = label df = df.append(dfRow, ignore_index=True) dfX = df.loc[:, features] dfY = df.loc[:, 'label'] X = dfX.to_numpy() y = dfY.to_numpy() classifier = NaiveBayesClassifier(labels) classifier.fit(X, y) q1_3() q4_6() q7_8() weka() print('Done')
class CuisineClassification: def __init__(self): self.bag = BagOfWords() self.nbc = NaiveBayesClassifier() def classify_check(self, train_json, train_ratio, learn_ratio, randomize=False): with Timer('Naive Bayes Classifier, Classify Check', indent=0) as total_t: with Timer('Loading Recipes for Training') as t: with open(train_json) as train_file: recipes = json.load(train_file) if randomize: random.shuffle(recipes) train_size = int(len(recipes) * train_ratio) test_size = int(len(recipes) * learn_ratio) train_recipes = recipes[:train_size] test_recipes = recipes[train_size:(train_size + test_size)] t.update( 'Loaded {}(training) and {}(testing) recipes'.format( len(train_recipes), len(test_recipes))) with Timer('Building Bag of Words') as t: self.bag.build_vocabulary(train_recipes) t.update('Built bag with {} sized vocabulary'.format( self.bag.count)) with Timer('Building Training Vectors') as t: train_vectors = self.bag.build_vectors(train_recipes) t.update('Built {} vectors'.format(len(train_vectors))) with Timer('Building Testing Vectors') as t: test_vectors = self.bag.build_vectors(test_recipes) t.update('Built {} vectors'.format(len(test_vectors))) with Timer('Training Bayes Classifier') as t: train_vectors = self.bag.build_vectors(train_recipes) self.nbc.train(train_vectors) t.update('Trained with {} vectors'.format( self.nbc.cuisine_total)) with Timer('Making Predictions') as t: total = 0 correct = 0 for n in range(len(test_vectors)): vector = test_vectors[n] recipe = test_recipes[n] prediction = self.nbc.classify(vector) total += 1 if prediction == recipe['cuisine']: correct += 1 t.update( 'Finished {} predictions with accuracy of {:.1%}'.format( total, correct / float(total))) total_t.update('Total') def classify_test(self, train_json, test_json, prediction_csv, scale): with Timer('Naive Bayes Classifier, Classify Test', indent=0) as total_t: with Timer('Loading Recipes for Training') as t: with open(train_json) as train_file: train_recipes = json.load(train_file) train_recipes = train_recipes[:int( len(train_recipes) * scale)] t.update('Loaded {} training recipes'.format( len(train_recipes))) with Timer('Loading Recipes for Testing') as t: with open(test_json) as test_file: test_recipes = json.load(test_file) test_recipes = test_recipes[:int( len(test_recipes) * scale)] t.update('Loaded {} testing recipes'.format( len(test_recipes))) with Timer('Building Bag of Words') as t: self.bag.build_vocabulary(train_recipes) t.update('Built bag with {} sized vocabulary'.format( self.bag.count)) with Timer('Building Training Vectors') as t: train_vectors = self.bag.build_vectors(train_recipes) t.update('Built {} vectors'.format(len(train_vectors))) with Timer('Building Testing Vectors') as t: test_vectors = self.bag.build_vectors(test_recipes) t.update('Built {} vectors'.format(len(test_vectors))) with Timer('Training Bayes Classifier') as t: train_vectors = self.bag.build_vectors(train_recipes) self.nbc.train(train_vectors) t.update('Trained with {} vectors'.format( self.nbc.cuisine_total)) with Timer('Writing Predictions') as t: predictions = 0 with open(prediction_csv, "wt") as prediction_file: writer = csv.writer(prediction_file) writer.writerow(['id', 'cuisine']) for n in range(len(test_vectors)): predictions += 1 vector = test_vectors[n] recipe_id = test_recipes[n]['id'] prediction = self.nbc.classify(vector) writer.writerow([recipe_id, prediction]) t.update('Wrote out {} predictions'.format(predictions)) total_t.update('Total')
# let coding = begin from NaiveBayesClassifier import NaiveBayesClassifier import pandas as pandas dataSet = [[182, 81.6, 30, 'masculin'], [180, 86.2, 28, 'masculin'], [170, 77.1, 30, 'masculin'], [180, 74.8, 25, 'masculin'], [152, 45.4, 15, 'feminin'], [168, 68.0, 20, 'feminin'], [165, 59.0, 18, 'feminin'], [165, 59.0, 23, 'feminin']] test = [[183, 59, 20]] nbc = NaiveBayesClassifier(dataSet) nbc.pridection(test)
v = re.compile(r'^[-+]?[0-9]+(\.[0-9]+)?$') # float or int reg import xlrd import numpy as np from NaiveBayesClassifier import NaiveBayesClassifier # load data data = xlrd.open_workbook('../WTMLDataSet_3.0.xlsx') table = data.sheet_by_name('WTML') dataset = [] for i in range(table.nrows): line = table.row_values(i) dataset.append(line) dataset = np.array(dataset) xs = dataset[1:, 1:-1] ys = (dataset[1:, -1] == '否').astype(np.int32) isdiscs = np.array([not bool(v.match(val)) for val in xs[0]]) labels = ['好瓜', '坏瓜'] # build naive Bayes classifier classifier = NaiveBayesClassifier(xs, isdiscs, ys) # input test_x = ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460] print("Input: \t%ls" % test_x) # output test_y = classifier.classify(np.array(test_x)) print("Output: %s" % labels[test_y])
train_corpus[k] = lda_models[k][tmp_train] test_corpus[k] = lda_models[k][tmp_test] train_label[k] = labels[k][:int(len(labels[k]) * 0.8)] test_label[k] = labels[k][int(len(labels[k]) * 0.8):] train_dat[k] = [(format_vector_as_dict(d), l) for (d, l) in zip(train_corpus[k].corpus, train_label[k])] test_dat[k] = [(format_vector_as_dict(d), l) for (d, l) in zip(test_corpus[k].corpus, test_label[k])] # training phase print 'Start training Naive Bayes Classifier' for k in train_dat.keys(): classifier = NaiveBayesClassifier.train(train_dat[k]) # test the accuracy print 'Testing' results = classifier.batch_classify([fs for (fs, l) in test_dat[k]]) correct = [l == r for ((fs, l), r) in zip(test_dat[k], results)] if correct: acc = float(sum(correct)) / len(correct) else: acc = 0 print k, acc # translate feature # dataset = [(format_vector_as_dict(d), l) for (d, l) in zip(corpus, labels)]
class MLBasedTokenizer: def __init__(self): self.NBC = NaiveBayesClassifier() self.LRC = LogisticRegression() abbreviations = [] self.abbreviations = abbreviations[1:] self.punctuations1 = ['!', '?', ',', '"', '(', ')'] #Always token self.punctuations2 = ['.', ':'] #Ambiguous self.split_characters = [' '] def create_features(self, string_of_sentences): # String of sentences = all tokens made into a string. combined_sentences = string_of_sentences features = [] length = len(combined_sentences) # For all inputs check for features # For all inputs create a feature list # features = (input number, feature #) # Think of every character as input # Check every character (except first and last one) for index in range(0, length - 1): # Features # - Is next character a splitter? # - Is next character a punctuation? # - Is character a punctuation? # - Is previous character quotation mark or closing paranthesis? # - Is next character quotation mark or opening paranthesis? is_next_char_splitter = combined_sentences[ index + 1] in self.split_characters is_prev_char_punc = combined_sentences[index - 1] in self.punctuations1 is_next_char_punc = combined_sentences[index + 1] in self.punctuations1 is_prev_char_punc2 = combined_sentences[index - 1] in self.punctuations2 is_next_char_punc2 = combined_sentences[index + 1] in self.punctuations2 is_char_punc2 = combined_sentences[index] in self.punctuations2 is_prev_char_numeric = combined_sentences[index - 1].isnumeric() is_char_numeric = combined_sentences[index].isnumeric() is_next_char_numeric = combined_sentences[index + 1].isnumeric() if index + 2 <= index: is_next_next_char_numeric = combined_sentences[index + 2].isnumeric() else: is_next_next_char_numeric = is_char_numeric features.append([ is_next_char_splitter, is_next_char_punc, is_prev_char_punc, is_next_char_punc2, is_prev_char_punc2, is_char_punc2, is_prev_char_numeric, is_next_char_numeric ]) #print(np.array(features) * 1) return np.array(features) * 1 def create_labels(self, string_of_sentences): # list of sentences # Create labels for all inputs (characters) # Except first and last character. length = len(string_of_sentences) input_length = length - 1 y = np.zeros(input_length) # Create labels based on whether the character is a split or not. split_positions = [] for index in range(0, length - 1): if string_of_sentences[index] == ' ': split_positions.append(index - 1) if string_of_sentences[index] == ',': split_positions.append(index - 1) if string_of_sentences[index] == ')': split_positions.append(index - 1) if string_of_sentences[index] == '(': split_positions.append(index + 1) if string_of_sentences[index] == '.': if not string_of_sentences[index - 1].isnumeric(): if not string_of_sentences[index + 1].isnumeric(): split_positions.append(index - 1) # Insert labels to y: for position in split_positions: if position < input_length: y[position] = 1 # Labeller okey, şuan merhaba. ayır diyor, # 22.12 ayırma diyor. return y def fit(self, string_of_sentences, model='NBC'): X = self.create_features(string_of_sentences) y = self.create_labels(string_of_sentences) if model == 'LogisticRegression': self.LRC.fit(X, y) else: self.NBC.fit(X, y) return X, y def predict(self, string_of_sentences, model='NBC'): X = self.create_features(string_of_sentences) if model == 'LogisticRegression': preds = self.LRC.predict(X) else: preds = self.NBC.predict(X) return [int(i) for i in preds] def split_to_tokens(self, string_of_sentences, model='NBC'): X = self.create_features(string_of_sentences) splitted_sentences = string_of_sentences if model == 'LogisticRegression': preds = self.LRC.predict(X) else: preds = self.NBC.predict(X) # converting boolean predictions into actual splitted tokens length = len(preds) # Equals to input length split_locations = [0] tokens = [] for index in range(length): if preds[index] == 1: split_locations.append(index + 1) splitted_sentences = splitted_sentences[:index + 1] + '*' + splitted_sentences[ index + 2:] if index == length - 1 and preds[index] == 0: split_locations.append(index + 2) length = len(split_locations) for index in range(length - 1): token = string_of_sentences[ split_locations[index]:split_locations[index + 1]] if token == ' ': token = string_of_sentences[split_locations[index] + 1:split_locations[index + 1] + 1] split_locations[index + 1] = split_locations[index + 1] + 1 tokens.append(token) # Check for the last character if preds[len(preds) - 1] == 1: tokens.append(string_of_sentences[-1]) return tokens
########################### # LSA l = LSA(MAX_GRAM, MIN_FREQ, P_EIG, f.x) print("Parameters: Min_freq =", l.min_freq,"NGram_max =", l.ngram_max, "P_eig =", l.p_eig*100) human_keywords = l.manage_keywords(f.keywords) lsa_results = l.train_phrases(human_keywords) #n_labels = [len(list(group)) for key, group in groupby(f.y)] print("LSA Results computed.") sets = Set(lsa_results, f.y, f.x) for i in range(len(sets.x_train)): #error_per_class = numpy.zeros(22) #errors = 0 ########################### # NAIVE BAYES naive = NaiveBayesClassifier(alpha=ALPHA) naive.train(numpy.array(sets.x_train[i]), sets.y_train[i]) test_score.append(naive.test_score(numpy.array(sets.x_test[i]), numpy.array(sets.y_test[i]))) for j in range(len(sets.x_test[i])): predicted_class = f.search_for_phrase(naive, sets.x_test[i][j]) r_class = sets.y_test[i][j] if (r_class != predicted_class): if(r_class == 4 or r_class == 5 or r_class == 6 or r_class == 7 or r_class == 9 or r_class == 10 or r_class == 11 or r_class == 12 or r_class == 14): print(numpy.round(naive.all_classes_result(sets.x_test[i][j]),2)) for o in range(len(sets.x_test[i][j])): if sets.x_test[i][j][o] > 0.1: print("Prob of term " + l.features_utterance[o] + " of real class:", math.exp(naive.classifier.feature_log_prob_[r_class][o])) print("Prob of term " + l.features_utterance[o] + " of predicted class:", math.exp(naive.classifier.feature_log_prob_[predicted_class][0][o])) print(sets.x_test[i][j][o]) print('\n') #print(l.features_utterance[o])
# Spencer Barton # 10-601 # Naive Bayes Classifier from NaiveBayesClassifier import NaiveBayesClassifier import sys #=============================================== # Script #=============================================== TRAIN_FILE_NAME = sys.argv[1] NB = NaiveBayesClassifier() NB.train(TRAIN_FILE_NAME) topWords = NB.getSortedWords() N = 20 for i in xrange(0, N): pair = topWords['lib'][i] print pair[0], round(pair[1], 4) print for i in xrange(0, N): pair = topWords['con'][i] print pair[0], round(pair[1], 4)
def cleanText(read_data): # 텍스트에 포함되어 있는 특수 문자 제거 text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', read_data).replace('\n', '').replace('\t', '') return text def get_test_file(): path_dir = '/Users/sinsuung/Workspace/Python/unstructured_data_final_project/corpus/test/' file_list = os.listdir(path_dir) # path 에 존재하는 파일 목록 가져오기 file_list.sort() # 파일 이름 순서대로 정렬 for i in file_list: f = open(path_dir + i) test_list.append(f.read()) if __name__ == "__main__": model = NaiveBayesClassifier() df = pd.read_csv( '/Users/sinsuung/Workspace/Python/unstructured_data_final_project/corpus/dev/out/result.csv', delimiter=',', header=None, names=['LABLE', 'CONTENT'], encoding='utf-8') model.train(df) test_list = [] get_test_file() for i in test_list: model.category_probability(i)