def category_by_movie(): from nltk.corpus import movie_reviews as mr from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random documents = [(list(mr.words(f)), c) for c in mr.categories() for f in mr.fileids(c)] random.shuffle(documents) all_words = FreqDist(w.lower() for w in mr.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print document_features(mr.words('pos/cv957_8737.txt')) #print documents[0] features = [(document_features(d), c) for (d, c) in documents] train_set, test_set = features[100:], features[:100] classifier = NaiveBayesClassifier.train(train_set) print classify.accuracy(classifier, train_set)
def cross_validate(classifier, training_set, test_set): chosen_classif = classifier best_accuracy = 0.0 best_train_accuracy = None best_classifier = None k_fold = cross_validation.KFold(len(training_set), n_folds=10) for train_indices, test_indices in k_fold: train = itemgetter(*train_indices)(training_set) test = itemgetter(*test_indices)(training_set) classifier = chosen_classif.train(train) print '--------------------------------' train_accuracy = classify.accuracy(classifier, train) print 'Training set accuracy:' + str(train_accuracy) if len(test_indices) == 1: test = (test,) accuracy = classify.accuracy(classifier, test) if accuracy > best_accuracy: best_classifier = classifier best_accuracy = accuracy best_train_accuracy = train_accuracy print 'Cross validation set accuracy: ' + str(accuracy) get_fscore(classifier, test) print 'Best classifier CV accuracy: ' + str(best_accuracy) test_accuracy = classify.accuracy(best_classifier, test_set) print 'Best classifier accuracy: ' + str(test_accuracy) print 'Best classifier precision recall fscore: ' print get_fscore(best_classifier, test_set) return [test_accuracy, best_train_accuracy, best_classifier]
def evaluate(train_set, test_set, classifier, name): refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (features, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(features) testsets[observed].add(i) # Get accuracy on training set, test set and get positive and negative recall. trainacc = 100 * classify.accuracy(classifier, train_set) testacc = 100 * classify.accuracy(classifier, test_set) spam_false = 100 - nltk.recall(refsets['spam'], testsets['spam'])*100 ham_false = 100 - nltk.recall(refsets['ham'], testsets['ham'])*100 return trainacc, testacc, spam_false, ham_false
def main(): spam = load_dataset('spam', sys.argv[1], True) ham = load_dataset('ham', sys.argv[2], True) training_spam = spam[:11500] training_ham = ham[:11500] test_spam = spam[1000:] test_ham = ham[1000:] nbc = NaiveBayesClassifier.train(training_ham + training_spam) cPickle.dump(nbc, sys.stdout) sys.stderr.writelines(['Spam accuracy: %f\n' % accuracy(nbc, test_spam), 'Ham accuracy: %f\n' % accuracy(nbc, test_ham)])
def classifier(lambda_): clf = get_classifier('%f' % lambda_, __train_fs, lambda_) logging.debug("Finished training the classifier lambda=%f ..." % lambda_) dev_acc = accuracy(clf, __dev_fs) logging.debug("classifier lambda=%f accuracy on DEV is: %3.5f", lambda_, dev_acc) train_acc = accuracy(clf, __train_fs) logging.debug("classifier lambda=%f accuracy on TRAIN is: %3.5f", lambda_, train_acc) # clf.show_most_informative_features() result = [clf.classify(fs) for fs,label in __dev_fs] gold = [label for fs,label in __dev_fs] cm = nltk.ConfusionMatrix(gold, result) cf_text = cm.pp(sort_by_count=True, show_percents=True, truncate=20) return (lambda_, dev_acc, train_acc, cf_text, clf)
def update_category_by_pos(): from nltk.corpus import brown from nltk import NaiveBayesClassifier from nltk import classify from nltk.tag import untag from nltk import DecisionTreeClassifier def pos_features(sentence, i): features = {'suffix(1)':sentence[i][-1:], 'suffix(2)':sentence[i][-2:], 'suffix(3)':sentence[i][-3:] } features['prev-word'] = '<start>' if i==0 else sentence[i-1] return features print pos_features(brown.sents()[0], 8) tagged_sents = brown.tagged_sents(categories='news') featuresets = [] for tagged_sent in tagged_sents: untagged_sent = untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, i), tag)) size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = NaiveBayesClassifier.train(train_set) classifier = DecisionTreeClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
def main_function(): conn = MySQLdb.connect( host=DATABASES["date_cutoff"]["HOST"], user=DATABASES["date_cutoff"]["USER"], passwd=DATABASES["date_cutoff"]["PASSWORD"], db=DATABASES["date_cutoff"]["NAME"], ) training_tweets = get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) classifier = NaiveBayesClassifier.train(training_feature_set) error_dict = {"+": 0, "-": 0, "I": 0, "O": 0} count_dict = {"+": 0, "-": 0, "I": 0, "O": 0} guess_dict = {"+": 0, "-": 0, "I": 0, "O": 0} full_matrix = { "+": {"+": 0, "-": 0, "I": 0, "O": 0}, "-": {"+": 0, "-": 0, "I": 0, "O": 0}, "I": {"+": 0, "-": 0, "I": 0, "O": 0}, "O": {"+": 0, "-": 0, "I": 0, "O": 0}, } count_table = {"+": 0, "-": 0, "I": 0, "O": 0} test_tweets = get_test_tweets(conn) test_feature_set = classify.process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) print count_table print "classifier accuracy: " + repr(classifier_accuracy)
def main_function(): conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_analysis") hq_conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter") training_tweets = get_test_tweets(conn) training_feature_set = process_tweets(training_tweets) classifier = DecisionTreeClassifier.train(training_feature_set) test_tweets = get_training_tweets(conn) test_feature_set = process_tweets(test_tweets) classifier_accuracy = accuracy(classifier, test_feature_set) alt_full_matrix = {'+':{'+':0, '-':0, 'E':0}, '-':{'+':0, '-':0, 'E':0}, 'E':{'+':0, '-':0, 'E':0}} #for f in test_tweets: #f = test_tweets[0] #print f #guess = classifier.classify(process_tweet(f[1])) #print guess # update_tweet_polarity(f[0], guess, conn) ## pl = classifier.prob_classify(process_tweet(f[1])) # idx = f[2] # if idx == 'I' or idx == 'O': # idx = 'E' # alt_full_matrix[idx][guess] += 1 #print alt_full_matrix print "classifier accuracy: " + repr(classifier_accuracy)
def benchmarking(self, classifier,_test_set,all_f_measure=[],all_precision=[],all_recall=[]): from nltk import classify accuracy = classify.accuracy(classifier, _test_set) print("accuracy:",accuracy) from nltk.metrics import precision from nltk.metrics import recall from nltk.metrics import f_measure import collections refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(_test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) prec=precision(refsets['class'], testsets['class']) rec=recall(refsets['class'], testsets['class']) f1=f_measure(refsets['class'], testsets['class']) print('precision:', prec) print('recall:', rec) print('F-measure:', f1) all_f_measure.append(f1) all_precision.append(prec) all_recall.append(rec) print('========Show top 10 most informative features========') classifier.show_most_informative_features(10)
def demo(): def gender_features(word): return {"last_letter": word[-1], "penultimate_letter": word[-2]} from nltk.classify import accuracy from nltk.corpus import names import random names = [(name, "male") for name in names.words("male.txt")] + [ (name, "female") for name in names.words("female.txt") ] import random random.seed(60221023) random.shuffle(names) featuresets = [(gender_features(n), g) for (n, g) in names] train_set, test_set = featuresets[500:], featuresets[:500] print "--- nltk.classify.svm demo ---" print "Number of training examples:", len(train_set) classifier = SvmClassifier.train(train_set) print "Total SVM dimensions:", len(classifier._svmfeatureindex) print "Label mapping:", classifier._labelmapping print "--- Processing an example instance ---" print "Reference instance:", names[0] print "NLTK-format features:\n " + str(test_set[0]) print "SVMlight-format features:\n " + str( map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex) ) distr = classifier.prob_classify(test_set[0][0]) print "Instance classification and confidence:", distr.max(), distr.prob(distr.max()) print "--- Measuring classifier performance ---" print "Overall accuracy:", accuracy(classifier, test_set)
def weka(train_set, test_set, algorithm="svm"): from nltk.classify import weka print "--- nltk.classify.weka %s ---" % algorithm temp_dir = tempfile.mkdtemp() classifier = nltk.classify.WekaClassifier.train(temp_dir + "/cred.model", train_set, algorithm) print "Overall accuracy:", accuracy(classifier, test_set)
def demo(): def gender_features(word): return {'last_letter': word[-1], 'penultimate_letter': word[-2]} from nltk.classify import accuracy from nltk.corpus import names import random names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) import random random.seed(60221023) random.shuffle(names) featuresets = [(gender_features(n), g) for (n,g) in names] train_set, test_set = featuresets[500:], featuresets[:500] print '--- nltk.classify.svm demo ---' print 'Number of training examples:', len(train_set) classifier = SvmClassifier.train(train_set) print 'Total SVM dimensions:', len(classifier._svmfeatureindex) print 'Label mapping:', classifier._labelmapping print '--- Processing an example instance ---' print 'Reference instance:', names[0] print 'NLTK-format features:\n ' + str(test_set[0]) print 'SVMlight-format features:\n ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex)) distr = classifier.prob_classify(test_set[0][0]) print 'Instance classification and confidence:', distr.max(), distr.prob(distr.max()) print '--- Measuring classifier performance ---' print 'Overall accuracy:', accuracy(classifier, test_set)
def test_raw_mail(org_email): features_test = {} wordtokens_test = [word_limit.lemmatize(key.lower()) for key in word_tokenize(org_email)] for key in wordtokens_test: if key not in stpwords: features_test[key] = True return features_test #Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle] #Splitting the test and training data sets from the whole email set features size_feature = int(len(feature_sets) * 0.10) train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature] classifier = NaiveBayesClassifier.train(train_set) #print (test_set[1:5]) #Printing the accuracy of the machine print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) #Printing the top 50 features classifier.show_most_informative_features(50) #Printing the spam and ham labels print ('labels:',classifier.labels()) #Classification of user entered email while(True): featset = raw_mail(input("Enter text to classify: ")) print (classifier.classify(featset))
def main_function(): conn = MySQLdb.connect(host=DATABASES['date_cutoff']['HOST'], user=DATABASES['date_cutoff']['USER'], passwd=DATABASES['date_cutoff']['PASSWORD'], db=DATABASES['date_cutoff']['NAME']) training_tweets = classify.get_training_tweets(conn_analysis) training_feature_set = process_tweets(training_tweets) config_megam('/opt/packages') classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) error_dict = {'+':0, '-':0, 'I':0, 'O':0} count_dict = {'+':0, '-':0, 'I':0, 'O':0} guess_dict = {'+':0, '-':0, 'I':0, 'O':0} full_matrix = {'+':{'+':0, '-':0, 'I':0, 'O':0}, '-':{'+':0, '-':0, 'I':0, 'O':0}, 'I':{'+':0, '-':0, 'I':0, 'O':0}, 'O':{'+':0, '-':0, 'I':0, 'O':0}} test_tweets = classify.get_test_tweets(conn_analysis) test_feature_set = process_tweets(test_tweets) classifier.show_most_informative_features(10) classifier_accuracy = accuracy(classifier, test_feature_set) print "classifier accuracy: " + repr(classifier_accuracy)
def cross_validation(data_set, n_folds=8): kf = KFold(len(data_set), n_folds=n_folds) best_accuracy = -1 training_accuracy = 0 for train, cv in kf: classifier = SklearnClassifier( Pipeline([('tfidf', TfidfTransformer()), ('nb', LinearSVC(C=1, tol=0.000001))])) training_data = data_set[0:cv[0]] + data_set[cv[-1]:] cv_data = data_set[cv[0]:cv[-1]+1] classifier.train(training_data) accuracy = classify.accuracy(classifier, cv_data) if accuracy > best_accuracy: best_classifier = classifier best_accuracy = accuracy training_accuracy = classify.accuracy(classifier, training_data) return best_classifier, training_accuracy, best_accuracy
def wsd_classifier(trainer, word, features, stopwords_list = STOPWORDS, number=300, log=False, distance=3, confusion_matrix=False): print "Reading data..." global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] events = _inst_cache[word][:] senses = list(set(l for (i, l) in events)) instances = [i for (i, l) in events] vocab = extract_vocab(instances, stopwords=stopwords_list, n=number) print ' Senses: ' + ' '.join(senses) # Split the instances into a training and test set, #if n > len(events): n = len(events) n = len(events) random.seed(5444522) random.shuffle(events) training_data = events[:int(0.8 * n)] test_data = events[int(0.8 * n):n] # Train classifier print 'Training classifier...' classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data]) # Test classifier print 'Testing classifier...' acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] ) print 'Accuracy: %6.4f' % acc if log==True: #write error file print 'Writing errors to errors.txt' output_error_file = open('errors.txt', 'w') errors = [] for (i, label) in test_data: guess = classifier.classify(features(i, vocab, distance)) if guess != label: con = i.context position = i.position item_number = str(test_data.index((i, label))) word_list = [] for (word, tag) in con: word_list.append(word) hard_highlighted = word_list[position].upper() word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:] sentence = ' '.join(word_list_highlighted) errors.append([item_number, sentence, guess,label]) error_number = len(errors) output_error_file.write('There are ' + str(error_number) + ' errors!' + '\n' + '----------------------------' + '\n' + '\n') for error in errors: output_error_file.write(str(errors.index(error)+1) +') ' + 'example number: ' + error[0] + '\n' + ' sentence: ' + error[1] + '\n' + ' guess: ' + error[2] + '; label: ' + error[3] + '\n' + '\n') output_error_file.close() if confusion_matrix==True: gold = [label for (i, label) in test_data] derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data] cm = nltk.ConfusionMatrix(gold,derived) print cm return cm
def testall_accuracy(self, testset=[]): # default test set is class field (all test files) if not testset: testset = self.test_feature_set_custom print 'Measuring classifier performance...' acc = accuracy(self.classifier, self.test_feature_set_custom) print 'Overall accuracy:', acc return acc
def buildClassifier(hamDir, spamDir): spamEmails = [] hamEmails = [] allEmails = [] features = [] # Using glob instead of os.listdir to ignore hidden files for email in glob.glob(spamDir + "/*"): f = open(email) spamEmails.append(f.read()) f.close() for email in glob.glob(hamDir + "/*"): f = open(email) hamEmails.append(f.read()) f.close() for email in spamEmails: allEmails.append((email, 'spam')) for email in hamEmails: allEmails.append((email, 'ham')) # Shuffle to get the accuracy of the 70:30 ratio. Otherwise, if no check were to be done, would not need to shuffle. random.shuffle(allEmails) # Make a list of feature per email for (email, label) in allEmails: features.append((emailFeatures(email), label)) # 70:30 ratio for training:testing print "Using a 70:30 ratio for training:testing, the accuracy is as follows: " totalSize = int(len(features) * 0.7) trainingEmails, testingEmails = features[:totalSize], features[totalSize:] print "training size: %d; testing size: %d" %(len(trainingEmails), len(testingEmails)) classifier = NaiveBayesClassifier.train(trainingEmails) print classify.accuracy(classifier, testingEmails) print "Now creating and saving a full size classifier made up of %d emails..." %len(features) classifier = NaiveBayesClassifier.train(features) saveClassifier(classifier, "full-classifier.pickle")
def naives_classifier(self, training_set, dev_set, log=0): classifier = NaiveBayesClassifier.train(training_set) accuracy = classify.accuracy(classifier, dev_set) print('Naive Bayes accuracy dev percent: ', (accuracy * 100)) if log == 1: classifier.show_most_informative_features(20) return classifier
def maximum_entropy(train_set, test_set): print "--- nltk.classify.maximum_entropy ---" from nltk.classify import megam megam.config_megam() classifier = nltk.classify.MaxentClassifier.train(train_set, "megam") print "Overall accuracy:", accuracy(classifier, test_set) classifier.show_most_informative_features(10)
def cross_validate(classifier, training_set, test_set): """ Performs 10-fold cross validation parameters: a classifier, training set, test set returns: best classifier, f-score, accuracy """ chosen_classif = classifier best_accuracy = 0.0 best_train_accuracy = None best_classifier = None # retrieve indices of data to be used as training and test sets in cross-validation k_fold = cross_validation.KFold(len(training_set), n_folds=10) # loop through tweets with those indices for train_indices, test_indices in k_fold: # retrieve training and test (cross validation) sets train = itemgetter(*train_indices)(training_set) test = itemgetter(*test_indices)(training_set) classifier = chosen_classif.train(train) train_accuracy = classify.accuracy(classifier, train) if len(test_indices) == 1: test = (test,) # retrieve accuracy accuracy = classify.accuracy(classifier, test) # if accuracy is greater than the best accuracy, store as best classifier if accuracy > best_accuracy: best_classifier = classifier best_accuracy = accuracy best_train_accuracy = train_accuracy fscore = get_fscore(classifier, test) test_accuracy = classify.accuracy(best_classifier, test_set) accuracy = {'test_accuracy': test_accuracy, 'best_train_accuracy': best_train_accuracy} to_return = {'classifier': best_classifier, 'fscore': fscore, 'accuracy': accuracy} return to_return
def cross_validate(content_set, times, words, amount_of_words): incr = len(content_set) // times document_extraction = Doc_extract(words, amount_of_words) for i in xrange(times): train_set = apply_features(document_extraction, content_set[:i * incr] + content_set[(i + 1) * incr:]) test_set = apply_features(document_extraction, content_set[i * incr: min((i + 1) * incr, len(content_set))]) classifier = get_trained_classifier(train_set) acc = accuracy(classifier, test_set) print('\n{0} classifier\n\tAccuracy: {1:.6}'.format(i + 1, acc)) print('\tPrecision: {0:.6}\n\tRecall: {1:.6}\n\tF_measure: {2:.6}'.format(*get_f_measure(classifier, test_set)))
def _cross_train(self, fold_sz): rid2shard = ST.random_shardlize(fold_sz, len(self._train_xs), load=True) precision = 0 for fid,sd in rid2shard.items(): tmp_train_xs = [self._train_xs[i] for i in sd] tmp_train_ys = [self._train_ys[i] for i in sd] test_set = [(self._feature_encoding(self._train_xs[i]), self._train_ys[i]) for i in sd] classifier = self._train(tmp_train_xs, tmp_train_ys) p = classify.accuracy(classifier, test_set) linfo('maxent classifier precision: %.4f' % p) precision += p linfo('average maxent classifier precision: %.4f' % precision/fold_sz)
def category_by_name(): from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random names = ([(name, 'male') for name in names.words('male.txt')] +[(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) def gender_features(word): return {'last_letter':word[-1]} train_set = apply_features(gender_features, names[500:]) test_set = apply_features(gender_features, names[:500]) classifier = NaiveBayesClassifier.train(train_set) print classifier.classify(gender_features('Neo')) print classify.accuracy(classifier, train_set)
def model_dev(func_name): from nltk.corpus import names names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) print "Length of dataset %d"%len(names) random.shuffle(names) random.shuffle(names) print "How the data set looks" print names[0:10] print "Testing the output of feature extraction" print "For name Gary -- %s"%func_name('Gary') featuresets = [(func_name(n), g) for (n, g) in names] print "length of featureset data %d"%len(featuresets) print featuresets[0:10] train_set, test_set = featuresets[500:], featuresets[:500] print "Length of train data %d"%len(train_set) print "length of test data %d"%len(test_set) time.sleep(10) os.system('clear') print "\n\nNaive Bayes Classification\n\n" nb_classifier = NaiveBayesClassifier.train(train_set) check_list=['Gary', 'Shivam', 'Grace', 'Sarah', 'Shaym', 'Richa', 'Abhisheyk'] for name in check_list: print "Naive gender classification of ---%s --is-- %s---"%(name,nb_classifier.classify(func_name(name))) print "The accuracy of the naive classifier is" print classify.accuracy(nb_classifier, test_set) print "The most informative features are:" print nb_classifier.show_most_informative_features(5) time.sleep(10) os.system('clear') print "\n\nMaxent Classification\n\n" mod=MaxentClassifier.train(train_set) for name in check_list: print "Maxent gender classification of ---%s --is-- %s---"%(name,mod.classify(func_name(name))) print "The accuracy of maxent is" print classify.accuracy(mod, test_set) print "The most informative features are:" print mod.show_most_informative_features(5)
def test(): naive_test_data = datapreparation.create_naive_test_data() # print naive_data[-1] # d1 = doc_cls[1:100] #gc.collect() # naive_data2 = naive_train_data # random.shuffle(naive_data2) # edge = (len(naive_data)/3) * 2 # print '##############################################################' print 'accuracy: ', classify.accuracy(classifier,naive_test_data) print classifier.most_informative_features() print classifier.show_most_informative_features()
def train_and_test(self, num_folds=10): shuffle(self.comment_list) fold_size = int(ceil(len(self.comment_list) / float(num_folds))) folds = [] for i in range(num_folds): start = i * fold_size end = (i+1) * fold_size if end > len(self.comment_list): end = len(self.comment_list) train_comments = self.comment_list[:start] + self.comment_list[end:] test_comments = self.comment_list[start:end] train_set = data_set(train_comments) test_set = data_set(test_comments) classifier = NaiveBayesClassifier.train(train_set) print accuracy(classifier, test_set)
def main(): global best_words tweets = get_tweets_from_db() tweet_list = tweets[1000:1599000] test_list = tweets[:1000]+ tweets[1599000:] word_scores = create_word_scores() best_words = find_best_words(word_scores, 500000) f = open('bestwords.pickle', 'wb') pickle.dump(best_words, f) f.close() training_set = classify.apply_features(best_word_features, tweet_list) print "extracted features" # train the classifier with the training set classifier = NaiveBayesClassifier.train(training_set) print "trained classifier" # create the pickle file f = open('NBclassifier_new.pickle', 'wb') pickle.dump(classifier, f) f.close() print "created pickle" # test for precision and recall refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) test_set = classify.apply_features(best_word_features, test_list) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'neg precision:', metrics.precision(refsets['0'], testsets['0']) print 'neg recall:', metrics.recall(refsets['0'], testsets['0']) print 'pos precision:', metrics.precision(refsets['4'], testsets['4']) print 'pos recall:', metrics.recall(refsets['4'], testsets['4']) # test_set = classify.apply_features(extract_features, test_list) # print "extracted features" print classify.accuracy(classifier, test_set) print classifier.show_most_informative_features(30)
def sentim(self, data): stop_words = ['the', 'an', 'the', 'i', 'a', 'and', 'to'] #, 'none'] #, 'heartworm', ' distemper/parvo'] #stopwords.words('english') path_csv = '../data/csv/tf_idf_adoptable_csv.csv' df = read_df_csv(path_csv) X_negative = df["description"] #data corpus_dirty = [] for doc in range(len(X_negative)): str_corpus = str(X_negative[doc]) corpus_dirty.append(str_corpus) negative_documents = [] for doc in range(len(X_negative)): record = X_negative[doc] record = (record.lower()) replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') remove_digits = str.maketrans('', '', digits) replaced = replaced.translate(remove_digits) clean = replaced.replace(", '...'", "").replace("...", '') negative_documents.append(clean) # print(documents) # # # 2. Create a set of tokenized documents. negative_descriptions = [word_tokenize(content) for content in negative_documents] negative_cleaned_tokens_list = [] for tokens in negative_descriptions: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_neg_words = get_all_words(negative_cleaned_tokens_list) freq_dist_neg = FreqDist(all_neg_words) print("most common ADOPTABLE words: ", freq_dist_neg.most_common(10)) ################################################################## ################################################################## ################################################################## path_csv = '../data/csv/tf_idf_adopted_csv.csv' df = read_df_csv(path_csv) X_positive = df["description"] #data corpus_dirty = [] for doc in range(len(X_positive)): str_corpus = str(X_positive[doc]) corpus_dirty.append(str_corpus) positive_documents = [] for doc in range(len(X_positive)): record = X_positive[doc] record = (record.lower()) replaced = record.replace(", '...'", "").replace("...", '').replace('\d+', '') remove_digits = str.maketrans('', '', digits) replaced = replaced.translate(remove_digits) clean = replaced.replace(", '...'", "").replace("...", '') positive_documents.append(clean) # print(documents) # # # 2. Create a set of tokenized documents. positive_descriptions = [word_tokenize(content) for content in positive_documents] # print("\n\nPositive Descriptions Tokenized: ", positive_descriptions) # ['dora', 'female', 'shep', 'mix', 'brindle', 'dhpp', 'kc', '//', 'no', 'puppy', 'hi', 'cathleen', ',', 'she', 'is', 'doing', 'great', 'and', 'really', 'starting'], ['meet', 'nova', '!', 'now', 'that', 'she', 'is', 'done', 'raising', 'her', 'pups', 'she', 'is', 'looking', 'for', 'a', 'home', 'of', 'her', 'own', 'where']] positive_cleaned_tokens_list = [] for tokens in positive_descriptions: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) # save_documents = open("pickled_algos/all_pos_words.pickle","wb") # pickle.dump(positive_cleaned_tokens_list, save_documents) # save_documents.close() freq_dist_pos = FreqDist(all_pos_words) print("most common ADOPTED words: ", freq_dist_pos.most_common(10)) ################################################################## ################################################################## ################################################################## positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) # positive_tokens_for_model = all_pos_words.pickle negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(description_dict, "Positive") for description_dict in positive_tokens_for_model] negative_dataset = [(description_dict, "Negative") for description_dict in negative_tokens_for_model] # print("positive_dataset: ", positive_dataset) # print("negative_dataset: ", negative_dataset) dataset = positive_dataset + negative_dataset seventy_percent_of_data = int(len(dataset) * .7) thirty_percent_of_data = int(len(dataset) * .3) # print(thirty_percent_of_data) #361 random.shuffle(dataset) #to avoid bias train_data = dataset[:seventy_percent_of_data] test_data = dataset[thirty_percent_of_data:] classifier = NaiveBayesClassifier.train(train_data) # classifier = MultinomialNB.fit(train_data) save_classifier = open("naivebayes_pet.pickle","wb") pickle.dump(classifier, save_classifier) save_classifier.close() print("%%%%%%%%%%%%%%%%%%%Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) # from nltk.corpus import twitter_samples # print("&&&&&&&&&&&&&&&&&&&&&&&&&") # print(twitter_samples) data = str(data) punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' for ele in data: if ele in punc: data = data.replace(ele, "") data = data.split() # print("tokenized data: ", data) #breakdown parts of speech parts_of_speech = [] parts_of_speech.append(nltk.pos_tag(data)) print("parts of speech tagging: ", parts_of_speech) #lemmatized data: stop_words = [] #left here in case I want to add words in the future cleaned_tokens = [] for token, tag in nltk.pos_tag(data): if tag.startswith("NN"): pos = 'n' elif tag.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() token = lemmatizer.lemmatize(token, pos) if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words: cleaned_tokens.append(token.lower()) custom_tokens = remove_noise(word_tokenize(str(data))) print(str(data), classifier.classify(dict([token, True] for token in custom_tokens))) sentiment_result = [classifier.classify(dict([token, True] for token in custom_tokens))] print("sentiment_result: ", type(sentiment_result), sentiment_result) data = sentiment_result return data
featuresets = [(gender_features(n), g) for (n, g) in names] print(len(featuresets)) print(featuresets[0:10]) train_set, test_set = featuresets[500:], featuresets[:500] print(len(train_set)) print(len(test_set)) nb_classifier = NaiveBayesClassifier.train(train_set) print(nb_classifier.classify(gender_features('Gary'))) print(nb_classifier.classify(gender_features('Grace'))) print(classify.accuracy(nb_classifier, test_set)) print(nb_classifier.show_most_informative_features(5)) me_classifier = MaxentClassifier.train(train_set) print(me_classifier.classify(gender_features('Gary'))) print(me_classifier.classify(gender_features('Grace'))) classify.accuracy(me_classifier, test_set) me_classifier.show_most_informative_features(5) def gender_features2(name): features = {} features["firstletter"] = name[0].lower()
def create_features(article): article_features = set(re.sub("[^a-z]", "", article.lower()).split()) features = {} for word in word_features: features['contains({})'.format(word)] = (word in article_features) return features print("[=] Generating features...") feature_set = [] i = 0 for (article, score) in data[:500]: feature_set.append([create_features(article), score]) i = i + 1 print("Generated feature " + str(i) + "/" + str(len(data))) print("[+] Generated features...") training_set, testing_set = feature_set[:int( len(feature_set) * 3 / 4)], feature_set[int(len(feature_set) * 3 / 4):] print("[+] Generated training set of length " + str(len(training_set)) + ", and testing set of length " + str(len(testing_set))) print("[*] Training...") classifier = NaiveBayesClassifier.train(training_set) print("[+] Finished training.") f_dos = open('models/checkpoints/headline_classifier.pickle', 'wb') pickle.dump(classifier, f_dos) f_dos.close() print("[+] Saved classifier.") print("[=] Accuracy: " + str(classify.accuracy(classifier, testing_set))) print("") print("[=] Analyzing features...") classifier.show_most_informative_features(5)
pro_dataset = [(tweet_dict, "Professional") for tweet_dict in pro_tokens] unpro_dataset = [(tweet_dict, "Unprofessional") for tweet_dict in unpro_tokens] #print(pro_dataset[0]) dataset = pro_dataset + unpro_dataset random.shuffle(dataset) seventy = int(len(dataset) * 0.70) train_data = dataset[:seventy] #70% test_data = dataset[seventy:] #30% #-----model classifier = NaiveBayesClassifier.train(train_data) accuracy = classify.accuracy(classifier, test_data) #print("\nAccuracy is:", classify.accuracy(classifier, test_data)) #print(classifier.show_most_informative_features(10)) #-----testing on custom tweet custom = sys.argv custom_tweet = "" for i in range(len(custom)): if not i == 0: custom_tweet = custom_tweet + " " + custom[i] custom_tokens = remove_noise(word_tokenize(custom_tweet))
test_set = pos_reviews_set[:200] + neg_reviews_set[:200] train_set = pos_reviews_set[100:] + neg_reviews_set[100:] print(len(test_set), len(train_set)) # #### This part of code trains the classifier and then print the accuracy gained(which can be different evertime) # # In[25]: from nltk import classify from nltk import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(train_set) accuracy = classify.accuracy(classifier, test_set) print(accuracy) # #### Voice input # Here we are first taking input in the formm of live audio stream and then converting it into text. # In[26]: import wave import pyaudio from os import path from pydub import AudioSegment import nltk import nltk.corpus import os import re
def test(self, test_set): return classify.accuracy(self.classifier, test_set)
def train_test_model(self): ''' This functions is an entirely self contained, trained Naive Bayes Model for text sentiment analysis with a 75.467% accuracy Importing more positive and negative classified tweets could be used to improve the model. The results are stored in the self.trained_model variable for the DataTransform class ''' print('Preprocessing classified tweets for model.') from nltk.corpus import twitter_samples import random positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') positive_df = pd.DataFrame(positive_tweets).rename(columns={0: 'text'}) negative_df = pd.DataFrame(negative_tweets).rename(columns={0: 'text'}) dict_samp = {} positive_dict = [] positive = [] negative=[] negative_dict = [] datatransform_positive = DataTransform() datatransform_positive.set_df(positive_df) datatransform_positive.clean_text('text','token_text') for i in range(len(datatransform_positive.output_df.index)): for j in range(len(datatransform_positive.output_df['token_text'][i])): dict_samp.update({datatransform_positive.output_df['token_text'][i][j]: True}) positive_dict.append(dict_samp) dict_samp = {} for w in positive_dict: positive.append((w, 'Positive')) datatransform_negative = DataTransform() datatransform_negative.set_df(negative_df) datatransform_negative.clean_text('text','token_text') for i in range(len(datatransform_negative.output_df.index)): for j in range(len(datatransform_negative.output_df['token_text'][i])): dict_samp.update({datatransform_negative.output_df['token_text'][i][j]: True}) negative_dict.append(dict_samp) dict_samp = {} for w in negative_dict: negative.append((w, 'Negative')) dataset = positive+negative random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] self.trained_model = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(self.trained_model, test_data)) return
db = Query('canada_subreddit.db') db.connect() cur = db.cursor() cur.execute(''' SELECT c.body, s.label FROM submissions as s, comments as c WHERE s.submission_id = c.submission_id AND (s.label = "Climate" OR s.label = "Housing"); ''') data = cur.fetchall() db.close() feature_set = FeaturePipeline().create_set(data) split = lambda x: -int(len(x) / 5) k = split(feature_set) training_set = feature_set[:k] testing_set = feature_set[k:] print('Now training...') Naive_classifier = NaiveBayesClassifier.train(training_set) print("Naive Bayes Algo accuracy percent:", (classify.accuracy(Naive_classifier, testing_set))) Naive_classifier.show_most_informative_features(30) # # with open(pickle_file,"wb") as save_classifier: # pickle.dump(Naive_classifier, save_classifier) # save_classifier.close()
random.shuffle(labeled_names) # featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] # # entries are ({'last_letter': 'g'}, 'male') # train_set, test_set = featuresets[500:], featuresets[:500] # # classifier = nltk.NaiveBayesClassifier.train(train_set) # # ans1 = classifier.classify(gender_features('Mark')) # ans2 = classifier.classify(gender_features('Precilla')) # # print("Mark is:", ans1) # print("Precilla is:", ans2) # print(accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) # print(nltk.classify.accuracy(classifier, test_set)) acc = [] for i in range(12): featuresets = [(gender_features(n)[i], gender) for (n, gender) in labeled_names] train_set, test_set = featuresets[500:], featuresets[:500] classifier = nltk.NaiveBayesClassifier.train(train_set) acc.append(accuracy(classifier, test_set)) print(acc) plt.plot(range(12), acc) plt.xlabel('Features') plt.ylabel('Accuracy') plt.xticks(range(12)) plt.savefig('bc.png', dpi=100, bbox_inches='tight') plt.show()
#from pprint import pprint ##pprint.pprint(training_featureset) ##pprint.pprint(results) #pprint(results) import sys #sys.exit() print ''' Classifier accuracy (Bayes): %s B Precision (Bayes): %s B Recall (Bayes): %s I Precision (Bayes): %s I Recall (Bayes): %s O Precision (Bayes): %s O Recall (Bayes): %s ''' % (accuracy(bayes_classifier, test_featureset), precision(results[0]['B-SNP'], results[1]['B-SNP']), recall(results[0]['B-SNP'], results[1]['B-SNP']), precision(results[0]['I-SNP'], results[1]['I-SNP']), recall(results[0]['I-SNP'], results[1]['I-SNP']), precision(results[0]['O'], results[1]['O']), recall(results[0]['O'], results[1]['O'])) #bayes_classifier.show_most_informative_features(10) sys.exit() maxent_classifier = nltk.classify.MaxentClassifier.train(training_featureset) maxent_results = get_results(maxent_classifier) print '''
def wst_classifier(trainer, word, features, stopwords_list=STOPWORDS, number=300, log=False, distance=3, confusion_matrix=False): """ This function takes as arguments: a trainer (e.g., NaiveBayesClassifier.train); a target word from senseval2 (you can find these out with senseval.fileids(), and they are 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos'); a feature set (this can be wsd_context_features or wsd_word_features); a number (defaults to 300), which determines for wsd_word_features the number of most frequent words within the context of a given sense that you use to classify examples; a distance (defaults to 3) which determines the size of the window for wsd_context_features (if distance=3, then wsd_context_features gives 3 words and tags to the left and 3 words and tags to the right of the target word); log (defaults to false), which if set to True outputs the errors into a file errors.txt confusion_matrix (defaults to False), which if set to True prints a confusion matrix. Calling this function splits the senseval data for the word into a training set and a test set (the way it does this is the same for each call of this function, because the argument to random.seed is specified, but removing this argument would make the training and testing sets different each time you build a classifier). It then trains the trainer on the training set to create a classifier that performs WSD on the word, using features (with number or distance where relevant). It then tests the classifier on the test set, and prints its accuracy on that set. If log==True, then the errors of the classifier over the test set are written to errors.txt. For each error four things are recorded: (i) the example number within the test data (this is simply the index of the example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the (incorrect) derived label, and (iv) the gold label. If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j] indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels that were correctly predicted). """ print "Reading data..." global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] events = _inst_cache[word][:] senses = list(set(l for (i, l) in events)) instances = [i for (i, l) in events] vocab = extract_vocab(instances, stopwords=stopwords_list, n=number) print ' Senses: ' + ' '.join(senses) # Split the instances into a training and test set, #if n > len(events): n = len(events) n = len(events) random.seed(5444522) random.shuffle(events) training_data = events[:int(0.8 * n)] test_data = events[int(0.8 * n):n] # Train classifier print 'Training classifier...' classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data]) # Test classifier print 'Testing classifier...' acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data]) print 'Accuracy: %6.4f' % acc if log == True: #write error file print 'Writing errors to errors.txt' output_error_file = open('errors.txt', 'w') errors = [] for (i, label) in test_data: guess = classifier.classify(features(i, vocab, distance)) if guess != label: con = i.context position = i.position item_number = str(test_data.index((i, label))) word_list = [] for (word, tag) in con: word_list.append(word) hard_highlighted = word_list[position].upper() word_list_highlighted = word_list[0:position] + [ hard_highlighted ] + word_list[position + 1:] sentence = ' '.join(word_list_highlighted) errors.append([item_number, sentence, guess, label]) error_number = len(errors) output_error_file.write('There are ' + str(error_number) + ' errors!' + '\n' + '----------------------------' + '\n' + '\n') for error in errors: output_error_file.write( str(errors.index(error) + 1) + ') ' + 'example number: ' + error[0] + '\n' + ' sentence: ' + error[1] + '\n' + ' guess: ' + error[2] + '; label: ' + error[3] + '\n' + '\n') output_error_file.close() if confusion_matrix == True: gold = [label for (i, label) in test_data] derived = [ classifier.classify(features(i, vocab)) for (i, label) in test_data ] cm = nltk.ConfusionMatrix(gold, derived) print cm return cm
def test(classifier, test_set): print('Testing classifier...') return classify.accuracy(classifier, test_set)
def sentim_twitter(self, data): '''heavily borrowed from https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk to show functioning model''' positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:700] test_data = dataset[700:] classifier = NaiveBayesClassifier.train(train_data) print("twitter data **********************************") print("%%%%%%%%%%%%%%%%%%% Twitter Accuracy is:", classify.accuracy(classifier, test_data)) print("twitter data **********************************") print(classifier.show_most_informative_features(10)) # data = (data) # custom_tweet = str(data) print("twitter data **********************************") print("twitter data **********************************") print("is this reading data correctly???: ", type(str(data))) custom_tweet = str(data) # this gives negative custom_tokens = remove_noise(word_tokenize(custom_tweet)) print("twitter data **********************************") print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens))) twitter = classifier.classify(dict([token, True] for token in custom_tokens)) return twitter
def train_test_evaluation(): positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') print('Total number of positive_tweets are : ', len(positive_tweets)) print('Total number of negative_tweets are : ', len(negative_tweets)) print('-------------------------') print('one smaple of positive_tweets : ', positive_tweets[0]) print('one smaple of negative_tweets : ', negative_tweets[0]) print('-------------------------\n\n') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json') print('Total number of positive_tweet_tokens are : ', len(positive_tweet_tokens)) print('Total number of negative_tweet_tokens are : ', len(negative_tweet_tokens)) print('-------------------------') print('one smaple of positive_tweet_tokens : ', positive_tweet_tokens[0]) print('one smaple of negative_tweet_tokens : ', negative_tweet_tokens[0]) print('-------------------------\n\n') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words)) # all_pos_words = get_all_words(positive_cleaned_tokens_list) # freq_dist_pos = FreqDist(all_pos_words) # print('Most Frequent Items in Positive Tweets',freq_dist_pos.most_common(10)) # # all_neg_words = get_all_words(negative_cleaned_tokens_list) # freq_dist_neg = FreqDist(all_neg_words) # print('Most Frequent Items in negative Tweets',freq_dist_neg.most_common(10)) # print('-------------------------') positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:9000] test_data = dataset[9000:] print('Length of Train Data is : ', len(train_data)) print(' A sample of Traing Data : ', train_data[0]) print('-------------------------') print('Length of Test Data is : ', len(train_data)) print(' A sample of Test Data : ', test_data[0]) print('-------------------------') classifier = NaiveBayesClassifier.train(train_data) print("\n\n Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) f = open('tweeter_trained_cls.pickle', 'wb') pickle.dump(classifier, f) f.close() return classifier
neutral_dataset = [(tweet_dict, "Neutral") for tweet_dict in neutral_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] from nltk import classify from nltk import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(train_data) print('Accuracy is:', classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) from nltk.tokenize import word_tokenize custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print(classifier.classify(dict([token, True] for token in custom_tokens))) custom_tweets = 'Congrats #SportStar on your 7th best goal from last season winning goal of the year :) #Baller #Topbin #oneofmanyworldies' tet = 'I am a very good boy, do you know that' custom_tokens = remove_noise(word_tokenize(tet))
test_data = pd.read_csv(test_url) train_data['0'] = train_data['0'].apply(lambda x: ast.literal_eval(x)) test_data['0'] = test_data['0'].apply(lambda x: ast.literal_eval(x)) train_data_tuple = list(zip(train_data['0'], train_data['1'])) test_data_tuple = list(zip(test_data['0'], test_data['1'])) # # st.write(type(train_data['0'][0])) # clf_nb = NaiveBayesClassifier.train(train_data_tuple) # with open('nb_nltk.pkl', 'wb') as f: # pickle.dump(clf_nb, f) nb_pickle_url = 'https://github.com/boblandsky/onion_ml/raw/master/nb_nltk.pkl' clf_nb = pd.read_pickle(nb_pickle_url) nb_acc = round(classify.accuracy(clf_nb, test_data_tuple), 4)*100 st.write(f'Accuracy on test set: {nb_acc}%') test_nb_headline = st.text_input("Give me a headline to predict. A sample one is provided.", "MLS Commissioner Relieved That Nobody Knows Him by Name") if st.button('Onion or not? NLTK Edition'): test_nb_tokens = remove_noise(word_tokenize(test_nb_headline)) results_nb = clf_nb.classify(dict([token, True] for token in test_nb_tokens)) if results_nb == 1: st.write("It's from the Onion!") else: st.write("It's not from the Onion!") # nb_worked = st.radio('Did the Naive Bayes model make an accurate prediction?', # ('Yes', 'No')
def predict(): import nltk nltk.download('twitter_samples') nltk.download('stopwords') nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') nltk.download('punkt') from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import twitter_samples, stopwords from nltk.tag import pos_tag from nltk.tokenize import word_tokenize from nltk import FreqDist, classify, NaiveBayesClassifier import re, string, random import pickle def remove_noise(tweet_tokens, stop_words=()): cleaned_tokens = [] for token, tag in pos_tag(tweet_tokens): token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\ '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token) token = re.sub("(@[A-Za-z0-9_]+)", "", token) if tag.startswith("NN"): pos = 'n' elif tag.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() token = lemmatizer.lemmatize(token, pos) if len(token ) > 0 and token not in string.punctuation and token.lower( ) not in stop_words: cleaned_tokens.append(token.lower()) return cleaned_tokens def get_all_words(cleaned_tokens_list): for tokens in cleaned_tokens_list: for token in tokens: yield token def get_tweets_for_model(cleaned_tokens_list): for tweet_tokens in cleaned_tokens_list: yield dict([token, True] for token in tweet_tokens) if __name__ == "__main__": positive_tweets = twitter_samples.strings('positive_tweets.json') negative_tweets = twitter_samples.strings('negative_tweets.json') text = twitter_samples.strings('tweets.20150430-223406.json') tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0] stop_words = stopwords.words('english') positive_tweet_tokens = twitter_samples.tokenized( 'positive_tweets.json') negative_tweet_tokens = twitter_samples.tokenized( 'negative_tweets.json') positive_cleaned_tokens_list = [] negative_cleaned_tokens_list = [] for tokens in positive_tweet_tokens: positive_cleaned_tokens_list.append( remove_noise(tokens, stop_words)) for tokens in negative_tweet_tokens: negative_cleaned_tokens_list.append( remove_noise(tokens, stop_words)) all_pos_words = get_all_words(positive_cleaned_tokens_list) freq_dist_pos = FreqDist(all_pos_words) print(freq_dist_pos.most_common(10)) positive_tokens_for_model = get_tweets_for_model( positive_cleaned_tokens_list) negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) custom_tweet = "" if request.method == 'POST': custom_tweet = request.form['text'] custom_tokens = remove_noise(word_tokenize(custom_tweet)) NB_Cls = classifier.classify( dict([token, True] for token in custom_tokens)) print(custom_tweet, NB_Cls) pickle.dump(NB_Cls, open('sentimental_101.pkl', 'wb')) return render_template('results.html', result=NB_Cls)
def run(self): kf = KFold(n_splits=10, shuffle=False, random_state=None) #features_set = [(self.corpora.get_sentence_by_id(key).opinion_finder_features(), value) for (key, value) in self.items] #features_set = [(self.corpora.get_sentence_by_id(key).arguing_features(), value) for (key, value) in self.items] #features_set = [(self.corpora.get_sentence_by_id(key).verb_features(), value) for (key, value) in self.items] #features_set = [(self.corpora.get_sentence_by_id(key).strong_subjectivity_feature(), value) for (key, value) in self.items] features_set = [ (self.corpora.get_sentence_by_id(key).get_all_features(), value) for (key, value) in self.items ] #print('features_set: ',features_set) accuracy_list = [] arg_precision_list = [] arg_recall_list = [] arg_f_measure_list = [] non_arg_precision_list = [] non_arg_recall_list = [] non_arg_f_measure_list = [] for train_index, test_index in kf.split(features_set): #print("TRAIN:", train_index, "TEST:", test_index) #SVC(), sparse=False #svm = SVC(kernel='linear',degree = 10 ) #classifier = SklearnClassifier(svm, sparse=False).train(features_set[train_index[0]:train_index[len(train_index) - 1]]) #print('coef :',svm.coef_) #print('_________________________________________') classifier = nltk.NaiveBayesClassifier.train( features_set[train_index[0]:train_index[len(train_index) - 1]]) #print('most_informative_features: ',classifier.most_informative_features(10)) #print('training set:', features_set[train_index[0]:train_index[len(train_index) - 1]]) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate( features_set[test_index[0]:test_index[len(test_index) - 1]]): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) arg_precision = precision(refsets['arg'], testsets['arg']) arg_recall = recall(refsets['arg'], testsets['arg']) arg_f_measure = f_measure(refsets['arg'], testsets['arg']) non_arg_precision = precision(refsets['non-arg'], testsets['non-arg']) non_arg_recall = recall(refsets['non-arg'], testsets['non-arg']) non_arg_f_measure = f_measure(refsets['non-arg'], testsets['non-arg']) accuracy_ = accuracy( classifier, features_set[test_index[0]:test_index[len(test_index) - 1]]) accuracy_list.append(accuracy_) arg_precision_list.append(arg_precision) arg_recall_list.append(arg_recall) arg_f_measure_list.append(arg_f_measure) non_arg_precision_list.append(non_arg_precision) non_arg_recall_list.append(non_arg_recall) non_arg_f_measure_list.append(non_arg_f_measure) print('median accuracy: ', accuracy_list) print('median arg_precision: ', arg_precision_list) print('median arg_recall: ', arg_recall_list) print('median arg_f_measure: ', arg_f_measure_list) print('median non_arg_precision: ', non_arg_precision_list) print('median non_arg_recall: ', non_arg_recall_list) print('median non_arg_f_measure: ', non_arg_f_measure_list)
@profile def doc_features(doc): doc_words = FreqDist(w for w in doc if not isStopWord(w)) features = {} for word in word_features: features['count (%s)' % word] = (doc_words.get(word, 0)) return features @profile def make_features(docs): return [(doc_features(d), c) for (d, c) in docs] @profile def split_data(sets): return sets[200:], sets[:200] if __name__ == "__main__": labeled_docs = label_docs() sw = set(stopwords.words('english')) filtered = filter_corpus() word_features = select_word_features(filtered) featuresets = make_features(labeled_docs) train_set, test_set = split_data(featuresets) classifier = NaiveBayesClassifier.train(train_set) print("Accuracy", accuracy(classifier, test_set)) print(classifier.show_most_informative_features())
negative_tokens_for_model = get_tweets_for_model( negative_cleaned_tokens_list) positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model] negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model] dataset = positive_dataset + negative_dataset random.shuffle(dataset) train_data = dataset[:7000] test_data = dataset[7000:] classifier = NaiveBayesClassifier.train(train_data) print("Accuracy is:", classify.accuracy(classifier, test_data)) print(classifier.show_most_informative_features(10)) custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again." custom_tokens = remove_noise(word_tokenize(custom_tweet)) print("Testing with custom tweet") print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens))) # added to store the model pickle_model(classifier)
open( os.path.join(os.path.dirname(__file__), 'models/MaxEnt/maxent.pkl'), 'wb')) if not TRAIN: classifier = pickle.load( open( os.path.join(os.path.dirname(__file__), 'models/MaxEnt/maxent_85.pkl'), 'rb')) pred = [] actual = [x[1] for x in test_set] for t, l in test_set: pred.append(classifier.classify(t)) print(classify.accuracy(classifier, test_set)) ''' questions = open(os.path.join(os.path.dirname(__file__), 'datasets/ADA_v2_Exercise_Questions.txt')).read().split('\n') X_ada = [] X_ada_orig = [] for q in questions: X_ada.append(clean(q, return_as_list=False, stem=False)) X_ada_orig.append(q) X_ada = get_filtered_questions(X_ada, threshold=0.75, what_type='ada') X_ada_features = [features(k.split()) for k in X_ada] preds = [] for t in X_ada_features: preds.append(classifier.classify(t))
vocabulary = set() for fileid in train_fileids: for word in movie_reviews.words(fileid): vocabulary.add(word) # Try a feature set of 500 random words vocabulary = list(vocabulary) random.shuffle(vocabulary) random_featureset = vocabulary[:500] train_set = format_dataset(train_fileids, random_featureset) test_set = format_dataset(test_fileids, random_featureset) bayes = NaiveBayesClassifier.train(train_set) print("Random words: ", random_featureset) print("Naive Bayes accuracy:", accuracy(bayes, test_set)) # Try a feature set of the 500 words that appear most often in the training examples common_words = dict() for fileid in train_fileids: for word in movie_reviews.words(fileid): if word not in common_words: common_words[word] = 1 else: word = word + 1 sorted_common = sorted(common_words.items(), key=operator.itemgetter(1))[fileid_count - 500:fileid_count] train_set = format_dataset(train_fileids, sorted_common) test_set = format_dataset(test_fileids, sorted_common)
print(len(dataset)) train_data = dataset[:5] test_data = dataset[5:] names = [ "MultinomialNBclassifier", "BernoulliNB", "LogisticRegression_classifier", "SGDClassifier_classifier ", "SVC_classifier", "LinearSVC_classifier", "NaiveBayesClassifier" ] MultinomialNBclassifier = SklearnClassifier(MultinomialNB()) MultinomialNBclassifier.train(train_data) print("\nMultinomialNB Accuracy is:", (classify.accuracy(MultinomialNBclassifier, test_data)) * 100) # GaussianNBclassifier = SklearnClassifier(GaussianNB()) # GaussianNBclassifier.train(train_data) # print("\nGaussianNB Accuracy is:", classify.accuracy(GaussianNBclassifier, test_data)) BernoulliNB = SklearnClassifier(BernoulliNB()) BernoulliNB.train(train_data) print("BernoulliNB Algo Accuracy: ", (nltk.classify.accuracy(BernoulliNB, test_data)) * 100) LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(train_data) print("LogisticRegression Algo Accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, test_data)) * 100)
pos_features.append((bag_of_words(words), 'pos')) # negative reviews feature set neg_features = [] for words in neg_reviews: neg_features.append((bag_of_words(words), 'neg')) shuffle(pos_features) shuffle(neg_features) test_feature_set = pos_features[:200] + neg_features[:200] train_feature_set = pos_features[200:] + neg_features[200:] classifier = NBC.train(train_feature_set) accuracy = classify.accuracy(classifier, test_feature_set) print(accuracy) #f = open('unigram_classifier.pickle', 'wb') #pickle.dump(classifier, f) #f.close() while (1): custom_review = input( "Enter a custom movie review (Press ENTER key to exit):\n") if (len(custom_review) < 1): break custom_review_tokens = word_tokenize(custom_review) custom_feature_set = bag_of_words(custom_review_tokens) print(classifier.classify(custom_feature_set)) prob_result = classifier.prob_classify(custom_feature_set) print("confidence: " + (str)(prob_result.prob(prob_result.max())))
def accuracy_test(self): print('Performing Accuracy Test') print('Accuracy is:') print(classify.accuracy(self.classifier, self.test_data)) print('---') print(self.classifier.show_most_informative_features(25))
from nltk import classify db = Query('canada_subreddit.db') db.connect() cur = db.cursor() cur.execute(''' SELECT c.body, s.label FROM submissions as s, comments as c WHERE s.submission_id = c.submission_id AND (s.label = "Climate" OR s.label = "Housing"); ''') data = cur.fetchall() feature_set = FeaturePipeline().create_set(data) split = lambda x: - int(len(x) / 5) k = split(feature_set) training_set = feature_set[:k] testing_set = feature_set[k:] print('Now training...') Naive_classifier = NaiveBayesClassifier.train(training_set) print("Naive Bayes Algo accuracy percent:", (classify.accuracy(Naive_classifier, testing_set))) Naive_classifier.show_most_informative_features(30) # # with open(pickle_file,"wb") as save_classifier: # pickle.dump(Naive_classifier, save_classifier) # save_classifier.close()
def WSDClasifier(trainer, word, features, stopwords=STOPWORDS, number=300, distance=3, log=False, confusion_matrix=False): """ Build a classifier instance for the senseval2 senses of a word and applies it :param word: from senseval2 (we have 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos') :type string: :param features: selector to which feature set to use :type features: str (word, context) :param n: passed to extract_vocab when constructing the second argument to the feature set constructor :type int: :param dist: passed to the feature set constructor as 3rd argument :type int: :param log: if set to True outputs any errors into a file errors.txt :type bool: :param confusion_matrix: if set to True prints a confusion matrix :type bool: Calling this function splits the senseval data for the word into a training set and a test set (the way it does this is the same for each call of this function, because the argument to random.seed is specified, but removing this argument would make the training and testing sets different each time you build a classifier). It then trains the trainer on the training set to create a classifier that performs WSD on the word, using features (with number or distance where relevant). It then tests the classifier on the test set, and prints its accuracy on that set. If log==True, then the errors of the classifier over the test set are written to errors.txt. For each error four things are recorded: (i) the example number within the test data (this is simply the index of the example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the (incorrect) derived label, and (iv) the gold label. If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j] indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels that were correctly predicted). """ global inst_cache if word not in inst_cache: inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] events = inst_cache[word][:] senses = list(set(l for (i, l) in events)) instances = [i for (i, l) in events] vocab = extract_vocab(instances, number) print(' Senses: ' + ' '.join(senses)) # Split the instances into a training and test set, #if N > len(events): N = len(events) N = len(events) random.seed(123456789) random.shuffle(events) train_data = events[:int(0.8 * N)] test_data = events[int(0.8 * N):N] # Train classifier print('Training classifier...') classifier = trainer([(features(i, vocab, distance), label) for (i, label) in train_data]) # Test classifier print('Testing classifier...') acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] ) print('Accuracy: {:6.4f}'.format(acc)) if log: #write error file print('Writing errors to errors.txt') with open('errors.txt', 'w') as file: errors = [] for (i, label) in test_data: guess = classifier.classify(features(i, vocab, distance)) if guess != label: con = i.context position = i.position item_number = str(test_data.index((i, label))) word_list=[cv[0] if isinstance(cv,tuple) else cv for cv in con] hard_highlighted = word_list[position].upper() word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:] sentence = ' '.join(word_list_highlighted) errors.append([item_number, sentence, guess,label]) file.write('There are {} errors'.format(len(errors))) file.write('----------------------------\n') for error in errors: idx = errors.index(error)+1 num, snt, guess, label = error file.write('{}) example #: {} \n sentence: {}\n guess: {}\n label: {}\n'.format(idx, num, snt, guess, label)) if confusion_matrix: gold = [label for (i, label) in test_data] derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data] cm = nltk.ConfusionMatrix(gold,derived) print(cm)
print("Dictionary with Negative class : ", negativeReviewDataset[7]) #print("tagged neg :",negative_dataset[0]) dataset = positiveReviewDataset + negativeReviewDataset print("Dataset[0] :", dataset[0]) print("Dataset length", len(dataset)) random.shuffle(dataset) trainData = dataset[:7000] testData = dataset[7000:] trainedModel = NaiveBayesClassifier.train(trainData) print("Accuracy of the model : ", classify.accuracy(trainedModel, testData)) review = "This is a bad product." reviewTokens = noiseRemoval(word_tokenize(review)) # Test print print(review, " : ", trainedModel.classify(dict([token, True] for token in reviewTokens))) #Text = "j@nittha" #Text = re.sub("@", "a", Text) #print(Text) # Flask API to be used in backend @app.route("/NLP")
def evaluate(train_set, test_set, classifier): print ('Accuracy (training set) = ' + str(classify.accuracy(classifier, train_set))) print ('Accuracy (test set) = ' + str(classify.accuracy(classifier, test_set))) classifier.show_most_informative_features(20)
tagged_words = brown.tagged_words(categories='news') print(tagged_words) featuresets = [(pos_features(n), g) for (n,g) in tagged_words] featuresets[0] from nltk import DecisionTreeClassifier from nltk.classify import accuracy cutoff = int(len(featuresets) * 0.1) train_set, test_set = featuresets[cutoff:], featuresets[:cutoff] classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. Therefore, this may take forever. For speed, use scikit-learn for the classifiers. accuracy(classifier, test_set) classifier.classify(pos_features('cats')) ''' To accompany the video, here is the sample code for NLTK part of speech tagging with lots of comments and info as well: POS tag list: CC coordinating conjunction CD cardinal digit DT determiner EX existential there (like: "there is" ... think of it like "there exists") FW foreign word IN preposition/subordinating conjunction
def category_by_pos(): from nltk.corpus import brown from nltk import FreqDist from nltk import DecisionTreeClassifier from nltk import NaiveBayesClassifier from nltk import classify suffix_fdist = FreqDist() for word in brown.words(): word = word.lower() suffix_fdist.inc(word[-1:]) suffix_fdist.inc(word[-2:]) suffix_fdist.inc(word[-3:]) common_suffixes = suffix_fdist.keys()[:100] # print common_suffixes def pos_features(word): features = {} for suffix in common_suffixes: features['endswith(%s)' % suffix] = word.lower().endswith(suffix) return features tagged_words = brown.tagged_words(categories='news') featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = DecisionTreeClassifier.train(train_set) # print 'Decision Tree %f' % classify.accuracy(classifier, test_set) classifier = NaiveBayesClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
def validate(self, validation_set): if self.classifier is None: raise Exception("self.classifier is None") reference=defaultdict(set) observed=defaultdict(set) observed['neutral']=set() for i, (tweet, label) in enumerate(validation_set): reference[label].add(i) observation=self.classify(tweet) observed[observation].add(i) acc=classify.accuracy(self.classifier, observed) posp=precision(reference['positive'],observed['positive']) posr=recall(reference['positive'], observed['positive']) posf=f_measure(reference['positive'], observed['positive']) negp=precision(reference['negative'],observed['negative']) negr=recall(reference['negative'], observed['negative']) negf=f_measure(reference['negative'], observed['negative']) print "accuracy: %s" % acc print "pos precision: %s" % posp print "pos recall: %s" % posr print "pos f-measure: %s" % posf print "neg precision: %s" % negp print "neg recall: %s" % negr print "neg f-measure: %s" % negf return (acc, posp, posr, posf, negp, negr, negf)