def train(test=False): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] if(test): negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features() else: return NaiveBayesClassifier.train(negfeats+posfeats)
def train_and_show_results(pos, neg, pos_bigrams, neg_bigrams, pos_control, neg_control, pos_control_bigrams, neg_control_bigrams): if pos_control == None or neg_control == None or pos_control_bigrams == None or neg_control_bigrams == None: negcutoff = len(neg)*3/4 poscutoff = len(pos)*3/4 neg_bigrams_cutoff = len(neg_bigrams)*3/4 pos_bigrams_cutoff = len(pos_bigrams)*3/4 test_bag_of_words = neg[negcutoff:] + pos[poscutoff:] test_bigrams = neg_bigrams[neg_bigrams_cutoff:] + pos_bigrams[pos_bigrams_cutoff:] train_corpora_bag_of_words = neg[:negcutoff] + pos[:poscutoff] train_corpora_bigrams = neg_bigrams[:neg_bigrams_cutoff] + pos_bigrams[:pos_bigrams_cutoff] else: test_bag_of_words = neg_control + pos_control test_bigrams = neg_control_bigrams + pos_control_bigrams train_corpora_bag_of_words = neg+pos train_corpora_bigrams = neg_bigrams + pos_bigrams print "negative corpus: ", len(neg) print "positive corpus: ", len(pos) if neg_control != None: print "negative test corpus: ", len(neg_control) print "positive test corpus: ", len(pos_control) print 'bag of words and bigrams - Naive Bayes' naive_bayes = NaiveBayesClassifier.train(train_corpora_bag_of_words) naive_bayes_bigrams = NaiveBayesClassifier.train(train_corpora_bigrams) save_dataset('naive_bayes.dat', naive_bayes) save_dataset('naive_bayes_bigrams.dat', naive_bayes_bigrams) print 'bag of words and bigrams - Maximum Entropy' maximum_entropy = nltk.MaxentClassifier.train(train_corpora_bag_of_words, max_iter=2) maximum_entropy_bigrams = nltk.MaxentClassifier.train(train_corpora_bigrams, max_iter=2) save_dataset('maximum_entropy.dat', maximum_entropy) save_dataset('maximum_entropy_bigrams.dat', maximum_entropy_bigrams) print 'Naive Bayesian results' print 'bag of words' print 'Accuracy:', nltk.classify.util.accuracy(naive_bayes, test_bag_of_words) naive_bayes.show_most_informative_features() print_precision_recall(naive_bayes, test_bag_of_words) print '\nbigrams' print 'Accuracy:', nltk.classify.util.accuracy(naive_bayes_bigrams, test_bigrams) naive_bayes_bigrams.show_most_informative_features() print_precision_recall(naive_bayes_bigrams, test_bigrams) print 'Maximum Entropy results' print 'bag of words' print 'Accuracy:', nltk.classify.util.accuracy(maximum_entropy, test_bag_of_words) maximum_entropy.show_most_informative_features() print_precision_recall(maximum_entropy, test_bag_of_words) print '\nbigrams' print 'Accuracy:', nltk.classify.util.accuracy(maximum_entropy_bigrams, test_bigrams) maximum_entropy_bigrams.show_most_informative_features() print_precision_recall(maximum_entropy_bigrams, test_bigrams)
def cross_validation(self): #10 fold cross validation is performed train_feats_count = int(len(self.training_feats)) fold_size = int(train_feats_count / self.k_fold) nb_accuracy_list = [] svm_accuracy_list = [] nb_f_val_list = [] svm_f_val_list = [] for a in range(self.k_fold): start_index = a * fold_size end_index = start_index + fold_size train_features = self.training_feats[:start_index] + self.training_feats[end_index:] test_features = self.training_feats[start_index:end_index] self.nb_classifier = NaiveBayesClassifier.train(train_features) nb_acc = nltk.classify.util.accuracy(self.nb_classifier, test_features) nb_accuracy_list.append(nb_acc) self.svm_classifier = SklearnClassifier(LinearSVC()) self.svm_classifier.train(train_features) svm_acc = nltk.classify.util.accuracy(self.svm_classifier, test_features) svm_accuracy_list.append(svm_acc) #Find F-Measure nb_f_val = self.compute_measures(test_features, self.nb_classifier) nb_f_val_list.append(nb_f_val) svm_f_val = self.compute_measures(test_features, self.svm_classifier) svm_f_val_list.append(svm_f_val) self.logging.info('Average accuracy of Naive Bayes Classifier %s\n' % (float(sum(nb_accuracy_list)/len(nb_accuracy_list)))) self.logging.info('Average accuracy of SVM Classifier %s\n' % (float(sum(svm_accuracy_list)/len(svm_accuracy_list)))) self.logging.info('Average F measure of Naive Bayes Classifier %s\n' % (float(sum(nb_f_val_list)/len(nb_f_val_list)))) self.logging.info('Average F measure of SVM Classifier %s\n' % (float(sum(svm_f_val_list)/len(svm_f_val_list))))
def classification(self): #Training NB classifier self.nb_classifier = NaiveBayesClassifier.train(self.training_feats) #Training SVM classifier self.svm_classifier = SklearnClassifier(LinearSVC()) self.svm_classifier.train(self.training_feats)
def evaluate_classifier(featx): sportsfeats = [(featx(tweet[0]), tweet[1]) for tweet in Sports_Tweet] politicsfeats = [(featx(tweet[0]), tweet[1]) for tweet in Politics_Tweet] sportscutoff = len(sportsfeats)*3/4 politicscutoff = len(politicsfeats)*3/4 trainfeats = sportsfeats[:sportscutoff] + politicsfeats[:politicscutoff] testfeats = sportsfeats[sportscutoff:] + politicsfeats[politicscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['Sports'], testsets['Sports']) print 'pos recall:', nltk.metrics.recall(refsets['Politics'], testsets['Politics']) print 'neg precision:', nltk.metrics.precision(refsets['Sports'], testsets['Sports']) print 'neg recall:', nltk.metrics.recall(refsets['Politics'], testsets['Politics']) classifier.show_most_informative_features() return classifier
def main(): # vote_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.votes.txt' # votes = get_justice_votes(vote_file) # for v in votes: print(v, votes[v]) # win_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.outcome.txt' # winners = get_winners(win_file) # for w in winners: print(w, winners[w]) text_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt' #text_file = '/Users/nasrallah/Desktop/some_text.txt' ## Extract the feature sets feature_sets = get_training_features(text_file) ## Shuffle the features to mix up pos and neg #random.shuffle(feature_sets) ## Separate into train and test sets cutoff = int(len(feature_sets)*3/4) train_feature_sets = feature_sets[:cutoff] test_feature_sets = feature_sets[cutoff:] print('train on %d instances, test on %d instances' % (len(train_feature_sets), len(test_feature_sets))) classifier = NaiveBayesClassifier.train(train_feature_sets) print('accuracy:', nltk.classify.util.accuracy(classifier, test_feature_sets)) classifier.show_most_informative_features()
def evaluate_classifier(featx): #negids = movie_reviews.fileids('neg') #posids = movie_reviews.fileids('pos') ##For Movie Review train: #negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] #posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] ##For product reviews train: negfeats = [(featx([wrd for wrd in nltk.word_tokenize(con) if wrd not in stpwrds]), 'neg') for con in traincons] posfeats = [(featx([wrd for wrd in nltk.word_tokenize(pro) if wrd not in stpwrds]), 'pos') for pro in trainpros] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:] + posfeats[:] #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features() return classifier
def __init__(self): # neg_phrases = filter_negative_phrases(load_csv_sentences('thoughtsandfeelings.csv')) # pos_phrases = filter_positive_phrases(load_csv_sentences('spiritualforums.csv')) neg_file = open("neg_phrases.txt", "r") pos_file = open("pos_phrases.txt", "r") neg_phrases = neg_file.readlines() pos_phrases = pos_file.readlines() neg_phrases_tagged = [] pos_phrases_tagged = [] for phrase in neg_phrases: neg_phrases_tagged.append((word_feats(phrase.split()), 'suicidal')) for phrase in pos_phrases: pos_phrases_tagged.append((word_feats(phrase.split()), 'alright')) negcutoff = int(len(neg_phrases_tagged) * .8) poscutoff = int(len(pos_phrases_tagged) * .8) trainfeats = neg_phrases_tagged[:negcutoff] + pos_phrases_tagged[:poscutoff] testfeats = neg_phrases_tagged[negcutoff:] + pos_phrases_tagged[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) self.classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(self.classifier, testfeats) self.classifier.show_most_informative_features()
def naiveBayes(features_train, features_test): print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test)) classifier = NaiveBayesClassifier.train(features_train) print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test) classifier.show_most_informative_features() precisions, recalls = precision_recall(classifier, features_test) print "accuracy: ", precisions, "fitness: ", recalls
def classify(self): # Classify articles = Article.objects.filter(entity=self.entity) def word_feats(body): words = body.split(" ") return dict([(word, True) for word in words]) negids = articles.filter(score__lt=0) posids = articles.filter(score__gt=0) negfeats = [(word_feats(a.body), "neg") for a in negids] posfeats = [(word_feats(a.body), "pos") for a in posids] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print "train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print "accuracy:", nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features()
def __init_naive_bayes(self): """ __init_naive_bayes(self): Gets the data from the positive, negative and neutral text files. Creates and trains the Naive Bayes classifier, using the data, so that it can learn what constitutes a positive, negative or neutral tweet. """ try: pos_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_positive.txt") f = codecs.open(pos_file, mode="rU", encoding='utf-8') positive = [line.lower().replace("\n" , " ") for line in f] positive = "".join(word[:] for word in positive).split() f.close neu_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_neutral.txt") f = codecs.open(neu_file, mode="rU", encoding='utf-8') neutral = [line.lower().replace("\n" , " ") for line in f] neutral = "".join(word[:] for word in neutral).split() f.close neg_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_negative.txt") f = codecs.open(neg_file, mode="rU", encoding='utf-8') negative = [line.lower().replace("\n" , " ") for line in f] negative = "".join(word[:] for word in negative).split() f.close posfeats = [(dict({word.lower() : True}), 'pos') for word in positive if self.__check_word(word)] neufeats = [(dict({word.lower() : True}), 'neu') for word in neutral if self.__check_word(word)] negfeats = [(dict({word.lower() : True}), 'neg') for word in negative if self.__check_word(word)] self.classifier = NaiveBayesClassifier.train( posfeats + neufeats + negfeats ) except: raise Exception ("Unknown error in SentimentAnalyzer::__init_naive_bayes")
def generate_sentiment_classifier(corpus, word_feats): negids = corpus.fileids('neg') posids = corpus.fileids('pos') negfeats = [(word_feats(corpus.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(corpus.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) classifier = NaiveBayesClassifier.train(trainfeats) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features() return classifier
def main(): org_names = Org.objects.values_list('name', flat=True) users = User.objects.filter(likely_org=False) user_names = [user.get_name for user in users] # Exclude the users we know are orgs (exact same name). This mostly gets run the first time and for new users with org names non_org_user_names = set(user_names) - set(org_names) org_features = [(word_features(name), 'org') for name in org_names] user_features = [(word_features(name), 'user') for name in non_org_user_names] classifier = NaiveBayesClassifier.train(user_features + org_features) counter = 0 likely_orgs = [] for user in users: prediction = classifier.prob_classify(word_features(user.get_name)) if prediction.max() == 'org': # Log probability ratio, so if P(org) == 2.4 and P(user) == 0.3 then log2(P(org)/P(user)) = log2(8.0) = 3.0 ratio = math.log(((float(prediction.prob('org')) + NORMALIZING_CONST) / (float(prediction.prob('user')) + NORMALIZING_CONST)), 2) if ratio >= MIN_RATIO and user.likely_org == False and user.admin_classification != 'user': log.info('User ID %d with name "%s" is probably an org. Saving.' % (user.id, user.get_name)) user.likely_org = True user.org_probability = ratio user.save() counter += 1 log.info("Processed %d users with org-like names" % counter)
def create_train_classifier(): print "Recreating training classifier" corpus_dir = nltk.data.find(TRAIN_DATASET_LOC) train_data = nltk.corpus.CategorizedPlaintextCorpusReader(corpus_dir, fileids='.*\.txt',cat_pattern="(pos|neg)") negids_train = train_data.fileids('neg') posids_train = train_data.fileids('pos') # negids_movies = movie_reviews.fileids('neg') # posids_movies = movie_reviews.fileids('pos') negfeats = [(__word_feats_neg(train_data.words(fileids=[f])), 'neg') for f in negids_train] posfeats = [(__word_feats_pos(train_data.words(fileids=[f])), 'pos') for f in posids_train] # negfeats.extend([(__word_feats_neg(movie_reviews.words(fileids=[f])), 'neg') for f in negids_movies]) # posfeats.extend([(__word_feats_pos(movie_reviews.words(fileids=[f])), 'pos') for f in posids_movies]) trainfeats = negfeats + posfeats classifier = NaiveBayesClassifier.train(trainfeats) pos_file_name = 'pickles'+os.sep+'positive_train.pickle' neg_file_name = 'pickles'+os.sep+'negative_train.pickle' class_file_name = 'pickles'+os.sep+'nbClassifier.pickle' __write_file(pos_file_name,cPickle.dumps(posfeats)) __write_file(neg_file_name,cPickle.dumps(negfeats)) __write_file(class_file_name,cPickle.dumps(classifier)) print "Done!"
def naivebayes(trainfeats, testfeats): classifier = NaiveBayesClassifier.train(trainfeats) print "NaiveBayes output" print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print classifier.show_most_informative_features()
def evaluateFeatures(featureSelect): posFeatures = [] negFeatures = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [featureSelect(posWords), 'pos'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [featureSelect(negWords), 'neg'] negFeatures.append(negWords) posCutoff = int(math.floor(len(posFeatures)*3/4)) negCutoff = int(math.floor(len(negFeatures)*3/4)) #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] trainFeatures = posFeatures + negFeatures print testFeatures[0] classifier = NaiveBayesClassifier.train(trainFeatures) referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) #print features #print predicted testSets[predicted].add(i)
def evaluate_classifier_Naive(featx): train_negids = train.fileids('neg') train_posids = train.fileids('pos') test_negids = test.fileids('neg') test_posids = test.fileids('pos') train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids] train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids] test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids] test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids] trainfeats = train_negfeats + train_posfeats testfeats = test_negfeats + test_posfeats Naive_classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets_Naive = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed_Naive = Naive_classifier.classify(feats) testsets_Naive[observed_Naive].add(i) accuracy1 = nltk.classify.util.accuracy(Naive_classifier, testfeats) pos_precision1 = nltk.metrics.precision(refsets['pos'], testsets_Naive['pos']) pos_recall1 = nltk.metrics.recall(refsets['pos'], testsets_Naive['pos']) neg_precision1 = nltk.metrics.precision(refsets['neg'], testsets_Naive['neg']) neg_recall1 = nltk.metrics.recall(refsets['neg'], testsets_Naive['neg']) Naive_classifier.show_most_informative_features(50) return(['NaiveBayes',accuracy1,pos_precision1,pos_recall1,neg_precision1,neg_recall1])
def classify(): #corpus = 'Cornell_text_polarity' #corpus = 'BingLiu_selected_sentences' corpus = 'Cornell_sentence_polarity' cases = load_corpus(corpus) features = get_word_features(cases) train_feats = [] test_feats = [] for polarity, feats in features.items(): #cutoff = len(feats) * 1 / 4 cutoff = 1000 print polarity, 'number of train:', cutoff #train_feats += feats[:cutoff] #test_feats += feats[cutoff:] temp_feats = feats[:] random.shuffle(temp_feats) train_feats += temp_feats[:cutoff] test_feats += temp_feats[cutoff:] print 'train on %d instances, test on %d instances' % (len(train_feats), len(test_feats)) classifier = NaiveBayesClassifier.train(train_feats) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats) classifier.show_most_informative_features()
def classification(self): fstruct = FeatStruct(self.train_reviews) classifier = NaiveBayesClassifier.train(fstruct) print 'accuracy:', nltk.classify.util.accuracy(classifier, self.test_reviews) classifier.show_most_informative_features()
def main_function(): conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'], user=DATABASES['ensemble']['USER'], passwd=DATABASES['ensemble']['PASSWORD'], db=DATABASES['ensemble']['NAME']) training_tweets = classify.get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) classifier = NaiveBayesClassifier.train(training_feature_set) error_dict = {'+':0, '-':0, 'I':0, 'O':0} count_dict = {'+':0, '-':0, 'I':0, 'O':0} guess_dict = {'+':0, '-':0, 'I':0, 'O':0} count_table = {'+':0, '-':0, 'I':0, 'O':0} tweets = classify.get_tweets_to_classify(conn); for tweet in tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn) count_table[guess] += 1 #fix_manual_tweets(conn_analysis) classify.run_sql(conn, classify.Statements.UPDATE_MANUAL_CLASSIFIED) print count_table print full_matrix
def train_with_movie_db(self): """ Training possible with movie reviews - this does not yield particularly good results """ self.use_movie_reviews = True negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "negative") for f in negids] posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "positive") for f in posids] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))) self.classifier = NaiveBayesClassifier.train(trainfeats) DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats))) DLOG(self.classifier.show_most_informative_features())
def main(): articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]') feats = {} trainfeats = [] testfeats = [] for cat in articles.categories(): wow = len([f for f in articles.fileids(cat)]) # such variable name print "for category", cat, ":", wow feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)] cutoff = wow - hold_back(wow) trainfeats.append(feats[cat][:cutoff]) testfeats.append(feats[cat][cutoff:]) train = [item for sublist in trainfeats for item in sublist] test = [item for sublist in testfeats for item in sublist] print 'train on %d instances, test on %d instances' % (len(train), len(test)) classifier = NaiveBayesClassifier.train(train) print 'accuracy:', nltk.classify.util.accuracy(classifier, test) classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :( # load with: # import pickle # f = open('my_classifier.pickle') # classifier = pickle.load(f) # f.close() with open('../data/classifier.pickle', 'wb') as f: pickle.dump(classifier, f)
def main(): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) with open("output.json") as fin: sid = SentimentIntensityAnalyzer() data = json.load(fin) for key in data: reviews = data[key]["reviews"] for i in range(len(reviews)): text = reviews[i]["review"] sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0} prob = classifier.prob_classify(word_feats(text.split(" "))) classification = classifier.classify(word_feats(text.split(" "))) sentiment_dict['positive_probability'] = prob.prob('pos') sentiment_dict['negative_probability'] = prob.prob('neg') sentiment_dict['label'] = classification reviews[i]["sentiment"] = sentiment_dict data[key]["reviews"] = reviews with open('out_with_sentiment.json', 'w') as outfile: json.dump(data, outfile)
def main(argv): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') #print negids negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids] trainfeats = posfeats+negfeats #print trainfeats # break classifier = NaiveBayesClassifier.train(trainfeats) #classifier = pickle.load(open("classifier.p", "rb")) topicList = ["media", "sports", "news", "fashion", "finance", "politics"] for line in sys.stdin: try: tolk_posset = word_tokenize(line.rstrip()) d = word_feats(tolk_posset) for topic in topicList: subjectFull = subj(line, topic) if not subjectFull == "No match": #print d print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1" except: #print "Error" continue
def init(): # create our dict of training data texts = {} texts['traffic'] = 'traffic-corpus.txt' texts['useless'] = 'useless-corpus.txt' #holds a dict of features for training our classifier train_set = [] # loop through each item, grab the text, tokenize it and create a training feature with it for sense, file in texts.iteritems(): print "training %s " % sense text = open(file, 'r').read() features = extract_words(text) train_set = train_set + [(get_feature(word), sense) for word in features] classifier = NaiveBayesClassifier.train(train_set) # uncomment out this line to see the most informative words the classifier will use classifier.show_most_informative_features(20) # uncomment out this line to see how well our accuracy is using some hand curated tweets # run_classifier_tests(classifier) return classifier
def train(): # get impact for documents for which it has not been computed yet for document in Document.objects.filter(sentiment__isnull=True): get_impact(document, settings.TIME) known_data = Document.objects.filter(sentiment__isnull=False) known_data_count = known_data.count() if known_data_count == 0: print('known_data_count == 0') return None, 0 # 2/3 training data num_training_data = int(round(2 * known_data_count / 3)) training_feats = [] for document in known_data.order_by('id')[:num_training_data]: text = get_nltktext(document.text) training_feats.append((word_feats(text), document.sentiment)) classifier = NaiveBayesClassifier.train(training_feats) # 1/3 test_data num_testing_data = int(round(known_data_count / 3)) testing_feats = [] for document in known_data.order_by('-id')[:num_testing_data]: text = get_nltktext(document.text) testing_feats.append((word_feats(text), document.sentiment)) print('train on %d instances, test on %d instances' % (len(training_feats), len(testing_feats))) accuracy = nltk.classify.util.accuracy(classifier, testing_feats) return classifier, accuracy
def train(self, graphs): """ Trains a ``NaiveBayesClassifier`` using the edges present in graphs list as positive examples, the edges not present as negative examples. Uses a feature vector of head-word, head-tag, child-word, and child-tag. :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. """ from nltk.classify import NaiveBayesClassifier # Create training labeled training examples labeled_examples = [] for graph in graphs: for head_node in graph.nodes.values(): for child_index, child_node in graph.nodes.items(): if child_index in head_node['deps']: label = "T" else: label = "F" labeled_examples.append( ( dict( a=head_node['word'], b=head_node['tag'], c=child_node['word'], d=child_node['tag'], ), label, ) ) self.classifier = NaiveBayesClassifier.train(labeled_examples)
def evaluate_classifier(featx): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features()
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] inposFeatures = [] innegFeatures = [] #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) """ with open(RT_INPUT_POS_FILE, 'r') as posSentences: for i in posSentences: inposWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) inposWords = [feature_select(inposWords), 'pos'] inposFeatures.append(inposWords) """ with open(RT_INPUT_NEG_FILE, 'r') as negSentences: for i in negSentences: innegWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) innegWords = [feature_select(innegWords), 'neg'] innegFeatures.append(innegWords) #selects 3/4 of the features to be used for training and 1/4 to be used for testing #posCutoff = int(math.floor(len(posFeatures)*3/4)) #negCutoff = int(math.floor(len(negFeatures)*3/4)) trainFeatures = posFeatures + negFeatures testFeatures = innegFeatures #+ inposFeatures #trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) fileOutput ={'key':[],'pos':[],'neg':[]} #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): #print features , label referenceSets[label].add(i) predicted = classifier.prob_classify(features) print "\n" fileOutput['key'].append(i) fileOutput['pos'].append(predicted.prob("pos")) fileOutput['neg'].append(predicted.prob("neg")) #posValues = predicted.prob("pos") #negValues = predicted.prob("neg") fileOutput.values() testSets[predicted].add(i) #print i #print testSets[predicted] return fileOutput
def classify_and_evaluate(reviews, feature_extractor=word_feats): random.shuffle(reviews) pos_reviews = filter(lambda x: x['class'] == 'POSITIVE', reviews) neg_reviews = filter(lambda x: x['class'] == 'NEGATIVE', reviews) # get unique features pos_features = [] neg_features = [] for review in pos_reviews: split_reviews = review['text'].split(' ') split_reviews = [x for x in split_reviews if x] pos_features.append((feature_extractor(split_reviews), 'pos')) for review in neg_reviews: split_reviews = review['text'].split(' ') split_reviews = [x for x in split_reviews if x] neg_features.append((feature_extractor(split_reviews), 'neg')) # divide groups pos_offset = int(math.floor(len(pos_reviews) * 3 / 4)) neg_offset = int(math.floor(len(neg_reviews) * 3 / 4)) training = pos_features[:pos_offset] + neg_features[:neg_offset] testing = pos_features[pos_offset:] + neg_features[neg_offset:] # train classifier classifier = NaiveBayesClassifier.train(training) print 'treinada em %d reviews, testada em %d reviews' % (len(training), len(testing)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testing) classifier.show_most_informative_features()
def train_classifier(self, dataset, feature_fn_name='word', train_ratio=0.8, verbose=False, token_column='text', target_column='category', best_ratio=0.8, pos_target_val=1, neg_target_val=-1): def word_feats(words): return dict([(word, True) for word in words]) def best_word_feats(words): return dict([(word, True) for word in words if word in bestwords]) def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) d = dict([(bigram, True) for bigram in bigrams]) d.update(best_word_feats(words)) return d def best_trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=200): tcf = TrigramCollocationFinder.from_words(words) trigrams = tcf.nbest(score_fn, n) d = dict([(trigram, True) for trigram in trigrams]) d.update(best_bigram_word_feats(words)) d.update(best_word_feats(words)) return d if verbose: print( '\nSelected feature function: {}, token column: {}, train ratio: {}' .format(feature_fn_name, token_column, train_ratio)) df = dataset.sample(frac=1).reset_index(drop=True) negids = df[df[target_column] == neg_target_val].index posids = df[df[target_column] == pos_target_val].index feats = df[token_column] if feature_fn_name in ['best_word', 'best_bigram', 'best_trigram']: word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for tokens in df[df[target_column] == pos_target_val][token_column]: for word in tokens.split(): word_fd[word] += 1 label_word_fd[self._positive_label][word] += 1 for tokens in df[df[target_column] == neg_target_val][token_column]: for word in tokens.split(): word_fd[word] += 1 label_word_fd[self._negative_label][word] += 1 pos_word_count = label_word_fd[self._positive_label].N() neg_word_count = label_word_fd[self._negative_label].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd[self._positive_label][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq( label_word_fd[self._negative_label][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_cnt = int(len(word_scores) * best_ratio) best = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:best_cnt] bestwords = set([w for w, s in best]) if feature_fn_name == 'best_trigram_word_feats': feat_fn = best_trigram_word_feats elif feature_fn_name == 'best_bigram': feat_fn = best_bigram_word_feats else: feat_fn = best_word_feats else: feat_fn = word_feats negfeats = [(feat_fn(feats[i].split()), self._negative_label) for i in negids] posfeats = [(feat_fn(feats[i].split()), self._positive_label) for i in posids] if verbose: print('No. of samples: {}, Pos: {}, Neg: {}'.format( len(feats), len(posfeats), len(negfeats))) negcutoff = int(len(negfeats) * train_ratio) poscutoff = int(len(posfeats) * train_ratio) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = defaultdict(set) testsets = defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) metrics = { 'Accuracy': nltk.classify.util.accuracy(classifier, testfeats), 'Pos precision': precision(refsets[self._positive_label], testsets[self._positive_label]), 'Pos recall': recall(refsets[self._positive_label], testsets[self._positive_label]), 'Neg precision': precision(refsets[self._negative_label], testsets[self._negative_label]), 'Neg recall': recall(refsets[self._negative_label], testsets[self._negative_label]) } if verbose: print(metrics) return classifier, metrics
training_pre = pos[:len(pos)-5] + neg[:len(neg)-5] test_pre = pos[len(pos)-5:] + neg[len(neg)-5:] training = [] test = [] for k, line in enumerate(training_pre): training.append([format_sentence(line[0]), line[1]]) for k, line in enumerate(test_pre): test.append([format_sentence(line[0]), line[1]]) # Build classfier from nltk.classify import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(training) classifier.show_most_informative_features() # Neg example example1 = "XBox Live still down " result = classifier.classify(format_sentence(example1)) print(result) example1 = "wow... all i wanted to do is see taylor swift but of course shes sold out! " result = classifier.classify(format_sentence(example1)) print(result) example1 = "very sad after cavs loss "
for i in range(int(Sum_Line * Devide_Part)): training_data.append([ preprocess1( line_clean(linecache.getline('data\\rt-polarity.pos', i))), 'pos' ]) for i in range(int(Sum_Line * Devide_Part)): training_data.append([ preprocess1( line_clean(linecache.getline('data\\rt-polarity.neg', i))), 'neg' ]) random.shuffle(training_data) model = NaiveBayesClassifier.train(training_data) Cor_Num_Pos = 0.0 for i in range(int(Sum_Line * Devide_Part), Sum_Line): Cor_Num_Pos += (model.classify( preprocess2( line_clean(linecache.getline('data\\rt-polarity.pos', i)))) == 'pos') Cor_Num_Neg = 0.0 for i in range(int(Sum_Line * Devide_Part), Sum_Line): Cor_Num_Neg += (model.classify( preprocess2( line_clean(linecache.getline('data\\rt-polarity.neg', i)))) == 'neg')
features_pos = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in fileids_pos] features_neg = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in fileids_neg] threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:] print('\nNumber of training datapoints:', len(features_train)) print('Number of test datapoints:', len(features_test)) classifier = NaiveBayesClassifier.train(features_train) print('\nAccuracy of the classifier:', nltk_accuracy(classifier, features_test)) N = 15 print('\nTop ' + str(N) + ' most informative words:') for i, item in enumerate(classifier.most_informative_features()): print(str(i + 1) + '. ' + item[0]) if i == N - 1: break input_reviews = [ "Everything about this movie is outstanding -- the performances, the way the true events are handled, the cinematography. ", "In this day of digital news, this movie makes us stand back and realize what we may lose in the way of investigative journalism as we slowly kill off print media.", "The lengths the directors go to to achieve a sense of authenticity is remarkable. ", "We are there in Boston in 2001-2002. We get to know enough about each character to make him or her real, but not enough to create side dramas. ",
def main_function(): conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'], user=DATABASES['ensemble']['USER'], passwd=DATABASES['ensemble']['PASSWORD'], db=DATABASES['ensemble']['NAME']) training_tweets = classify.get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) bayes_classifier = NaiveBayesClassifier.train(training_feature_set) count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} test_tweets = classify.get_test_tweets(conn) for tweet in test_tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = bayes_classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn) count_table[guess] += 1 print "Naive Bayes" print count_table count_table = {'+': 0, '-': 0, 'I': 0, 'O': 0} config_megam('/opt/packages') max_ent_classifier = MaxentClassifier.train(training_feature_set, algorithm="megam", trace=0) for tweet in test_tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = max_ent_classifier.classify(classify.process_tweet(text)) update_tweet_polarity_ensemble(tweet[0], guess, conn) count_table[guess] += 1 print "Maximum Entropy" print count_table #generate the accuracy matrix full_matrix = { '+': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, '-': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, 'I': { '+': 0, '-': 0, 'I': 0, 'O': 0 }, 'O': { '+': 0, '-': 0, 'I': 0, 'O': 0 } } for tweet in test_tweets: result = classify.run_sql( conn, classify.Statements.CHECK_CONSENSUS % tweet[0]) guess = result[0][0] actual_result = classify.run_sql( conn, classify.Statements.CHECK_MAJORITY % tweet[0]) actual = actual_result[0][0] if guess is not None: if actual is not None: full_matrix[actual][guess] += 1 print full_matrix
def code(): possitive_fileid = reviews.fileids('pos') #pos/cv000 to cv999 negative_fileid = reviews.fileids('neg') #neg/cv000 to cv999 print("Total number of reviews in the dataset: " + str(len(reviews.fileids()))) print("Total number of possitive Reviews: " + str(len(possitive_fileid))) print("Total number of negative Reviews: " + str(len(negative_fileid))) possitive_features = [(extract(reviews.words(fileids=[fd])), 'Good') for fd in possitive_fileid] negative_features = [(extract(reviews.words(fileids=[fd])), 'Bad') for fd in negative_fileid] train_features = possitive_features[:] + negative_features[:] test_features = possitive_features[:] + negative_features[:] print('\nTotal number of trained datapoints:', len(train_features)) print('Total number of tested datapoints:', len(test_features)) # Train a Naive Bayes classifier classifier = classify.train(train_features) acc = classifier_accuracy(classifier, test_features) * 100 print('\nAccuracy of the system: ' + str(acc) + ' %') n = 20 print('\nTop ' + str(n) + ' most informative words:') for i, item in enumerate(classifier.most_informative_features()): print(str(i + 1) + '. ' + item[0]) if i == n - 1: break default_reviews = [ 'The costumes in this movie were great', 'I think the story was terrible and the characters were very weak', 'People say that the director of the movie is amazing', 'This is such an idiotic movie, i will not recommend it to anyone', 'This is not the movie i recommend' ] print("\nCurrent Reviews:\n") for i in default_reviews: print(i + '.\n\n') while (True): a = int( input( "Do you want to add more reviews, if yes press 1 else press 0\n" )) if (a == 0): break elif (a == 1): b = input("Please enter the review\n") default_reviews.append(b) total_rating = 0 print("\nMovie Review Predictions:") for review in default_reviews: print("\nReview:", review) probabilities = classifier.prob_classify(extract(review.split())) predicted_sentiment = probabilities.max() print("Predicted sentiment:", predicted_sentiment) print("Probability of correct sentiment:", format(round(probabilities.prob(predicted_sentiment), 2), '.2f')) if (predicted_sentiment == 'Good'): total_rating += round(probabilities.prob(predicted_sentiment), 2) else: if predicted_sentiment == 'Bad': total_rating -= round(probabilities.prob(predicted_sentiment), 2) if (-0.25 <= total_rating <= 0.25): print("\n\nOverall Rating: Average") elif total_rating < -0.25: print("\n\nOverall Rating: Very Bad") else: print("\n\nOverall Rating: Very good")
def trainAndPrintAccuracy(trainingSet, testSet): print("Training data...") classifier = NaiveBayesClassifier.train(trainingSet) accuracy = nltk.classify.util.accuracy(classifier, testSet) print(accuracy) return classifier
def cross_validate(iterations, haiku_labeled, short_labeled, corpus_length, journal): accuracy_scores = [] most_informative = [] mis_classified = [] h_precision_scores = [] h_recall_scores = [] nh_precision_scores = [] nh_recall_scores = [] response = raw_input("Do you want to reduce dimensionality (y/n)?") if response == "y": min_df = raw_input("Include words occurring more than how many times?") #do normal cross-validation for i in range(iterations): haiku_random = [] haiku_random = random.sample(haiku_labeled, corpus_length) #pick poems at random -- choose number based on smaller corpus size #Create 4 folds for validation testing cut_point = int((len(haiku_random))/4) hfold1 = haiku_random[0:cut_point] hfold2 = haiku_random[cut_point:(cut_point*2)] hfold3 = haiku_random[(cut_point*2):(cut_point*3)] hfold4 = haiku_random[(cut_point*3):] poetry_random = [] poetry_random = random.sample(short_labeled, corpus_length) #draws this number of samples randomly from the feature-set; will have to adjust number according to corpus size cut_point2 = int((len(poetry_random))/4) pfold1 = poetry_random[0:cut_point2] pfold2 = poetry_random[cut_point2:(cut_point2*2)] pfold3 = poetry_random[(cut_point2*2):(cut_point2*3)] pfold4 = poetry_random[(cut_point2*3):] #build training and test-sets train_set = hfold1 + hfold2 + pfold1 + pfold2 + hfold3 + pfold3 test_set = hfold4 + pfold4 #+ hfold3 + pfold3 #dimensionality reduction; shouldn't you be doing this for both train and test sets? if response =="y": doc_terms = documents_per_word(train_set + test_set) #count how many documents each word appears in train_set = reduce_word_features((train_set + test_set), doc_terms, int(min_df)) #exclude terms that appear in min_df documents #train the classifier nb_classifier = NaiveBayesClassifier.train([e[1] for e in train_set]) nb_classifier.labels() #check accuracy of classifier and store accuracy measure accuracy_scores.append(accuracy(nb_classifier, [el[1] for el in test_set])) #obtain the 30 most informative features for each iteration most_informative.append(nb_classifier.show_most_informative_features(n=30)) #get haiku precision and recall measures from the test and store in list h_precision, h_recall, nh_precision, nh_recall = get_precision_recall(test_set, nb_classifier) h_precision_scores.append(h_precision) h_recall_scores.append (h_recall) nh_precision_scores.append(nh_precision) nh_recall_scores.append (nh_recall) #store list of mis-classified files from the journal corpus (i.e., files misclassified as haiku) for el in test_set: guess = nb_classifier.classify(el[1][0]) if guess != el[1][1] and (re.findall(r'[a-z]', el[0][0]) or len(el[0]) > 8): #this will exclude the haiku mis-classified as not-haiku mis_classified.append(el[0]) #write the mis_classified texts to a file and print out the most-commonly mis-classified texts counter = collections.Counter(mis_classified) #prepare to print mis-classified files to .csv import csv filename = 'c:\Users\Public\Documents\MyData\HaikuArticle\errors.csv' mis_classified_texts = open(filename, 'a') wr = csv.writer(mis_classified_texts, quoting=csv.QUOTE_ALL) wr.writerow(counter.values()) #write the frequency wr.writerow(counter.keys()) #write the filenames #print the files to a document along with metadata print_misclassified_haiku(counter, journal) print("\nThe most commonly mis-classified files in this test were the following:") print counter.most_common() return accuracy_scores, most_informative, h_precision_scores, h_recall_scores, nh_precision_scores, nh_recall_scores
def evaluate_features(feature_select): #reading pre-labeled input and splitting into lines negSentences = open(os.path.join(__location__, 'rt-polarity-neg.txt'), 'r', encoding='utf8') posSentences = open(os.path.join(__location__, 'rt-polarity-pos.txt'), 'r', encoding='utf8') negSentences = re.split(r'\n', negSentences.read()) posSentences = re.split(r'\n', posSentences.read()) # stopwords = re.split(r'\n', posSentences.read()) stopwords = [] with open('stopwords.txt', 'r') as f: for line in f: stopwords.append(line.rstrip()) f.close() # print(stopwords) posFeatures = [] negFeatures = [] # breaks up the sentences into lists of individual words # creates instance structures for classifier for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i) posWords = [word for word in posWords if word not in stopwords] posWords = [feature_select(posWords), 'pos'] # print(posWords) # print(posWords) posFeatures.append(posWords) # if posWords in stopwords: # continue # else: # posFeatures.append(posWords) for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i) negWords = [word for word in negWords if word not in stopwords] negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) # if negFeatures in stopwords: # print("11") # continue # else: # negFeatures.append(negWords) # posFeatures = [word for word in posFeatures if word not in stopwords] # negFeatures = [word for word in negFeatures if word not in stopwords] posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] #Runs the classifier on the testFeatures classifier = NaiveBayesClassifier.train(trainFeatures) #Sets up labels to look at output referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) for i, (features, label) in enumerate( testFeatures): # enumerate adds number-count to each item referenceSets[label].add( i) # recorded polarity for these test sentences predicted = classifier.classify( features) # classifiers' proposed polarity for tests testSets[predicted].add(i) #Outputs print('train on %s instances, test on %s instances' % (len(trainFeatures), len(testFeatures))) print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)) print('pos precision:', scores.precision(referenceSets['pos'], testSets['pos'])) print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos'])) print('neg precision:', scores.precision(referenceSets['neg'], testSets['neg'])) print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg'])) classifier.show_most_informative_features(10)
X = [x[0] for x in dataset] Y = [x[1] for x in dataset] kfold = StratifiedKFold(n_splits=int(args.z), shuffle=True, random_state=seed) cvscores = [] for train, test in kfold.split(X, Y): # print(dataset[train[0]]) train_data = [] for i in range(len(train)): train_data.append(dataset[train[i]]) test_data = [] for i in range(len(test)): test_data.append(dataset[test[i]]) model = NaiveBayesClassifier.train(train_data) scores = nltk.classify.util.accuracy(model, test_data) print("{}%".format(scores * 100)) cvscores.append(scores * 100) # plot_model(model, to_file='model.png') model.show_most_informative_features() print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores))) ### create training and test sets ## set the cutoffs # negcutoff = math.floor(len(neg_list)*3/4) # poscutoff = math.floor(len(pos_list)*3/4) # # top10list = [] # avgAccuracy = 0
def train_classifier(training, test): classifier = NaiveBayesClassifier.train(training) print('Classifier Accuracy => ', accuracy(classifier, test))
filtered_words = [ word for word in words if word not in stopwords.words('english') ] # 表示该词在文本中,为了使用nltk中的分类器 return {word: True for word in filtered_words} # 构造样本 train_data = [[pro_text(text1), 1], [pro_text(text2), 1], [pro_text(text3), 1], [pro_text(text4), 0], [pro_text(text5), 0]] print('train_data', train_data) # 训练模型 nb_model = NaiveBayesClassifier.train(train_data) # 测试模型 text6 = 'that is a bad one' print('测试结果:', nb_model.classify(pro_text(text6))) print('\n===================== 2. 文本相似度 =====================') # 2. 文本相似度 import nltk from nltk import FreqDist text1 = 'I like the movie so much ' text2 = 'That is a good movie ' text3 = 'This is a great one ' text4 = 'That is a really bad movie ' text5 = 'This is a terrible movie'
def evaluate_classifier(featx): negfeats = [(featx(f), 'neg') for f in word_split(negdata)] posfeats = [(featx(f), 'pos') for f in word_split(posdata)] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] # using 3 classifiers classifier_list = ['nb', 'maxent', 'svm'] for cl in classifier_list: if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(trainfeats) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) accuracy = nltk.classify.util.accuracy(classifier, testfeats) pos_precision = precision(refsets['pos'], testsets['pos']) pos_recall = recall(refsets['pos'], testsets['pos']) pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) neg_precision = precision(refsets['neg'], testsets['neg']) neg_recall = recall(refsets['neg'], testsets['neg']) neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) print('') print('---------------------------------------') print('SINGLE FOLD RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', accuracy) print('precision', (pos_precision + neg_precision) / 2) print('recall', (pos_recall + neg_recall) / 2) print('f-measure', (pos_fmeasure + neg_fmeasure) / 2) #classifier.show_most_informative_features() print('') ## CROSS VALIDATION trainfeats = negfeats + posfeats # SHUFFLE TRAIN SET # As in cross validation, the test chunk might have only negative or only positive data random.shuffle(trainfeats) n = 5 # 5-fold cross-validation for cl in classifier_list: subset_size = int(len(trainfeats) / n) accuracy = [] pos_precision = [] pos_recall = [] neg_precision = [] neg_recall = [] pos_fmeasure = [] neg_fmeasure = [] cv_count = 1 for i in range(n): testing_this_round = trainfeats[i * subset_size:][:subset_size] training_this_round = trainfeats[:i * subset_size] + trainfeats[ (i + 1) * subset_size:] if cl == 'maxent': classifierName = 'Maximum Entropy' classifier = MaxentClassifier.train(training_this_round, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter=1) elif cl == 'svm': classifierName = 'SVM' classifier = SklearnClassifier(LinearSVC(), sparse=False) classifier.train(training_this_round) else: classifierName = 'Naive Bayes' classifier = NaiveBayesClassifier.train(training_this_round) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testing_this_round): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round) cv_pos_precision = precision(refsets['pos'], testsets['pos']) cv_pos_recall = recall(refsets['pos'], testsets['pos']) cv_pos_fmeasure = f_measure(refsets['pos'], testsets['pos']) cv_neg_precision = precision(refsets['neg'], testsets['neg']) cv_neg_recall = recall(refsets['neg'], testsets['neg']) cv_neg_fmeasure = f_measure(refsets['neg'], testsets['neg']) accuracy.append(cv_accuracy) pos_precision.append(cv_pos_precision) pos_recall.append(cv_pos_recall) neg_precision.append(cv_neg_precision) neg_recall.append(cv_neg_recall) pos_fmeasure.append(cv_pos_fmeasure) neg_fmeasure.append(cv_neg_fmeasure) cv_count += 1 print('---------------------------------------') print('N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')') print('---------------------------------------') print('accuracy:', sum(accuracy) / n) print('precision', (sum(pos_precision) / n + sum(neg_precision) / n) / 2) print('recall', (sum(pos_recall) / n + sum(neg_recall) / n) / 2) print('f-measure', (sum(pos_fmeasure) / n + sum(neg_fmeasure) / n) / 2) print('')
news_words = sents2words(news_sents) news_feats = [(word_feats(wordlist), 'news') for wordlist in news_words] news_cutoff = len(news_feats) * 3 / 4 other_cutoff = len(other_feats) * 3 / 4 others_cutoff = len(others_feats) * 3 / 4 train_feats_other = news_feats[:news_cutoff] + other_feats[:other_cutoff] train_feats_others = news_feats[:news_cutoff] + others_feats[:others_cutoff] test_feats_other = news_feats[news_cutoff:] + other_feats[other_cutoff:] test_feats_others = news_feats[news_cutoff:] + others_feats[others_cutoff:] test_sents_other = news_sents[news_cutoff:] + other_sents[other_cutoff:] print 'train on %d instances, test on %d instances' % (len(train_feats_other), len(test_feats_other)) classifier_other = NaiveBayesClassifier.train(train_feats_other) classifier_others = NaiveBayesClassifier.train(train_feats_others) print 'accuracy:', nltk.classify.util.accuracy(classifier_other, test_feats_other) print 'accuracy:', nltk.classify.util.accuracy(classifier_others, test_feats_others) classifier_other.show_most_informative_features(n=100) filename = filebase + "/fnielsen/data/Hansen2010Diffusion_news.txt" tweet_news = [ re.findall('^(-?\d) (.+)$', line, re.UNICODE)[0] for line in open(filename).readlines()[:1000] ] tweet_words = sents2words(map(lambda (v, s): s, tweet_news)) tweet_feats = [] for n in range(len(tweet_news)):
def evaluate_features(feature_select): posFeatures_train = [] negFeatures_train = [] posFeatures_test = [] negFeatures_test = [] #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(POS_FILE_TRAIN, "rb") as f: posSentences_train = f.readlines() #posSentences = pos_data.split('\n') random.shuffle(posSentences_train) with open(POS_FILE_TEST, "rb") as f: posSentences_test = f.readlines() #posSentences = pos_data.split('\n') random.shuffle(posSentences_test) with open(NEG_FILE_TRAIN, "rb") as f: #negSentences = f.read().split('\n') negSentences_train = f.readlines() random.shuffle(negSentences_train) with open(NEG_FILE_TEST, "rb") as f: #negSentences = f.read().split('\n') negSentences_test = f.readlines() random.shuffle(negSentences_test) #with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences_train: posWords = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip()) posWords = [feature_select(posWords), 'pos'] #pos = contains location posFeatures_train.append(posWords) str_i = (i.decode("utf-8")).strip() Train_twit_Dic[frozenset(posWords[0].items())] = str_i for i in posSentences_test: posWords_test = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip()) posWords_test = [feature_select(posWords_test), 'pos'] #pos = contains location posFeatures_test.append(posWords_test) str1 = (i.decode("utf-8")).strip() Test_twit_Dic[frozenset(posWords_test[0].items())] = str1 #with open(RT_POLARITY_NEG_FILE, 'r') as negSenBufferedReader: <_io.BufferedReader name='/home/ira/Dropbox/twitter/contain_location_tweets.txt'>tences: for i in negSentences_train: negWords = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip()) negWords = [feature_select(negWords), 'neg'] #neg = doesn't contain location negFeatures_train.append(negWords) str2 = (i.decode("utf-8")).strip() Train_twit_Dic[frozenset(negWords[0].items())] = str2 for i in negSentences_test: negWords_test = re.findall(r"[\w']+|[.,!?;@#]", str(i).rstrip()) negWords_test = [feature_select(negWords_test), 'neg'] #neg = doesn't contain location negFeatures_test.append(negWords_test) str3 = (i.decode("utf-8")).strip() Test_twit_Dic[frozenset(negWords_test[0].items())] = str3 #selects 3/4 of the features to be used for training and 1/4 to be used for testing #posCutoff = int(math.floor(len(posFeatures)*3/4)) #negCutoff = int(math.floor(len(negFeatures)*3/4)) #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] ###need to understand what is test here #testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] trainFeatures = posFeatures_train + negFeatures_train ###need to understand what is test here testFeatures = posFeatures_test + negFeatures_test ##############################################################################3 #trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #################### MINE #################################### if predicted == "pos": ##the twit according to the classifier contains a location twiit = Test_twit_Dic[frozenset(features.items())] list_close_twits = Close_Twt_Dic[twiit] words = nltk.word_tokenize(twiit) tagged_words = ner_tagger.tag(words) lbl = "" for tag_w in tagged_words: if tag_w[1] == "LOCATION": lbl = lbl + tag_w[0] + " " #found a label for our twiit final_lbl = lbl ### employ satnford trained classifier on all of the close twiits to find if lbl == "": #couldnt find the location (lable) for our twiit, lets try to find it wihitn its physical neiborhood twwits lbl_list = [] for s in list_close_twits: words = nltk.word_tokenize(s) tagged_words = ner_tagger.tag(words) lbl = "" for tag_w in tagged_words: if tag_w[1] == "LOCATION": lbl = lbl + tag_w[0] + " " if lbl != "": lbl_list.append(lbl) ## find most common str (label) in lbl_list c = Counter(lbl_list) c.most_common(1) final_lbl = c[0][0] #prints metrics to show how well the feature selection did print('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))) print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)) #print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos']) print('pos precision:', precision(referenceSets['pos'], testSets['pos'])) print('pos recall:', recall(referenceSets['pos'], testSets['pos'])) #print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg']) print('neg precision:', precision(referenceSets['neg'], testSets['neg'])) print('neg recall:', recall(referenceSets['neg'], testSets['neg'])) classifier.show_most_informative_features(10)
def do_validation(self): # each fold is a list of body ids. folds, hold_out = kfold_split(self.dataset, n_folds=10) # fold_stances is a dict. keys are fold number (e.g. 0-9). hold_out_stances is list fold_stances, hold_out_stances = get_stances_for_folds( self.dataset, folds, hold_out) # https://cs.fit.edu/~mmahoney/compression/textdata.html sentences = word2vec.Text8Corpus('text8') model = word2vec.Word2Vec(sentences, size=200) labeled_feat_dict = {} print "Generating features for each fold" for fold_id in fold_stances: print "Generating features for fold ", fold_id bodies = folds[fold_id] stances = fold_stances[fold_id] fold_avg_sims, fold_max_sims = JaccardGenerator().gen_jaccard_sims( self.dataset, bodies, stances) common_ngrams = NgramsGenerator().gen_common_ngrams( self.dataset, bodies, stances, self._ngram_len) wordvectors = WordVector().gen_wordvectors(self.dataset, bodies, stances, model) labeled_feature_set = [] for i in range(len(stances)): features = { 'avg_sims': fold_avg_sims[i], 'max_sims': fold_max_sims[i], 'common_ngrams': common_ngrams[i], 'word_vectors': wordvectors[i] } label = self._process_stance(stances[i]['Stance']) labeled_feature = (features, label) labeled_feature_set.append(labeled_feature) labeled_feat_dict[fold_id] = labeled_feature_set print "Generating features for hold out fold" holdout_avg_sims, holdout_max_sims = JaccardGenerator( ).gen_jaccard_sims(self.dataset, hold_out, hold_out_stances) holdout_common_ngrams = NgramsGenerator().gen_common_ngrams( self.dataset, hold_out, hold_out_stances, self._ngram_len) holdout_wordvectors = WordVector().gen_wordvectors( self.dataset, hold_out, hold_out_stances, model) h_unlabeled_features = [] h_labels = [] for i in range(len(hold_out_stances)): unlabeled_feature = { 'avg_sims': holdout_avg_sims[i], 'max_sims': holdout_max_sims[i], 'common_ngrams': holdout_common_ngrams[i], 'word_vectors': holdout_wordvectors[i] } label = self._process_stance(hold_out_stances[i]['Stance']) h_unlabeled_features.append(unlabeled_feature) h_labels.append(label) fold_accuracy = {} best_fold_accuracy = 0.0 classifiers = [] print "Validating using each fold as testing set" for fold_id in fold_stances: fold_ids = list(range(len(folds))) del fold_ids[fold_id] # deleted fold is test set for this run training_set = [ feat for fid in fold_ids for feat in labeled_feat_dict[fid] ] testing_set = [] testing_labels = [] for feat, label in labeled_feat_dict[fold_id]: testing_set.append(feat) testing_labels.append(label) classifier = NaiveBayesClassifier.train(training_set) classifiers.append(classifier) pred = classifier.classify_many(testing_set) accuracy = self._score(pred, testing_labels) print "Fold ", fold_id, "accuracy: ", accuracy if accuracy > best_fold_accuracy: best_fold_accuracy = accuracy best_fold_cls = classifier h_res = best_fold_cls.classify_many(h_unlabeled_features) print 'holdout score:', self._score(h_res, h_labels)
def word_feats(words): return dict([(word, True) for word in words]) negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeats = negfeats + posfeats classifier = NaiveBayesClassifier.train(trainfeats) print('train on %d instances' % (len(trainfeats))) classifier.show_most_informative_features() def process(): print('input some text') while True: line = input() if line.strip() is not '': words = word_tokenize(line) feats = [word_feats(words)] print(feats) print(classifier.classify_many(feats)) for pdist in classifier.prob_classify_many(feats): print('pos: %.4f neg: %.4f' %
'movie', 'sound', 'was', 'words', 'the', 'actors', 'is', 'did', 'know', 'not' ] pos_counti_features1 = [(word_feats(pos_count), 'pos_count') for pos_count in words_pos_counti] neg_counta_features2 = [(word_feats(neg_count), 'neg_count') for neg_count in words_neg_counta] neutral_features3 = [(word_feats(neu), 'neu') for neu in words_neutral] print("pos_counti_features1: ", pos_counti_features1) print("neg_counta_features2: ", neg_counta_features2) print("neutral_features3: ", neutral_features3) train_set_va = neg_counta_features2 + pos_counti_features1 + neutral_features3 classifier_va = NaiveBayesClassifier.train(train_set_va) print("train_set_va: ", train_set_va) print("classifier_va: ", classifier_va) # Predict neg_count = 0 pos_count = 0 sentence = "Awesome movie, I liked it" sentence = sentence.lower() key_words = sentence.split(' ') for word in key_words: classResult = classifier_va.classify(word_feats(word)) if classResult == 'neg_count': neg_count = neg_count + 1 if classResult == 'pos_count':
class abc(username): consumer_key = "D9idPsR9iCUbzQzlUIoOUlOjc" consumer_secret = "VGLhthlGLxbJpPyu3WtTTz4oKLYYkJ5VHJIn94Azf0bDoeE7vm" access_token = "798166878-qgZxk593TZpoSpaZyPBMI1wnOjkKH80AFGR0ZqAg" access_token_secret = "DvQDsAEGm23yG6aRVoPhrVIeZINR6Y47jgjdXgdF7OaVH" auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) auth_api = API(auth) target = username rel = ['hate', 'left', 'bf','gf','girlfriend','boyfriend','heartbreak','alone', 'love','abuse'] edu = ['exam','test','assignment','school','studying'] mon = ['broke','money','cash','economy', 'finance','crisis'] cleantweet = [] clist = [] text = [] mylist = [] words = [] global dcount,Money dcount = 0 Money = 0 global ccount,Academic ccount = 0 Academic = 0 global tcount,Relationship tcount = 0 Relationship = 0 long_stop_list = ['a','are','be','an','and','at','by','the','is','this','that','to','for','it','in','on'] def create_word_features(clist): useful_words = [word for word in clist if word not in long_stop_list] my_dict = dict([(word, True) for word in useful_words]) return my_dict corpus_root = 'C:/Users/Bindu/Desktop/samp1/neg' wordlists=PlaintextCorpusReader(corpus_root,'.*') neg_re=[] mylist=[] x=[] for fileids in wordlists.fileids(): words = wordlists.words(fileids) neg_re.append((create_word_features(words),"Depressed")) corpus_root='C:/Users/Bindu/Desktop/samp1/pos' wordlists=PlaintextCorpusReader(corpus_root,'.*') pos_re=[] mylist=[] x=[] for fileids in wordlists.fileids(): words = wordlists.words(fileids) pos_re.append((create_word_features(words),"Not depressed")) train_set = neg_re[:45] + pos_re[:45] random.shuffle(train_set) test_set = neg_re[45:] + pos_re[45:] random.shuffle(test_set) #print(len(train_set),len(test_set)) classifier = NaiveBayesClassifier.train(train_set) accuracy = nltk.classify.util.accuracy(classifier, test_set) #print(accuracy * 100) class Helpline(App): def build(self): #Window.clearcolor = (0.50,0.50,0.50,1) return Label(text="Need to talk to someone? \n\nNational Suicide Helpline \n\nVisit: http://www.aasra.info/ \nCall: +912227546669 \n\nWant to chat with a counsellor? \nVisit:https://yourdost.com/") hp=Helpline() def get_info(target): try: item = auth_api.get_user(target) print("Name: " + item.name) print("Twitter name: " + item.screen_name) print("Total number of times tweeted: " + str(item.statuses_count)) if (item.statuses_count < 50): print ("Insufficient Data to analyse") exit() print("Following: " + str(item.friends_count)) print("Followers: " + str(item.followers_count)) tweets = item.statuses_count account_created_date = item.created_at delta = datetime.utcnow() - account_created_date account_age_days = delta.days print("Account age (in days): " + str(account_age_days)) if account_age_days > 0: print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days))) except: print ("Invalid Username") exit() def get_tweets(target): print ("Collecting user %s's tweets" % target) for status in Cursor(auth_api.user_timeline, screen_name = '@%s' % target).items(): mylist.append(json.dumps(status._json['text'])) def get_cause(clist): global Relationship,Academic,Money for w in clist: if w in rel: Relationship+=1 elif w in edu: Academic+=1 elif w in mon: Money+=1 def get_classify(tweet): clist = [] global dcount global ccount words = word_tokenize(tweet) clist = words[:] get_cause(clist) words = create_word_features(words) x=classifier.classify(words) if x == "Depressed": dcount+=1 else: ccount+=1 get_info(target) try: get_tweets(target) except: print("%s's account is set to private" % target) exit() for tweet in mylist: tcount+=1 tweet = tweet.replace('"','') tweet = tweet.lower() tweet = re.sub('^rt','',tweet)#Remove RT if they appear at the beginning of a tweet tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)#Remove www.* or https?://* tweet = re.sub('@[^\s]+','',tweet)#Remove @username tweet = " ".join(word.strip() for word in re.split('#|_', tweet))#Convert #word into word tweet = re.sub('[0-9]+','',tweet)#Remove numbers tweet = re.sub(r'[^\w\s]','',tweet)#Remove punctuation tweet = re.sub('[\s]+',' ', tweet)#Remove additional white spaces cleantweet.append(tweet) get_classify(tweet) def biggest(a, y, z): Max = a if y > Max: Max = y if z > Max: Max = z if y > z: Max = y return Max tot = dcount+ccount #print(dcount,ccount) rat = dcount/tot print (rat) if (rat < 0.25 ): print (" %s is unlikely to be depressed" % target) elif (rat > 0.25 and rat < 0.4): print (" %s is likely to be moderately depressed" % target) cause = biggest(Relationship,Money,Academic) if (cause == Relationship): print("Cause for depression is likely to be relationship troubles") elif (cause == Money): print("Cause for depression is likely to be money troubles") elif (cause == Academic): print ("Cause for depression is likely to be academic troubles") else: hp.run() print ("%s is likely to be severely depressed" % target) cause = biggest(Relationship,Money,Academic) if (cause == Relationship): print("Cause for depression is likely to be relationship troubles") elif (cause == Money): print("Cause for depression is likely to be money troubles") elif (cause == Academic): print ("Cause for depression is likely to be academic troubles") with open("%s.txt" % target,'w',encoding = 'utf-8') as f: f.write(str(cleantweet)) f.close() def random_color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None): h = int(360.0 * 45.0 / 255.0) s = int(100.0 * 255.0 / 255.0) l = int(100.0 * float(random_state.randint(60, 120)) / 255.0) return "hsl({}, {}%, {}%)".format(h, s, l) file_content=open ("%s.txt" % target).read() wordcloud = WordCloud(font_path = r'C:\Windows\WinSxS\amd64_microsoft-windows-font-truetype-verdana_31bf3856ad364e35_10.0.16299.15_none_e1654f127052576a\verdana.ttf', stopwords = STOPWORDS, background_color = 'white', width = 1200, height = 1000, color_func = random_color_func, collocations = False ).generate(file_content) plt.imshow(wordcloud) plt.axis('off') plt.show() class LoginApp(App): username = StringProperty(None) def build(self): manager = ScreenManager() manager.add_widget(Login(name='login')) manager.add_widget(Connected(name='connected')) return manager def get_application_config(self): if(not self.username): return super(LoginApp, self).get_application_config() conf_directory = self.user_data_dir + '/' + self.username if(not os.path.exists(conf_directory)): os.makedirs(conf_directory) return super(LoginApp, self).get_application_config( '%s/config.cfg' % (conf_directory) ) if __name__ == '__main__': LoginApp().run()
for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [make_full_dict(posWords), 'pos'] posFeatures.append(posWords) # Laben the negative words for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [make_full_dict(negWords), 'neg'] negFeatures.append(negWords) # Create the training set trainFeatures = posFeatures + negFeatures # Train the Classifier classifier = NaiveBayesClassifier.train(trainFeatures) '''Save Pickled Data''' # Save the trained Classifier for future use save_trained_data = open( 'C:/Users/animi/Documents/Python Codes/NLP/PythonProgrammingDotNet/Pickle_Files/Trained_NBC.pickle', 'wb') pickle.dump(classifier, save_trained_data) save_trained_data.close() '''Load Pickled Data''' saved_trained_data = open( 'C:/Users/animi/Documents/Pickle_Files/Trained_NBC.pickle', 'rb') classifier = pickle.load(saved_trained_data) saved_trained_data.close() # Test the classifier with custom input while True:
for label, feats in lfeats.items(): cutoff = int(len(feats) * split) train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]]) return train_feats, test_feats train_feats, test_feats = split_label_feats(lfeats) train_feats[0] print(len(train_feats)) print(len(test_feats)) #%% # ______________ Bayesian _______________- nb = NaiveBayesClassifier.train(train_feats) nb.labels() acc = accuracy(nb, test_feats) print("Bayesian Accuracy: ", acc) #%% # ______________ Naive_Bayes _______________- sk = SklearnClassifier(MultinomialNB()) sk.train(train_feats) acc_Naive_Bayes = accuracy(sk, test_feats) print("Naive Bayes Accuracy: ", acc_Naive_Bayes) # ______________ K-Neighbors _______________- sk_knn = SklearnClassifier(KNeighborsClassifier())
def test(trainfeats, testfeats, source, type): # print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))) my_classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = my_classifier.classify(feats) testsets[observed].add(i) # precision and recall accuracy = nltk.classify.util.accuracy(my_classifier, testfeats) * 100 pos_prec = precision(refsets['pos'], testsets['pos']) * 100 pos_rec = recall(refsets['pos'], testsets['pos']) * 100 neg_prec = precision(refsets['neg'], testsets['neg']) * 100 neg_rec = recall(refsets['neg'], testsets['neg']) * 100 # round accuracy = round(accuracy, 1) pos_prec = round(pos_prec, 1) pos_rec = round(pos_rec, 1) neg_prec = round(neg_prec, 1) neg_rec = round(neg_rec, 1) # print('pos F-measure:', f_measure(refsets['pos'], testsets['pos'])) # print('neg F-measure:', f_measure(refsets['neg'], testsets['neg'])) my_classifier.show_most_informative_features(50) dir_path = os.path.dirname(__file__) file_path = os.path.join( dir_path, source + '/pickled/' + type + '/MNB_classifier.pickle') open_file = open(file_path, "rb") MNB_classifier = pickle.load(open_file) open_file.close() mnb = (nltk.classify.accuracy(MNB_classifier, testfeats)) * 100 print(mnb) mnb = round(mnb, 1) file_path = os.path.join( dir_path, source + '/pickled/' + type + '/BernoulliNB_classifier.pickle') open_file = open(file_path, "rb") BernoulliNB_classifier = pickle.load(open_file) open_file.close() bnb = (nltk.classify.accuracy(BernoulliNB_classifier, testfeats)) * 100 print(bnb) bnb = round(bnb, 1) file_path = os.path.join( dir_path, source + '/pickled/' + type + '/LogisticRegression_classifier.pickle') open_file = open(file_path, "rb") LogisticRegression_classifier = pickle.load(open_file) open_file.close() lr = (nltk.classify.accuracy(LogisticRegression_classifier, testfeats)) * 100 print(lr) lr = round(lr, 1) file_path = os.path.join( dir_path, source + '/pickled/' + type + '/LinearSVC_classifier.pickle') open_file = open(file_path, "rb") LinearSVC_classifier = pickle.load(open_file) open_file.close() lsvc = (nltk.classify.accuracy(LinearSVC_classifier, testfeats)) * 100 print(lsvc) lsvc = round(lsvc, 1) file_path = os.path.join( dir_path, source + '/pickled/' + type + '/NuSVC_classifier.pickle') open_file = open(file_path, "rb") NuSVC_classifier = pickle.load(open_file) open_file.close() nsvc = (nltk.classify.accuracy(NuSVC_classifier, testfeats)) * 100 print(nsvc) nsvc = round(nsvc, 1) voted_classifier = VoteClassifier(NuSVC_classifier, LinearSVC_classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier) voted = (nltk.classify.accuracy(voted_classifier, testfeats)) * 100 print(voted) voted = round(voted, 1) nltk_output = "nlt, " + str(accuracy) + ", " + str(pos_prec) + ", " + str( neg_prec) + ", " + str(pos_rec) + ", " + str(neg_rec) + "\n" sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str( lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n" return (nltk_output, sklearn_output)
from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk mydir = 'Documents/Plab/Project4/subset/test/neg' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') stop = stopwords.words('english') documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]] classifier = nbc.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) classifier.show_most_informative_features(5)
def feature_classification(feature_select): global testSentences, wordScores, number_of_features, stoplist, bestWords positiveFeatures = [] negativeFeatures = [] testFeatures = [] testLines = [] tempArray = [] probability = [] testSets = collections.defaultdict(set) count = 0 linesCount = 0 positiveScore = 0.0 #to create positive training features with open(positive_file, 'r') as positiveLines: for i in positiveLines: positiveWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) positiveWords = [k for k in positiveWords if not k in stoplist] positiveWords = [feature_select(positiveWords), 'pos'] positiveFeatures.append(positiveWords) #to create negative training features with open(negative_file, 'r') as negativeLines: for i in negativeLines: negativeWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negativeWords = [l for l in negativeWords if not l in stoplist] negativeWords = [feature_select(negativeWords), 'neg'] negativeFeatures.append(negativeWords) #to create testing features for i in testSentences: testLines = i.split('.') lines = len(testLines) count = 0 for j in testLines: if not j: count += 1 lines -= count for j in testLines: testWords = re.findall(r"[\w']+|[.,!?;]", j) for k in testWords: k = k.lower() testWords = [feature_select(testWords), lines] testFeatures.append(testWords) #the whole training features to be provided to the classifier trainFeatures = positiveFeatures + negativeFeatures #creating a classifier object and performing the training process classifier = NaiveBayesClassifier.train(trainFeatures) #to perform testing process for i, (features, lines) in enumerate(testFeatures): predicted = classifier.classify(features) if predicted == 'pos': count += 1 linesCount += 1 testSets[predicted].add(i) if linesCount == lines: positiveScore = float(count) / lines probability.append(positiveScore) count = 0 linesCount = 0 ## print 'Probability:' ## print probability score = (sum(probability)) / len(probability) ## print 'Score:' ## print score return score
def train_classifiers(posFeatures, negFeatures): #selects 3/4 of the features to be used for training and 1/4 to be used for testing posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] #trains a Naive Bayes Classifier print("----------------Naive Bayes Classifier-----------") classifier = NaiveBayesClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #prints metrics to show how well the feature selection did print('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))) print('Original Naive Bayes Accuracy:', (nltk.classify.util.accuracy(classifier, testFeatures)) * 100) print('pos precision:', precision(referenceSets['pos'], testSets['pos'])) print('pos recall:', recall(referenceSets['pos'], testSets['pos'])) print('neg precision:', precision(referenceSets['neg'], testSets['neg'])) print('neg recall:', recall(referenceSets['neg'], testSets['neg'])) classifier.show_most_informative_features(10) #Pickle the algorithm for future use save_classifier = open("pickled_algos/originalnaivebayes.pickle", "wb") pickle.dump(classifier, save_classifier) save_classifier.close() MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(trainFeatures) print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testFeatures)) * 100) #Pickle the algorithm for future use save_classifier = open("pickled_algos/MNB_classifier.pickle", "wb") pickle.dump(MNB_classifier, save_classifier) save_classifier.close() BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(trainFeatures) print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testFeatures)) * 100) #Pickle the algorithm for future use save_classifier = open("pickled_algos/BernoulliNB_classifier.pickle", "wb") pickle.dump(BernoulliNB_classifier, save_classifier) save_classifier.close() LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(trainFeatures) print( "LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testFeatures)) * 100) #Pickle the algorithm for future use save_classifier = open( "pickled_algos/LogisticRegression_classifier.pickle", "wb") pickle.dump(LogisticRegression_classifier, save_classifier) save_classifier.close() LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(trainFeatures) print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testFeatures)) * 100) #Pickle the algorithm for future use save_classifier = open("pickled_algos/LinearSVC_classifier.pickle", "wb") pickle.dump(LinearSVC_classifier, save_classifier) save_classifier.close() SGDC_classifier = SklearnClassifier(SGDClassifier()) SGDC_classifier.train(trainFeatures) print("SGDClassifier accuracy percent:", nltk.classify.accuracy(SGDC_classifier, testFeatures) * 100) #Pickle the algorithm for future use save_classifier = open("pickled_algos/SGDC_classifier.pickle", "wb") pickle.dump(SGDC_classifier, save_classifier) save_classifier.close() Dec_Tree_Classifier = SklearnClassifier(DecisionTreeClassifier()) Dec_Tree_Classifier.train(trainFeatures) print("DecisionTreeClassifier Accuracy:", (nltk.classify.accuracy(Dec_Tree_Classifier, testFeatures)) * 100) #Pickle the algorithm for future use save_classifier = open("pickled_algos/decision_tree.pickle", "wb") pickle.dump(Dec_Tree_Classifier, save_classifier) save_classifier.close() """ # Grad_Boost_Classifier = SklearnClassifier(GradientBoostingClassifier()) # Grad_Boost_Classifier.train(trainFeatures) # print("Gradient Boosting Classifier Accuracy:", (nltk.classify.accuracy(Grad_Boost_Classifier,testFeatures))*100) """ Random_Forest_Classifier = SklearnClassifier(RandomForestClassifier()) Random_Forest_Classifier.train(trainFeatures) print("Random Forest Classifier Accuracy:", (nltk.classify.accuracy(Random_Forest_Classifier, testFeatures)) * 100) #Pickle the algorithm for future use save_classifier = open("pickled_algos/random_forest.pickle", "wb") pickle.dump(Random_Forest_Classifier, save_classifier) save_classifier.close() Ada_Boost_Classifier = SklearnClassifier(AdaBoostClassifier()) Ada_Boost_Classifier.train(trainFeatures) print("Ada Boost Classifier Accuracy:", (nltk.classify.accuracy(Ada_Boost_Classifier, testFeatures)) * 100) #Pickle the algorithm for future use save_classifier = open("pickled_algos/Ada_Boost.pickle", "wb") pickle.dump(Ada_Boost_Classifier, save_classifier) save_classifier.close() voted_classifier = VoteClassifier(classifier, LinearSVC_classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, Random_Forest_Classifier, Ada_Boost_Classifier) print("Voted classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testFeatures)) * 100) # The voted classifier could not be pickled. Check this later! return trainFeatures, testFeatures
'Negative') for f in minus_filenum] threshold_fact = 0.8 threshold_pluspts = int(threshold_fact * len(feature_pluspts)) threshold_minuspts = int(threshold_fact * len(feature_minuspts)) feature_training = feature_pluspts[: threshold_pluspts] + feature_minuspts[: threshold_minuspts] feature_testing = feature_pluspts[threshold_pluspts:] + feature_minuspts[ threshold_minuspts:] print "\nNumber of training datapoints:", len(feature_training) print "Number of test datapoints:", len(feature_testing) # Train a Naive Bayes classifiers classifiers = NaiveBayesClassifier.train(feature_training) print "\nAccuracy of the classifiers:", nltk.classify.util.accuracy( classifiers, feature_testing) print "\nTop 10 most informative words:" for item in classifiers.most_informative_features()[:10]: print item[0] # Sample input reviews in_reviews = [ "The Movie was amazing", "the movie was dull. I would never recommend it to anyone.", "The cinematography is pretty great in the movie", "The direction was horrible and the story was all over the place" ]
'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ] negative_vocab = ['bad', 'terrible', 'useless', 'hate', ':('] neutral_vocab = [ 'movie', 'the', 'sound', 'was', 'is', 'actors', 'did', 'know', 'words', 'not' ] positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab] negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab] neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab] train_set = negative_features + positive_features + neutral_features classifier = NaiveBayesClassifier.train(train_set) # Predict neg = 0 pos = 0 sentence = "Movie is not good, i hate this movie" sentence = sentence.lower() words = sentence.split(' ') for word in words: classResult = classifier.classify(word_feats(word)) print(classResult) if classResult == 'neg': neg = neg + 1 if classResult == 'pos': pos = pos + 1
import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews def word_features(words): return dict([(word, True) for word in words]) neg_ids = movie_reviews.fileids('neg') pos_ids = movie_reviews.fileids('pos') neg_features = [(word_features(movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids] pos_features = [(word_features(movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids] neg_cutoff = len(neg_features) * 3 / 4 pos_cutoff = len(pos_features) * 3 / 4 training_features = neg_features[:neg_cutoff] + pos_features[:pos_cutoff] test_features = neg_features[neg_cutoff:] + pos_features[pos_cutoff:] print 'Training on %d instances, testing on %d instances' % ( len(training_features), len(test_features)) classifier = NaiveBayesClassifier.train(training_features) print 'Accuracy: ', nltk.classify.util.accuracy(classifier, test_features) classifier.show_most_informative_features()
def evaluate_features(feature_select): #All variables tagged_Sentences = [] untagged_Sentences = [] neg_sentence = [] pos_sentence = [] mixed_sentence = [] neutral_sentence = [] neg_Feautures = [] pos_Feautures = [] mixed_Feautures = [] neutral_Feautures = [] test_sentence = [] test_Feautures = [] allwords = [] tempPos = [] stopWords = stopwords.words("english") # Reading positive words from txt file fileInput = open('positive-words.txt', 'r') sentences = re.split(r'\n', fileInput.read()) fileInput.close() for i in sentences: posWords = re.findall(r"^[\w']+", i) if posWords: posWords = [feature_select(posWords), '+'] POS_Words.append(posWords) pos_Feautures.append(posWords) # Reading negative words from txt file fileInput = open('negative-words.txt', 'r') sentences = re.split(r'\n', fileInput.read()) fileInput.close() for i in sentences: negWords = re.findall(r"^[\w']+", i) if negWords: negWords = [feature_select(negWords), '-'] NEG_Words.append(negWords) neg_Feautures.append(negWords) #reading pre-labeled input and splitting into lines fileInput = open('All_Classified.txt', 'r') sentences = re.split(r'\n', fileInput.read()) fileInput.close() for i in sentences: #tagged = re.findall(r"^[-=+\*]|[\w']+[/]?[\w']+[/]+[\w']+ [.,!?;]*", i) tagged = re.findall( r"^[-=+\*]|[\w']+[/]?[\w']+[/]+[^(NN|NNS|NNP|PRP)]+ [.,!?;]*", i) untagged = re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*|', '', ' '.join(tagged)) untagged_Words = re.findall(r"[\w']+|[.,!?;]", untagged) filtered_Words = [ w for w in untagged_Words if not w.lower() in stopWords ] if untagged and tagged: if untagged[0] == '-': neg_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '-'] NEG_Words.append(filtered_Words) neg_Feautures.append(filtered_Words) tagged_Words = [feature_select(tagged), '-'] NEG_Words.append(tagged_Words) neg_Feautures.append(tagged_Words) """ for word in filtered_Words: a=dict([(word, True)]) Word= [a, '-'] NEG_Words.append(Word) neg_Feautures.append(Word) """ if untagged[0] == '+': pos_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '+'] POS_Words.append(filtered_Words) pos_Feautures.append(filtered_Words) tagged_Words = [feature_select(tagged), '+'] POS_Words.append(tagged_Words) pos_Feautures.append(tagged_Words) """ for word in filtered_Words: a=dict([(word, True)]) Word= [a, '+'] #Word= [dict([(word, True)]), '+'] POS_Words.append(Word) pos_Feautures.append(Word) """ if untagged[0] == '*': mixed_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '*'] MIX_Words.append(filtered_Words) mixed_Feautures.append(filtered_Words) if untagged[0] == '=': neutral_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '='] NEUTRAL_Words.append(filtered_Words) neutral_Feautures.append(filtered_Words) """ tagged_Words = [feature_select(tagged), '='] NEUTRAL_Words.append(tagged_Words) neutral_Feautures.append(tagged_Words) for word in filtered_Words: a=dict([(word, True)]) Word= [a, '='] #Word= [dict([(word, True)]), '='] NEUTRAL_Words.append(Word) neutral_Feautures.append(Word) """ tagged_Sentences.append(tagged) untagged_Sentences.append(untagged) #Read a test file and create test feutures fileInput = open('test_dummy.txt', 'r') sentences = re.split(r'\n', fileInput.read()) fileInput.close() for i in sentences: tagged = re.findall(r"^[-=+\*]|[\w']+[/]?[\w']+[/]+[\w']+ [.,!?;]*", i) untagged = re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*', '', i) untagged_Words = re.findall(r"[\w']+|[.,!?;]", untagged) filtered_Words = [ w for w in untagged_Words if not w.lower() in stopWords ] if untagged and tagged: if untagged[0] == '-': test_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '-'] test_Feautures.append(filtered_Words) if untagged[0] == '+': test_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '+'] test_Feautures.append(filtered_Words) if untagged[0] == '=': test_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '='] test_Feautures.append(filtered_Words) #posCutoff = int(math.floor(len(pos_Feautures)*3/4)) #negCutoff = int(math.floor(len(neg_Feautures)*3/4)) neutralCutoff = int(math.floor(len(neutral_Feautures) * 1 / 20)) trainFeatures = pos_Feautures + neg_Feautures + neutral_Feautures[: neutralCutoff] #test_Feautures= pos_Feautures[posCutoff:] + neg_Feautures[negCutoff:] + neutral_Feautures[neutralCutoff: 2*neutralCutoff] #trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(test_Feautures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #prints metrics to show how well the feature selection did print 'train on %d instances, test on %d instances' % ( len(tagged_Sentences), len(test_sentence)) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_Feautures) print 'pos precision:', nltk.metrics.precision(referenceSets['+'], testSets['+']) print 'pos recall:', nltk.metrics.recall(referenceSets['+'], testSets['+']) print 'pos f-measure:', nltk.metrics.f_measure(referenceSets['+'], testSets['+']) print 'neg precision:', nltk.metrics.precision(referenceSets['-'], testSets['-']) print 'neg recall:', nltk.metrics.recall(referenceSets['-'], testSets['-']) print 'neg f-measure:', nltk.metrics.f_measure(referenceSets['-'], testSets['-'])
def NaiveBayesClassification(self, train_features, test_features): # Training and finding accuracy of NaiveBayes Classifier #Training self.nb_classifier = NaiveBayesClassifier.train(train_features)