def train(test=False): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] if(test): negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features() else: return NaiveBayesClassifier.train(negfeats+posfeats)
def train_and_show_results(pos, neg, pos_bigrams, neg_bigrams, pos_control, neg_control, pos_control_bigrams, neg_control_bigrams): if pos_control == None or neg_control == None or pos_control_bigrams == None or neg_control_bigrams == None: negcutoff = len(neg)*3/4 poscutoff = len(pos)*3/4 neg_bigrams_cutoff = len(neg_bigrams)*3/4 pos_bigrams_cutoff = len(pos_bigrams)*3/4 test_bag_of_words = neg[negcutoff:] + pos[poscutoff:] test_bigrams = neg_bigrams[neg_bigrams_cutoff:] + pos_bigrams[pos_bigrams_cutoff:] train_corpora_bag_of_words = neg[:negcutoff] + pos[:poscutoff] train_corpora_bigrams = neg_bigrams[:neg_bigrams_cutoff] + pos_bigrams[:pos_bigrams_cutoff] else: test_bag_of_words = neg_control + pos_control test_bigrams = neg_control_bigrams + pos_control_bigrams train_corpora_bag_of_words = neg+pos train_corpora_bigrams = neg_bigrams + pos_bigrams print "negative corpus: ", len(neg) print "positive corpus: ", len(pos) if neg_control != None: print "negative test corpus: ", len(neg_control) print "positive test corpus: ", len(pos_control) print 'bag of words and bigrams - Naive Bayes' naive_bayes = NaiveBayesClassifier.train(train_corpora_bag_of_words) naive_bayes_bigrams = NaiveBayesClassifier.train(train_corpora_bigrams) save_dataset('naive_bayes.dat', naive_bayes) save_dataset('naive_bayes_bigrams.dat', naive_bayes_bigrams) print 'bag of words and bigrams - Maximum Entropy' maximum_entropy = nltk.MaxentClassifier.train(train_corpora_bag_of_words, max_iter=2) maximum_entropy_bigrams = nltk.MaxentClassifier.train(train_corpora_bigrams, max_iter=2) save_dataset('maximum_entropy.dat', maximum_entropy) save_dataset('maximum_entropy_bigrams.dat', maximum_entropy_bigrams) print 'Naive Bayesian results' print 'bag of words' print 'Accuracy:', nltk.classify.util.accuracy(naive_bayes, test_bag_of_words) naive_bayes.show_most_informative_features() print_precision_recall(naive_bayes, test_bag_of_words) print '\nbigrams' print 'Accuracy:', nltk.classify.util.accuracy(naive_bayes_bigrams, test_bigrams) naive_bayes_bigrams.show_most_informative_features() print_precision_recall(naive_bayes_bigrams, test_bigrams) print 'Maximum Entropy results' print 'bag of words' print 'Accuracy:', nltk.classify.util.accuracy(maximum_entropy, test_bag_of_words) maximum_entropy.show_most_informative_features() print_precision_recall(maximum_entropy, test_bag_of_words) print '\nbigrams' print 'Accuracy:', nltk.classify.util.accuracy(maximum_entropy_bigrams, test_bigrams) maximum_entropy_bigrams.show_most_informative_features() print_precision_recall(maximum_entropy_bigrams, test_bigrams)
def main(argv): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') #print negids negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids] trainfeats = posfeats+negfeats #print trainfeats # break classifier = NaiveBayesClassifier.train(trainfeats) #classifier = pickle.load(open("classifier.p", "rb")) topicList = ["media", "sports", "news", "fashion", "finance", "politics"] for line in sys.stdin: try: tolk_posset = word_tokenize(line.rstrip()) d = word_feats(tolk_posset) for topic in topicList: subjectFull = subj(line, topic) if not subjectFull == "No match": #print d print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1" except: #print "Error" continue
def init(): # create our dict of training data texts = {} texts['traffic'] = 'traffic-corpus.txt' texts['useless'] = 'useless-corpus.txt' #holds a dict of features for training our classifier train_set = [] # loop through each item, grab the text, tokenize it and create a training feature with it for sense, file in texts.iteritems(): print "training %s " % sense text = open(file, 'r').read() features = extract_words(text) train_set = train_set + [(get_feature(word), sense) for word in features] classifier = NaiveBayesClassifier.train(train_set) # uncomment out this line to see the most informative words the classifier will use classifier.show_most_informative_features(20) # uncomment out this line to see how well our accuracy is using some hand curated tweets # run_classifier_tests(classifier) return classifier
def evaluate_features(feature_select): posFeatures = [] negFeatures = [] inposFeatures = [] innegFeatures = [] #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) """ with open(RT_INPUT_POS_FILE, 'r') as posSentences: for i in posSentences: inposWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) inposWords = [feature_select(inposWords), 'pos'] inposFeatures.append(inposWords) """ with open(RT_INPUT_NEG_FILE, 'r') as negSentences: for i in negSentences: innegWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) innegWords = [feature_select(innegWords), 'neg'] innegFeatures.append(innegWords) #selects 3/4 of the features to be used for training and 1/4 to be used for testing #posCutoff = int(math.floor(len(posFeatures)*3/4)) #negCutoff = int(math.floor(len(negFeatures)*3/4)) trainFeatures = posFeatures + negFeatures testFeatures = innegFeatures #+ inposFeatures #trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) fileOutput ={'key':[],'pos':[],'neg':[]} #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): #print features , label referenceSets[label].add(i) predicted = classifier.prob_classify(features) print "\n" fileOutput['key'].append(i) fileOutput['pos'].append(predicted.prob("pos")) fileOutput['neg'].append(predicted.prob("neg")) #posValues = predicted.prob("pos") #negValues = predicted.prob("neg") fileOutput.values() testSets[predicted].add(i) #print i #print testSets[predicted] return fileOutput
def train(): # get impact for documents for which it has not been computed yet for document in Document.objects.filter(sentiment__isnull=True): get_impact(document, settings.TIME) known_data = Document.objects.filter(sentiment__isnull=False) known_data_count = known_data.count() if known_data_count == 0: print('known_data_count == 0') return None, 0 # 2/3 training data num_training_data = int(round(2 * known_data_count / 3)) training_feats = [] for document in known_data.order_by('id')[:num_training_data]: text = get_nltktext(document.text) training_feats.append((word_feats(text), document.sentiment)) classifier = NaiveBayesClassifier.train(training_feats) # 1/3 test_data num_testing_data = int(round(known_data_count / 3)) testing_feats = [] for document in known_data.order_by('-id')[:num_testing_data]: text = get_nltktext(document.text) testing_feats.append((word_feats(text), document.sentiment)) print('train on %d instances, test on %d instances' % (len(training_feats), len(testing_feats))) accuracy = nltk.classify.util.accuracy(classifier, testing_feats) return classifier, accuracy
def classify_and_evaluate(reviews, feature_extractor=word_feats): random.shuffle(reviews) pos_reviews = filter(lambda x: x['class'] == 'POSITIVE', reviews) neg_reviews = filter(lambda x: x['class'] == 'NEGATIVE', reviews) # get unique features pos_features = [] neg_features = [] for review in pos_reviews: split_reviews = review['text'].split(' ') split_reviews = [x for x in split_reviews if x] pos_features.append((feature_extractor(split_reviews), 'pos')) for review in neg_reviews: split_reviews = review['text'].split(' ') split_reviews = [x for x in split_reviews if x] neg_features.append((feature_extractor(split_reviews), 'neg')) # divide groups pos_offset = int(math.floor(len(pos_reviews) * 3 / 4)) neg_offset = int(math.floor(len(neg_reviews) * 3 / 4)) training = pos_features[:pos_offset] + neg_features[:neg_offset] testing = pos_features[pos_offset:] + neg_features[neg_offset:] # train classifier classifier = NaiveBayesClassifier.train(training) print 'treinada em %d reviews, testada em %d reviews' % (len(training), len(testing)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testing) classifier.show_most_informative_features()
def main(): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) with open("output.json") as fin: sid = SentimentIntensityAnalyzer() data = json.load(fin) for key in data: reviews = data[key]["reviews"] for i in range(len(reviews)): text = reviews[i]["review"] sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0} prob = classifier.prob_classify(word_feats(text.split(" "))) classification = classifier.classify(word_feats(text.split(" "))) sentiment_dict['positive_probability'] = prob.prob('pos') sentiment_dict['negative_probability'] = prob.prob('neg') sentiment_dict['label'] = classification reviews[i]["sentiment"] = sentiment_dict data[key]["reviews"] = reviews with open('out_with_sentiment.json', 'w') as outfile: json.dump(data, outfile)
def evaluate_classifier(featx): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features()
def train_with_movie_db(self): """ Training possible with movie reviews - this does not yield particularly good results """ self.use_movie_reviews = True negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "negative") for f in negids] posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "positive") for f in posids] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))) self.classifier = NaiveBayesClassifier.train(trainfeats) DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats))) DLOG(self.classifier.show_most_informative_features())
def main(): # vote_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.votes.txt' # votes = get_justice_votes(vote_file) # for v in votes: print(v, votes[v]) # win_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.outcome.txt' # winners = get_winners(win_file) # for w in winners: print(w, winners[w]) text_file = '/Users/nasrallah/Desktop/Insight/courtcast/data/supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt' #text_file = '/Users/nasrallah/Desktop/some_text.txt' ## Extract the feature sets feature_sets = get_training_features(text_file) ## Shuffle the features to mix up pos and neg #random.shuffle(feature_sets) ## Separate into train and test sets cutoff = int(len(feature_sets)*3/4) train_feature_sets = feature_sets[:cutoff] test_feature_sets = feature_sets[cutoff:] print('train on %d instances, test on %d instances' % (len(train_feature_sets), len(test_feature_sets))) classifier = NaiveBayesClassifier.train(train_feature_sets) print('accuracy:', nltk.classify.util.accuracy(classifier, test_feature_sets)) classifier.show_most_informative_features()
def classification(self): fstruct = FeatStruct(self.train_reviews) classifier = NaiveBayesClassifier.train(fstruct) print 'accuracy:', nltk.classify.util.accuracy(classifier, self.test_reviews) classifier.show_most_informative_features()
def evaluate_classifier(featx): #negids = movie_reviews.fileids('neg') #posids = movie_reviews.fileids('pos') ##For Movie Review train: #negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] #posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] ##For product reviews train: negfeats = [(featx([wrd for wrd in nltk.word_tokenize(con) if wrd not in stpwrds]), 'neg') for con in traincons] posfeats = [(featx([wrd for wrd in nltk.word_tokenize(pro) if wrd not in stpwrds]), 'pos') for pro in trainpros] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:] + posfeats[:] #trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features() return classifier
def evaluate_classifier_Naive(featx): train_negids = train.fileids('neg') train_posids = train.fileids('pos') test_negids = test.fileids('neg') test_posids = test.fileids('pos') train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids] train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids] test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids] test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids] trainfeats = train_negfeats + train_posfeats testfeats = test_negfeats + test_posfeats Naive_classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets_Naive = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed_Naive = Naive_classifier.classify(feats) testsets_Naive[observed_Naive].add(i) accuracy1 = nltk.classify.util.accuracy(Naive_classifier, testfeats) pos_precision1 = nltk.metrics.precision(refsets['pos'], testsets_Naive['pos']) pos_recall1 = nltk.metrics.recall(refsets['pos'], testsets_Naive['pos']) neg_precision1 = nltk.metrics.precision(refsets['neg'], testsets_Naive['neg']) neg_recall1 = nltk.metrics.recall(refsets['neg'], testsets_Naive['neg']) Naive_classifier.show_most_informative_features(50) return(['NaiveBayes',accuracy1,pos_precision1,pos_recall1,neg_precision1,neg_recall1])
def __init__(self): # neg_phrases = filter_negative_phrases(load_csv_sentences('thoughtsandfeelings.csv')) # pos_phrases = filter_positive_phrases(load_csv_sentences('spiritualforums.csv')) neg_file = open("neg_phrases.txt", "r") pos_file = open("pos_phrases.txt", "r") neg_phrases = neg_file.readlines() pos_phrases = pos_file.readlines() neg_phrases_tagged = [] pos_phrases_tagged = [] for phrase in neg_phrases: neg_phrases_tagged.append((word_feats(phrase.split()), 'suicidal')) for phrase in pos_phrases: pos_phrases_tagged.append((word_feats(phrase.split()), 'alright')) negcutoff = int(len(neg_phrases_tagged) * .8) poscutoff = int(len(pos_phrases_tagged) * .8) trainfeats = neg_phrases_tagged[:negcutoff] + pos_phrases_tagged[:poscutoff] testfeats = neg_phrases_tagged[negcutoff:] + pos_phrases_tagged[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) self.classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(self.classifier, testfeats) self.classifier.show_most_informative_features()
def cross_validation(self): #10 fold cross validation is performed train_feats_count = int(len(self.training_feats)) fold_size = int(train_feats_count / self.k_fold) nb_accuracy_list = [] svm_accuracy_list = [] nb_f_val_list = [] svm_f_val_list = [] for a in range(self.k_fold): start_index = a * fold_size end_index = start_index + fold_size train_features = self.training_feats[:start_index] + self.training_feats[end_index:] test_features = self.training_feats[start_index:end_index] self.nb_classifier = NaiveBayesClassifier.train(train_features) nb_acc = nltk.classify.util.accuracy(self.nb_classifier, test_features) nb_accuracy_list.append(nb_acc) self.svm_classifier = SklearnClassifier(LinearSVC()) self.svm_classifier.train(train_features) svm_acc = nltk.classify.util.accuracy(self.svm_classifier, test_features) svm_accuracy_list.append(svm_acc) #Find F-Measure nb_f_val = self.compute_measures(test_features, self.nb_classifier) nb_f_val_list.append(nb_f_val) svm_f_val = self.compute_measures(test_features, self.svm_classifier) svm_f_val_list.append(svm_f_val) self.logging.info('Average accuracy of Naive Bayes Classifier %s\n' % (float(sum(nb_accuracy_list)/len(nb_accuracy_list)))) self.logging.info('Average accuracy of SVM Classifier %s\n' % (float(sum(svm_accuracy_list)/len(svm_accuracy_list)))) self.logging.info('Average F measure of Naive Bayes Classifier %s\n' % (float(sum(nb_f_val_list)/len(nb_f_val_list)))) self.logging.info('Average F measure of SVM Classifier %s\n' % (float(sum(svm_f_val_list)/len(svm_f_val_list))))
def naivebayes(trainfeats, testfeats): classifier = NaiveBayesClassifier.train(trainfeats) print "NaiveBayes output" print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print classifier.show_most_informative_features()
def evaluate_classifier(featx): sportsfeats = [(featx(tweet[0]), tweet[1]) for tweet in Sports_Tweet] politicsfeats = [(featx(tweet[0]), tweet[1]) for tweet in Politics_Tweet] sportscutoff = len(sportsfeats)*3/4 politicscutoff = len(politicsfeats)*3/4 trainfeats = sportsfeats[:sportscutoff] + politicsfeats[:politicscutoff] testfeats = sportsfeats[sportscutoff:] + politicsfeats[politicscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['Sports'], testsets['Sports']) print 'pos recall:', nltk.metrics.recall(refsets['Politics'], testsets['Politics']) print 'neg precision:', nltk.metrics.precision(refsets['Sports'], testsets['Sports']) print 'neg recall:', nltk.metrics.recall(refsets['Politics'], testsets['Politics']) classifier.show_most_informative_features() return classifier
def classification(self): #Training NB classifier self.nb_classifier = NaiveBayesClassifier.train(self.training_feats) #Training SVM classifier self.svm_classifier = SklearnClassifier(LinearSVC()) self.svm_classifier.train(self.training_feats)
def main_function(): conn = MySQLdb.connect(host=DATABASES['ensemble']['HOST'], user=DATABASES['ensemble']['USER'], passwd=DATABASES['ensemble']['PASSWORD'], db=DATABASES['ensemble']['NAME']) training_tweets = classify.get_training_tweets(conn) training_feature_set = classify.process_tweets(training_tweets) classifier = NaiveBayesClassifier.train(training_feature_set) error_dict = {'+':0, '-':0, 'I':0, 'O':0} count_dict = {'+':0, '-':0, 'I':0, 'O':0} guess_dict = {'+':0, '-':0, 'I':0, 'O':0} count_table = {'+':0, '-':0, 'I':0, 'O':0} tweets = classify.get_tweets_to_classify(conn); for tweet in tweets: text = classify.get_tweet_text(conn, tweet[0])[0][0] guess = classifier.classify(classify.process_tweet(text)) classify.update_tweet_polarity(tweet[0], guess, conn) count_table[guess] += 1 #fix_manual_tweets(conn_analysis) classify.run_sql(conn, classify.Statements.UPDATE_MANUAL_CLASSIFIED) print count_table print full_matrix
def main(): articles = CategorizedPlaintextCorpusReader(corpusdir, '.*', cat_pattern = r'(.*)[/]') feats = {} trainfeats = [] testfeats = [] for cat in articles.categories(): wow = len([f for f in articles.fileids(cat)]) # such variable name print "for category", cat, ":", wow feats[cat] = [(word_feats(articles.words(fileids = [f])), cat) for f in articles.fileids(cat)] cutoff = wow - hold_back(wow) trainfeats.append(feats[cat][:cutoff]) testfeats.append(feats[cat][cutoff:]) train = [item for sublist in trainfeats for item in sublist] test = [item for sublist in testfeats for item in sublist] print 'train on %d instances, test on %d instances' % (len(train), len(test)) classifier = NaiveBayesClassifier.train(train) print 'accuracy:', nltk.classify.util.accuracy(classifier, test) classifier.show_most_informative_features() # I don't understand the output for more than 2 categories :( # load with: # import pickle # f = open('my_classifier.pickle') # classifier = pickle.load(f) # f.close() with open('../data/classifier.pickle', 'wb') as f: pickle.dump(classifier, f)
def evaluateFeatures(featureSelect): posFeatures = [] negFeatures = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [featureSelect(posWords), 'pos'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [featureSelect(negWords), 'neg'] negFeatures.append(negWords) posCutoff = int(math.floor(len(posFeatures)*3/4)) negCutoff = int(math.floor(len(negFeatures)*3/4)) #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] trainFeatures = posFeatures + negFeatures print testFeatures[0] classifier = NaiveBayesClassifier.train(trainFeatures) referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) #print features #print predicted testSets[predicted].add(i)
def classify(self): # Classify articles = Article.objects.filter(entity=self.entity) def word_feats(body): words = body.split(" ") return dict([(word, True) for word in words]) negids = articles.filter(score__lt=0) posids = articles.filter(score__gt=0) negfeats = [(word_feats(a.body), "neg") for a in negids] posfeats = [(word_feats(a.body), "pos") for a in posids] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print "train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print "accuracy:", nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features()
def generate_sentiment_classifier(corpus, word_feats): negids = corpus.fileids('neg') posids = corpus.fileids('pos') negfeats = [(word_feats(corpus.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(corpus.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) classifier = NaiveBayesClassifier.train(trainfeats) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features() return classifier
def __init_naive_bayes(self): """ __init_naive_bayes(self): Gets the data from the positive, negative and neutral text files. Creates and trains the Naive Bayes classifier, using the data, so that it can learn what constitutes a positive, negative or neutral tweet. """ try: pos_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_positive.txt") f = codecs.open(pos_file, mode="rU", encoding='utf-8') positive = [line.lower().replace("\n" , " ") for line in f] positive = "".join(word[:] for word in positive).split() f.close neu_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_neutral.txt") f = codecs.open(neu_file, mode="rU", encoding='utf-8') neutral = [line.lower().replace("\n" , " ") for line in f] neutral = "".join(word[:] for word in neutral).split() f.close neg_file = pjoin(sys.path[0], "sentiment_word_files", "tweets_negative.txt") f = codecs.open(neg_file, mode="rU", encoding='utf-8') negative = [line.lower().replace("\n" , " ") for line in f] negative = "".join(word[:] for word in negative).split() f.close posfeats = [(dict({word.lower() : True}), 'pos') for word in positive if self.__check_word(word)] neufeats = [(dict({word.lower() : True}), 'neu') for word in neutral if self.__check_word(word)] negfeats = [(dict({word.lower() : True}), 'neg') for word in negative if self.__check_word(word)] self.classifier = NaiveBayesClassifier.train( posfeats + neufeats + negfeats ) except: raise Exception ("Unknown error in SentimentAnalyzer::__init_naive_bayes")
def create_train_classifier(): print "Recreating training classifier" corpus_dir = nltk.data.find(TRAIN_DATASET_LOC) train_data = nltk.corpus.CategorizedPlaintextCorpusReader(corpus_dir, fileids='.*\.txt',cat_pattern="(pos|neg)") negids_train = train_data.fileids('neg') posids_train = train_data.fileids('pos') # negids_movies = movie_reviews.fileids('neg') # posids_movies = movie_reviews.fileids('pos') negfeats = [(__word_feats_neg(train_data.words(fileids=[f])), 'neg') for f in negids_train] posfeats = [(__word_feats_pos(train_data.words(fileids=[f])), 'pos') for f in posids_train] # negfeats.extend([(__word_feats_neg(movie_reviews.words(fileids=[f])), 'neg') for f in negids_movies]) # posfeats.extend([(__word_feats_pos(movie_reviews.words(fileids=[f])), 'pos') for f in posids_movies]) trainfeats = negfeats + posfeats classifier = NaiveBayesClassifier.train(trainfeats) pos_file_name = 'pickles'+os.sep+'positive_train.pickle' neg_file_name = 'pickles'+os.sep+'negative_train.pickle' class_file_name = 'pickles'+os.sep+'nbClassifier.pickle' __write_file(pos_file_name,cPickle.dumps(posfeats)) __write_file(neg_file_name,cPickle.dumps(negfeats)) __write_file(class_file_name,cPickle.dumps(classifier)) print "Done!"
def main(): org_names = Org.objects.values_list('name', flat=True) users = User.objects.filter(likely_org=False) user_names = [user.get_name for user in users] # Exclude the users we know are orgs (exact same name). This mostly gets run the first time and for new users with org names non_org_user_names = set(user_names) - set(org_names) org_features = [(word_features(name), 'org') for name in org_names] user_features = [(word_features(name), 'user') for name in non_org_user_names] classifier = NaiveBayesClassifier.train(user_features + org_features) counter = 0 likely_orgs = [] for user in users: prediction = classifier.prob_classify(word_features(user.get_name)) if prediction.max() == 'org': # Log probability ratio, so if P(org) == 2.4 and P(user) == 0.3 then log2(P(org)/P(user)) = log2(8.0) = 3.0 ratio = math.log(((float(prediction.prob('org')) + NORMALIZING_CONST) / (float(prediction.prob('user')) + NORMALIZING_CONST)), 2) if ratio >= MIN_RATIO and user.likely_org == False and user.admin_classification != 'user': log.info('User ID %d with name "%s" is probably an org. Saving.' % (user.id, user.get_name)) user.likely_org = True user.org_probability = ratio user.save() counter += 1 log.info("Processed %d users with org-like names" % counter)
def classify(): #corpus = 'Cornell_text_polarity' #corpus = 'BingLiu_selected_sentences' corpus = 'Cornell_sentence_polarity' cases = load_corpus(corpus) features = get_word_features(cases) train_feats = [] test_feats = [] for polarity, feats in features.items(): #cutoff = len(feats) * 1 / 4 cutoff = 1000 print polarity, 'number of train:', cutoff #train_feats += feats[:cutoff] #test_feats += feats[cutoff:] temp_feats = feats[:] random.shuffle(temp_feats) train_feats += temp_feats[:cutoff] test_feats += temp_feats[cutoff:] print 'train on %d instances, test on %d instances' % (len(train_feats), len(test_feats)) classifier = NaiveBayesClassifier.train(train_feats) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats) classifier.show_most_informative_features()
def naiveBayes(features_train, features_test): print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test)) classifier = NaiveBayesClassifier.train(features_train) print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test) classifier.show_most_informative_features() precisions, recalls = precision_recall(classifier, features_test) print "accuracy: ", precisions, "fitness: ", recalls
def train(self, graphs): """ Trains a ``NaiveBayesClassifier`` using the edges present in graphs list as positive examples, the edges not present as negative examples. Uses a feature vector of head-word, head-tag, child-word, and child-tag. :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. """ from nltk.classify import NaiveBayesClassifier # Create training labeled training examples labeled_examples = [] for graph in graphs: for head_node in graph.nodes.values(): for child_index, child_node in graph.nodes.items(): if child_index in head_node['deps']: label = "T" else: label = "F" labeled_examples.append( ( dict( a=head_node['word'], b=head_node['tag'], c=child_node['word'], d=child_node['tag'], ), label, ) ) self.classifier = NaiveBayesClassifier.train(labeled_examples)
neg_words.append(({neg_word.rstrip(): True}, 'negative')) print "First 5 positive words %s " % pos_words[:5] print "First 5 negative words %s" % neg_words[:5] print "Number of positive words %d" % len(pos_words) print "Number of negative words %d" % len(neg_words) all_words_with_sentiment = pos_words + neg_words print "Total number of words %d" % len(all_words_with_sentiment) from nltk.classify import NaiveBayesClassifier classifier = NaiveBayesClassifier.train(all_words_with_sentiment) def to_dictionary(words): return dict([(word, True) for word in words]) test_data = [] def predict_sentiment(text, expected_sentiment=None): text_to_classify = to_dictionary(text.split()) result = classifier.classify(text_to_classify) test_data.append([text_to_classify, expected_sentiment]) return result
testFeats = None for category in categories: instancesOfEntityTrain = getInstancesOfEntity( category, completeTaggedSentencesTrain) instancesOfEntityTest = getInstancesOfEntity( category, completeTaggedSentencesTest) entityFeatsTrain = train_feats(category, instancesOfEntityTrain) entityFeatsTest = train_feats(category, instancesOfEntityTrain) if trainFeats == None: trainFeats = entityFeatsTrain testFeats = entityFeatsTest else: trainFeats += entityFeatsTrain testFeats += entityFeatsTest features = prev_next_pos_iob #naiveBayers naiveBayers = NaiveBayesClassifier.train(trainFeats) naiveBayersTagger = ClassifierBasedTagger( train=completeTaggedSentencesTrain, feature_detector=features, classifier_builder=naiveBayers) nerChunkerNaiveBayers = ClassifierChunker(completeTaggedSentencesTrain, naiveBayersTagger) evalNaiveBayers = nerChunkerNaiveBayers.evaluate2(testFeats) print(evalNaiveBayers)
def build_classifier(self): classifier = NaiveBayesClassifier.train(self.training) return (classifier)
threshold_factor = 0.8 threshold_positive = int(threshold_factor * len(features_positive)) threshold_negative = int(threshold_factor * len(features_negative)) features_train = features_positive[: threshold_positive] + features_negative[: threshold_negative] features_test = features_positive[threshold_positive:] + features_negative[ threshold_negative:] print("Number of training datapoints: ", len(features_train)) print("Number of test datapoints: ", len(features_test)) # Train a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(features_train) print("\nAccuracy of NBC: ", nltk.classify.util.accuracy(classifier, features_test)) print("\nTop 10 most informative words: ") for item in classifier.most_informative_features()[:10]: print(item[0]) # Sample input reviews input_reviews = [ "It is an amazing movie", "This is a dull movie. I would never recommend it to anyone.", "The cinematography is pretty great in this movie", "The direction was terrible and the story was all over the place" ]
) #Using only the contents in HTML <body> tag, avoides Javascript from being treated as text. words = html_data.findAll( text=True ) #setting text to True to extract only the text in the <body> word_list = [] #Stores the list of words for word in words[ 30:]: #Removing redundant content from Instapaper Mobilizer headers for w in word.split(" "): #splitting on spcae for multiword strings wd = (pattern.sub('', w.lower()) ) #substituing non alphanumeric characters with '' if len(wd) > 1 and not wd.isdigit(): word_list.append( wd) #exclude strings of less than 2 characters filtered_words = [ w for w in word_list if not w in nltk.corpus.stopwords.words('english') ] return filtered_words positive_examples = [ 'http://www.engadget.com/2012/11/16/htc-droid-dna-review/', 'http://www.engadget.com/2012/10/08/samsung-galaxy-note-ii-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/', 'http://www.engadget.com/2012/11/16/htc-desire-x-review/' ] train_set = [(list(get_list_of_words_in_url), True) for link in positive_examples] classifier = NaiveBayesClassifier.train(train_set) print get_list_of_words_in_url( 'http://www.theverge.com/2012/11/28/3699112/the-verge-year-one-our-big-stories-august-2012-through-november-2012' )
def evaluate_features(feature_select): #All variables tagged_Sentences = [] untagged_Sentences = [] neg_sentence = [] pos_sentence = [] mixed_sentence = [] neutral_sentence = [] neg_Feautures = [] pos_Feautures = [] mixed_Feautures = [] neutral_Feautures = [] test_sentence = [] test_Feautures = [] allwords = [] tempPos = [] stopWords = stopwords.words("english") # Reading positive words from txt file fileInput = open('positive-words.txt', 'r') sentences = re.split(r'\n', fileInput.read()) fileInput.close() for i in sentences: posWords = re.findall(r"^[\w']+", i) if posWords: posWords = [feature_select(posWords), '1'] POS_Words.append(posWords) pos_Feautures.append(posWords) # Reading negative words from txt file fileInput = open('negative-words.txt', 'r') sentences = re.split(r'\n', fileInput.read()) fileInput.close() for i in sentences: negWords = re.findall(r"^[\w']+", i) if negWords: negWords = [feature_select(negWords), '-1'] NEG_Words.append(negWords) neg_Feautures.append(negWords) #reading pre-labeled input and splitting into lines fileInput = open('All_Classified.txt', 'r') sentences = re.split(r'\n', fileInput.read()) fileInput.close() for i in sentences: tagged = re.findall(r"^[012\(-1)]|[\w']+[/]?[\w']+[/]+[\w']+ [.,!?;]*", i) untagged = re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*|', '', i) untagged_Words = re.findall(r"[\w']+|[.,!?;]", untagged) filtered_Words = [ w for w in untagged_Words if not w.lower() in stopWords ] #allwords.append(', '.join(filtered_Words)) if untagged and tagged: if tagged[0] == '-1': neg_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '-1'] NEG_Words.append(filtered_Words) neg_Feautures.append(filtered_Words) tagged_Words = [feature_select(tagged), '-1'] NEG_Words.append(tagged_Words) neg_Feautures.append(tagged_Words) allword = ['-1', ', '.join(untagged_Words)] allwords.append(', '.join(allword)) if tagged[0] == '1': pos_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '1'] POS_Words.append(filtered_Words) pos_Feautures.append(filtered_Words) tagged_Words = [feature_select(tagged), '1'] POS_Words.append(tagged_Words) pos_Feautures.append(tagged_Words) allword = ['1', ', '.join(untagged_Words)] allwords.append(', '.join(allword)) if tagged[0] == '2': mixed_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '2'] MIX_Words.append(filtered_Words) mixed_Feautures.append(filtered_Words) allword = ['2', ', '.join(untagged_Words)] allwords.append(', '.join(allword)) if tagged[0] == '0': neutral_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), '0'] NEUTRAL_Words.append(filtered_Words) neutral_Feautures.append(filtered_Words) allword = ['0', ', '.join(untagged_Words)] allwords.append(', '.join(allword)) tagged_Sentences.append(tagged) untagged_Sentences.append(untagged) #Read a test file and create test feutures #reading pre-labeled input and splitting into lines fileInput = open('cs583_test_data.txt', 'r') sentences = re.split(r'\n', fileInput.read()) fileInput.close() temp = 0 for i in sentences: tagged = re.findall( r"^[\"012\(-1)]|[\w']+[/]?[\w']+[/]+[\w']+[.,!?;]*", i) #tagged = re.findall(r"^[-=+\*]|[\w']+[/]?[\w']+[/]+[^(NN|NNS|NNP|PRP)]+ [.,!?;]*", i) untagged = re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*|', '', ' '.join(tagged)) #untagged =re.sub(r'/[^\s]+|[0-9]+|[.,!?;]*|','',i) untagged_Words = re.findall(r"[\w']+|[.,!?;]", untagged) filtered_Words = [ w for w in untagged_Words if not w.lower() in stopWords ] if untagged and tagged and i: if i[-2] == '-': c = '-1' test_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), c] #NEUTRAL_Words.append(filtered_Words) test_Feautures.append(filtered_Words) if i[-1] == '1' and i[-2] != '-': c = '1' test_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), c] #NEUTRAL_Words.append(filtered_Words) test_Feautures.append(filtered_Words) if i[-1] == '2': c = '2' test_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), c] #NEUTRAL_Words.append(filtered_Words) test_Feautures.append(filtered_Words) if i[-1] == '0': c = '0' test_sentence.append(untagged) filtered_Words = [feature_select(filtered_Words), c] #NEUTRAL_Words.append(filtered_Words) test_Feautures.append(filtered_Words) """ posCutoff = int(math.floor(len(pos_Feautures)*3/4)) negCutoff = int(math.floor(len(neg_Feautures)*3/4)) """ neutralCutoff = int(math.floor(len(neutral_Feautures) * 1 / 20)) trainFeatures = pos_Feautures + neg_Feautures + neutral_Feautures[: neutralCutoff] #test_Feautures= pos_Feautures[posCutoff:] + neg_Feautures[negCutoff:] + neutral_Feautures[neutralCutoff: 2*neutralCutoff] #trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(test_Feautures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #prints metrics to show how well the feature selection did print 'train on %d instances, test on %d instances' % ( len(tagged_Sentences), len(test_sentence)) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_Feautures) print 'pos precision:', nltk.metrics.precision(referenceSets['1'], testSets['1']) print 'pos recall:', nltk.metrics.recall(referenceSets['1'], testSets['1']) print 'pos f-measure:', nltk.metrics.f_measure(referenceSets['1'], testSets['1']) print 'neg precision:', nltk.metrics.precision(referenceSets['-1'], testSets['-1']) print 'neg recall:', nltk.metrics.recall(referenceSets['-1'], testSets['-1']) print 'neg f-measure:', nltk.metrics.f_measure(referenceSets['-1'], testSets['-1'])
print("Generiere Cutoff") ingCutoff = int(len(ingFeats) * 0.9) neutCutoff = int(len(neutFeats) * 0.9) print( f'Sätze Ingvar-Korpus {len(ingFeats)}, Sätze neutraler Korpus {len(neutFeats)}' ) print("Trainiere Classifier mit Kontrollmenge") trainfeats = ingFeats[:ingCutoff] + neutFeats[:neutCutoff] testfeats = ingFeats[ingCutoff:] + neutFeats[neutCutoff:] print('Trainiere mit %d Features, Teste mit %d Features' % (len(trainfeats), len(testfeats))) classifierTrain = NaiveBayesClassifier.train(trainfeats) print('Genauigkeit:', nltk.classify.util.accuracy(classifierTrain, testfeats)) classifierTrain.show_most_informative_features() print("Trainiere Classifier zum Weiterverwenden") mainFeats = ingFeats + neutFeats classifier = NaiveBayesClassifier.train(mainFeats) with open('SentimentAnalysisClassifier.pickle', 'wb') as f: pickle.dump(classifier, f, protocol=2) f.close() """ test_sentence = '' while test_sentence.lower() is not 'stop':
positive_ids = movie_reviews.fileids('pos') """ Separate positive features from negative """ negative_features = [(extract(movie_reviews.words(fileids=[f])), 'neg') for f in negative_ids] positive_features = [(extract(movie_reviews.words(fileids=[f])), 'pos') for f in positive_ids] """ Trains of 3/4 off the database and test off 1/4 """ negative_cutoff = int(len(negative_features) * 3 / 4) positive_cutoff = int(len(positive_features) * 3 / 4) train_features = negative_features[: negative_cutoff] + positive_features[: positive_cutoff] test_features = negative_features[negative_cutoff:] + positive_features[ positive_cutoff:] print('Training on %d instances, testing on %d instances' % (len(train_features), len(test_features))) classifier = NaiveBayesClassifier.train(train_features) print('Training complete') print('accuracy:', nltk.classify.util.accuracy(classifier, test_features)) classifier.show_most_informative_features() """ Save classifier """ f = open('classifier.pickle', 'wb') pickle.dump(classifier, f) f.close()
for w in short_pos_words: all_words.append(w.lower()) for w in short_neg_words: all_words.append(w.lower()) print('3' * 80) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys()) print('4' * 80) def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features featuresets = [(find_features(rev), category) for (rev, category) in documents] training_set = featuresets print('5' * 80) model = NaiveBayesClassifier.train(training_set) print('6' * 80) Pkl_Filename = "Pickle_RL_Model.pkl" with open(Pkl_Filename, 'wb') as file: pickle.dump(model, file)
def classifyReviews(): ''' Perform sentiment classification on movie reviews ''' # Read the data from the file data = pd.read_csv("data/movieReviews.csv") # get the text of the positive and negative reviews only. # positive and negative will be lists of strings # For now we use only very positive and very negative reviews. positive = getReviews(data, 4) negative = getReviews(data, 0) # Split each data set into training and testing sets. # You have to write the function splitTrainTest (posTrainText, posTestText) = splitTrainTest(positive, 0.8) (negTrainText, negTestText) = splitTrainTest(negative, 0.8) # Format the data to be passed to the classifier. # You have to write the formatForClassifer function posTrain = formatForClassifier(posTrainText, 'pos') negTrain = formatForClassifier(negTrainText, 'neg') # Create the training set by appending the pos and neg training examples training = posTrain + negTrain # Format the testing data for use with the classifier posTest = formatForClassifier(posTestText, 'pos') negTest = formatForClassifier(negTestText, 'neg') # Create the test set test = posTest + negTest # Train a Naive Bayes Classifier # Uncomment the next line once the code above is working classifier = NaiveBayesClassifier.train(training) # Uncomment the next two lines once everything above is working print("Accuracy of the classifier is: " + str(accuracy(classifier, test))) classifier.show_most_informative_features() # Calculate and print the accuracy on the positive and negative # documents separately # You will want to use the function classifier.classify, which takes # a document formatted for the classifier and returns the classification # of that document ("pos" or "neg"). For example: # classifier.classify(format_sentence("I love this movie. It was great!")) # will (hopefully!) return "pos" numPos = 0 numNeg = 0 for review in positive: if classifier.classify(format_sentence(review)) == "pos": numPos += 1 for review in negative: if classifier.classify(format_sentence(review)) == "neg": numNeg += 1 print("Accuracy of Positive: " + str(numPos / len(positive))) print("Accuracy of Negative: " + str(numNeg / len(negative))) # Prints two lists with all of the misclassified positive reviews and misclassified negative reviews. wrongPosList = [] wrongNegList = [] for review in positive: if classifier.classify(format_sentence(review)) == "neg": wrongPosList.append(review) for review in negative: if classifier.classify(format_sentence(review)) == "pos": wrongNegList.append(review) print("Misclassified Positive Reviews: " + str(wrongPosList)) print("Misclassified Negative Reviews: " + str(wrongNegList))
def train(trainfeats, testfeats, dataset, nlt=True, skl=True, most=0): # print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))) nltk_output = "none" sklearn_output = "none" if nlt: my_classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = my_classifier.classify(feats) testsets[observed].add(i) # precision and recall accuracy = nltk.classify.util.accuracy(my_classifier, testfeats) * 100 pos_prec = precision(refsets['pos'], testsets['pos']) * 100 pos_rec = recall(refsets['pos'], testsets['pos']) * 100 neg_prec = precision(refsets['neg'], testsets['neg']) * 100 neg_rec = recall(refsets['neg'], testsets['neg']) * 100 # round accuracy = round(accuracy, 1) pos_prec = round(pos_prec, 1) pos_rec = round(pos_rec, 1) neg_prec = round(neg_prec, 1) neg_rec = round(neg_rec, 1) # print('pos F-measure:', f_measure(refsets['pos'], testsets['pos'])) # print('neg F-measure:', f_measure(refsets['neg'], testsets['neg'])) my_classifier.show_most_informative_features(most) nltk_output = "nlt, " + str(accuracy) + ", " + str( pos_prec) + ", " + str(neg_prec) + ", " + str( pos_rec) + ", " + str(neg_rec) + "\n" if skl: MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier._vectorizer.sort = False my_classifier = MNB_classifier.train(trainfeats) # get_precision(trainfeats, testfeats, my_classifier, dataset, "mnb") # mnb = (nltk.classify.accuracy(MNB_classifier, testfeats)) * 100 # mnb = round(mnb, 1) # print(mnb) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier._vectorizer.sort = False my_classifier = BernoulliNB_classifier.train(trainfeats) # get_precision(trainfeats, testfeats, my_classifier, dataset, "bnb") # bnb = (nltk.classify.accuracy(BernoulliNB_classifier, testfeats)) * 100 # bnb = round(bnb, 1) # print(bnb) LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier._vectorizer.sort = False my_classifier = LogisticRegression_classifier.train(trainfeats) # get_precision(trainfeats, testfeats, my_classifier, dataset, "lr") # lr = (nltk.classify.accuracy(LogisticRegression_classifier, testfeats)) * 100 # lr = round(lr, 1) # print(lr) LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier._vectorizer.sort = False my_classifier = LinearSVC_classifier.train(trainfeats) # get_precision(trainfeats, testfeats, my_classifier, dataset, "lsvc") # lsvc = (nltk.classify.accuracy(LinearSVC_classifier, testfeats)) * 100 # lsvc = round(lsvc, 1) # print(lsvc) NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier._vectorizer.sort = False my_classifier = NuSVC_classifier.train(trainfeats) # get_precision(trainfeats, testfeats, my_classifier, dataset, "nsvc") # nsvc = (nltk.classify.accuracy(NuSVC_classifier, testfeats)) * 100 # nsvc = round(nsvc, 1) # print(nsvc) voted_classifier = VoteClassifier(NuSVC_classifier, LinearSVC_classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier) get_precision(trainfeats, testfeats, voted_classifier, dataset, "voted") # voted = (nltk.classify.accuracy(voted_classifier, testfeats)) * 100 # voted = round(voted, 1) # print(voted) # sklearn_output = "skl, " + str(mnb) + ", " + str(bnb) + ", " + str(lr) + ", " + str(lsvc) + ", " + str(nsvc) + ", " + str(voted) + "\n" sklearn_output = "" return (nltk_output, sklearn_output)
pos.append([format_sentence(i), 'pos']) neg = [] with open("rt.neg", encoding="utf8") as f: for i in f: neg.append([format_sentence(i), 'neg']) # next, split labeled data into the training and test data training = pos[:int((.8) * len(pos))] + neg[:int((.8) * len(neg))] test = pos[int((.8) * len(pos)):] + neg[int((.8) * len(neg)):] # a second test data set based on tweets test_2 = [] with open("pos_tweets.txt", encoding="utf8") as f: for i in f: test_2.append([format_sentence(i), 'pos']) with open("neg_tweets.txt", encoding="utf8") as f: for i in f: test_2.append([format_sentence(i), 'neg']) classifier = NaiveBayesClassifier.train(training) classifier.show_most_informative_features() print("accuracy = " + str(accuracy(classifier, test))) print("accuracy 2 = " + str(accuracy(classifier, test_2))) # training a model every time we run the program is inefiicient # we use pickle to store and save the model for future use classifier_file = open("classifier.pickle", "wb") pickle.dump(classifier, classifier_file) classifier_file.close()
def main(twtInfo: object): data_tcs_tweets = pd.read_json(twtInfo, orient="records") tweets = data_tcs_tweets["text"] data_id = data_tcs_tweets["id"] nltk.download("twitter_samples") pos_tweets = twitter_samples.strings("positive_tweets.json") neg_tweets = twitter_samples.strings("negative_tweets.json") clean_pos_tweets = [cleanTweet(tweet) for tweet in pos_tweets] clean_neg_tweets = [cleanTweet(tweet) for tweet in neg_tweets] # downloads corpus of stopwords (i.e. "the", "did", "?") # TODO: check if nltk.stopwords is already downloaded and if it is, then skip nltk.download("stopwords") # downloads work tokenizer trained on English # TODO: check if nltk.punkt is already downloaded and if it is, then skip nltk.download("punkt") # tokenize and clean up the whole set of clean tweet texts # tc_tweets = tokenized & cleaned tweets pos_tc_tweets = tokenizeTweets(clean_pos_tweets) neg_tc_tweets = tokenizeTweets(clean_neg_tweets) # apply stemming algorithm to tweets # stemming normalizes text i.e. "waited", "waits", "waiting" -> "wait" # this cleans the data and makes it easier for the ML algorithm to read it pos_tcs_tweets = stemTweets(pos_tc_tweets) neg_tcs_tweets = stemTweets(neg_tc_tweets) # pairs each tweet's cleaned text with its sentiment label pos_label_pair_list = ((tweet, "pos") for tweet in pos_tcs_tweets) neg_label_pair_list = ((tweet, "neg") for tweet in neg_tcs_tweets) # TODO: possible bias location, we are only separating into positive/negative sentiment and not neutral # remove all neutral tweets since we are only interested in positive/negative ones #text_label_pair_list[:] = [tuple for tuple in text_label_pair_list if tuple[1] != "Neutral"] # split into train and test set, 90% for training set, 10% for testing set #train, test = train_test_split(text_label_pair_list, test_size = 0.1, random_state=7) # define bag-of-words model and features pos_bow = [(buildBowFeatures(tuple[0]), tuple[1]) for tuple in pos_label_pair_list] neg_bow = [(buildBowFeatures(tuple[0]), tuple[1]) for tuple in neg_label_pair_list] data_bow = [buildBowFeatures(text) for text in tweets] # one of the simplest supervised ML classifiers is the Naive Bayes Classifier # TODO: potential new tool would involve different ML classifier # it can be trained on 90% of the data to learn what words are associated with pos/neg comments train_bow = pos_bow + neg_bow shuffle(train_bow) sentiment_classifier = NaiveBayesClassifier.train(train_bow) # we can check after training what the accuracy is on the training set # i.e. the same data we used for training, this should be a high number since algo already saw the data #nltk.classify.util.accuracy(sentiment_classifier, train_bow)*100 # accuracy on the testing set #nltk.classify.util.accuracy(sentiment_classifier, test_bow)*100 preds = [ sentiment_classifier.classify(comment_dict) for comment_dict in data_bow ] # TODO: figure out what to return #return pd.Series(rxt_params).to_json(orient="records") #dfPreds = pd.DataFrame(preds) #ret = pd.concat([data, dfPreds], axis=1) ret = [] for i in range(len(preds)): ret.append({}) ret[i]["text"] = tweets[i] ret[i]["id"] = data_id[i] ret[i]["sentiment"] = preds[i] return pd.Series(ret).to_json(orient="records")
# count term frequency terms_all = [term for term in words if term not in stop] word_tokens = word_feats(terms_all) bi_tokens = bigram_word_feats(terms_all) best_tokens200 = best_feats(terms_all,bestwords) count_all.update(terms_all) sentence = " ".join(str(term) for term in terms_all) #print ('label the features') with open('train-labels.txt','r') as f2: for line2 in f2: line2 = line2.strip('\n') if id == line2.split('\t')[0]: train_data.append([bi_tokens, line2.split('\t')[1]]) print ('train the model') classifier = NaiveBayesClassifier.train(train_data) # classifier.show_most_informative_features(20) with open("dev-tweets.txt",'r')as dev: print ('preprocess the dev ') test_data = [] for line in dev: line = line.strip('\n') id2 = line.split('\t')[0] text = line.split('\t')[1] text = preprocess.processAll(text) words = [word if (word[0:2] == '__') else word.lower() for word in text.split() if len(word) >= 3] words = [word for word in words if word[0:2] != '__'] words = [stemmer.stem(w) for w in words] words = [lemmatizer.lemmatize(w) for w in words]
def returnLastValue(): with open('Actor.txt', 'r') as actor: data_actor = actor.read().replace(',', ' ') actor_vocab = nltk.word_tokenize(data_actor) with open('Plot.txt', 'r') as plot: data_plot = plot.read().replace(',', ' ') plot_vocab = nltk.word_tokenize(data_plot) with open('Theme.txt', 'r') as theme: data_theme = theme.read().replace(',', ' ') theme_vocab = nltk.word_tokenize(data_theme) with open('Other.txt', 'r') as other: data_other = other.read().replace(',', ' ') other_vocab = nltk.word_tokenize(data_other) actor_features = [(word_feats(act), 'actor') for act in actor_vocab] plot_features = [(word_feats(plo), 'plot') for plo in plot_vocab] theme_feature = [(word_feats(the), 'theme') for the in theme_vocab] other_feature = [(word_feats(ot), 'other') for ot in other_vocab] train_set = actor_features + plot_features + theme_feature + other_feature NBclassifier = NaiveBayesClassifier.train(train_set) LR_classifier = SklearnClassifier(LogisticRegression()) LR_classifier.train(train_set) LSVS_classifier = SklearnClassifier(LinearSVC()) LSVS_classifier.train(train_set) Random_classifier = SklearnClassifier(RandomForestClassifier()) Random_classifier.train(train_set) decision_classifier = SklearnClassifier(DecisionTreeClassifier()) decision_classifier.train(train_set) classifierReswithVotes = {} classifierResult = [] NBactor = 0 NBplot = 0 NBtheme = 0 NBother = 0 # actor plot theme other LRactor = 0 LRplot = 0 LRtheme = 0 LRother = 0 SVMactor = 0 SVMplot = 0 SVMtheme = 0 SVMother = 0 RFactor = 0 RFplot = 0 RFtheme = 0 RFother = 0 DTactor = 0 DTplot = 0 DTtheme = 0 DTother = 0 line = "good actor" sentence = "".join(line) sentence = sentence.lower() words = sentence.split(' ') # actor plot theme other for word in words: NBclassResult = NBclassifier.classify(word_feats(word)) if NBclassResult == 'actor': NBactor = NBactor + 1 if NBclassResult == 'plot': NBplot = NBplot + 1 if NBclassResult == 'theme': NBtheme = NBtheme + 1 if NBclassResult == 'other': NBother = NBother + 1 # actor plot theme other classResultLR = LR_classifier.classify(word_feats(word)) if classResultLR == 'actor': LRactor = LRactor + 1 if classResultLR == 'plot': LRplot = LRplot + 1 if classResultLR == 'theme': LRtheme = LRtheme + 1 if classResultLR == 'other': LRother = LRother + 1 # actor plot theme other classResultSVM = LSVS_classifier.classify(word_feats(word)) if classResultSVM == 'actor': SVMactor = SVMactor + 1 if classResultSVM == 'plot': SVMplot = SVMplot + 1 if classResultSVM == 'theme': SVMtheme = SVMtheme + 1 if classResultSVM == 'other': SVMother = SVMother + 1 classResultRandom = Random_classifier.classify(word_feats(word)) # actor plot theme other if classResultRandom == 'actor': RFactor = RFactor + 1 if classResultRandom == 'plot': RFplot = RFplot + 1 if classResultRandom == 'theme': RFtheme = RFtheme + 1 if classResultRandom == 'other': RFother = RFother + 1 classResultDT = decision_classifier.classify(word_feats(word)) # actor plot theme other if classResultDT == 'actor': DTactor = DTactor + 1 if classResultDT == 'plot': DTplot = DTplot + 1 if classResultDT == 'theme': DTtheme = DTtheme + 1 if classResultDT == 'other': DTother = DTother + 1 # actor plot theme other statsNB = { 'actor': (float(NBactor) / len(words)), 'plot': (float(NBplot) / len(words)), 'theme': (float(NBtheme) / len(words)), 'other': (float(NBother) / len(words)) } maximumNB = max(statsNB.items(), key=operator.itemgetter(1))[0] classifierResult.append(maximumNB) addToVotes(maximumNB, statsNB.pop(maximumNB), classifierReswithVotes) statsLR = { 'actor': (float(LRactor) / len(words)), 'plot': (float(LRplot) / len(words)), 'theme': (float(LRtheme) / len(words)), 'other': (float(LRother) / len(words)) } maximumLR = max(statsLR.items(), key=operator.itemgetter(1))[0] classifierResult.append(maximumLR) addToVotes(maximumLR, statsLR.pop(maximumLR), classifierReswithVotes) statsSVM = { 'actor': (float(SVMactor) / len(words)), 'plot': (float(SVMplot) / len(words)), 'theme': (float(SVMtheme) / len(words)), 'other': (float(SVMother) / len(words)) } maximumSVM = max(statsSVM.items(), key=operator.itemgetter(1))[0] classifierResult.append(maximumSVM) addToVotes(maximumSVM, statsSVM.pop(maximumSVM), classifierReswithVotes) statsRF = { 'actor': (float(RFactor) / len(words)), 'plot': (float(RFplot) / len(words)), 'theme': (float(RFtheme) / len(words)), 'other': (float(RFother) / len(words)) } maximumRF = max(statsRF.items(), key=operator.itemgetter(1))[0] classifierResult.append(maximumRF) addToVotes(maximumRF, statsRF.pop(maximumRF), classifierReswithVotes) statsDT = { 'actor': (float(DTactor) / len(words)), 'plot': (float(DTplot) / len(words)), 'theme': (float(DTtheme) / len(words)), 'other': (float(DTother) / len(words)) } maximumDT = max(statsDT.items(), key=operator.itemgetter(1))[0] classifierResult.append(maximumDT) addToVotes(maximumDT, statsDT.pop(maximumDT), classifierReswithVotes) print(str(classifierResult)) try: normalRes = mode(classifierResult) except: maxx = max(classifierReswithVotes.items(), key=operator.itemgetter(1))[0] normalRes = maxx hybrid_classifier = VoteClassifier(NBclassifier, LR_classifier, LSVS_classifier, Random_classifier, decision_classifier) print("sentence :" + sentence) hybridRes = hybrid_classifier.classifyAll(sentence) print("Normal Result", normalRes) print("Hybrid Result", hybridRes) print( "------------------------------------------------------------------------------" )
pos = [] with open("basic_positive.csv", "r") as reader: for line in reader: pos.append(line) neg = [] with open("basic_negative.csv", "r") as reader: for line in reader: neg.append(line) positive_feature = [(format_sentence(pos_term), "pos") for pos_term in pos] negative_feature = [(format_sentence(neg_term), "neg") for neg_term in neg] train_test = positive_feature + negative_feature classifier = NaiveBayesClassifier.train(train_test) pos = 0 neg = 0 pos_line = 0 neg_line = 0 counter = 0 count_line = 0 #try to save the pos and neg comments #this can be helpful to perform a further machine learning test pos_file = open("positive_comments.csv", "w") neg_file = open("negative_comments.csv", "w") with open("dataset.csv", "r") as reader: for line in reader: count_line += 1 #this is the single comment
df.iloc[idx_test:,:] test_df, training_df=\ pd_train_test_split(data_df, test_size=0.2, randomstate=123) train_Xy=[(wfeatures,sentiment) for wfeatures,sentiment in \ zip(training_df['dict_features'].tolist(),training_df['sentiment'].tolist())] test_Xy=[(wfeatures,sentiment) for wfeatures,sentiment in \ zip(test_df['dict_features'].tolist(),test_df['sentiment'].tolist())] # training print(" ** training ") NB_nltk_clf = NaiveBayesClassifier.train(train_Xy) MaxEnt_nltk_clf=classifier = MaxentClassifier.train(train_Xy, max_iter = 10) NB_nltk_clf.show_most_informative_features(10) MaxEnt_nltk_clf.show_most_informative_features(10) # test and report # support functions def classifier_predict(clf, testXy): test_predictions, test_labels=[],[] for sampleid in range(len(testXy)): test_predictions.append(clf.classify(testXy[sampleid][0])) test_labels.append(testXy[sampleid][1]) return np.array(test_predictions), np.array(test_labels)
from nltk.tokenize import word_tokenize from nltk.classify import NaiveBayesClassifier import pickle def formatar_sentenca(sentenca): return {palavra: True for palavra in word_tokenize(sentenca)} f_pos = open('corpus_positivo.txt', 'rb') positivos = f_pos.read().splitlines() f_pos.close() f_neg = open('corpus_negativo.txt', 'rb') negativos = f_neg.read().splitlines() f_neg.close() dados_treinamento = [] for positivo in positivos: dados_treinamento.append( [formatar_sentenca(positivo.decode("utf8").lower()), "positivo"]) for negativo in negativos: dados_treinamento.append( [formatar_sentenca(negativo.decode("utf8").lower()), "negativo"]) modelo = NaiveBayesClassifier.train(dados_treinamento) with open('modelo.obj', 'wb') as f: modelo_serial = pickle.dump(modelo, f) print('Modelo classificador treinado e armazenado em modelo.obj')
def do_validation(self): # each fold is a list of body ids. folds, hold_out = kfold_split(self.dataset, n_folds=10) # fold_stances is a dict. keys are fold number (e.g. 0-9). hold_out_stances is list fold_stances, hold_out_stances = get_stances_for_folds( self.dataset, folds, hold_out) labeled_feat_dict = {} print "Generating features for each fold" for fold_id in fold_stances: print "Generating features for fold ", fold_id bodies = folds[fold_id] stances = fold_stances[fold_id] fold_avg_sims, fold_max_sims = JaccardGenerator().gen_jaccard_sims( self.dataset, bodies, stances) labeled_feature_set = [] for i in range(len(stances)): labeled_feature = ({ 'avg_sims': fold_avg_sims[i], 'max_sims': fold_max_sims[i] }, self._process_stance(stances[i]['Stance'])) labeled_feature_set.append(labeled_feature) labeled_feat_dict[fold_id] = labeled_feature_set print "Generating features for hold out fold" holdout_avg_sims, holdout_max_sims = JaccardGenerator( ).gen_jaccard_sims(self.dataset, hold_out, hold_out_stances) h_unlabeled_features = [] h_labels = [] for i in range(len(hold_out_stances)): unlabeled_feature = { 'avg_sims': holdout_avg_sims[i], 'max_sims': holdout_max_sims[i] } label = self._process_stance(hold_out_stances[i]['Stance']) h_unlabeled_features.append(unlabeled_feature) h_labels.append(label) fold_accuracy = {} best_fold_accuracy = 0.0 classifiers = [] print "Validating using each fold as testing set" for fold_id in fold_stances: fold_ids = list(range(len(folds))) del fold_ids[fold_id] # deleted fold is test set for this run # training set is every fold except for the testing fold (fold_id) training_set = [ feat for fid in fold_ids for feat in labeled_feat_dict[fid] ] testing_set = [] testing_labels = [] # testing set is just the testing fold (fold_id) for feat, label in labeled_feat_dict[fold_id]: testing_set.append(feat) testing_labels.append(label) classifier = NaiveBayesClassifier.train(training_set) classifiers.append(classifier) pred = classifier.classify_many(testing_set) accuracy = self._score(pred, testing_labels) print "Fold ", fold_id, "accuracy: ", accuracy if accuracy > best_fold_accuracy: best_fold_accuracy = accuracy best_fold_cls = classifier h_res = best_fold_cls.classify_many(h_unlabeled_features) print 'holdout score:', self._score(h_res, h_labels)
def getClassifier(tweetfile, cfg): degreesToUse = cfg['NLPnGrams'] print "DEBOOOOO", degreesToUse, type(degreesToUse) classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ') shortClass = classMode.replace(' ', '').lower() loadNeeded = True if 'NLPTEST' not in cfg.keys(): degreeString = '-'.join([str(degree) for degree in degreesToUse]) pickleFile = 'nlpTrainers/' + tweetfile.replace( '.csv', '.' + shortClass + degreeString + '.pickle') if isfile(pickleFile): print "Loading pickled", shortClass, "classifier" fileIn = open(pickleFile) classifier = cPickle.load(fileIn) fileIn.close() loadNeeded = False if loadNeeded: if 'NLPTEST' in cfg.keys(): content = prepText(tweetfile) categorized = prepClassifications(content) NGrammized = collectNGrams(categorized, degreesToUse, cfg) else: print "Loading content & preparing text" content = prepText(loadFile(tweetfile)) print "Categorizing contents" categorized = prepClassifications(content) print "Deriving NGrams of length(s)", degreesToUse NGrammized = collectNGrams(categorized, degreesToUse, cfg) print "Compiling Results" readyToSend = [] allCats = [str(key) for key in NGrammized.keys()] for category in allCats: readyToSend += NGrammized[category] print "Attempting Classification by mode", classMode, degreesToUse if classMode == 'naive bayes': from nltk.classify import NaiveBayesClassifier classifier = { 'class': NaiveBayesClassifier.train(readyToSend), 'mode': 'nb' } elif classMode == 'positive naive bayes': from nltk.classify import PositiveNaiveBayesClassifier classifier = { 'class': PositiveNaiveBayesClassifier.train(readyToSend), 'mode': 'pnb' } elif classMode == 'max ent': #import nltk.classify #from sklearn.linear_model import LogisticRegression #from nltk.classify import SklearnClassifier #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'} from nltk.classify import MaxentClassifier classifier = { 'class': MaxentClassifier.train(readyToSend, algorithm='iis'), 'mode': 'me' } elif classMode == 'decision tree': from nltk.classify import DecisionTreeClassifier classifier = { 'class': DecisionTreeClassifier.train(readyToSend), 'mode': 'dt' } elif classMode == 'svm': if "SVMOrder" in cfg.keys(): priority = cfg['SVMOrder'] else: priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210" if type(priority) is str: priority = list(priority) priority = [entry for entry in priority if entry in allCats] preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg) classifier = { 'class': preppedSVM, 'mode': 'svm', 'priority': priority } else: from nltk.classify import NaiveBayesClassifier classifier = { 'class': NaiveBayesClassifier.train(readyToSend), 'mode': 'nb' } if 'NLPTEST' not in cfg.keys(): print "Pickling Classifier" fileOut = open(pickleFile, 'wb') cPickle.dump(classifier, fileOut) fileOut.close() if 'NLPTEST' not in cfg.keys(): if classMode != 'svm': classifier['class'].show_most_informative_features(n=150) """else: for key in classifier['class'].keys(): print classifier print classifier.keys() classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))""" return classifier
fileName = sys.argv[1] # Loads training data from input file load_data(fileName) # Then build feature vectors for both negative and positive tweets negfeats = get_feature_vec(negtweets, 'neg') posfeats = get_feature_vec(postweets, 'pos') all_feats = negfeats + posfeats random.shuffle(all_feats) print 'TextBlog accuracy on training data:', textblob_acc( postweets, negtweets) result = cross_validation(all_feats) print 'Naiave Bayes cross validation accuracy:', result[0] print 'MaxEnt cross validation accuracy:', result[1] pos1feats = get_feature_vec(pos1, 'pos') neg1feats = get_feature_vec(neg1, 'neg') all1_feats = neg1feats + pos1feats print 'TextBlog accuracy on manual dataset:', textblob_acc(pos1, neg1) bayes = NaiveBayesClassifier.train(all_feats) print 'Naiave Bayes accuracy on manual dataset:', nltk.classify.util.accuracy( bayes, all1_feats) maxent = nltk.MaxentClassifier.train( all_feats, nltk.classify.MaxentClassifier.ALGORITHMS[0], max_iter=1) print 'MaxEnt accuracy on manual dataset:', nltk.classify.util.accuracy( maxent, all1_feats)
def training(): if os.path.exists('CommentSentimentData/dataset.pkl'): with open('CommentSentimentData/dataset.pkl', 'rb') as f: dataset = pickle.load(f) pos_sen = dataset['pos'] neg_sen = dataset['neg'] else: dataset = fileload('CommentSentimentData/training.1600000.csv') pos_sen = [sen[5] for sen in dataset if sen[0] == '4'] neg_sen = [sen[5] for sen in dataset if sen[0] == '0'] dataset_dic = {} dataset_dic['pos'] = pos_sen dataset_dic['neg'] = neg_sen with open('CommentSentimentData/dataset.pkl', 'wb') as f: pickle.dump(dataset_dic, f, protocol=pickle.HIGHEST_PROTOCOL) if os.path.exists('CommentSentimentData/bestwords.pkl'): with open('CommentSentimentData/bestwords.pkl', 'rb') as f: best_words = pickle.load(f) else: best_words = find_best_words(pos_sen, neg_sen, 2000) with open('CommentSentimentData/bestwords.pkl', 'wb') as f: pickle.dump(best_words, f, protocol=pickle.HIGHEST_PROTOCOL) prev = [(features(words, best_words), 'positive') for words in pos_sen] nrev = [(features(words, best_words), 'negative') for words in neg_sen] pos_set = prev neg_set = nrev if os.path.exists('CommentSentimentData/classifier.pkl'): with open('CommentSentimentData/classifier.pkl', 'rb') as f: real_classifier = pickle.load(f) else: real_classifier = NaiveBayesClassifier.train(prev + nrev) with open('CommentSentimentData/classifier.pkl', 'wb') as f: pickle.dump(real_classifier, f, protocol=pickle.HIGHEST_PROTOCOL) # TO TEST ACCURACY OF CLASSIFIER UNCCOMMENT THE CODE BELOW # ACCURACY : 78.1695423855964 # ncutoff = int(len(nrev) * 3 / 4) # pcutoff = int(len(prev) * 3 / 4) # train_set = nrev[:ncutoff] + prev[:pcutoff] # test_set = nrev[ncutoff:] + prev[pcutoff:] # # test_classifier = NaiveBayesClassifier.train(train_set) # test_classifier = SklearnClassifier(BernoulliNB()).train(train_set) pos_sen = open("CommentSentimentData/positive.txt", 'r', encoding='latin-1').read() neg_sen = open("CommentSentimentData/negative.txt", 'r', encoding='latin-1').read() prev = [(features(words, best_words), 'positive') for words in pos_sen.split('\n')] nrev = [(features(words, best_words), 'negative') for words in neg_sen.split('\n')] test_set = nrev + prev print("Accuracy is : ", util.accuracy(real_classifier, test_set) * 100)
def author_beng_nbc(): #1st Set bankc = open("/python27/Bankim500_1.txt", "r").read() bankw = bankc.split() bankz = reduce(concat, [['bankim', x] for x in bankw[1:]], bankw[0:1]) #print a3 it = iter(bankz) bankt = zip(it, it) #print a4 #2nd Set bibhuc = open("/python27/Bibhuti500_1.txt", "r").read() bibhuw = bibhuc.split() bibhuz = reduce(concat, [['bibhuti', x] for x in bibhuw[1:]], bibhuw[0:1]) #print b3 it1 = iter(bibhuz) bibhut = zip(it1, it1) #print b4 #3rd Set rabindrac = open("/python27/Rabindra500_1.txt", "r").read() rabindraw = rabindrac.split() rabindraz = reduce(concat, [['rabindra', x] for x in rabindraw[1:]], rabindraw[0:1]) #print a3 it2 = iter(rabindraz) rabindrat = zip(it2, it2) #4th Set saratc = open("/python27/Sarat500_1.txt", "r").read() saratw = saratc.split() saratz = reduce(concat, [['sarat', x] for x in saratw[1:]], saratw[0:1]) #print a3 it3 = iter(saratz) saratt = zip(it3, it3) add1 = bankt + bibhut + rabindrat + saratt #print c1 training_data = add1 vocabulary = set( chain(*[word_tokenize(i[0].lower()) for i in training_data])) feature_set = [ ({i: (i in word_tokenize(sentence.lower())) for i in vocabulary}, tag) for sentence, tag in training_data ] #print "###",feature_set from nltk.classify import NaiveBayesClassifier as nbc train_set, test_set = feature_set[:300], feature_set[300:] print len(train_set) print len(test_set) classifier = nbc.train(train_set) test_sentence = "আলীপুরের উকিল বিশেষ কিছু হয় বলিয়া মনে হয় না বালিগঞ্জের ওদিকে কোথায় একটা টিউশনি আছে" featurized_test_sentence = { i: (i in word_tokenize(test_sentence.lower())) for i in vocabulary } print "test_sent:", test_sentence print "tag:", classifier.classify(featurized_test_sentence) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'bankim precision:', nltk.precision(refsets['bankim'], testsets['bankim']) print 'bankim recall:', nltk.recall(refsets['bankim'], testsets['bankim']) print 'bankim F-measure:', nltk.f_measure(refsets['bankim'], testsets['bankim']) print 'bibhuti precision:', nltk.precision(refsets['bibhuti'], testsets['bibhuti']) print 'bibhuti recall:', nltk.recall(refsets['bibhuti'], testsets['bibhuti']) print 'bibhuti F-measure:', nltk.f_measure(refsets['bibhuti'], testsets['bibhuti']) print 'bankim precision:', nltk.precision(refsets['rabindra'], testsets['rabindra']) print 'bankim recall:', nltk.recall(refsets['rabindra'], testsets['rabindra']) print 'bankim F-measure:', nltk.f_measure(refsets['rabindra'], testsets['rabindra']) print 'bibhuti precision:', nltk.precision(refsets['sarat'], testsets['sarat']) print 'bibhuti recall:', nltk.recall(refsets['sarat'], testsets['sarat']) print 'bibhuti F-measure:', nltk.f_measure(refsets['sarat'], testsets['sarat'])
def unigramAnalysis(self, word_extract_feature): #Dataset on Anger and Trust is extremely poor #it ruined my existing dataset as well , so I will avoid them as of now datafiles = [ { 'emo': "Sad", 'name': "/negative.csv" }, { 'emo': "Happy", 'name': "/positive.csv" } # ,{'emo': 'Happy', 'name': "/trust.csv"}, {'emo': 'Sad', 'name': "/anger.csv"} ] trainfeats = [] testfeats = [] dataset = [] for value in datafiles: emo = value['emo'] name = value['name'] read = self.readFile(name) read['emo'] = emo features = [(word_extract_feature(statement.split()), emo) for statement in read['tweets']] dataset.append(features) for data in dataset: cutoff = len(data) * 3 / 4 trainfeats = trainfeats + data[:cutoff] testfeats = testfeats + data[cutoff:] try: classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) #K-Fold classification test #average the result of number of tests shuffle(trainfeats) X_folds = np.array_split(trainfeats, K_FOLDS) scores = list() for k in range(K_FOLDS): X_train = list(X_folds) X_test = X_train.pop(k) X_train = np.concatenate(X_train) classifier = NaiveBayesClassifier.train(X_train) scores.append(nltk.classify.util.accuracy(classifier, X_test)) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'Average accuracy K-Fold ', sum(scores) / float(len(scores)) print 'accuracy:', nltk.classify.util.accuracy( classifier, testfeats) print 'Happy precision:', nltk.metrics.precision( refsets['Happy'], testsets['Happy']) print 'Happy recall:', nltk.metrics.recall(refsets['Happy'], testsets['Happy']) print 'Sad precision:', nltk.metrics.precision( refsets['Sad'], testsets['Sad']) print 'Sad recall:', nltk.metrics.recall(refsets['Sad'], testsets['Sad']) # print 'Output:',nltk.classify.util.accuracy(classifier, ['He is Our To be Hanged']) # print 'Trust precision:', nltk.metrics.precision(refsets['Trust'], testsets['Trust']) # print 'Trust recall:', nltk.metrics.recall(refsets['Trust'], testsets['Trust']) # print 'Sad precision:', nltk.metrics.precision(refsets['Angry'], testsets['Angry']) # print 'Sad recall:', nltk.metrics.recall(refsets['Angry'], testsets['Angry']) classifier.show_most_informative_features() except AttributeError, err: print Exception, err
def word_feats(words): return dict([(word, True) for word in words]) voc_p = ['great', 'fun', 'epic', 'good', 'happy', 'safe', 'normal','amazing' ] voc_n = ['bad', 'terrible', 'help', 'danger', 'trouble'] ft_pos = [(word_feats(pos), 'pos') for pos in voc_p] ft_ng = [(word_feats(neg), 'neg') for neg in voc_n] tr_set = ft_ng + ft_pos class_fy = NaiveBayesClassifier.train(tr_set) # Predict def predictNegPos(sentence): ng = 0 ps = 0 sentence = sentence.lower() words = sentence.split(' ') for word in words: classResult = class_fy.classify(word_feats(word)) if classResult == 'neg': ng+=1 if classResult == 'pos': ps+=1 if ng > ps: result = firebase.put(
filtered_from_stopWords='' counter = 0 for j in range(len(illegal_chars)) : if counter == 0: counter+=1 filtered = i[0].replace(illegal_chars[j], '') else : filtered=filtered.replace(illegal_chars[j],'') counter=0 filteredArr = filtered.split(' ') for x in filteredArr : if x not in stopWords : filtered_from_stopWords+=x+' ' bb=[] filtered_from_stopWords_ARRAY=filtered_from_stopWords.split(' ') features = {w.lower(): (w in most_cm_1) for w in filtered_from_stopWords_ARRAY} bb.append(features) bb.append(i[1]) sentences.append(bb) remarks.append(i[1]) count =0 print(remarks) print(sentences) classifier = NaiveBayesClassifier.train(sentences) inputs = input('Enter a comment ') words_entered=inputs.split(' ') entry = {w: ( True) for w in words_entered} print(classifier.classify(entry))
def train_model(train_features): classifier = NaiveBayesClassifier.train(train_features) return classifier
testfeats = negtest_feats[:test_neg] + postest_feats[:test_pos] while (flag == 1 ): #this flag is set so that user gets option to pass another query i = 0 #i is set for indexing the 10 extracted videos result = [ ] #double dimensional array to store 100 comments of each video pos_neg_list = [ ] #this list stores the positive and negative counts of all 10 videos as a tuple query = raw_input('enter a query word:') (vid_ids, vid_titles, vid_likes, vid_dislikes, comment_count) = youtube_search( query) #youtube_search function returns five parameters print("no. of videos", len(vid_ids)) nb_classifier = NaiveBayesClassifier.train(trainfeats) #nb_precisions, nb_recalls= precision_recall(nb_classifier, testfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = nb_classifier.classify(feats) testsets[observed].add(i) print("Accuracy:", nltk.classify.accuracy(nb_classifier, testfeats)) print("Positive Precision:", precision(refsets['pos'], testsets['pos'])) print('Positive Recall:', recall(refsets['pos'], testsets['pos'])) print('Positive F-measure:', f_measure(refsets['pos'], testsets['pos']))
def evaluate_features(feature_select): #these variables contains the output of our feature selection mechanism posFeatures = [] negFeatures = [] conFeatures = [] intFeatures = [] litFeatures = [] modFeatures = [] supFeatures = [] uncFeatures = [] newsFeatures = [] # http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation # breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) with open(RT_POLARITY_CON_FILE, 'r') as conSentences: for i in conSentences: conWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) conWords = [feature_select(conWords), 'con'] conFeatures.append(conWords) # for i in newsSentences: # newsWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) # newsWords = [feature_select(newsWords), 'news'] # newsFeatures.append(newsWords) #print(newsFeatures) #separates the data into training and testing data for a Naive Bayes classifier # selects 3/4 of the features to be used for training and 1/4 to be used for testing posCutoff = int(math.floor(len(posFeatures) * 3 / 4)) negCutoff = int(math.floor(len(negFeatures) * 3 / 4)) conCutoff = int(math.floor(len(conFeatures) * 3 / 4)) #trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] + conFeatures[:conCutoff] #testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] + conFeatures[conCutoff:] trainFeatures = posFeatures + negFeatures + conFeatures print(trainFeatures) with open(RT_POLARITY_NEWS_FILE, 'r') as newsSentences: for test_sentence in newsSentences: # Tokenize the line. doc = nltk.word_tokenize(test_sentence.lower()) featurized_doc = {i: (i in doc) for i in trainFeatures} # tagged_label = classifier.classify(featurized_doc) print(doc) testFeatures = featurized_doc print(trainFeatures) # trains a Naive Bayes Classifier classifier = NaiveBayesClassifier.train(trainFeatures) # initiates referenceSets and testSets referenceSets = collections.defaultdict( set) #will contain the actual values for the testing data testSets = collections.defaultdict(set) #will contain the predicted output # puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) # print(predicted) testSets[predicted].add(i) # prints metrics to show how well the feature selection did print('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))) print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)) print('pos precision:', precision(referenceSets['pos'], testSets['pos'])) print('pos recall:', recall(referenceSets['pos'], testSets['pos'])) print('neg precision:', precision(referenceSets['neg'], testSets['neg'])) print('neg recall:', recall(referenceSets['neg'], testSets['neg'])) print('con precision:', precision(referenceSets['con'], testSets['con'])) print('con recall:', recall(referenceSets['con'], testSets['con'])) classifier.show_most_informative_features(10)
def main(): all_words = [] list_of_tokens_of_each_file = [] #---read all files to extract all word lists---# for root, directories, filenames in os.walk(sys.argv[1]): for each_filename in filenames: if each_filename.endswith(".txt"): input = open(os.path.join(root, each_filename), "r", encoding="latin1").read() tokens = input.split() list_of_tokens_of_each_file.append(tokens) for each_token in tokens: if each_token not in all_words: all_words.append(each_token) test_set_list = [] #---- fetch all dev data set ---# for root, directories, filenames in os.walk(sys.argv[2]): for each_filename in filenames: if each_filename.endswith(".txt"): test_input = open(os.path.join(root, each_filename), "r", encoding="latin1").read() test_tokens = test_input.split() test_set_list.append(test_tokens) total_list = [] X_list = [] #--- extract the features for each of the training file ----# for root, directories, filenames in os.walk(sys.argv[1]): if "positive" in root or "negative" in root or "neutral" in root: for i in range(len(filenames)): X_list.append( word2features(list_of_tokens_of_each_file[i], all_words, root)) result_list = [] #-- applying naive bayes NLTK classification ---# classifier = NaiveBayesClassifier.train(X_list) f = open('nboutput.txt', 'w') actual_positive = 0 actual_negative = 0 actual_neutral = 0 positive_counter = 0 negative_counter = 0 neutral_counter = 0 classified_positive = 0 classified_negative = 0 classified_neutral = 0 for root, directories, filenames in os.walk(sys.argv[2]): print("root is ", root) for i in range(len(filenames)): path = root + '/' + filenames[i] if "positive" in path: actual_positive = actual_positive + 1 elif "negative" in path: actual_negative = actual_negative + 1 elif "neutral" in path: actual_neutral = actual_neutral + 1 if "positive" in root or "negative" in root or "neutral" in root: result_list.append( word2features_test(test_set_list[i], all_words)) #--- classify each of the test file to respective category---# output = classifier.classify( word2features_test(test_set_list[i], all_words)) if output == "positive": classified_positive = classified_positive + 1 elif output == "negative": classified_negative = classified_negative + 1 elif output == "neutral": classified_neutral = classified_neutral + 1 f.write(output + " " + root + '/' + filenames[i] + "\n") print('accuracy:', nltk.classify.util.accuracy(classifier, X_list))