def train_with_movie_db(self): """ Training possible with movie reviews - this does not yield particularly good results """ self.use_movie_reviews = True negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "negative") for f in negids] posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])), "positive") for f in posids] negcutoff = len(negfeats) * 3 / 4 poscutoff = len(posfeats) * 3 / 4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats))) self.classifier = NaiveBayesClassifier.train(trainfeats) DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats))) DLOG(self.classifier.show_most_informative_features())
def main(argv): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') #print negids negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids] trainfeats = posfeats+negfeats #print trainfeats # break classifier = NaiveBayesClassifier.train(trainfeats) #classifier = pickle.load(open("classifier.p", "rb")) topicList = ["media", "sports", "news", "fashion", "finance", "politics"] for line in sys.stdin: try: tolk_posset = word_tokenize(line.rstrip()) d = word_feats(tolk_posset) for topic in topicList: subjectFull = subj(line, topic) if not subjectFull == "No match": #print d print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1" except: #print "Error" continue
def category_by_movie(): from nltk.corpus import movie_reviews as mr from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random documents = [(list(mr.words(f)), c) for c in mr.categories() for f in mr.fileids(c)] random.shuffle(documents) all_words = FreqDist(w.lower() for w in mr.words()) word_features = all_words.keys()[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print document_features(mr.words('pos/cv957_8737.txt')) #print documents[0] features = [(document_features(d), c) for (d, c) in documents] train_set, test_set = features[100:], features[:100] classifier = NaiveBayesClassifier.train(train_set) print classify.accuracy(classifier, train_set)
def prep_reviews_data(self): # messy code to test classifier with movie reviews if not self.movie_review_data: print 'Preparing movie reviews...\n' from nltk.corpus import movie_reviews docs = [movie_reviews.raw(fileid) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] process = lambda x: 1 if x == 'pos' else -1 labels = [process(category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] docs, labels = double_shuffle(docs, labels) training, testing = divide_list_by_ratio(docs) self.train_labs, self.test_labs = divide_list_by_ratio(labels) train_vecs = self.vectorizer.fit_transform(training) test_vecs = self.vectorizer.transform(testing) if isinstance(self.model, naive_bayes.GaussianNB): train_vecs = train_vecs.toarray() test_vecs = test_vecs.toarray() self.train_vecs = train_vecs self.test_vecs = test_vecs self.movie_review_data = True self.news_market_data = False
def load_data(): global posfeats,negfeats negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] return
def maketrainset(movie_reviews, tokenizer, stemmer): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'neg') for f in negids] posfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'pos') for f in posids] trainfeats = negfeats + posfeats return trainfeats
def main(): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = int(len(negfeats) * 3 / 4) poscutoff = int(len(posfeats) * 3 / 4) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) with open("output.json") as fin: sid = SentimentIntensityAnalyzer() data = json.load(fin) for key in data: reviews = data[key]["reviews"] for i in range(len(reviews)): text = reviews[i]["review"] sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0} prob = classifier.prob_classify(word_feats(text.split(" "))) classification = classifier.classify(word_feats(text.split(" "))) sentiment_dict['positive_probability'] = prob.prob('pos') sentiment_dict['negative_probability'] = prob.prob('neg') sentiment_dict['label'] = classification reviews[i]["sentiment"] = sentiment_dict data[key]["reviews"] = reviews with open('out_with_sentiment.json', 'w') as outfile: json.dump(data, outfile)
def train(test=False): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] if(test): negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) classifier.show_most_informative_features() else: return NaiveBayesClassifier.train(negfeats+posfeats)
def evaluate_classifier(featx): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features()
def median_approach(llimit,ulimit,isphrase,pathname): posmedlist=[] negmedlist=[] medians=[] lpcount=0 totalcount=ulimit-llimit cnt_var=0 print '\nNo of +ve reviews trained : ' for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]: testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname) posmedlist.append(testmed) lpcount=lpcount+1 cnt_var+=1 print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' lpcount=0 cnt_var=0 print '\nNo of -ve reviews trained : ' for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]: testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname) negmedlist.append(testmed) lpcount=lpcount+1 cnt_var+=1 print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' medians.append([numpy.median(x) for x in itertools.izip(*posmedlist)]) medians.append([numpy.median(x) for x in itertools.izip(*negmedlist)]) f = open('train_result\proximity_median_train_result_'+str(isphrase),'w') json.dump(medians,f) f.close()
def evaluate_features(self,feature_extractor, N): self.negative = movie_reviews.fileids('neg') #list of all names of the documents under neg folder self.positive = movie_reviews.fileids('pos') #list of all names of the documents under pos folder self.maintrain, self.maintest = self.stratifiedSplit(self.negative, self.positive, N) lst = [] trainvocabulary = [] for doc,lbl in self.maintrain: x = (feature_extractor(movie_reviews.words(fileids=[doc])),lbl) lst.append(x) trainvocabulary = trainvocabulary + x[0].keys() trainvocabulary = set(trainvocabulary) if q2_1.W == 0: q2_1.W = len(trainvocabulary) print "no. of features in train:", self.W nb = classifier.train(lst) self.testClassify = self.classifyTest(self.maintest, nb, feature_extractor) print "accuracy = ", accuracy(self.maintest, self.testClassify) print "Negative:" print " precision = ", self.calcPrec('neg', self.maintest, self.testClassify) print " recall = ", self.calcRecall('neg', self.maintest, self.testClassify) print " f measure = ", self.calcFMeasur('neg', self.maintest, self.testClassify) print "Positive:" print " precision = ", self.calcPrec('pos', self.maintest, self.testClassify) print " recall = ", self.calcRecall('pos', self.maintest, self.testClassify) print " f measure = ", self.calcFMeasur('pos', self.maintest, self.testClassify) nb.show_most_informative_features() return nb
def main(): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') to_review1 = "A man with a magnanimous spirit helps a mute girl from Pakistan return home." to_review2 = "Forced out of his own company by former Darren Cross, Dr. Hank Pym (Michael Douglas) recruits the talents of Scott Lang (Paul Rudd), a master thief just released from prison. Lang becomes Ant-Man, trained by Pym and armed with a suit that allows him to shrink in size, possess superhuman strength and control an army of ants. The miniature hero must use his new skills to prevent Cross, also known as Yellowjacket, from perfecting the same technology and using it as a weapon for evil." to_review3 = '''Parents need to know that kids may clamor to see this fast-paced, action-packed comic book-based adventure. But it's definitely more age-appropriate for teens than younger children. Although much of the violence is clearly meant to be based in the realm of sci-fi and fantasy -- and/or is shown at a distance -- there's plenty of it, from massive explosions to children held at gunpoint to super-powered fistfights. Some of the violence is war themed, and some characters get hurt and/or die. While much is made of lead character Tony Stark's devil-may-care lifestyle of fun and frolic, viewers also see him turn away from the more irresponsible aspects of playboyhood. Language is minimal, and sexual content is more suggested than shown overall -- though there are a few eyebrow-raising moments.''' reviews = [] reviews.append(to_review1) reviews.append(to_review2) reviews.append(to_review3) for to_review in reviews: to_review_words = to_review.split(" ") print "Reviewing",to_review,"\n\n\n" print ''' Normal classification ''',"\n\n" negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] calculateScore(classification(negfeats, posfeats, 1, 1), to_review_words) calculateScore(classification(negfeats, posfeats, 1, 0.95), to_review_words) calculateScore(classification(negfeats, posfeats, 0.95, 1), to_review_words) calculateScore(classification(negfeats, posfeats, 0.9, 1), to_review_words) calculateScore(classification(negfeats, posfeats, 1, 0.9), to_review_words) print ''' Without Punctuations ''',"\n\n" negfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'pos') for f in posids] calculateScore_punctuations(classification(negfeats, posfeats, 1, 1), to_review_words) calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.95), to_review_words) calculateScore_punctuations(classification(negfeats, posfeats, 0.95, 1), to_review_words) calculateScore_punctuations(classification(negfeats, posfeats, 0.9, 1), to_review_words) calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.9), to_review_words) print ''' Without Stop Words ''',"\n\n" negfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'pos') for f in posids] wordstoreview = [] for each in to_review_words: if each not in stopwords.words('english'): wordstoreview.append(each) calculateScore_stopwords(classification(negfeats, posfeats, 1, 1), wordstoreview) calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.95), to_review_words) calculateScore_stopwords(classification(negfeats, posfeats, 0.95, 1), to_review_words) calculateScore_stopwords(classification(negfeats, posfeats, 0.9, 1), to_review_words) calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.9), to_review_words) print ''' With Lemmatizer ''',"\n\n" negfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'pos') for f in posids] calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 1), to_review_words) calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.95), to_review_words) calculateScore_lemmatizer(classification(negfeats, posfeats, 0.95, 1), to_review_words) calculateScore_lemmatizer(classification(negfeats, posfeats, 0.9, 1), to_review_words) calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.9), to_review_words)
def __init__(self, train1=True, train2=True, train3=True, train4=True): self.trainfeats = [] if train1: negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') neg_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] pos_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] self.trainfeats = neg_movies + pos_movies if train2: f = open("out.txt", "r") negfeats = [] posfeats = [] for line in f: status = line[0] texto = line[2:] if status == '0': negfeats.append((self.word_feats(texto.split(" ")), 'neg')) elif status == '1': posfeats.append((self.word_feats(texto.split(" ")), 'pos')) self.trainfeats += negfeats + posfeats if train3: f = open("E:\\Workspace\\WS_TG\\analisador1\\AFINN\\AFINN-111.txt", 'r') for l in f: data = l.strip().split('\t') self.trainfeats.append( (self.word_feats(data[0]), 'neg' if int(data[1]) < 0 else 'pos')) if train4: f = open("E:\\Workspace\\WS_TG\\api\\trainning set.txt", 'r') pos = [] neutral = [] neg = [] for line in f: if line.startswith("pos"): pos.append(line) elif line.startswith("neutral"): neutral.append(line) elif line.startswith("neg"): neg.append(line) print len(pos), len(neutral), len(neg) total = pos + neutral[:200] + neg for line in total: data = line.split(' .:. ') self.trainfeats.append( (self.word_feats(data[1].split()), data[0]) ) self.classifier = NaiveBayesClassifier.train(self.trainfeats) print self.classifier.show_most_informative_features(20)
def setup_demo(lower): print 'running movie reviews demo. data dir: ', nltk_movie_reviews_data_root negative_reviews = map (lambda x: nltk_movie_reviews_data_root + x, movie_reviews.fileids('neg')) positive_reviews = map (lambda x: nltk_movie_reviews_data_root + x, movie_reviews.fileids('pos')) pos = create_corpus_from_file_list(negative_reviews, "negative", None, None, lower) neg = create_corpus_from_file_list(positive_reviews, "positive", None, None, lower) pos_bigrams = create_corpus_from_file_list(negative_reviews, "negative", None, None, lower, wordlist_to_bigrams_dict) neg_bigrams = create_corpus_from_file_list(positive_reviews, "positive", None, None, lower, wordlist_to_bigrams_dict) return (pos, neg, pos_bigrams, neg_bigrams)
def __init__(self, load = False, loadFile = ""): if(load): self.loadClassifier(loadFile) else: negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in movie_reviews.fileids('neg')] posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in movie_reviews.fileids('pos')] trainfeats = negfeats + posfeats self.classifier = NaiveBayesClassifier.train(trainfeats)
def bins_svm_approach(llimit,ulimit,isphrase,pathname): posbinlist=[] negbinlist=[] trainingdata=[] trainingclass=[] bin_train_set=[] totalcount=ulimit-llimit lpcount=0 cnt_var=0 print '\nNo of +ve reviews scanned for training : ' for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]: testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname) posbinlist.append(testbin) lpcount+=1 cnt_var+=1 print 'Scanning +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' lpcount=0 cnt_var=0 print '\nNo of -ve reviews scanned for training : ' for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]: testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname) negbinlist.append(testbin) lpcount+=1 cnt_var+=1 print 'Scanning -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' lpcount=0 totalcount=len(posbinlist) print '\nNo of +ve reviews trained : ' trainingdata.extend(posbinlist) for i in range(totalcount): trainingclass.append(1) lpcount+=1 print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' lpcount=0 totalcount=len(negbinlist) print '\nNo of -ve reviews trained : ' trainingdata.extend(negbinlist) for i in range(totalcount): trainingclass.append(0) lpcount+=1 print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%' bin_train_set.append(trainingdata) bin_train_set.append(trainingclass) f = open('train_result\proximity_bin_train_result_'+str(isphrase),'w') json.dump(bin_train_set,f) f.close()
def sort_files(): """ Sorted the sample for cross reading the sample :return: files_list """ files_list = list() neg_file_list = movie_reviews.fileids('neg') pos_file_list = movie_reviews.fileids('pos') files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list))) return files_list
def train_classifiers(self): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats( movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats( movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeats = negfeats + posfeats # train naive bayes self.classifier = NaiveBayesClassifier.train(trainfeats)
def train(feature): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeatures = [(feature(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeatures = [(feature(movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeatures = negfeatures + posfeatures classifier = NaiveBayesClassifier.train(trainfeatures) return classifier
def demo_movie_reviews(trainer, n_instances=None, output=None): """ Train classifier on all instances of the Movie Reviews dataset. The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for training and testing. Reviews will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import movie_reviews from nltk.sentiment import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances/2) pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]] neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]] # We separately split positive and negative instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_docs = train_pos_docs+train_neg_docs testing_docs = test_pos_docs+test_neg_docs sentim_analyzer = SentimentAnalyzer() all_words = sentim_analyzer.all_words(training_docs) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__, Tokenizer='WordPunctTokenizer', Feats=extr, Results=results, Instances=n_instances)
def train(self, feats): print "Starting to train the data" start = datetime.datetime.now() print "setting the ids", datetime.datetime.now() self.negids = movie_reviews.fileids('neg') self.posids = movie_reviews.fileids('pos') #random.shuffle(self.negids) #random.shuffle(self.posids) ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] + ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids]) ##random.shuffle(self.reviews) ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:]) ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4]) print "setting the feats", datetime.datetime.now() self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids] self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids] self.negcutoff = len(self.negfeats)*3/4 self.poscutoff = len(self.posfeats)*3/4 print "setting the train/test", datetime.datetime.now() self.trainfeats = self.negfeats[:self.negcutoff] + self.posfeats[:self.poscutoff] self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[self.poscutoff:] print "training", datetime.datetime.now() self.classifier = NaiveBayesClassifier.train(self.trainfeats) ##self.classifier = NaiveBayesClassifier.train(self.train_set) self.refsets = defaultdict(set) self.testsets = defaultdict(set) print "accuracy stuff", datetime.datetime.now() for i, (feats, label) in enumerate(self.testfeats): ##for i, (feats, label) in enumerate(self.test_set): self.refsets[label].add(i) observed = self.classifier.classify(feats) self.testsets[observed].add(i) end = datetime.datetime.now() print "Training lasted for ", end-start print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats) ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set) print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos']) print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos']) print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg']) print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg']) self.classifier.show_most_informative_features() self.trained = True
def movieReviews(self, category, count): ret = [] if category != 'positive' and category != 'negative': return ret fileids = [] if category == 'positive': fileids = movie_reviews.fileids('pos') elif category == 'negative': fileids = movie_reviews.fileids('neg') sampleFileIds = sample(fileids, count) for sampleFileId in sampleFileIds: ret.append(movie_reviews.raw(sampleFileId)) return ret
def train(): global classifier # Train our classifier negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(feature_extractor(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(feature_extractor(movie_reviews.words(fileids=[f])), 'pos') for f in posids] classifier = NaiveBayesClassifier.train(negfeats + posfeats)
def train_classifier(self): """This code is heavily inspired by: http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/ """ negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeats = negfeats + posfeats self.classifier = NaiveBayesClassifier.train(trainfeats)
def evaluate_classifier(featx): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) return classifier
def evaluate_classifier(featx): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') count = 1500000 lines = [] english_stops = set(stopwords.words('english')) print ctime(), "Reading files..." f = open('Sentiment Analysis Dataset.csv', "rU") line = f.readline() line = f.readline() negfeats = [] posfeats = [] for i in range(count): lines.append(line) line = f.readline() f.close() random.shuffle(lines) negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) classifier.show_most_informative_features()
def train_classifier(self): # Training negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(self.best_word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(self.best_word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] self.sentiment_classifier = NaiveBayesClassifier.train(trainfeats)
def trainMovies(): negids = movie_reviews.fileids('neg') print type(negids), negids posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] train = negfeats + posfeats classifier = NaiveBayesClassifier.train(train) f = open('movie_semtiment_classifier.pickle', 'wb') pickle.dump(classifier, f) f.close()
def nb_movierev(): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeats = negfeats + posfeats testfeats = word_feats(my_tok('very good indeed',1)) print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print classifier.classify(testfeats)
def train_clf(method): negidxs = movie_reviews.fileids('neg') posidxs = movie_reviews.fileids('pos') if method=='stopword_filtered_words_features': negfeatures = [(stopword_filtered_words_features(movie_reviews.words(fileids=[file])), 'neg') for file in negidxs] posfeatures = [(stopword_filtered_words_features(movie_reviews.words(fileids=[file])), 'pos') for file in posidxs] elif method=='best_words_features': negfeatures = [(best_words_features(movie_reviews.words(fileids=[file])), 'neg') for file in negidxs] posfeatures = [(best_words_features(movie_reviews.words(fileids=[file])), 'pos') for file in posidxs] elif method=='best_bigrams_words_features': negfeatures = [(best_bigrams_words_features(movie_reviews.words(fileids=[file])), 'neg') for file in negidxs] posfeatures = [(best_bigrams_words_features(movie_reviews.words(fileids=[file])), 'pos') for file in posidxs] trainfeatures = negfeatures + posfeatures clf = NaiveBayesClassifier.train(trainfeatures) return clf
# If text, add the predicted value to the output. if text is not None: output.append("\"{}\"".format(text)) output.append("Classified as: {}".format(model.predict([text]))) output.append("") # Create two columns with most negative and most positive features. for (cp, fnp), (cn, fnn) in topn: output.append("{:0.4f}{: >15} {:0.4f}{: >15}".format( cp, fnp, cn, fnn)) return "\n".join(output) if __name__ == "__main__": PATH = "model.pickle" if not os.path.exists(PATH): # Time to build the model from nltk.corpus import movie_reviews as reviews X = [reviews.raw(fileid) for fileid in reviews.fileids()] y = [reviews.categories(fileid)[0] for fileid in reviews.fileids()] model = build_and_evaluate(X, y, outpath=PATH) else: with open(PATH, 'rb') as f: model = pickle.load(f) print(show_most_informative_features(model))
from nltk.corpus import movie_reviews from nltk.util import ngrams from nltk import word_tokenize from nltk.util import pad_sequence import nltk from nltk import FreqDist import math from random import shuffle from nltk.metrics import precision, recall, f_measure from nltk.corpus import stopwords import collections from sklearn.linear_model import LogisticRegression from nltk.classify.scikitlearn import SklearnClassifier document = [(movie_reviews.words(file),category) for file in movie_reviews.fileids() for category in movie_reviews.categories(file)] # Randomizes document files so that the data doesnt bias. shuffle(document) userInput = [] for input in open("classifyUserInput.txt"): userInput.append(word_tokenize(input)) wordFreq = FreqDist(movie_reviews.words()) frequent_words_non_filtered = list(wordFreq) # Use this line instead of the prior to only select the top 5000 words. It's much faster than using all words # frequent_words_non_filtered = list(wordFreq)[:5000] frequent_words_list = [word for word in frequent_words_non_filtered if word not in stopwords.words('english')] # Finds features from the list of words and places them in a dictionary. def find_freq_words(word_list): words_dict = {} for x in frequent_words_list:
def pprocess_induction(self): # get combined vocabulary for pos and neg class all_reviews = (self.clean_movie_reviews(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()) if self.kwargs.get('stopwords', False): print("***** Stopwords kept *****") c_vocab_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2)) else: print("***** Stopwords removed *****") c_vocab_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english') c_tfidf = c_vocab_vect.fit_transform([doc for doc in all_reviews]) self.c_vocab = c_vocab_vect.get_feature_names() if not self.kwargs['train']: # only get the combined vocabulary, no need to do tfidf logic return else: print("***** Training *****") # segment the movie reviews -> pos, neg and call the clean function to remove pos_documents = (self.clean_movie_reviews(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids('pos')) neg_documents = (self.clean_movie_reviews(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids('neg')) # fit and transform the the documents to a tfidf matrix using the combined vocabulary if self.kwargs.get('stopwords', False): print("***** Stopwords kept *****") tfidf_pos_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), smooth_idf=True, vocabulary=self.c_vocab) tfidf_neg_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), smooth_idf=True, vocabulary=self.c_vocab) else: print("***** Stopwords removed *****") tfidf_pos_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english', smooth_idf=True, vocabulary=self.c_vocab) tfidf_neg_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english', smooth_idf=True, vocabulary=self.c_vocab) pos_tfidf = tfidf_pos_vect.fit_transform([doc for doc in pos_documents]) neg_tfidf = tfidf_neg_vect.fit_transform([doc for doc in neg_documents]) # with open('vocab', 'wb') as f: # pickle.dump(c_vocab, f) # combine both pos neg arrays into combined sparse matrix pos_array = self.create_array(pos_tfidf, 0) neg_array = self.create_array(neg_tfidf, 1) self.combined_array = np.concatenate((pos_array, neg_array), axis=0) # print(self.combined_array.shape) # print(combined_array[:1000,-1]) # should print all 0, pos reviews # print(combined_array[1000:, -1]) # should print all 1, neg reviews # print(f"pp_induct: {len(self.c_vocab)}") return self.combined_array
import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews def extract_features(word_list): return dict([(word, True) for word in word_list]) if __name__ == '__main__': # Load positive and negative reviews positive_fileids = movie_reviews.fileids('pos') negative_fileids = movie_reviews.fileids('neg') features_positive = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in positive_fileids] features_negative = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in negative_fileids] # Split the data into train and test (80/20) threshold_factor = 0.8 threshold_positive = int(threshold_factor * len(features_positive)) threshold_negative = int(threshold_factor * len(features_negative)) features_train = features_positive[: threshold_positive] + features_negative[: threshold_negative] features_test = features_positive[threshold_positive:] + features_negative[ threshold_negative:] print("\nNumber of training datapoints:", len(features_train)) print("Number of test datapoints:", len(features_test))
eng_sw = stopwords.words('english') + list(punctuation) """ $$ Here we are using the dataset from the nltk library which was downloaded in the beggining, from the corpora inside the movie_reviws section. Go and check it out ,,,where you've downloaded the nltk data. There will 1000 neg and pos revews, categorised into two folders namely neg and pos. """ all_data_1 = [] all_data_1 = [(list(movie_reviews.words(file_name)), folder_name) for folder_name in movie_reviews.categories() for file_name in movie_reviews.fileids(folder_name)] """ $$ For few people the above one line code may not strike as fast, so i have breakup the code in regular format, and will explain about each line. """ all_data = [] for folder_name in movie_reviews.categories(): for file_name in movie_reviews.fileids(folder_name): all_data.append((list(movie_reviews.words(file_name)), folder_name)) """ $$ Here the fist for loop is to iterate through the sub folders in the movie_reviews data under nlt.corpora...
import nltk import random from nltk.corpus import movie_reviews from nltk.classify.scikitlearn import SklearnClassifier import pickle from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC # this takes the most of the algorithm time. documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] def find_features(document): words = set(document) features = {}
def demo_movie_reviews(trainer, n_instances=None, output=None): """ Train classifier on all instances of the Movie Reviews dataset. The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for training and testing. Reviews will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import movie_reviews from nltk.sentiment import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances / 2) pos_docs = [ (list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances] ] neg_docs = [ (list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances] ] # We separately split positive and negative instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_docs = train_pos_docs + train_neg_docs testing_docs = test_pos_docs + test_neg_docs sentim_analyzer = SentimentAnalyzer() all_words = sentim_analyzer.all_words(training_docs) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print( 'Your classifier does not provide a show_most_informative_features() method.' ) results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown( output, Dataset='Movie_reviews', Classifier=type(classifier).__name__, Tokenizer='WordPunctTokenizer', Feats=extr, Results=results, Instances=n_instances, )
from nltk.corpus import movie_reviews import keyword_extractor for fileid in movie_reviews.fileids(): words = movie_reviews.words(fileid) print words
# -*- coding: utf-8 -*- """ Created on Mon Oct 15 14:50:27 2018 @author: sriniv11 """ from nltk.corpus import movie_reviews # Total reviews print(len(movie_reviews.fileids())) # Output: 2000 # Review categories print(movie_reviews.categories()) # Output: [u'neg', u'pos'] # Total positive reviews print(len(movie_reviews.fileids('pos'))) # Output: 1000 # Total negative reviews print(len(movie_reviews.fileids('neg'))) # Output: 1000 positive_review_file = movie_reviews.fileids('pos')[0] print(positive_review_file) # Output: pos/cv000_29590.txt
def save_labelled_vectors(self): self.load_vocab() fileids = movie_reviews.fileids() random.shuffle(fileids) self.save_labelled_vectors_type(DataType.TRAIN, fileids) self.save_labelled_vectors_type(DataType.TEST, fileids)
##one liner of loop below # documents = [(list(movie_reviews.words(fileid)), catagory) # for catagory in movie_reviews.catagories() # for fileid in movie_reviews.fileids(catagory)] # documents = [] # for catagory in movie_reviews.catagories(): # for fileidin in movie_reviews.fileids(catagory): # document.append(list(movie_reviews.words(fileid)), category) # random.shuffle(documents) # print(documents[1]) for i in mr.fileids(): documents[i.split('/')[0]].append(i) random.shuffle(documents['pos']) random.shuffle(documents['neg']) #print(documents['pos'][:10]) # first ten pos reviews. #print #print(documents['neg'][:10]) # first ten neg reviews. documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] random.shuffle(documents)
from nltk.corpus import movie_reviews from nltk.classify import NaiveBayesClassifier from nltk.classify.util import accuracy as nltk_accuracy #Function to extract features def extract_features(words): return dict([(word, True) for word in words]) if __name__ == '__main__': #Loading reviews from corpus fileids_pos = movie_reviews.fileids('pos') fileids_neg = movie_reviews.fileids('neg') #Extracting features from reviews features_pos = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in fileids_pos] features_neg = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in fileids_neg] #Defining test and train split #80% for training and 20% for testing threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) #Creating training and testing datasets features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:]
def __init__(self): self.documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import collections from sklearn.svm import LinearSVC, SVC import random def create_word_features(words): my_dict = dict([(word, True) for word in words]) return my_dict print('---------------------------------------------------------------------------') print('WELCOME TO SENTIMENTAL ANALYSIS OF ONLINE IMDB MOVIE REVIEWS ') print('---------------------------------------------------------------------------') print(' DATASET ') neg_reviews = [] for fileid in movie_reviews.fileids('neg'): words = movie_reviews.words(fileid) neg_reviews.append((create_word_features(words), "negative")) #print(pos_reviews[0]) print('length of negative reviews') print(len(neg_reviews)) pos_reviews = [] for fileid in movie_reviews.fileids('pos'): words = movie_reviews.words(fileid) pos_reviews.append((create_word_features(words), "positive")) #print(pos_reviews[0]) print('length of positive reviews') print(len(pos_reviews))
from nltk.corpus import movie_reviews # Total reviews print(len(movie_reviews.fileids())) # Review categories print(movie_reviews.categories()) # Total positive reviews print(len(movie_reviews.fileids('pos'))) # Total negative reviews print(len(movie_reviews.fileids('neg'))) positive_review_file = movie_reviews.fileids('pos')[0] print(positive_review_file) documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((movie_reviews.words(fileid), category)) print(len(documents)) # print first tuple print(documents[0]) from random import shuffle shuffle(documents)
def word_feats(words): return {word: True for word in words} def get_combined_features(): with open(os.path.join('training_data', 'combined.txt')) as f: sentences = [] for line in f: sentiment, sentence = line.split('\t') tokens = word_tokenize(sentence) sentences.append((word_feats(tokens), sentiment[:3])) return sentences print("Finding ids for positive and negative reviews") negids = reviews.fileids('neg') posids = reviews.fileids('pos') print("Creating feature sets") negfeats = [(word_feats(reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(reviews.words(fileids=[f])), 'pos') for f in posids] mixfeats = get_combined_features() print("Calculating cutoffs") negcutoff = int(len(negfeats) * 9 / 10) poscutoff = int(len(posfeats) * 9 / 10) mixcutoff = int(len(mixfeats) * 9 / 10) print("Creating training set") trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + mixfeats[:mixcutoff] print("Creating test set")
import random from nltk.corpus import movie_reviews from nltk.corpus import stopwords from nltk import FreqDist from nltk import NaiveBayesClassifier from nltk.classify import accuracy import string labeled_docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)] random.seed(42) random.shuffle(labeled_docs) review_words = movie_reviews.words() print "# Review Words", len(review_words) sw = set(stopwords.words('english')) punctuation = set(string.punctuation) def isStopWord(word): return word in sw or word in punctuation filtered = [w.lower() for w in review_words if not isStopWord(w.lower())] print "# After filter", len(filtered) words = FreqDist(filtered) N = int(.05 * len(words.keys())) word_features = words.keys()[:N]
from nltk.corpus import movie_reviews # Total reviews print(len(movie_reviews.fileids())) # Output: 2000 # Review categories print(movie_reviews.categories()) # Output: [u'neg', u'pos'] # Total positive reviews print(len(movie_reviews.fileids('pos'))) # Output: 1000 # Total negative reviews print(len(movie_reviews.fileids('neg'))) # Output: 1000 positive_review_file = movie_reviews.fileids('pos')[0] print(positive_review_file) #Output: pos/cv000_29590.txt documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): #documents.append((list(movie_reviews.words(fileid)), category)) documents.append((movie_reviews.words(fileid), category)) print(len(documents)) # Output: 2000 # x = [str(item) for item in documents[0][0]] # print (x) # print first tuple print(documents[0])
model = lda.LDA(n_topics=10, n_iter=500) model.fit(sentences_train) # Fit the model n_top_words = 10 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ', '.join(topic_words))) ''' EXAMPLE: Automatically summarize a document ''' # corpus of 2000 movie reviews from nltk.corpus import movie_reviews reviews = [movie_reviews.raw(filename) for filename in movie_reviews.fileids()] # create document-term matrix tfidf = TfidfVectorizer(stop_words='english') dtm = tfidf.fit_transform(reviews) features = tfidf.get_feature_names() import numpy as np # find the most and least "interesting" sentences in a randomly selected review def summarize(): # choose a random movie review review_id = np.random.randint(0, len(reviews)) review_text = reviews[review_id]
classifier = nltk.NaiveBayesClassifier.train(train_set) print((nltk.classify.accuracy(classifier, devtest_set))) #%% errors = [] #预测错误集合 for (name, tag) in devtest_names: guess = classifier.classify(gender_features(name)) if guess != tag: errors.append((tag, guess, name)) for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print('correct=%-8s guess=%-8s name=%-30s' %(tag, guess, name)) #%% from nltk.corpus import movie_reviews import random import nltk documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = list(all_words.keys())[:2000]#勘误 print(word_features) #%% def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) #%% featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100]
from nltk.corpus import movie_reviews from nltk.classify import NaiveBayesClassifier from nltk.classify.util import accuracy as nltk_accuracy # Function to extract features from the input list def extract_features(words): return dict([(word, True) for word in words]) if __name__ == "__main__": # Load the reviews fileids_pos = movie_reviews.fileids("pos") fileids_neg = movie_reviews.fileids("neg") # Extractthe features from reiews features_pos = [(extract_features(movie_reviews.words(fileids=[f])), "Positive") for f in fileids_pos] features_neg = [(extract_features(movie_reviews.words(fileids=[f])), "Negative") for f in fileids_neg] # Define the trains and test split threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) # Create training and training datasets features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:]
from nltk.corpus import nps_chat from nltk.corpus import brown from nltk import word_tokenize posts = nltk.corpus.nps_chat.xml_posts() featuresets = [nltk.pos_tag(word_tokenize(post.text)) for post in posts] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(featuresets, backoff=t0) t2 = nltk.BigramTagger(featuresets, backoff=t1) ##text = word_tokenize("I am good"); ##print(t2.tag(text)); ##print(text); from nltk.corpus import movie_reviews as movies pos_docs = movies.fileids('pos') neg_docs = movies.fileids('neg') classifier_training = [] for doc in pos_docs: sents = movies.sents(doc) for sent in sents: tagged = t2.tag(sent) words = [w for w, k in tagged] tags = [k for w, k in tagged] feature = {} for i in range(len(words) - 1): feature[words[i] + ' ' + words[i + 1]] = tags[i] + ' ' + tags[i + 1] temp = (feature, 'pos')
import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews def collect_features(word_list): word = [] return dict ([(word, True) for word in word_list]) if __name__=='__main__': plus_filenum = movie_reviews.fileids('pos') minus_filenum = movie_reviews.fileids('neg') feature_pluspts = [(collect_features(movie_reviews.words(fileids=[f])), 'Positive') for f in plus_filenum] feature_minuspts = [(collect_features(movie_reviews.words(fileids=[f])), 'Negative') for f in minus_filenum] threshold_fact = 0.8 threshold_pluspts = int(threshold_fact * len(feature_pluspts)) threshold_minuspts = int(threshold_fact * len(feature_minuspts)) feature_training = feature_pluspts[:threshold_pluspts] + feature_minuspts[:threshold_minuspts] feature_testing = feature_pluspts[threshold_pluspts:] + feature_minuspts[threshold_minuspts:] print "\nNumber of training datapoints:", len(feature_training) print "Number of test datapoints:", len(feature_testing) # Train a Naive Bayes classifiers classifiers = NaiveBayesClassifier.train(feature_training) print "\nAccuracy of the classifiers:", nltk.classify.util.accuracy(classifiers,feature_testing)
def main(): DOC_SIZE = 500 TRAIN_SIZE = int(DOC_SIZE * 0.90) pos_files = mr.fileids(categories='pos')[:DOC_SIZE] neg_files = mr.fileids(categories='neg')[:DOC_SIZE] sources = { 'train_pos': pos_files[:TRAIN_SIZE], 'train_neg': neg_files[:TRAIN_SIZE], 'test_pos': pos_files[TRAIN_SIZE:], 'test_neg': neg_files[TRAIN_SIZE:] } # Tag documents in format: # TaggedDocument(words=['word_1', 'word_2', 'word_n'], tags=['pos']) corpus = label_docs(sources) doc2vecModel = Doc2Vec(documents=corpus, min_count=1, window=10, vector_size=100, workers=7, sample=1e-4, negative=5) for epoch in range(10): random.shuffle(corpus) doc2vecModel.train(corpus, total_examples=doc2vecModel.corpus_count, epochs=doc2vecModel.epochs) # set training dataset x_train = numpy.zeros((TRAIN_SIZE * 2, 100)) y_train = numpy.zeros(TRAIN_SIZE * 2) for i in range(TRAIN_SIZE): x_train[i] = doc2vecModel['train_pos_' + str(i)] x_train[TRAIN_SIZE + i] = doc2vecModel['train_neg_' + str(i)] y_train[i] = 1 y_train[TRAIN_SIZE + i] = 0 # set testing dataset TEST_SIZE = DOC_SIZE - TRAIN_SIZE x_test = numpy.zeros((TEST_SIZE * 2, 100)) y_test = numpy.zeros(TEST_SIZE * 2,) for i in range(TEST_SIZE): x_test[i] = doc2vecModel['test_pos_' + str(i)] x_test[TEST_SIZE + i] = doc2vecModel['test_neg_' + str(i)] y_test[i] = 1 y_test[TEST_SIZE + i] = 0 # classification classifier = MLPClassifier(hidden_layer_sizes=(5,10)) classifier.fit(x_train, y_train) print() print('Corpus Size: {}'.format(len(pos_files + neg_files))) print('Training Size:\n\tpositive: {}\tnegative: {}'.format(len(sources['train_pos']), len(sources['train_neg']))) print('Testing Size:\n\tpositive: {}\tnegative: {}'.format(len(sources['test_pos']), len(sources['test_neg']))) print() # mean accuracy of the model using the test dataset print("Accuracy Score:", classifier.score(x_test, y_test)) # sample predictions print('Predicting movie reviews using Neural Network classifier:') print() # print(len(classifier.predict(x_test))) # random.shuffle(x_test) # test_datasets = x_test # test_labels = y_test pred_list = classifier.predict(x_test) cat = {0: 'NEG', 1: 'POS'} # print(len(x_test)) for i in range(10): r = random.randint(0, len(x_test) - 1) print('Test doc: {}\t\tactual class: {}\t\tprediction: {}\t\t accurate?: {}' .format(r, cat[int(y_test[r])], cat[int(pred_list[r])], y_test[r] == pred_list[r]))
from scipy.sparse import find from scipy.spatial.distance import cosine import numpy as np import nltk # Download the nltk modules you need. You only need to run these lines once. nltk.download('movie_reviews') nltk.download('punkt') from nltk.corpus import movie_reviews # Description of data set print(movie_reviews.readme()) # Prepare document set for stemming movie_docs = [' '.join(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()] # Define function for tokenizing documents def tokenize(text): tokens = nltk.word_tokenize(text) stems = [] for item in tokens: item_lower = item.lower() stems.append(nltk.PorterStemmer().stem(item_lower)) return stems # Build TF-IDF matrix tfidf = TfidfVectorizer(tokenizer=tokenize) movie_tfidf = tfidf.fit_transform(movie_docs) # Examine non-zero entries in TF-IDF matrix
Python 3.8.2 (tags/v3.8.2:7b3ab59, Feb 25 2020, 23:03:10) [MSC v.1916 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license()" for more information. >>> from nltk.corpus import movie_reviews >>> print (len(movie_reviews.fileids())) #total reviews 2000 >>> print (movie_reviews.categories()) #review categories ['neg', 'pos'] >>> print (len(movie_reviews.fileids('pos'))) #pos reviews 1000 >>> print (len(movie_reviews.fileids('neg'))) #neg reviews 1000 >>> positive_review_file = movie_reviews.fileids('pos')[0] >>> print (positive_review_file) pos/cv000_29590.txt >>> documents = [] #creating a movie review document >>> for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((movie_reviews.words(fileid), category)) >>> print (len(documents)) 2000 >>> x = [str(item) for item in documents[0][0]] >>> print(x) ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'f**k', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'bad', 'ways', 'of', 'making', 'all', 'types', 'of', 'films', ',', 'and', 'these', 'folks', 'just', 'didn', "'", 't', 'snag', 'this', 'one', 'correctly', '.', 'they', 'seem', 'to', 'have', 'taken', 'this', 'pretty', 'neat', 'concept', ',', 'but', 'executed', 'it', 'terribly', '.', 'so', 'what', 'are', 'the', 'problems', 'with', 'the', 'movie', '?', 'well', ',', 'its', 'main', 'problem', 'is', 'that', 'it', "'", 's', 'simply', 'too', 'jumbled', '.', 'it', 'starts', 'off', '"', 'normal', '"', 'but', 'then', 'downshifts', 'into', 'this', '"', 'fantasy', '"', 'world', 'in', 'which', 'you', ',', 'as', 'an', 'audience', 'member', ',', 'have', 'no', 'idea', 'what', "'", 's', 'going', 'on', '.', 'there', 'are', 'dreams', ',', 'there', 'are', 'characters', 'coming', 'back', 'from', 'the', 'dead', ',', 'there', 'are', 'others', 'who', 'look', 'like', 'the', 'dead', ',', 'there', 'are', 'strange', 'apparitions', ',', 'there', 'are', 'disappearances', ',', 'there', 'are', 'a', 'looooot', 'of', 'chase', 'scenes', ',', 'there', 'are', 'tons', 'of', 'weird', 'things', 'that', 'happen', ',', 'and', 'most', 'of', 'it', 'is', 'simply', 'not', 'explained', '.', 'now', 'i', 'personally', 'don', "'", 't', 'mind', 'trying', 'to', 'unravel', 'a', 'film', 'every', 'now', 'and', 'then', ',', 'but', 'when', 'all', 'it', 'does', 'is', 'give', 'me', 'the', 'same', 'clue', 'over', 'and', 'over', 'again', ',', 'i', 'get', 'kind', 'of', 'fed', 'up', 'after', 'a', 'while', ',', 'which', 'is', 'this', 'film', "'", 's', 'biggest', 'problem', '.', 'it', "'", 's', 'obviously', 'got', 'this', 'big', 'secret', 'to', 'hide', ',', 'but', 'it', 'seems', 'to', 'want', 'to', 'hide', 'it', 'completely', 'until', 'its', 'final', 'five', 'minutes', '.', 'and', 'do', 'they', 'make', 'things', 'entertaining', ',', 'thrilling', 'or', 'even', 'engaging', ',', 'in', 'the', 'meantime', '?', 'not', 'really', '.', 'the', 'sad', 'part', 'is', 'that', 'the', 'arrow', 'and', 'i', 'both', 'dig', 'on', 'flicks', 'like', 'this', ',', 'so', 'we', 'actually', 'figured', 'most', 'of', 'it', 'out', 'by', 'the', 'half', '-', 'way', 'point', ',', 'so', 'all', 'of', 'the', 'strangeness', 'after', 'that', 'did', 'start', 'to', 'make', 'a', 'little', 'bit', 'of', 'sense', ',', 'but', 'it', 'still', 'didn', "'", 't', 'the', 'make', 'the', 'film', 'all', 'that', 'more', 'entertaining', '.', 'i', 'guess', 'the', 'bottom', 'line', 'with', 'movies', 'like', 'this', 'is', 'that', 'you', 'should', 'always', 'make', 'sure', 'that', 'the', 'audience', 'is', '"', 'into', 'it', '"', 'even', 'before', 'they', 'are', 'given', 'the', 'secret', 'password', 'to', 'enter', 'your', 'world', 'of', 'understanding', '.', 'i', 'mean', ',', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'from', 'visions', 'for', 'about', '20', 'minutes', 'throughout', 'the', 'movie', 'is', 'just', 'plain', 'lazy', '!', '!', 'okay', ',', 'we', 'get', 'it', '.', '.', '.', 'there', 'are', 'people', 'chasing', 'her', 'and', 'we', 'don', "'", 't', 'know', 'who', 'they', 'are', '.', 'do', 'we', 'really', 'need', 'to', 'see', 'it', 'over', 'and', 'over', 'again', '?', 'how', 'about', 'giving', 'us', 'different', 'scenes', 'offering', 'further', 'insight', 'into', 'all', 'of', 'the', 'strangeness', 'going', 'down', 'in', 'the', 'movie', '?', 'apparently', ',', 'the', 'studio', 'took', 'this', 'film', 'away', 'from', 'its', 'director', 'and', 'chopped', 'it', 'up', 'themselves', ',', 'and', 'it', 'shows', '.', 'there', 'might', "'", 've', 'been', 'a', 'pretty', 'decent', 'teen', 'mind', '-', 'f**k', 'movie', 'in', 'here', 'somewhere', ',', 'but', 'i', 'guess', '"', 'the', 'suits', '"', 'decided', 'that', 'turning', 'it', 'into', 'a', 'music', 'video', 'with', 'little', 'edge', ',', 'would', 'make', 'more', 'sense', '.', 'the', 'actors', 'are', 'pretty', 'good', 'for', 'the', 'most', 'part', ',', 'although', 'wes', 'bentley', 'just', 'seemed', 'to', 'be', 'playing', 'the', 'exact', 'same', 'character', 'that', 'he', 'did', 'in', 'american', 'beauty', ',', 'only', 'in', 'a', 'new', 'neighborhood', '.', 'but', 'my', 'biggest', 'kudos', 'go', 'out', 'to', 'sagemiller', ',', 'who', 'holds', 'her', 'own', 'throughout', 'the', 'entire', 'film', ',', 'and', 'actually', 'has', 'you', 'feeling', 'her', 'character', "'", 's', 'unraveling', '.', 'overall', ',', 'the', 'film', 'doesn', "'", 't', 'stick', 'because', 'it', 'doesn', "'", 't', 'entertain', ',', 'it', "'", 's', 'confusing', ',', 'it', 'rarely', 'excites', 'and', 'it', 'feels', 'pretty', 'redundant', 'for', 'most', 'of', 'its', 'runtime', ',', 'despite', 'a', 'pretty', 'cool', 'ending', 'and', 'explanation', 'to', 'all', 'of', 'the', 'craziness', 'that', 'came', 'before', 'it', '.', 'oh', ',', 'and', 'by', 'the', 'way', ',', 'this', 'is', 'not', 'a', 'horror', 'or', 'teen', 'slasher', 'flick', '.', '.', '.', 'it', "'", 's', 'just', 'packaged', 'to', 'look', 'that', 'way', 'because', 'someone', 'is', 'apparently', 'assuming', 'that', 'the', 'genre', 'is', 'still', 'hot', 'with', 'the', 'kids', '.', 'it', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'and', 'has', 'been', 'sitting', 'on', 'the', 'shelves', 'ever', 'since', '.', 'whatever', '.', '.', '.', 'skip', 'it', '!', 'where', "'", 's', 'joblo', 'coming', 'from', '?', 'a', 'nightmare', 'of', 'elm', 'street', '3', '(', '7', '/', '10', ')', '-', 'blair', 'witch', '2', '(', '7', '/', '10', ')', '-', 'the', 'crow', '(', '9', '/', '10', ')', '-', 'the', 'crow', ':', 'salvation', '(', '4', '/', '10', ')', '-', 'lost', 'highway', '(', '10', '/', '10', ')', '-', 'memento', '(', '10', '/', '10', ')', '-', 'the', 'others', '(', '9', '/', '10', ')', '-', 'stir', 'of', 'echoes', '(', '8', '/', '10', ')'] >>> print (documents[0]) (['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg') >>> from random import shuffle >>> shuffle(documents) #shuffle the document list >>> >>> #Feature Exctraction
def text_classification(): documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) print(documents[1])
devacc = nltk.classify.accuracy(buildclass, devclass) #0.77 #Test accuracy testacc = nltk.classify.accuracy(buildclass, testclass) #0.794 #Question 4 #Create movie review documents import string from nltk.corpus import movie_reviews as rev from nltk.corpus import stopwords stop = stopwords.words('english') documents = [([ w for w in rev.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in rev.fileids()] random.shuffle(documents) #FreqDist of all words Total_words = nltk.FreqDist(w.lower() for w in rev.words()) wf = list(Total_words)[:2000] #Classification by feature extraction def document_features(document): document_words = set(document) features = {} for word in wf: features['contains({})'.format(word)] = (word in document_words) return features
from nltk.corpus import stopwords from nltk import word_tokenize import string # ---------------------------------------------------------------------------------------------------------------------- punctuations = list(string.punctuation) # Print the classification of the problem print ' Maximum Entropy classifier accuracy by removing punctuations: ' def word_feats(words): return dict([(word, True) for word in words]) # Get all the reviews with negative dataset and positive dataset from the movie reviews. negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') # Mark the word in the dataset as positive and negative. negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] # Set some cut off for separating the training data and testing data. negcutoff = len(negfeats) * 9 / 10 poscutoff = len(posfeats) * 9 / 10 # Based on the cut off, fill the training data and testing data with its respective positive and negative dataset. trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
import nltk import random from nltk.corpus import movie_reviews docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)] random.shuffle(docs) all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words()) token_features = list(all_tokens.keys())[:2000] print(token_features[:100]) def doc_features(docs): doc_words = set(docs) features = {} for word in token_features: features['contains(%s)' % word] = (word in doc_words) return features print(doc_features(movie_reviews.words('pos/cv957_8737.txt'))) feature_sets = [(doc_features(d), c) for (d,c) in docs] train_sets, test_sets = feature_sets[100:], feature_sets[:100] classifiers = nltk.NaiveBayesClassifier.train(train_sets) print(nltk.classify.accuracy(classifiers, test_sets)) classifiers.show_most_informative_features(5)