示例#1
0
    def train_with_movie_db(self):
        """
        Training possible with movie reviews
        - this does not yield particularly good results
        """
        self.use_movie_reviews = True

        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')

        negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "negative") for f in negids]
        posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
                     "positive") for f in posids]

        negcutoff = len(negfeats) * 3 / 4
        poscutoff = len(posfeats) * 3 / 4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))

        self.classifier = NaiveBayesClassifier.train(trainfeats)

        DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats)))
        DLOG(self.classifier.show_most_informative_features())
示例#2
0
def main(argv):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    #print negids
 
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids]

    trainfeats =  posfeats+negfeats
    #print trainfeats
    #    break
    classifier = NaiveBayesClassifier.train(trainfeats)

    #classifier = pickle.load(open("classifier.p", "rb"))
    topicList = ["media", "sports", "news", "fashion", "finance", "politics"]
    for line in sys.stdin:
        try:
            tolk_posset = word_tokenize(line.rstrip())
            d = word_feats(tolk_posset)
            for topic in topicList:
                subjectFull = subj(line, topic)
                if not subjectFull == "No match":
                    #print d
                    print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1"                    
        except:
                #print "Error"
                continue
示例#3
0
def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
示例#4
0
    def prep_reviews_data(self): # messy code to test classifier with movie reviews
        if not self.movie_review_data:
            print 'Preparing movie reviews...\n'
            from nltk.corpus import movie_reviews
            docs = [movie_reviews.raw(fileid) 
                    for category in movie_reviews.categories() 
                    for fileid in movie_reviews.fileids(category)]

            process = lambda x: 1 if x == 'pos' else -1
            labels = [process(category)
                    for category in movie_reviews.categories() 
                    for fileid in movie_reviews.fileids(category)]

            docs, labels = double_shuffle(docs, labels)
            training, testing = divide_list_by_ratio(docs)
            self.train_labs, self.test_labs = divide_list_by_ratio(labels)

            train_vecs = self.vectorizer.fit_transform(training)
            test_vecs = self.vectorizer.transform(testing)

            if isinstance(self.model, naive_bayes.GaussianNB):
                train_vecs = train_vecs.toarray()
                test_vecs = test_vecs.toarray()

            self.train_vecs = train_vecs
            self.test_vecs = test_vecs

            self.movie_review_data = True
            self.news_market_data = False
def load_data():
   global posfeats,negfeats
   negids = movie_reviews.fileids('neg')
   posids = movie_reviews.fileids('pos')
   negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
   posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
   return
示例#6
0
 def maketrainset(movie_reviews, tokenizer, stemmer):
     negids = movie_reviews.fileids('neg')
     posids = movie_reviews.fileids('pos')
     negfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'neg') for f in negids]
     posfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'pos') for f in posids]
     trainfeats = negfeats + posfeats
     return trainfeats
示例#7
0
def main():
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
    negcutoff = int(len(negfeats) * 3 / 4)
    poscutoff = int(len(posfeats) * 3 / 4)
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    classifier = NaiveBayesClassifier.train(trainfeats)

    with open("output.json") as fin:
        sid = SentimentIntensityAnalyzer()
        data = json.load(fin)
    for key in data:
        reviews = data[key]["reviews"]
        for i in range(len(reviews)):
            text = reviews[i]["review"]
            sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0}
            prob = classifier.prob_classify(word_feats(text.split(" ")))
            classification = classifier.classify(word_feats(text.split(" ")))
            sentiment_dict['positive_probability'] = prob.prob('pos')
            sentiment_dict['negative_probability'] = prob.prob('neg')
            sentiment_dict['label'] = classification
            reviews[i]["sentiment"] = sentiment_dict
        data[key]["reviews"] = reviews
    with open('out_with_sentiment.json', 'w') as outfile:
        json.dump(data, outfile)
示例#8
0
def train(test=False):

    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')


    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]


    if(test):
        negcutoff = len(negfeats)*3/4
        poscutoff = len(posfeats)*3/4

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))

        classifier = NaiveBayesClassifier.train(trainfeats)
        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

        classifier.show_most_informative_features()

    else:
        return NaiveBayesClassifier.train(negfeats+posfeats)
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
def median_approach(llimit,ulimit,isphrase,pathname):

    posmedlist=[]
    negmedlist=[]
    medians=[]

    lpcount=0
    totalcount=ulimit-llimit
    cnt_var=0
    print '\nNo of +ve reviews trained : '
    for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]:
        testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname)
        posmedlist.append(testmed)
        lpcount=lpcount+1
	cnt_var+=1
        print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    lpcount=0
    cnt_var=0
    print '\nNo of -ve reviews trained : '
    for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]:
        testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname)
        negmedlist.append(testmed)
        lpcount=lpcount+1
	cnt_var+=1
        print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    medians.append([numpy.median(x) for x in itertools.izip(*posmedlist)])
    medians.append([numpy.median(x) for x in itertools.izip(*negmedlist)])

    f = open('train_result\proximity_median_train_result_'+str(isphrase),'w')
    json.dump(medians,f)
    f.close()
示例#11
0
 def evaluate_features(self,feature_extractor, N):
     self.negative = movie_reviews.fileids('neg') #list of all names of the documents under neg folder
     self.positive = movie_reviews.fileids('pos') #list of all names of the documents under pos folder
     self.maintrain, self.maintest = self.stratifiedSplit(self.negative, self.positive, N)
     lst = []
     trainvocabulary = []
     for doc,lbl in self.maintrain:
         x = (feature_extractor(movie_reviews.words(fileids=[doc])),lbl)
         lst.append(x)
         trainvocabulary = trainvocabulary + x[0].keys()
     trainvocabulary = set(trainvocabulary)
     if q2_1.W == 0:
         q2_1.W = len(trainvocabulary)
     print "no. of features in train:", self.W
     nb = classifier.train(lst)
     self.testClassify = self.classifyTest(self.maintest, nb, feature_extractor)
     print "accuracy = ", accuracy(self.maintest, self.testClassify)
     print "Negative:"
     print "    precision = ", self.calcPrec('neg', self.maintest, self.testClassify)
     print "    recall = ", self.calcRecall('neg', self.maintest, self.testClassify)
     print "    f measure = ", self.calcFMeasur('neg', self.maintest, self.testClassify)
     print "Positive:"
     print "    precision = ", self.calcPrec('pos', self.maintest, self.testClassify)
     print "    recall = ", self.calcRecall('pos', self.maintest, self.testClassify)
     print "    f measure = ", self.calcFMeasur('pos', self.maintest, self.testClassify)
     nb.show_most_informative_features()
     return nb
def main():
	negids = movie_reviews.fileids('neg')
	posids = movie_reviews.fileids('pos')

	to_review1 = "A man with a magnanimous spirit helps a mute girl from Pakistan return home."
	to_review2 = "Forced out of his own company by former Darren Cross, Dr. Hank Pym (Michael Douglas) recruits the talents of Scott Lang (Paul Rudd), a master thief just released from prison. Lang becomes Ant-Man, trained by Pym and armed with a suit that allows him to shrink in size, possess superhuman strength and control an army of ants. The miniature hero must use his new skills to prevent Cross, also known as Yellowjacket, from perfecting the same technology and using it as a weapon for evil."
	to_review3 = '''Parents need to know that kids may clamor to see this fast-paced, action-packed comic book-based adventure. But it's definitely more age-appropriate for teens than younger children. Although much of the violence is clearly meant to be based in the realm of sci-fi and fantasy -- and/or is shown at a distance -- there's plenty of it, from massive explosions to children held at gunpoint to super-powered fistfights. Some of the violence is war themed, and some characters get hurt and/or die. While much is made of lead character Tony Stark's devil-may-care lifestyle of fun and frolic, viewers also see him turn away from the more irresponsible aspects of playboyhood. Language is minimal, and sexual content is more suggested than shown overall -- though there are a few eyebrow-raising moments.'''
	reviews = []
	reviews.append(to_review1)
	reviews.append(to_review2)
	reviews.append(to_review3)

	for to_review in reviews:
		to_review_words = to_review.split(" ")
		print "Reviewing",to_review,"\n\n\n"


		print ''' Normal classification ''',"\n\n"
		negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
		posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
		calculateScore(classification(negfeats, posfeats, 1, 1), to_review_words)
		calculateScore(classification(negfeats, posfeats, 1, 0.95), to_review_words)
		calculateScore(classification(negfeats, posfeats, 0.95, 1), to_review_words)
		calculateScore(classification(negfeats, posfeats, 0.9, 1), to_review_words)
		calculateScore(classification(negfeats, posfeats, 1, 0.9), to_review_words)

		print ''' Without Punctuations ''',"\n\n"
		negfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
		posfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
		calculateScore_punctuations(classification(negfeats, posfeats, 1, 1), to_review_words)
		calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.95), to_review_words)
		calculateScore_punctuations(classification(negfeats, posfeats, 0.95, 1), to_review_words)
		calculateScore_punctuations(classification(negfeats, posfeats, 0.9, 1), to_review_words)
		calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.9), to_review_words)



		print ''' Without Stop Words ''',"\n\n"
		negfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
		posfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
		wordstoreview = []
		for each in to_review_words:
			if each not in stopwords.words('english'):
				wordstoreview.append(each)
		calculateScore_stopwords(classification(negfeats, posfeats, 1, 1), wordstoreview)
		calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.95), to_review_words)
		calculateScore_stopwords(classification(negfeats, posfeats, 0.95, 1), to_review_words)
		calculateScore_stopwords(classification(negfeats, posfeats, 0.9, 1), to_review_words)
		calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.9), to_review_words)


		print ''' With Lemmatizer ''',"\n\n"
		negfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
		posfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
		calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 1), to_review_words)
		calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.95), to_review_words)
		calculateScore_lemmatizer(classification(negfeats, posfeats, 0.95, 1), to_review_words)
		calculateScore_lemmatizer(classification(negfeats, posfeats, 0.9, 1), to_review_words)
		calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.9), to_review_words)
示例#13
0
 def __init__(self, train1=True, train2=True, train3=True, train4=True):
     self.trainfeats = []        
     
     if train1:
         negids = movie_reviews.fileids('neg')
         posids = movie_reviews.fileids('pos')
          
         neg_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
         pos_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
         
         self.trainfeats = neg_movies + pos_movies
     
     if train2:
         f = open("out.txt", "r")
         
         negfeats = []
         posfeats = []
         for line in f:
             status = line[0]
             texto = line[2:]
 
             if status == '0':
                 negfeats.append((self.word_feats(texto.split(" ")), 'neg'))
             elif status == '1':
                 posfeats.append((self.word_feats(texto.split(" ")), 'pos'))               
     
         self.trainfeats += negfeats + posfeats
     
     if train3:    
         f = open("E:\\Workspace\\WS_TG\\analisador1\\AFINN\\AFINN-111.txt", 'r')
         for l in f:
             data = l.strip().split('\t')
             self.trainfeats.append( (self.word_feats(data[0]), 'neg' if int(data[1]) < 0 else 'pos'))
             
     if train4:
         f = open("E:\\Workspace\\WS_TG\\api\\trainning set.txt", 'r')
         pos = []
         neutral = []
         neg = []
         for line in f:
             if line.startswith("pos"):
                 pos.append(line)
             elif line.startswith("neutral"):
                 neutral.append(line)
             elif line.startswith("neg"):
                 neg.append(line)
                 
         print len(pos), len(neutral), len(neg)
         
         total = pos + neutral[:200] + neg
         
         for line in total:
             data = line.split(' .:. ')
             self.trainfeats.append( (self.word_feats(data[1].split()), data[0]) )
                    
     self.classifier = NaiveBayesClassifier.train(self.trainfeats)
     
     print self.classifier.show_most_informative_features(20)
def setup_demo(lower):
    print 'running movie reviews demo. data dir: ', nltk_movie_reviews_data_root
    negative_reviews = map (lambda x: nltk_movie_reviews_data_root + x, movie_reviews.fileids('neg'))
    positive_reviews = map (lambda x: nltk_movie_reviews_data_root + x, movie_reviews.fileids('pos'))
    pos = create_corpus_from_file_list(negative_reviews, "negative", None, None, lower) 
    neg = create_corpus_from_file_list(positive_reviews, "positive", None, None, lower)         
    pos_bigrams = create_corpus_from_file_list(negative_reviews, "negative", None, None, lower, wordlist_to_bigrams_dict) 
    neg_bigrams = create_corpus_from_file_list(positive_reviews, "positive", None, None, lower, wordlist_to_bigrams_dict)         
    return (pos, neg, pos_bigrams, neg_bigrams)
示例#15
0
 def __init__(self, load = False, loadFile = ""):
     if(load):
         self.loadClassifier(loadFile)
     else:
         negids = movie_reviews.fileids('neg')
         posids = movie_reviews.fileids('pos')
         negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in movie_reviews.fileids('neg')]
         posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in movie_reviews.fileids('pos')]
         trainfeats = negfeats + posfeats
         self.classifier = NaiveBayesClassifier.train(trainfeats)
def bins_svm_approach(llimit,ulimit,isphrase,pathname):

    posbinlist=[]
    negbinlist=[]
    trainingdata=[]
    trainingclass=[]
    bin_train_set=[]
    totalcount=ulimit-llimit

    lpcount=0
    cnt_var=0
    print '\nNo of +ve reviews scanned for training : '
    for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]:
        testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname)
        posbinlist.append(testbin)
        lpcount+=1
	cnt_var+=1
        print 'Scanning +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
        

    lpcount=0
    cnt_var=0
    print '\nNo of -ve reviews scanned for training : '
    for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]:
        testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname)
        negbinlist.append(testbin)
        lpcount+=1
	cnt_var+=1
        print 'Scanning -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'


    lpcount=0
    totalcount=len(posbinlist)
    print '\nNo of +ve reviews trained : '
    trainingdata.extend(posbinlist)
    for i in range(totalcount):
        trainingclass.append(1)
        lpcount+=1
        print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    lpcount=0
    totalcount=len(negbinlist)
    print '\nNo of -ve reviews trained : '
    trainingdata.extend(negbinlist)
    for i in range(totalcount):
        trainingclass.append(0)
        lpcount+=1
        print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    bin_train_set.append(trainingdata)
    bin_train_set.append(trainingclass)

    f = open('train_result\proximity_bin_train_result_'+str(isphrase),'w')
    json.dump(bin_train_set,f)
    f.close()
示例#17
0
def sort_files():
    """
    Sorted the sample for cross reading the sample
    :return:
        files_list
    """
    files_list = list()
    neg_file_list = movie_reviews.fileids('neg')
    pos_file_list = movie_reviews.fileids('pos')
    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
    return files_list
示例#18
0
    def train_classifiers(self):
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        negfeats = [(word_feats(
            movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        posfeats = [(word_feats(
            movie_reviews.words(fileids=[f])), 'pos') for f in posids]
        trainfeats = negfeats + posfeats

        # train naive bayes
        self.classifier = NaiveBayesClassifier.train(trainfeats)
def train(feature):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    negfeatures = [(feature(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeatures = [(feature(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    trainfeatures = negfeatures + posfeatures
    classifier = NaiveBayesClassifier.train(trainfeatures)

    return classifier
示例#20
0
文件: util.py 项目: DrDub/nltk
def demo_movie_reviews(trainer, n_instances=None, output=None):
    """
    Train classifier on all instances of the Movie Reviews dataset.
    The corpus has been preprocessed using the default sentence tokenizer and
    WordPunctTokenizer.
    Features are composed of:
        - most frequent unigrams

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total reviews that have to be used for
        training and testing. Reviews will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.corpus import movie_reviews
    from nltk.sentiment import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances/2)

    pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
    neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs+train_neg_docs
    testing_docs = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
                        Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
                        Instances=n_instances)
示例#21
0
  def train(self, feats):
    print "Starting to train the data"
    start = datetime.datetime.now()

    print "setting the ids", datetime.datetime.now()
    self.negids = movie_reviews.fileids('neg')
    self.posids = movie_reviews.fileids('pos')
    #random.shuffle(self.negids)
    #random.shuffle(self.posids)
    ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] +
        ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids])
    ##random.shuffle(self.reviews)

    ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:])
    ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4])

    print "setting the feats", datetime.datetime.now()
    self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids]
    self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids]

    self.negcutoff = len(self.negfeats)*3/4
    self.poscutoff = len(self.posfeats)*3/4

    print "setting the train/test", datetime.datetime.now()
    self.trainfeats = self.negfeats[:self.negcutoff] + self.posfeats[:self.poscutoff]
    self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[self.poscutoff:]

    print "training", datetime.datetime.now()
    self.classifier = NaiveBayesClassifier.train(self.trainfeats)
    ##self.classifier = NaiveBayesClassifier.train(self.train_set)
    self.refsets = defaultdict(set)
    self.testsets = defaultdict(set)

    print "accuracy stuff", datetime.datetime.now()
    for i, (feats, label) in enumerate(self.testfeats):
    ##for i, (feats, label) in enumerate(self.test_set):
      self.refsets[label].add(i)
      observed = self.classifier.classify(feats)
      self.testsets[observed].add(i)

    end = datetime.datetime.now()
    print "Training lasted for ", end-start


    print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats)
    ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set)
    print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg'])
    self.classifier.show_most_informative_features()
    self.trained = True
示例#22
0
 def movieReviews(self, category, count):
   ret = []
   if category != 'positive' and category != 'negative':
     return ret
   fileids = []
   if category == 'positive':
     fileids = movie_reviews.fileids('pos')
   elif category == 'negative':
     fileids = movie_reviews.fileids('neg')
   sampleFileIds = sample(fileids, count)
   for sampleFileId in sampleFileIds:
     ret.append(movie_reviews.raw(sampleFileId))
   return ret
示例#23
0
def train():
    global classifier

    # Train our classifier
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    negfeats = [(feature_extractor(movie_reviews.words(fileids=[f])), 'neg')
                for f in negids]
    posfeats = [(feature_extractor(movie_reviews.words(fileids=[f])), 'pos')
                for f in posids]

    classifier = NaiveBayesClassifier.train(negfeats + posfeats)
    def train_classifier(self):
        """This code is heavily inspired by:
        http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
        """
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')

        negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

        trainfeats = negfeats + posfeats

        self.classifier = NaiveBayesClassifier.train(trainfeats)
示例#25
0
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    classifier = NaiveBayesClassifier.train(trainfeats)
    return classifier
示例#26
0
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    count = 1500000
    lines = []

    english_stops = set(stopwords.words('english'))
    print ctime(), "Reading files..."

    f = open('Sentiment Analysis Dataset.csv', "rU")

    line = f.readline()
    line = f.readline()

    negfeats = []
    posfeats = []

    for i in range(count):
        lines.append(line)
        line = f.readline()
    f.close()
    random.shuffle(lines)

    
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
示例#27
0
  def train_classifier(self):
    # Training
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')

    negfeats = [(self.best_word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(self.best_word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

    self.sentiment_classifier = NaiveBayesClassifier.train(trainfeats)
def trainMovies():
    negids = movie_reviews.fileids('neg')
    print type(negids), negids
    posids = movie_reviews.fileids('pos')

    negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

    train = negfeats + posfeats

    classifier = NaiveBayesClassifier.train(train)

    f = open('movie_semtiment_classifier.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()
示例#29
0
def nb_movierev():
	negids = movie_reviews.fileids('neg')
	posids = movie_reviews.fileids('pos')
	 
	negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
	posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
	 
	trainfeats = negfeats + posfeats
	testfeats = word_feats(my_tok('very good indeed',1))


	print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
	 
	classifier = NaiveBayesClassifier.train(trainfeats)
	print classifier.classify(testfeats)
 def train_clf(method):
     negidxs = movie_reviews.fileids('neg')
     posidxs = movie_reviews.fileids('pos')
     if method=='stopword_filtered_words_features':
         negfeatures = [(stopword_filtered_words_features(movie_reviews.words(fileids=[file])), 'neg') for file in negidxs]
         posfeatures = [(stopword_filtered_words_features(movie_reviews.words(fileids=[file])), 'pos') for file in posidxs]
     elif method=='best_words_features':
         negfeatures = [(best_words_features(movie_reviews.words(fileids=[file])), 'neg') for file in negidxs]
         posfeatures = [(best_words_features(movie_reviews.words(fileids=[file])), 'pos') for file in posidxs]
     elif method=='best_bigrams_words_features':
         negfeatures = [(best_bigrams_words_features(movie_reviews.words(fileids=[file])), 'neg') for file in negidxs]
         posfeatures = [(best_bigrams_words_features(movie_reviews.words(fileids=[file])), 'pos') for file in posidxs]
         
     trainfeatures = negfeatures + posfeatures
     clf = NaiveBayesClassifier.train(trainfeatures)
     return clf
    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append("Classified as: {}".format(model.predict([text])))
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append("{:0.4f}{: >15}    {:0.4f}{: >15}".format(
            cp, fnp, cn, fnn))

    return "\n".join(output)


if __name__ == "__main__":
    PATH = "model.pickle"

    if not os.path.exists(PATH):
        # Time to build the model
        from nltk.corpus import movie_reviews as reviews

        X = [reviews.raw(fileid) for fileid in reviews.fileids()]
        y = [reviews.categories(fileid)[0] for fileid in reviews.fileids()]

        model = build_and_evaluate(X, y, outpath=PATH)

    else:
        with open(PATH, 'rb') as f:
            model = pickle.load(f)

    print(show_most_informative_features(model))
示例#32
0
from nltk.corpus import movie_reviews
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.util import pad_sequence
import nltk
from nltk import FreqDist
import math
from random import shuffle
from nltk.metrics import precision, recall, f_measure
from nltk.corpus import stopwords
import collections
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier

document = [(movie_reviews.words(file),category) for file in movie_reviews.fileids() for category in movie_reviews.categories(file)]
# Randomizes document files so that the data doesnt bias.
shuffle(document)
userInput = []
for input in open("classifyUserInput.txt"):
    userInput.append(word_tokenize(input))

wordFreq = FreqDist(movie_reviews.words())
frequent_words_non_filtered = list(wordFreq)
# Use this line instead of the prior to only select the top 5000 words. It's much faster than using all words
# frequent_words_non_filtered = list(wordFreq)[:5000]
frequent_words_list = [word for word in frequent_words_non_filtered if word not in stopwords.words('english')]

# Finds features from the list of words and places them in a dictionary.
def find_freq_words(word_list):
    words_dict = {}
    for x in frequent_words_list:
示例#33
0
    def pprocess_induction(self):
        # get combined vocabulary for pos and neg class
        all_reviews = (self.clean_movie_reviews(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids())
        if self.kwargs.get('stopwords', False):
            print("***** Stopwords kept *****")
            c_vocab_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2))
        else:
            print("***** Stopwords removed *****")
            c_vocab_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english')
        c_tfidf = c_vocab_vect.fit_transform([doc for doc in all_reviews])
        self.c_vocab = c_vocab_vect.get_feature_names()

        if not self.kwargs['train']:  # only get the combined vocabulary, no need to do tfidf logic
            return
        else:
            print("***** Training *****")
            # segment the movie reviews -> pos, neg and call the clean function to remove
            pos_documents = (self.clean_movie_reviews(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids('pos'))
            neg_documents = (self.clean_movie_reviews(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids('neg'))

            # fit and transform the the documents to a tfidf matrix using the combined vocabulary
            if self.kwargs.get('stopwords', False):
                print("***** Stopwords kept *****")
                tfidf_pos_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), smooth_idf=True, vocabulary=self.c_vocab)
                tfidf_neg_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), smooth_idf=True, vocabulary=self.c_vocab)
            else:
                print("***** Stopwords removed *****")
                tfidf_pos_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english', smooth_idf=True, vocabulary=self.c_vocab)
                tfidf_neg_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english', smooth_idf=True, vocabulary=self.c_vocab)
            pos_tfidf = tfidf_pos_vect.fit_transform([doc for doc in pos_documents])
            neg_tfidf = tfidf_neg_vect.fit_transform([doc for doc in neg_documents])
            # with open('vocab', 'wb') as f:
            #     pickle.dump(c_vocab, f)

            # combine both pos neg arrays into combined sparse matrix
            pos_array = self.create_array(pos_tfidf, 0)
            neg_array = self.create_array(neg_tfidf, 1)
            self.combined_array = np.concatenate((pos_array, neg_array), axis=0)
            # print(self.combined_array.shape)
            # print(combined_array[:1000,-1])  # should print all 0, pos reviews
            # print(combined_array[1000:, -1])  # should print all 1, neg reviews

            # print(f"pp_induct: {len(self.c_vocab)}")
            return self.combined_array
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews


def extract_features(word_list):
    return dict([(word, True) for word in word_list])


if __name__ == '__main__':
    # Load positive and negative reviews
    positive_fileids = movie_reviews.fileids('pos')
    negative_fileids = movie_reviews.fileids('neg')

    features_positive = [(extract_features(movie_reviews.words(fileids=[f])),
                          'Positive') for f in positive_fileids]
    features_negative = [(extract_features(movie_reviews.words(fileids=[f])),
                          'Negative') for f in negative_fileids]

    # Split the data into train and test (80/20)
    threshold_factor = 0.8
    threshold_positive = int(threshold_factor * len(features_positive))
    threshold_negative = int(threshold_factor * len(features_negative))

    features_train = features_positive[:
                                       threshold_positive] + features_negative[:
                                                                               threshold_negative]
    features_test = features_positive[threshold_positive:] + features_negative[
        threshold_negative:]
    print("\nNumber of training datapoints:", len(features_train))
    print("Number of test datapoints:", len(features_test))
eng_sw = stopwords.words('english') + list(punctuation)
"""
$$  Here we are using the dataset from the nltk library which was 
    downloaded in the beggining, from the corpora inside the movie_reviws
    section. 
    
    Go and check it out ,,,where you've downloaded the nltk data.
    
    There will 1000 neg and pos revews, categorised into two folders namely
    neg and pos.

"""
all_data_1 = []
all_data_1 = [(list(movie_reviews.words(file_name)), folder_name)
              for folder_name in movie_reviews.categories()
              for file_name in movie_reviews.fileids(folder_name)]
"""
$$  For few people the above one line code may not strike as fast,
    so i have breakup the code in regular format, and will explain 
    about each line.

"""

all_data = []

for folder_name in movie_reviews.categories():
    for file_name in movie_reviews.fileids(folder_name):
        all_data.append((list(movie_reviews.words(file_name)), folder_name))
"""
$$  Here the fist for loop is to iterate through the sub folders in
    the movie_reviews data under nlt.corpora...
示例#36
0
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


# this takes the most of the algorithm time.
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)


all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
示例#37
0
def demo_movie_reviews(trainer, n_instances=None, output=None):
    """
    Train classifier on all instances of the Movie Reviews dataset.
    The corpus has been preprocessed using the default sentence tokenizer and
    WordPunctTokenizer.
    Features are composed of:
        - most frequent unigrams

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total reviews that have to be used for
        training and testing. Reviews will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.corpus import movie_reviews
    from nltk.sentiment import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    pos_docs = [
        (list(movie_reviews.words(pos_id)), 'pos')
        for pos_id in movie_reviews.fileids('pos')[:n_instances]
    ]
    neg_docs = [
        (list(movie_reviews.words(neg_id)), 'neg')
        for neg_id in movie_reviews.fileids('neg')[:n_instances]
    ]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs + train_neg_docs
    testing_docs = test_pos_docs + test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            'Your classifier does not provide a show_most_informative_features() method.'
        )
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(
            output,
            Dataset='Movie_reviews',
            Classifier=type(classifier).__name__,
            Tokenizer='WordPunctTokenizer',
            Feats=extr,
            Results=results,
            Instances=n_instances,
        )
示例#38
0
from nltk.corpus import movie_reviews
import keyword_extractor

for fileid in movie_reviews.fileids():
    words = movie_reviews.words(fileid)
    print words
示例#39
0
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 15 14:50:27 2018

@author: sriniv11
"""

from nltk.corpus import movie_reviews

# Total reviews
print(len(movie_reviews.fileids()))  # Output: 2000

# Review categories
print(movie_reviews.categories())  # Output: [u'neg', u'pos']

# Total positive reviews
print(len(movie_reviews.fileids('pos')))  # Output: 1000

# Total negative reviews
print(len(movie_reviews.fileids('neg')))  # Output: 1000

positive_review_file = movie_reviews.fileids('pos')[0]
print(positive_review_file)  # Output: pos/cv000_29590.txt
示例#40
0
 def save_labelled_vectors(self):
     self.load_vocab()
     fileids = movie_reviews.fileids()
     random.shuffle(fileids)
     self.save_labelled_vectors_type(DataType.TRAIN, fileids)
     self.save_labelled_vectors_type(DataType.TEST, fileids)
示例#41
0
##one liner of loop below
# documents = [(list(movie_reviews.words(fileid)), catagory)
# 			for catagory in movie_reviews.catagories()
# 			for fileid in movie_reviews.fileids(catagory)]

# documents = []

# for catagory in movie_reviews.catagories():
# 	for fileidin in movie_reviews.fileids(catagory):
# 		document.append(list(movie_reviews.words(fileid)), category)

# random.shuffle(documents)

# print(documents[1])

for i in mr.fileids():
    documents[i.split('/')[0]].append(i)

random.shuffle(documents['pos'])
random.shuffle(documents['neg'])

#print(documents['pos'][:10]) # first ten pos reviews.
#print
#print(documents['neg'][:10]) # first ten neg reviews.

documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]

random.shuffle(documents)
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

#Function to extract features


def extract_features(words):
    return dict([(word, True) for word in words])


if __name__ == '__main__':
    #Loading reviews from corpus
    fileids_pos = movie_reviews.fileids('pos')
    fileids_neg = movie_reviews.fileids('neg')

    #Extracting features from reviews
    features_pos = [(extract_features(movie_reviews.words(fileids=[f])),
                     'Positive') for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(fileids=[f])),
                     'Negative') for f in fileids_neg]

    #Defining test and train split
    #80% for training and 20% for testing
    threshold = 0.8
    num_pos = int(threshold * len(features_pos))
    num_neg = int(threshold * len(features_neg))

    #Creating training and testing datasets
    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]
示例#43
0
 def __init__(self):
     self.documents = [(list(movie_reviews.words(fileid)), category)
                       for category in movie_reviews.categories()
                       for fileid in movie_reviews.fileids(category)]
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import collections
from sklearn.svm import LinearSVC, SVC
import random

def create_word_features(words):
    my_dict = dict([(word, True) for word in words])
    return my_dict
print('---------------------------------------------------------------------------')
print('WELCOME TO  SENTIMENTAL ANALYSIS OF ONLINE IMDB MOVIE REVIEWS ')
print('---------------------------------------------------------------------------')
print('                               DATASET                               ')

neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append((create_word_features(words), "negative"))

#print(pos_reviews[0])
print('length of negative reviews')
print(len(neg_reviews))
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append((create_word_features(words), "positive"))

#print(pos_reviews[0])
print('length of positive reviews')
print(len(pos_reviews))
示例#45
0
from nltk.corpus import movie_reviews

# Total reviews
print(len(movie_reviews.fileids()))

# Review categories
print(movie_reviews.categories())

# Total positive reviews
print(len(movie_reviews.fileids('pos')))

# Total negative reviews
print(len(movie_reviews.fileids('neg')))

positive_review_file = movie_reviews.fileids('pos')[0]
print(positive_review_file)

documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):

        documents.append((movie_reviews.words(fileid), category))

print(len(documents))

# print first tuple
print(documents[0])
from random import shuffle
shuffle(documents)
def word_feats(words):
    return {word: True for word in words}


def get_combined_features():
    with open(os.path.join('training_data', 'combined.txt')) as f:
        sentences = []
        for line in f:
            sentiment, sentence = line.split('\t')
            tokens = word_tokenize(sentence)
            sentences.append((word_feats(tokens), sentiment[:3]))
    return sentences


print("Finding ids for positive and negative reviews")
negids = reviews.fileids('neg')
posids = reviews.fileids('pos')

print("Creating feature sets")
negfeats = [(word_feats(reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(reviews.words(fileids=[f])), 'pos') for f in posids]
mixfeats = get_combined_features()

print("Calculating cutoffs")
negcutoff = int(len(negfeats) * 9 / 10)
poscutoff = int(len(posfeats) * 9 / 10)
mixcutoff = int(len(mixfeats) * 9 / 10)

print("Creating training set")
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + mixfeats[:mixcutoff]
print("Creating test set")
示例#47
0
import random
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
import string

labeled_docs = [(list(movie_reviews.words(fid)), cat)
                for cat in movie_reviews.categories()
                for fid in movie_reviews.fileids(cat)]
random.seed(42)
random.shuffle(labeled_docs)

review_words = movie_reviews.words()
print "# Review Words", len(review_words)

sw = set(stopwords.words('english'))
punctuation = set(string.punctuation)


def isStopWord(word):
    return word in sw or word in punctuation


filtered = [w.lower() for w in review_words if not isStopWord(w.lower())]
print "# After filter", len(filtered)

words = FreqDist(filtered)
N = int(.05 * len(words.keys()))
word_features = words.keys()[:N]
示例#48
0
from nltk.corpus import movie_reviews

# Total reviews
print(len(movie_reviews.fileids()))  # Output: 2000

# Review categories
print(movie_reviews.categories())  # Output: [u'neg', u'pos']

# Total positive reviews
print(len(movie_reviews.fileids('pos')))  # Output: 1000

# Total negative reviews
print(len(movie_reviews.fileids('neg')))  # Output: 1000

positive_review_file = movie_reviews.fileids('pos')[0]
print(positive_review_file)  #Output: pos/cv000_29590.txt

documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        #documents.append((list(movie_reviews.words(fileid)), category))
        documents.append((movie_reviews.words(fileid), category))

print(len(documents))  # Output: 2000

# x = [str(item) for item in documents[0][0]]
# print (x)

# print first tuple
print(documents[0])
示例#49
0
model = lda.LDA(n_topics=10, n_iter=500)
model.fit(sentences_train) # Fit the model 
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ', '.join(topic_words)))


'''
EXAMPLE: Automatically summarize a document
'''

# corpus of 2000 movie reviews
from nltk.corpus import movie_reviews
reviews = [movie_reviews.raw(filename) for filename in movie_reviews.fileids()]

# create document-term matrix
tfidf = TfidfVectorizer(stop_words='english')
dtm = tfidf.fit_transform(reviews)
features = tfidf.get_feature_names()

import numpy as np

# find the most and least "interesting" sentences in a randomly selected review
def summarize():
    
    # choose a random movie review    
    review_id = np.random.randint(0, len(reviews))
    review_text = reviews[review_id]
示例#50
0
classifier = nltk.NaiveBayesClassifier.train(train_set)
print((nltk.classify.accuracy(classifier, devtest_set)))
#%%
errors = [] #预测错误集合
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

for (tag, guess, name) in sorted(errors): # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print('correct=%-8s guess=%-8s name=%-30s' %(tag, guess, name))
#%%
from nltk.corpus import movie_reviews
import random
import nltk
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words.keys())[:2000]#勘误
print(word_features)
#%%
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))
#%%
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
示例#51
0
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

# Function to extract features from the input list


def extract_features(words):
    return dict([(word, True) for word in words])


if __name__ == "__main__":
    # Load the reviews
    fileids_pos = movie_reviews.fileids("pos")
    fileids_neg = movie_reviews.fileids("neg")

    # Extractthe features from reiews
    features_pos = [(extract_features(movie_reviews.words(fileids=[f])),
                     "Positive") for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(fileids=[f])),
                     "Negative") for f in fileids_neg]

    # Define the trains and test split
    threshold = 0.8
    num_pos = int(threshold * len(features_pos))
    num_neg = int(threshold * len(features_neg))

    # Create training and training datasets
    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]
from nltk.corpus import nps_chat
from nltk.corpus import brown
from nltk import word_tokenize

posts = nltk.corpus.nps_chat.xml_posts()
featuresets = [nltk.pos_tag(word_tokenize(post.text)) for post in posts]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(featuresets, backoff=t0)
t2 = nltk.BigramTagger(featuresets, backoff=t1)

##text = word_tokenize("I am good");
##print(t2.tag(text));
##print(text);

from nltk.corpus import movie_reviews as movies
pos_docs = movies.fileids('pos')
neg_docs = movies.fileids('neg')
classifier_training = []

for doc in pos_docs:
    sents = movies.sents(doc)
    for sent in sents:
        tagged = t2.tag(sent)
        words = [w for w, k in tagged]
        tags = [k for w, k in tagged]
        feature = {}
        for i in range(len(words) - 1):
            feature[words[i] + ' ' +
                    words[i + 1]] = tags[i] + ' ' + tags[i + 1]

        temp = (feature, 'pos')
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

def collect_features(word_list):
        	word = []
                return dict ([(word, True) for word in word_list])


if __name__=='__main__':
			plus_filenum = movie_reviews.fileids('pos')
			minus_filenum = movie_reviews.fileids('neg')

			feature_pluspts = [(collect_features(movie_reviews.words(fileids=[f])),
				'Positive') for f in plus_filenum]
    			feature_minuspts = [(collect_features(movie_reviews.words(fileids=[f])),
				'Negative') for f in minus_filenum]

    			threshold_fact = 0.8
    			threshold_pluspts = int(threshold_fact * len(feature_pluspts))
    			threshold_minuspts = int(threshold_fact * len(feature_minuspts))

    			feature_training = feature_pluspts[:threshold_pluspts] + feature_minuspts[:threshold_minuspts]
    			feature_testing = feature_pluspts[threshold_pluspts:] + feature_minuspts[threshold_minuspts:] 
    			print "\nNumber of training datapoints:", len(feature_training)
    			print "Number of test datapoints:", len(feature_testing)

		    	# Train a Naive Bayes classifiers
		    	classifiers = NaiveBayesClassifier.train(feature_training)
		    	print "\nAccuracy of the classifiers:", nltk.classify.util.accuracy(classifiers,feature_testing)
示例#54
0
def main():
    DOC_SIZE = 500
    TRAIN_SIZE = int(DOC_SIZE * 0.90)

    pos_files = mr.fileids(categories='pos')[:DOC_SIZE]
    neg_files = mr.fileids(categories='neg')[:DOC_SIZE]


    sources = {
        'train_pos': pos_files[:TRAIN_SIZE],
        'train_neg': neg_files[:TRAIN_SIZE],
        'test_pos': pos_files[TRAIN_SIZE:],
        'test_neg': neg_files[TRAIN_SIZE:]
    }

    # Tag documents in format: 
    # TaggedDocument(words=['word_1', 'word_2', 'word_n'], tags=['pos'])
    corpus = label_docs(sources)

    doc2vecModel = Doc2Vec(documents=corpus, 
                           min_count=1, 
                           window=10, 
                           vector_size=100, 
                           workers=7, 
                           sample=1e-4, 
                           negative=5)

    for epoch in range(10):
        random.shuffle(corpus)
        doc2vecModel.train(corpus, total_examples=doc2vecModel.corpus_count, epochs=doc2vecModel.epochs)
        

    # set training dataset
    x_train = numpy.zeros((TRAIN_SIZE * 2, 100))
    y_train = numpy.zeros(TRAIN_SIZE * 2)

    for i in range(TRAIN_SIZE):
        x_train[i] = doc2vecModel['train_pos_' + str(i)]
        x_train[TRAIN_SIZE + i] = doc2vecModel['train_neg_' + str(i)]
        y_train[i] = 1
        y_train[TRAIN_SIZE + i] = 0

    
    # set testing dataset
    TEST_SIZE = DOC_SIZE - TRAIN_SIZE
    x_test = numpy.zeros((TEST_SIZE * 2, 100))
    y_test = numpy.zeros(TEST_SIZE * 2,)

    for i in range(TEST_SIZE):
        x_test[i] = doc2vecModel['test_pos_' + str(i)]
        x_test[TEST_SIZE + i] = doc2vecModel['test_neg_' + str(i)]
        y_test[i] = 1
        y_test[TEST_SIZE + i] = 0

    
    # classification
    classifier = MLPClassifier(hidden_layer_sizes=(5,10))
    classifier.fit(x_train, y_train)

    print()
    print('Corpus Size: {}'.format(len(pos_files + neg_files)))
    print('Training Size:\n\tpositive: {}\tnegative: {}'.format(len(sources['train_pos']), len(sources['train_neg'])))
    print('Testing Size:\n\tpositive: {}\tnegative: {}'.format(len(sources['test_pos']), len(sources['test_neg'])))
    print()


    # mean accuracy of the model using the test dataset
    print("Accuracy Score:", classifier.score(x_test, y_test))

    # sample predictions
    print('Predicting movie reviews using Neural Network classifier:')
    print()

    # print(len(classifier.predict(x_test)))

    # random.shuffle(x_test)
    # test_datasets = x_test
    # test_labels = y_test
    pred_list = classifier.predict(x_test)
    cat = {0: 'NEG', 1: 'POS'}

    # print(len(x_test))

    for i in range(10):
        r = random.randint(0, len(x_test) - 1)
        print('Test doc: {}\t\tactual class: {}\t\tprediction: {}\t\t accurate?: {}'
        .format(r, cat[int(y_test[r])], cat[int(pred_list[r])], y_test[r] == pred_list[r]))
示例#55
0
from scipy.sparse import find
from scipy.spatial.distance import cosine
import numpy as np
import nltk 

# Download the nltk modules you need. You only need to run these lines once.
nltk.download('movie_reviews')
nltk.download('punkt')

from nltk.corpus import movie_reviews

# Description of data set
print(movie_reviews.readme())

# Prepare document set for stemming
movie_docs = [' '.join(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()]

# Define function for tokenizing documents
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        item_lower = item.lower()
        stems.append(nltk.PorterStemmer().stem(item_lower))
    return stems

# Build TF-IDF matrix
tfidf = TfidfVectorizer(tokenizer=tokenize)
movie_tfidf = tfidf.fit_transform(movie_docs)

# Examine non-zero entries in TF-IDF matrix
示例#56
0
Python 3.8.2 (tags/v3.8.2:7b3ab59, Feb 25 2020, 23:03:10) [MSC v.1916 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license()" for more information.
>>> from nltk.corpus import movie_reviews
>>> print (len(movie_reviews.fileids())) #total reviews
2000
>>> print (movie_reviews.categories()) #review categories
['neg', 'pos']
>>> print (len(movie_reviews.fileids('pos'))) #pos reviews
1000
>>> print (len(movie_reviews.fileids('neg'))) #neg reviews
1000
>>> positive_review_file = movie_reviews.fileids('pos')[0]
>>> print (positive_review_file)
pos/cv000_29590.txt
>>> documents = [] #creating a movie review document
>>> for category in movie_reviews.categories():
	for fileid in movie_reviews.fileids(category):
		documents.append((movie_reviews.words(fileid), category))

		
>>> print (len(documents))
2000
>>> x = [str(item) for item in documents[0][0]]
>>> print(x)
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'f**k', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'bad', 'ways', 'of', 'making', 'all', 'types', 'of', 'films', ',', 'and', 'these', 'folks', 'just', 'didn', "'", 't', 'snag', 'this', 'one', 'correctly', '.', 'they', 'seem', 'to', 'have', 'taken', 'this', 'pretty', 'neat', 'concept', ',', 'but', 'executed', 'it', 'terribly', '.', 'so', 'what', 'are', 'the', 'problems', 'with', 'the', 'movie', '?', 'well', ',', 'its', 'main', 'problem', 'is', 'that', 'it', "'", 's', 'simply', 'too', 'jumbled', '.', 'it', 'starts', 'off', '"', 'normal', '"', 'but', 'then', 'downshifts', 'into', 'this', '"', 'fantasy', '"', 'world', 'in', 'which', 'you', ',', 'as', 'an', 'audience', 'member', ',', 'have', 'no', 'idea', 'what', "'", 's', 'going', 'on', '.', 'there', 'are', 'dreams', ',', 'there', 'are', 'characters', 'coming', 'back', 'from', 'the', 'dead', ',', 'there', 'are', 'others', 'who', 'look', 'like', 'the', 'dead', ',', 'there', 'are', 'strange', 'apparitions', ',', 'there', 'are', 'disappearances', ',', 'there', 'are', 'a', 'looooot', 'of', 'chase', 'scenes', ',', 'there', 'are', 'tons', 'of', 'weird', 'things', 'that', 'happen', ',', 'and', 'most', 'of', 'it', 'is', 'simply', 'not', 'explained', '.', 'now', 'i', 'personally', 'don', "'", 't', 'mind', 'trying', 'to', 'unravel', 'a', 'film', 'every', 'now', 'and', 'then', ',', 'but', 'when', 'all', 'it', 'does', 'is', 'give', 'me', 'the', 'same', 'clue', 'over', 'and', 'over', 'again', ',', 'i', 'get', 'kind', 'of', 'fed', 'up', 'after', 'a', 'while', ',', 'which', 'is', 'this', 'film', "'", 's', 'biggest', 'problem', '.', 'it', "'", 's', 'obviously', 'got', 'this', 'big', 'secret', 'to', 'hide', ',', 'but', 'it', 'seems', 'to', 'want', 'to', 'hide', 'it', 'completely', 'until', 'its', 'final', 'five', 'minutes', '.', 'and', 'do', 'they', 'make', 'things', 'entertaining', ',', 'thrilling', 'or', 'even', 'engaging', ',', 'in', 'the', 'meantime', '?', 'not', 'really', '.', 'the', 'sad', 'part', 'is', 'that', 'the', 'arrow', 'and', 'i', 'both', 'dig', 'on', 'flicks', 'like', 'this', ',', 'so', 'we', 'actually', 'figured', 'most', 'of', 'it', 'out', 'by', 'the', 'half', '-', 'way', 'point', ',', 'so', 'all', 'of', 'the', 'strangeness', 'after', 'that', 'did', 'start', 'to', 'make', 'a', 'little', 'bit', 'of', 'sense', ',', 'but', 'it', 'still', 'didn', "'", 't', 'the', 'make', 'the', 'film', 'all', 'that', 'more', 'entertaining', '.', 'i', 'guess', 'the', 'bottom', 'line', 'with', 'movies', 'like', 'this', 'is', 'that', 'you', 'should', 'always', 'make', 'sure', 'that', 'the', 'audience', 'is', '"', 'into', 'it', '"', 'even', 'before', 'they', 'are', 'given', 'the', 'secret', 'password', 'to', 'enter', 'your', 'world', 'of', 'understanding', '.', 'i', 'mean', ',', 'showing', 'melissa', 'sagemiller', 'running', 'away', 'from', 'visions', 'for', 'about', '20', 'minutes', 'throughout', 'the', 'movie', 'is', 'just', 'plain', 'lazy', '!', '!', 'okay', ',', 'we', 'get', 'it', '.', '.', '.', 'there', 'are', 'people', 'chasing', 'her', 'and', 'we', 'don', "'", 't', 'know', 'who', 'they', 'are', '.', 'do', 'we', 'really', 'need', 'to', 'see', 'it', 'over', 'and', 'over', 'again', '?', 'how', 'about', 'giving', 'us', 'different', 'scenes', 'offering', 'further', 'insight', 'into', 'all', 'of', 'the', 'strangeness', 'going', 'down', 'in', 'the', 'movie', '?', 'apparently', ',', 'the', 'studio', 'took', 'this', 'film', 'away', 'from', 'its', 'director', 'and', 'chopped', 'it', 'up', 'themselves', ',', 'and', 'it', 'shows', '.', 'there', 'might', "'", 've', 'been', 'a', 'pretty', 'decent', 'teen', 'mind', '-', 'f**k', 'movie', 'in', 'here', 'somewhere', ',', 'but', 'i', 'guess', '"', 'the', 'suits', '"', 'decided', 'that', 'turning', 'it', 'into', 'a', 'music', 'video', 'with', 'little', 'edge', ',', 'would', 'make', 'more', 'sense', '.', 'the', 'actors', 'are', 'pretty', 'good', 'for', 'the', 'most', 'part', ',', 'although', 'wes', 'bentley', 'just', 'seemed', 'to', 'be', 'playing', 'the', 'exact', 'same', 'character', 'that', 'he', 'did', 'in', 'american', 'beauty', ',', 'only', 'in', 'a', 'new', 'neighborhood', '.', 'but', 'my', 'biggest', 'kudos', 'go', 'out', 'to', 'sagemiller', ',', 'who', 'holds', 'her', 'own', 'throughout', 'the', 'entire', 'film', ',', 'and', 'actually', 'has', 'you', 'feeling', 'her', 'character', "'", 's', 'unraveling', '.', 'overall', ',', 'the', 'film', 'doesn', "'", 't', 'stick', 'because', 'it', 'doesn', "'", 't', 'entertain', ',', 'it', "'", 's', 'confusing', ',', 'it', 'rarely', 'excites', 'and', 'it', 'feels', 'pretty', 'redundant', 'for', 'most', 'of', 'its', 'runtime', ',', 'despite', 'a', 'pretty', 'cool', 'ending', 'and', 'explanation', 'to', 'all', 'of', 'the', 'craziness', 'that', 'came', 'before', 'it', '.', 'oh', ',', 'and', 'by', 'the', 'way', ',', 'this', 'is', 'not', 'a', 'horror', 'or', 'teen', 'slasher', 'flick', '.', '.', '.', 'it', "'", 's', 'just', 'packaged', 'to', 'look', 'that', 'way', 'because', 'someone', 'is', 'apparently', 'assuming', 'that', 'the', 'genre', 'is', 'still', 'hot', 'with', 'the', 'kids', '.', 'it', 'also', 'wrapped', 'production', 'two', 'years', 'ago', 'and', 'has', 'been', 'sitting', 'on', 'the', 'shelves', 'ever', 'since', '.', 'whatever', '.', '.', '.', 'skip', 'it', '!', 'where', "'", 's', 'joblo', 'coming', 'from', '?', 'a', 'nightmare', 'of', 'elm', 'street', '3', '(', '7', '/', '10', ')', '-', 'blair', 'witch', '2', '(', '7', '/', '10', ')', '-', 'the', 'crow', '(', '9', '/', '10', ')', '-', 'the', 'crow', ':', 'salvation', '(', '4', '/', '10', ')', '-', 'lost', 'highway', '(', '10', '/', '10', ')', '-', 'memento', '(', '10', '/', '10', ')', '-', 'the', 'others', '(', '9', '/', '10', ')', '-', 'stir', 'of', 'echoes', '(', '8', '/', '10', ')']
>>> print (documents[0])
(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')
>>> from random import shuffle
>>> shuffle(documents) #shuffle the document list
>>> 
>>> #Feature Exctraction
示例#57
0
def text_classification():
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    random.shuffle(documents)
    print(documents[1])
示例#58
0
devacc = nltk.classify.accuracy(buildclass, devclass)  #0.77

#Test accuracy
testacc = nltk.classify.accuracy(buildclass, testclass)  #0.794

#Question 4

#Create movie review documents
import string
from nltk.corpus import movie_reviews as rev
from nltk.corpus import stopwords
stop = stopwords.words('english')
documents = [([
    w for w in rev.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in rev.fileids()]
random.shuffle(documents)

#FreqDist of all words
Total_words = nltk.FreqDist(w.lower() for w in rev.words())
wf = list(Total_words)[:2000]


#Classification by feature extraction
def document_features(document):
    document_words = set(document)
    features = {}
    for word in wf:
        features['contains({})'.format(word)] = (word in document_words)
    return features
示例#59
0
from nltk.corpus import stopwords
from nltk import word_tokenize
import string

# ----------------------------------------------------------------------------------------------------------------------
punctuations = list(string.punctuation)
# Print the classification of the problem
print ' Maximum Entropy classifier accuracy by removing punctuations: '


def word_feats(words):
    return dict([(word, True) for word in words])


# Get all the reviews with negative dataset and positive dataset from the movie reviews.
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

# Mark the word in the dataset as positive and negative.
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg')
            for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos')
            for f in posids]

# Set some cut off for separating the training data and testing data.
negcutoff = len(negfeats) * 9 / 10
poscutoff = len(posfeats) * 9 / 10

# Based on the cut off, fill the training data and testing data with its respective positive and negative dataset.
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
import nltk
import random
from nltk.corpus import movie_reviews
docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)]
random.shuffle(docs)
all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words())
token_features = list(all_tokens.keys())[:2000]
print(token_features[:100])
 
def doc_features(docs):
    doc_words = set(docs)
    features = {}
    for word in token_features:
        features['contains(%s)' % word] = (word in doc_words)
        return features

print(doc_features(movie_reviews.words('pos/cv957_8737.txt')))
feature_sets = [(doc_features(d), c) for (d,c) in docs]
train_sets, test_sets = feature_sets[100:], feature_sets[:100]
classifiers = nltk.NaiveBayesClassifier.train(train_sets)
print(nltk.classify.accuracy(classifiers, test_sets))
classifiers.show_most_informative_features(5)