def test_calibration(self): '''tests whether all features like default adjectives are generated when class is instantiated''' train = [] bound = int(len(movie_reviews.sents(categories="pos"))*0.8) for i,sent in enumerate(movie_reviews.sents(categories="pos")[:bound]): train.append(("positive"," ".join(sent))) bound = int(len(movie_reviews.sents(categories="neg"))*0.8) for sent in movie_reviews.sents(categories="neg")[:bound]: train.append(("negative", " ".join(sent))) random.shuffle(train) x = opinionminer.OpinionMiner() x.trainClassifier(train) print 'Tags are generated: ',len(x.selective_pos.conditions())>0 print 'Positive Adverbs exist:', (len(x.positive_adverbs) > 2) print 'Positive Adjectives exist:', (len(x.positive_adjectives)>2)
def _setDefaultPositiveNegativeWords(self): buff1 = self._loadData('positive_adjectives.bin') buff2 = self._loadData('positive_adverbs.bin') buff3 = self._loadData('negative_adverbs.bin') buff4 = self._loadData('negative_adjectives.bin') if buff1 and buff2 and buff3 and buff4: self.positive_adjectives = buff1 self.positive_adverbs = buff2 self.negative_adverbs = buff3 self.negative_adjectives = buff4 return #First compile list of positive adjectives & adverbs #by initially tagging all positive sentences with POS tagger tagger = speechtagger.SpeechTagger() processed_sents = [] self.positive_adjectives = set() self.positive_adverbs = set() self.negative_adverbs = set() self.negative_adjectives = set() train_bound_pos = int(len(movie_reviews.sents(categories="pos"))*0.8) train_bound_neg = int(len(movie_reviews.sents(categories="neg"))*0.8) #***************positive******************# for sentence in movie_reviews.sents(categories="pos")[:train_bound_pos]: concat_sent = (" ".join(sentence)).lower() processed_sents.append(concat_sent) tagged_sents = tagger.tag(processed_sents) #TODO: Save to file for sentence in tagged_sents: for (word, tag) in sentence: if tag is 'ADJ' or word in self.selective_pos['ADJ']: self.positive_adjectives.add(word) elif tag is 'ADV' or word in self.selective_pos['ADV']: self.positive_adverbs.add(word) #**************negative*****************# processed_sents = [] for sentence in movie_reviews.sents(categories="neg")[:train_bound_neg]: concat_sent = (" ".join(sentence)).lower() processed_sents.append(concat_sent) tagged_sents = tagger.tag(processed_sents) #TODO: Save to file for sentence in tagged_sents: for (word, tag) in sentence: if tag is 'ADJ' or word in self.selective_pos['ADJ']: self.negative_adjectives.add(word) elif tag is 'ADV' or word in self.selective_pos['ADV']: self.negative_adverbs.add(word) self._saveData('positive_adjectives.bin',self.positive_adjectives) self._saveData('positive_adverbs.bin', self.positive_adverbs) self._saveData('negative_adjectives.bin', self.negative_adjectives) self._saveData('negative_adverbs.bin', self.negative_adverbs)
def Default_Dataset(): documents = movies.fileids() for doc in documents: sents = movies.sents(doc) doc_label = doc[:3] for sent in sents: default_sentence_set.append((doc_label, sent))
def New_Dataset(): documents = movies.fileids(); for doc in documents: sents = movies.sents(doc); doc_label = doc[:3]; for sent in sents: new_sentence_set.append((doc_label,sent));
def opinion_features(fileid): """ starter feature engineering for movie reviews... """ # many features are counts! #global rn #print("rev#",rn) #rn += 1 positive_count=0 negative_count=0 for word in movie_reviews.words(fileid): if word in pos_set: positive_count += 1 if word in neg_set: negative_count += 1 for s in movie_reviews.sents(fileid): # s == list of words in one sentence sentence = " ".join(s) tb = textblob.TextBlob(sentence) polarity = tb.polarity subjectivity = tb.subjectivity # here is the dictionary of features... features = {'positive': positive_count, 'negative': negative_count, 'polarity': polarity, 'subjectivity': subjectivity} return features
def get_word2vec( train_fn="data/rap/input.txt", saved_model_fn="save/save/GoogleNews-vectors-negative300.bin"): try: print "loading word2vec model at {0}".format(saved_model_fn) model = Word2Vec.load_word2vec_format(saved_model_fn, binary=True) print "model loaded" return model except IOError: print "no word2vec model found at {0}".format(saved_model_fn) with open(train_fn) as f: data = f.read() clean = TextLoader.clean_str(data) lines = [line.split(" ") for line in clean.split('\n')] full_data = brown.sents() + movie_reviews.sents() + treebank.sents( ) + lines print "training word2vec model" model = Word2Vec(workers=8) model.build_vocab(full_data) for i in xrange(0, 5): print "epoch " + str(i + 1) # full_data = shuffle(full_data) pb = ProgressBar(maxval=len(full_data)) chunk_size = len(full_data) / 100 j = 0 pb.start() while j + chunk_size < len(full_data): model.train(full_data[j:j + chunk_size]) j += chunk_size pb.update(j) print "done training" model.save(saved_model_fn) return model
def evaluate_features(feature_extractor, N, only_acc=False): from nltk.corpus import movie_reviews from nltk.classify import NaiveBayesClassifier as naive from nltk.classify.util import accuracy from nltk.metrics import precision, recall, f_measure from sys import stdout negative = movie_reviews.fileids('neg') positive = movie_reviews.fileids('pos') negfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])), 'neg') for f in negative] posfeats = [(feature_extractor(movie_reviews.sents(fileids=[f])), 'pos') for f in positive] negtrain, negtest = stratifiedSamples(negfeats, N) postrain, postest = stratifiedSamples(posfeats, N) trainfeats = negtrain + postrain testfeats = negtest + postest classifier = naive.train(trainfeats) if only_acc: return accuracy(classifier, testfeats) print 'accuracy: {}'.format(accuracy(classifier, testfeats)) # Precision, Recall, F-measure from collections import defaultdict refsets = defaultdict(set) testsets = defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'pos precision:', precision(refsets['pos'], testsets['pos']) print 'pos recall:', recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', precision(refsets['neg'], testsets['neg']) print 'neg recall:', recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg']) stdout.flush() classifier.show_most_informative_features() return classifier
def Initialize_SentimentAnalyzer(): pos_docs = movies.fileids('pos') neg_docs = movies.fileids('neg') classifier_training = [] for doc in pos_docs: sents = movies.sents(doc) for sent in sents: tagged = t2.tag(sent) words = [w for w, k in tagged] tags = [k for w, k in tagged] feature = {} for i in range(len(words) - 1): feature[words[i] + ' ' + words[i + 1]] = tags[i] + ' ' + tags[i + 1] temp = (feature, 'pos') classifier_training.append(temp) for doc in neg_docs: sents = movies.sents(doc) for sent in sents: tagged = t2.tag(sent) words = [w for w, k in tagged] tags = [k for w, k in tagged] feature = {} for i in range(len(words) - 1): feature[words[i] + ' ' + words[i + 1]] = tags[i] + ' ' + tags[i + 1] temp = (feature, 'neg') classifier_training.append(temp) random.shuffle(classifier_training) train_set = classifier_training classifier = nltk.NaiveBayesClassifier.train(train_set) return classifier
def _init_train(self): lemmas = [ tup[0].split() for tup in self.db.loadProcessed("lemmatized") ] model = FastText(min_count=5) model.build_vocab(brown.sents()) model.train( brown.sents(), total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=model.epochs, ) model.build_vocab(treebank.sents(), update=True) model.train( treebank.sents(), total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=model.epochs, ) model.build_vocab(movie_reviews.sents(), update=True) model.train( movie_reviews.sents(), total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=model.epochs, ) model.build_vocab(lemmas, update=True) model.train( lemmas, total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=model.epochs, ) return model
def make_mech_turk_entry(): fileid = movie_reviews.fileids(categories=category)[fileid_num] turk_entry_filename = "entry-%s" % fileid.replace("/", "-") with open(turk_entry_filename, "wb") as f: f.write(preamble) f.write("\n\n\n<h1>Review</h1>\n\n<p>") f.write(movie_reviews.raw(fileid).replace("\n", "<br/>\n")) f.write("</p>\n") f.write("\n\n\n<h1>Select Summary Sentence</h1>\n\n<p>") for i, sent in enumerate(movie_reviews.sents(fileid)): opentag = '<input type="radio" name="%s" value="sent%i">' % (fileid, i) taginner = string.join(sent, " ") closetag = "</input><br/>\n" f.write(opentag + taginner + closetag) f.write("</p>\n")
def Initialize_SentimentAnalyzer(): documents = movies.fileids() classifier_training = [] for doc in documents: sents = movies.sents(doc) doc_label = doc[:3] for sent in sents: tagged = t2.tag(sent) pairs = [(w, k) for w, k in tagged] feature = {} for i in range(len(pairs) - 1): feature[pairs[i][0] + ' ' + pairs[i + 1][0]] = pairs[i][1] + ' ' + pairs[i + 1][1] temp = (feature, doc_label) classifier_training.append(temp) random.shuffle(classifier_training) train_set = classifier_training classifier = nltk.NaiveBayesClassifier.train(train_set) return classifier
import pandas as pd #print(movie_reviews.fileids()) def func(list0): sents = [] for blist in list0: fsent = "" for slist in blist: fsent += slist + " " sents.append(fsent) return sents sents0 = movie_reviews.sents("neg/cv000_29416.txt") sents1 = movie_reviews.sents("pos/cv041_21113.txt") texts0 = func(sents0) texts1 = func(sents1) texts2 = texts0 + texts1 vec = CountVectorizer() vec.fit(texts2) print([w for w in sorted(vec.vocabulary_.keys())]) print( pd.DataFrame(vec.transform(texts2).toarray(), columns=sorted(vec.vocabulary_.keys())))
def trainAllClassifiers(): #Get all subjective and objective sentences. #Note: The "encode/decode" statement is used to parse the unicode representation of the text to an #Ascii representation. The "apply_features()" method throws an error if this isn't done. This is most #likely because python 3 uses unicode characters to perform operations on string, while python 2 doesn't. print("Splitting positive and negative documents...") positive_docs = [ ([string.encode('ascii', 'ignore').decode('ascii') for string in sent], 'pos') for sent in movie_reviews.sents(categories='pos') ] negative_docs = [ ([string.encode('ascii', 'ignore').decode('ascii') for string in sent], 'neg') for sent in movie_reviews.sents(categories='neg') ] #obj_docs = [(sent.encode('ascii', 'ignore').decode('ascii'), 'obj') for sent in subjectivity.sents(categories='obj')] #Randomly split data sets into train and test sets. train_pos, test_pos = train_test_split(positive_docs, test_size=1000, train_size=4000) train_neg, test_neg = train_test_split(negative_docs, test_size=1000, train_size=4000) #Aggregate train and test data sets. train = train_pos + train_neg test = test_pos + test_neg #Create a sentiment analyzer to analyze the text documents. This analyzer #provides an abstraction for managing a classifier, and feature extractor. #It also provides convinence data metrics on classifier performance. sentim_analyzer = SentimentAnalyzer() #Mark negations in the tokenized training text, and count all negative words. #all_words() returns all tokens from the document, which is used to create a set #of features with a feature extractor. print("Creating feature set...") all_words_with_neg_tags = sentim_analyzer.all_words( [mark_negation(doc) for doc in train]) #Create the unigram features, only taking features that occur more than 4 time. unigram_features = sentim_analyzer.unigram_word_feats( all_words_with_neg_tags, min_freq=2) #Save the unigram feature list to a file so it can be used later. #These features need to be applied to the email set. f = open("./bow_features.pkl", "w") pickle.dump(unigram_features, f) f.close() #Create a feature extractor based on the unigram word features created. #The unigram feature extractor is found in the sentiment utils package. sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_features) #Create feature-value representations of the data. train_set = sentim_analyzer.apply_features(train) test_set = sentim_analyzer.apply_features(test) #Collect some memory. positive_docs = None negative_docs = None gc.collect() #Note, training may take a long time. #Create a trainer and train the sentiment analyzer on the training set. print("Beginning the classifier training...") #SVM startTime = time.time() print("Linear Support Vector Machine.") clf = SklearnClassifier(LinearSVC()) trainer = clf.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "lsvm") saveMetricsToFile("lsvm", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes." #Naive Bayes startTime = time.time() print("Naive Bayes.") trainer = NaiveBayesClassifier.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "nb") saveMetricsToFile("nb", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes." #Stochastic Gradient Descent. (Performed first since it takes the least amount of time.) startTime = time.time() print("Stochastic Gradient Descent.") clf = SklearnClassifier(SGDClassifier()) trainer = clf.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "sgd") saveMetricsToFile("sgd", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes." #SVM startTime = time.time() print("RBF Support Vector Machine.") clf = SklearnClassifier(svm.SVC(kernel='rbf')) trainer = clf.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "svm") saveMetricsToFile("svm", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes." #Multinomial Naive Bayes. startTime = time.time() print("Multinomial Naive Bayes.") clf = SklearnClassifier(MultinomialNB()) trainer = clf.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "mnb") saveMetricsToFile("mnb", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes." #Logistic Regression. startTime = time.time() print("Logistic Regression.") clf = SklearnClassifier(LogisticRegression()) trainer = clf.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "lr") saveMetricsToFile("lr", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes." #Descision tree startTime = time.time() print("Decision Tree.") clf = SklearnClassifier(DecisionTreeClassifier()) trainer = clf.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "dt") saveMetricsToFile("dt", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes." #Random Forrest. startTime = time.time() print("Random Forrest.") clf = SklearnClassifier(RandomForestClassifier()) trainer = clf.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "rf") saveMetricsToFile("rf", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes." #Adaboost startTime = time.time() print("Ada Boost") clf = SklearnClassifier(AdaBoostClassifier()) trainer = clf.train classifier = sentim_analyzer.train(trainer, train_set) endTime = time.time() timeDiff = endTime - startTime saveModel(classifier, "ab") saveMetricsToFile("ab", sentim_analyzer, test_set, timeDiff / 60.0) print "Total time to train: " + str(timeDiff / 60.0) + " minutes."
def classifyOn1000Examples(binary=False): print("Splitting positive and negative documents...") positive_docs = [ ([string.encode('ascii', 'ignore').decode('ascii') for string in sent], 'pos') for sent in movie_reviews.sents(categories='pos') ] negative_docs = [ ([string.encode('ascii', 'ignore').decode('ascii') for string in sent], 'neg') for sent in movie_reviews.sents(categories='neg') ] #Randomly split data sets into train and test sets. train_pos, test_pos = train_test_split(positive_docs, test_size=500, train_size=4000) train_neg, test_neg = train_test_split(negative_docs, test_size=500, train_size=4000) #Aggregate train and test data sets. test = test_pos + test_neg #Create a sentiment analyzer to analyze the text documents. This analyzer #provides an abstraction for managing a classifier, and feature extractor. #It also provides convinence data metrics on classifier performance. sentim_analyzer = SentimentAnalyzer() #Mark negations in the tokenized training text, and count all negative words. #all_words() returns all tokens from the document, which is used to create a set #of features with a feature extractor. print("Creating feature set...") f = open("./bow_features.pkl", "r") unigram_features = pickle.load(f) f.close() #Create a feature extractor based on the unigram word features created. #The unigram feature extractor is found in the sentiment utils package. sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_features) #Create feature-value representations of the data. test_set = sentim_analyzer.apply_features(test) #Make a dict to hold predicted labels. testDict = {"test_labels": []} for sent in test_set: if binary == True: if sent[1] == "pos": testDict["test_labels"].append(1) else: testDict["test_labels"].append(-1) else: testDict["test_labels"].append(sent[1]) print("Beginning classification...") classifierResultsDict = {key: [] for key in classifierNamesList} for classifierKey in classifierNamesList: print("Starting classifier: " + classifierKey) classifier = loadModel(classifierKey) for sent in test_set: label = classifier.classify(sent[0]) if binary == True: if label == "pos": classifierResultsDict[classifierKey].append(1) else: classifierResultsDict[classifierKey].append(-1) else: classifierResultsDict[classifierKey].append(label) return pd.DataFrame(classifierResultsDict), pd.DataFrame(testDict)
from sklearn.model_selection import train_test_split # For the classifier, to tell between pos and neg since the NN uses numbers. posNegDict = {'pos': 0, 'neg': 1} numToCatDict = {1: 'pos', 0: 'neg'} # collapse and average the word2vecs keyvaluepair and remove words that arent in the vocab. def averageVectors(vec, words): words = [wd for wd in words if wd in vec.wv.index_to_key] if len(words) != 0: return np.average(vec.wv[words], axis=0) else: return None # Gather the documents with their classifications in numeric form. document = [(Word2Vec(movie_reviews.sents(file), min_count=1), movie_reviews.words(file), posNegDict[category]) for file in movie_reviews.fileids() for category in movie_reviews.categories(file)] # Randomizes document files so that the data doesnt bias. shuffle(document) # Gather user input userInput = [] userRaw = [] i = 0 for input in open("classifyUserInput.txt"): userRaw.append(word_tokenize(input)) userInput.append(averageVectors(Word2Vec(word_tokenize(input)), userRaw[i])) i = i + 1 # Separate the vectors and the classificiation x = np.array([averageVectors(x[0], x[1]) for x in document]) y = np.array([y[2] for y in document])
print("condll2007 to sents") inaugural_corp_sents = inaugural.sents() print("inaugural to sents") abc_corp_sents = abc.sents() print("ABC to sentences") genesis_corp_sents = genesis.sents() print("Genesis to sents") frame_net_corp_sents = fn.sents() print("Frame_net to sents") state_union_corp_sents = state_union.sents() print('state union to sents') subject_corp_sents = subjectivity.sents() print('Subjectvity to sents') brown_corp_sents = brown.sents() print("Brown corpus to sents") movie_reviews_corp_sents = movie_reviews.sents() print("Movie reviews to sents ") guttenberg_corp_sents = gutenberg.sents() print("Guttenberg to sents") treebank_corb_sents = treebank.sents() print("Freebank to sents") reuters_corp_sents = reuters.sents() print("Reuters to sents") webtext_corp_sents = webtext.sents() print("Webtext to sents") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print("Cleaning data ...")
import pandas import pandasql from nltk import sent_tokenize from nltk import word_tokenize import numpy as np from nltk.corpus import wordnet as wn from nltk.corpus import movie_reviews from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB movWords=[] movTarget=[] for sentence in movie_reviews.sents(): for word in sentence: types=[b.lexname() for b in wn.synsets(word)] if 'adj.all' in types and word!='i' and 'noun.quantity' not in types and all(['verb' not in type1 for type1 in types]) and 'noun.food' not in types: movWords.append(word) movTarget.append('mov') #load the move reviews data, not verbs and not foods filename="filenames.csv"#the recipe id stored in this file recipeIds=pandas.read_csv(filename,encoding='ISO-8859-1')['filenames'].values#load the file dictInfo={}#create a dic for the word info for recipeId in recipeIds:#go through each recipe print recipeId filename="reviews/reviews"+str(recipeId)+".csv"#this is our keywords reviews=pandas.read_csv(filename,encoding='ISO-8859-1')#load the file #get the reviews recRev = sent_tokenize(' '.join([review.lower().replace('c.','cup').replace('tspn.','teaspoon').replace(
__author__ = 'a_medelyan' # Goal: Get movie reviews and read them # See: http://www.nltk.org/book/ch02.html from nltk.corpus import movie_reviews # How many documents in this corpus? print len(movie_reviews.fileids()) # What are the categories? print movie_reviews.categories() # What are some files names? print movie_reviews.fileids('neg')[:10] print movie_reviews.fileids('pos')[:10] # Print the words in a sample text print movie_reviews.words('pos/cv000_29590.txt') # Print the original text print movie_reviews.raw('pos/cv000_29590.txt') # Print the sentences of the text print movie_reviews.sents('pos/cv000_29590.txt') # Spare time? Calculate the average number of words and sentences in positive and negative reviews # Do people use a lot more words when giving positive vs. negative reviews?
def find_summary_sentence(parser, fileid=None, localfile=None): """Finds the summary sentence for a body of text, specified by fileid or by localfile. fileid is accessed by NLTK.corpus.movie_reviews; localfile is a path to a non-NLTK text file""" #load feature/opinion keywords and their respective ranks opinion_ranks = load_opinion_keywords() feature_ranks = load_feature_keywords() proper_noun_rank = 2 #convert feature/opinions words to set, for quickly checking membership feature_words = set(feature_ranks.keys()) opinion_words = set(opinion_ranks.keys()) #load movie review as a list of sentence. (each sent is a list of words) if fileid and (not localfile): source = movie_reviews.sents(fileid) elif (not fileid) and localfile: source = open_file_as_sentences(localfile, feature_words, opinion_words) else: print "Please enter an nltk fileid, or the name of a local textfile" return #filter review for sentences containing a feature and an opinion summary_sents = [[word.rstrip(string.punctuation) for word in sent if word.rstrip(string.punctuation) != ''] for sent in source if (set(sent) & opinion_words != set()) and ((set(sent) & feature_words != set()) or len(find_proper_nouns(sent)) > 0)] summary_sents_with_feature_opinion_dist = [] for sent in summary_sents: try: feature, feature_rank = None, 10000 opinion, opinion_rank = None, 10000 sent_str = string.join(sent, ' ') proper_nouns = set(find_proper_nouns(sent)) #unique to each sentence #find the best opinion/feature in the sentence for word in sent: if (word in opinion_words) and opinion_ranks[word] < opinion_rank: opinion = word opinion_rank = opinion_ranks[word] elif (word in feature_words) and feature_ranks[word] < feature_rank: feature = word feature_rank = feature_ranks[word] elif (word in proper_nouns) and proper_noun_rank < feature_rank : feature = word feature_rank = proper_noun_rank #keep track of distance btwn feature/opinion for each sentence if feature and opinion: distance = dist_btwn_feature_and_opinion(feature, opinion, sent_str, parser) summary_sents_with_feature_opinion_dist.append((distance, sent_str)) except JavaException: # print "Failure: sentence is too long (len = %i)" % len(sent) pass except AssertionError: # print "Failure: could not find root" pass #best summary sentences is the one with closest feature/opinion summary_sents_with_feature_opinion_dist.sort() if len(summary_sents_with_feature_opinion_dist) > 0: return summary_sents_with_feature_opinion_dist[0][1] else: return None
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")] words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) print "Building clean sentences list" sentences = [] for s in brown.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in treebank.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in abc.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in movie_reviews.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in genesis.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) def singles(words): if len(words) < 1: return for w in words: if re.match("[a-zA-Z'-]+", w) and w.strip() != "''": yield w def doubles(sentences): for s in sentences: s = s.split(' ')
# Module 5: Advanced Topics # Gensime from gensim.models import Word2Vec from nltk.corpus import movie_reviews embedding = Word2Vec(movie_reviews.sents(), min_count=1, size=10) print(embedding.most_similar('man', topn=5)) print(embedding.most_similar('woman', topn=5)) # embedding.save('movie_model') # embedding2 = Word2Vec.load('movie_model') # print(embedding2.most_similar('man', topn=5))
stop = stopwords.words('english') documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] #test = lemma().lemmatize([i for i,j in documents]) random.shuffle(documents) # DEFINE WORDS AS KEYS AND OCCURENCES AS VALUES #word_features = FreqDist(chain(*[i for i,j in documents])) #from itertools import chain word_features = FreqDist([x for y, z in documents for x in y]) word_features = list(word_features.keys()) #[:1000] # TERM-DOC MATRIX, SAMPLING TRAIN AND TEST SETS AT 80-20 numtrain = int(len(documents) * 80 / 100) train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]] # RUN CLASSIFIER AND RETURN PERFORMANCE MEASURES classifier = nbc.train(train_set) print(nltk.classify.accuracy(classifier, test_set) * 100) classifier.show_most_informative_features(5) """ MODELLING NLTK MOVIE REVIEWS - NB, WORD2VEC """ w2v = Word2Vec(mr.sents()) w2v.most_similar("damon", topn=5)
posts = nltk.corpus.nps_chat.xml_posts(); featuresets = [nltk.pos_tag(word_tokenize(post.text)) for post in posts]; t0= nltk.DefaultTagger('NN'); t1= nltk.UnigramTagger(featuresets, backoff=t0); t2= nltk.BigramTagger(featuresets, backoff= t1); ##text = word_tokenize("I am good"); ##print(t2.tag(text)); ##print(text); pos_docs = movies.fileids('pos'); neg_docs = movies.fileids('neg'); classifier_training=[]; for doc in pos_docs: sents = movies.sents(doc); for sent in sents: tagged = t2.tag(sent); words = [w for w,k in tagged]; tags = [k for w,k in tagged]; feature={}; for i in range(len(words)-1): feature[words[i]+ ' ' + words[i+1]] = tags[i]+ ' ' + tags[i+1]; temp = (feature, 'pos'); classifier_training.append(temp); ##print('pos data acquired !'); for doc in neg_docs: sents = movies.sents(doc);
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jun 28 12:08:25 2018 @author: ritesh """ #importing libraries import os import gensim from nltk.corpus import movie_reviews #getting data sentences = movie_reviews.sents() #training word2vec model model = gensim.models.Word2Vec(sentences, min_count=1) #model.save('Mreview_model') #load and test #model = gensim.models.Word2Vec.load(‘Mreview_model’) #words most similar to mother print("Most Similar:", model.most_similar('mother')) #find the odd one out print(model.doesnt_match("breakfast cereal dinner lunch".split())) print(model.doesnt_match("cat dog table".split())) #vector representation of word human
print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding brown sentence structures ({0})...".format(len(brown.sents())) for sentence in brown.sents(): processed_count += 1 try: blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger()) tags = tuple([tag[1] for tag in blob.tags]) sentences.add(tags) except: print "\r", print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding movie_review sentence structures ({0})...".format(len(movie_reviews.sents())) for sentence in movie_reviews.sents(): processed_count += 1 try: blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger()) tags = tuple([tag[1] for tag in blob.tags]) sentences.add(tags) except: print "\r", print "Processed {0} sentences\r".format(processed_count), print "Processed {0} sentences".format(processed_count) print "Current Structure total: {0}".format(len(sentences)) print "Saving structures to text file" f = open('./sentence_structures.txt', 'w')
t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(featuresets, backoff=t0) t2 = nltk.BigramTagger(featuresets, backoff=t1) ##text = word_tokenize("I am good"); ##print(t2.tag(text)); ##print(text); from nltk.corpus import movie_reviews as movies pos_docs = movies.fileids('pos') neg_docs = movies.fileids('neg') classifier_training = [] for doc in pos_docs: sents = movies.sents(doc) for sent in sents: tagged = t2.tag(sent) words = [w for w, k in tagged] tags = [k for w, k in tagged] feature = {} for i in range(len(words) - 1): feature[words[i] + ' ' + words[i + 1]] = tags[i] + ' ' + tags[i + 1] temp = (feature, 'pos') classifier_training.append(temp) print('pos data acquired !') for doc in neg_docs:
"""Following the tutorial on http://www.nltk.org/howto/sentiment.html""" from nltk.classify import NaiveBayesClassifier from nltk.corpus import movie_reviews from nltk.sentiment import SentimentAnalyzer from nltk.sentiment.util import * n_instances = 1000 pos_sent = [(sent, 'pos') for sent in movie_reviews.sents(categories='pos')][:n_instances] neg_pos = [(sent, 'neg') for sent in movie_reviews.sents(categories='neg')][:n_instances] print(pos_sent[:30]) print(neg_pos[:30]) # Split subjective and objective instances to keep a balanced distribution in both train and test sets train_subj_docs = pos_sent[:80] test_subj_docs = pos_sent[80:1000] train_obj_docs = neg_pos[:80] test_obj_docs = neg_pos[80:1000] testing_docs = test_subj_docs + test_obj_docs training_docs = train_subj_docs + train_obj_docs # Handles negation sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(doc) for doc in training_docs])
from nltk.corpus import sentiwordnet as wdn from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.stem.porter import PorterStemmer from nltk.util import ngrams from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS pp = pprint.PrettyPrinter(indent=4) neg, pos = movie_reviews.categories() new_phrases = [] for ids in movie_reviews.fileids(neg): for phrase in movie_reviews.sents(ids)[1:]: if len(phrase) > 3: new_phrases.append({ 'type': 'neg', 'phrase': ' '.join(phrase).lower(), 'pos_score': 0.0, 'neg_score': 0.0, 'over_score': 0.0 }) for ids in movie_reviews.fileids(pos): for phrase in movie_reviews.sents(ids): if len(phrase) > 3: new_phrases.append({ 'type': 'pos', 'phrase': ' '.join(phrase).lower(), 'pos_score': 0.0,
def preprocess(): train_data, train_label = [], [] model = Word2Vec(movie_reviews.sents()) path = os.getcwd() + "/aclImdb/train/pos/" for filename in os.listdir(path): pos_file = open("aclImdb/train/pos/" + filename, 'r') for line in pos_file.readlines(): review = [] for word in line.split(): if word in model.wv.vocab: review.append(model.wv[word]) train_data.append(review) train_label.append((0, 1)) pos_file.close() path = os.getcwd() + "/aclImdb/train/neg/" for filename in os.listdir(path): neg_file = open("aclImdb/train/neg/" + filename, 'r') for line in neg_file.readlines(): review = [] for word in line.split(): if word in model.wv.vocab: review.append(model.wv[word]) train_data.append(review) train_label.append((1, 0)) neg_file.close() print len(train_data), len(train_label) x_train = np.array([np.array(xi) for xi in train_data]) y_train = np.array(train_label) print x_train.shape, x_train[0].shape, x_train[1].shape print y_train.shape, y_train[0].shape, y_train[1].shape np.save('train_data.npy', x_train) np.save('train_label.npy', y_train) test_data, test_label = [], [] path = os.getcwd() + "/aclImdb/test/pos/" for filename in os.listdir(path): pos_file = open("aclImdb/test/pos/" + filename, 'r') for line in pos_file.readlines(): review = [] for word in line.split(): if word in model.wv.vocab: review.append(model.wv[word]) test_data.append(review) test_label.append((0, 1)) pos_file.close() path = os.getcwd() + "/aclImdb/test/neg/" for filename in os.listdir(path): neg_file = open("aclImdb/test/neg/" + filename, 'r') for line in neg_file.readlines(): review = [] for word in line.split(): if word in model.wv.vocab: review.append(model.wv[word]) test_data.append(review) test_label.append((0, 1)) neg_file.close() print 'finish loading data' print len(test_data), len(test_label) x_test = np.array([np.array(xi) for xi in test_data]) y_test = np.array(test_label) np.save('test_data.npy', x_test) np.save('test_label.npy', y_test) return [ np.asarray(x_train), np.asarray(y_train), np.asarray(x_test), np.asarray(y_test) ]
def find_summary_sentence(parser, fileid=None, localfile=None): """Finds the summary sentence for a body of text, specified by fileid or by localfile. fileid is accessed by NLTK.corpus.movie_reviews; localfile is a path to a non-NLTK text file""" #load feature/opinion keywords and their respective ranks opinion_ranks = load_opinion_keywords() feature_ranks = load_feature_keywords() proper_noun_rank = 2 #convert feature/opinions words to set, for quickly checking membership feature_words = set(feature_ranks.keys()) opinion_words = set(opinion_ranks.keys()) #load movie review as a list of sentence. (each sent is a list of words) if fileid and (not localfile): source = movie_reviews.sents(fileid) elif (not fileid) and localfile: source = open_file_as_sentences(localfile, feature_words, opinion_words) else: print("Please enter an nltk fileid, or the name of a local textfile") return #filter review for sentences containing a feature and an opinion summary_sents = [[ word.rstrip(string.punctuation) for word in sent if word.rstrip(string.punctuation) != '' ] for sent in source if (set(sent) & opinion_words != set()) and ( (set(sent) & feature_words != set()) or len(find_proper_nouns(sent)) > 0)] summary_sents_with_feature_opinion_dist = [] for sent in summary_sents: try: feature, feature_rank = None, 10000 opinion, opinion_rank = None, 10000 sent_str = str.join(sent, ' ') proper_nouns = set( find_proper_nouns(sent)) #unique to each sentence #find the best opinion/feature in the sentence for word in sent: if (word in opinion_words ) and opinion_ranks[word] < opinion_rank: opinion = word opinion_rank = opinion_ranks[word] elif (word in feature_words) and feature_ranks[word] < feature_rank: feature = word feature_rank = feature_ranks[word] elif (word in proper_nouns) and proper_noun_rank < feature_rank: feature = word feature_rank = proper_noun_rank #keep track of distance btwn feature/opinion for each sentence if feature and opinion: distance = dist_btwn_feature_and_opinion( feature, opinion, sent_str, parser) summary_sents_with_feature_opinion_dist.append( (distance, sent_str)) except JavaException: # print "Failure: sentence is too long (len = %i)" % len(sent) pass except AssertionError: # print "Failure: could not find root" pass #best summary sentences is the one with closest feature/opinion summary_sents_with_feature_opinion_dist.sort() if len(summary_sents_with_feature_opinion_dist) > 0: return summary_sents_with_feature_opinion_dist[0][1] else: return None
def word2vec_processing(self, corpora='treebank'): print("Start word2vec training...") self.t = Word2Vec(movie_reviews.sents()) print("Word2vec training is finished")
from gensim.models import Word2Vec from nltk.corpus import brown, movie_reviews import os data_folder = r"""C:\Users\K1774755\King's College London\Cognitive Impairment in Schizophrenia - Documents\Courses\CUSMUMH\week 7 - NLP_courses_and_tutorials with nltk & spacy""" pycharm_folder = r'C:\Users\K1774755\PycharmProjects\toy-models\NLP' # Let's generate word vectors over the Brown corpus text. # We will have 20 dimensions, using a window of five for the context words in the skip-grams # (e.g. c1, c2, w, c3, c4). # This might be a little slow (maybe 1-2 minutes). # for the Brown corpus b = Word2Vec(brown.sents(), size=400, window=10, min_count=5) # for the movie review corpus mr = Word2Vec(movie_reviews.sents(), size=20, window=5, min_count=3) #Now we have the vectors, we can see how good they are by measuring which words are similar to each other. b.wv.most_similar('company', topn=5) mr.wv.most_similar('love', topn=5) #Try altering the window and the dimension size, to see if you get better results. #We can also do some arithmetic with the words. Let's try that classical result, king - man + woman. b.wv.most_similar(positive=['biggest', 'small'], negative=['big'], topn=5) #We can then load these in using Gensim; they might take a minute to load. from gensim.models.keyedvectors import KeyedVectors glove = KeyedVectors.load_word2vec_format(os.path.join(pycharm_folder,'glove.twitter.27B.25d.txt.bz2'), binary=False) print("Done loading") #Can you find any cool word combinations? What differences are there in the datasets?
# -*- coding: utf-8 -*- #!/usr/bin/env python from gensim.models import Word2Vec from nltk.corpus import brown, movie_reviews, treebank if __name__ == '__main__': brown_sentences = Word2Vec(brown.sents()) movie_sentences = Word2Vec(movie_reviews.sents()) treebank_sentences = Word2Vec(treebank.sents()) print brown_sentences.most_similar('money', topn=5) print movie_sentences.most_similar('money', topn=5) print treebank_sentences.most_similar('money', topn=5)
"나의": "너의", "당신은": "나는", "너의": "나의"} def hugot_bot(): print("안녕 이름이 뭐니?") chat = Chat(pairs, reflections) chat.converse() hugot_bot() import nltk from nltk.corpus import movie_reviews movie_reviews.sents() sentences = [list(s) for s in movie_reviews.sents()] sentences[0] sentences[1] #도수 #가까이 근접 #벡터 사이의 코사인 유사도를 이용한 가까운 단어를 추출. from gensim.models.word2vec import Word2Vec model = Word2Vec(sentences) model.init_sims(replace=True) model.similarity('actor', 'actress') model.similarity('he', 'she') model.similarity('actor', 'she') model.similarity('actress', 'she') model.most_similar('accident', topn=10) #default 값이 10개
return correct_texts if __name__ == '__main__': with open("nytimes_news_articles.txt", "r") as f: corpus = f.read().splitlines() nytimes_corpus = [] for line in corpus: for sent in sent_tokenize(line): if len(sent) != 0: if word_tokenize(sent)[0] != "URL": nytimes_corpus.append(word_tokenize(sent.lower())) corpus = nytimes_corpus for sent in movie_reviews.sents(): corpus.append(sent) #cfd, cpd = bigram_language_model(movie_reviews.sents(), 3) print(len(corpus)) trigram_model = ngram_language_model(corpus, 3) vocabulary_size = input('Vocabulary size를 입력해주세요.') vocabulary_size = int(vocabulary_size) if vocabulary_size > 9000: print('배울 단어가 없습니다.') pass else: thousands = vocabulary_size // 1000 + 1
import sys, os, nltk, random from nltk.corpus import movie_reviews documents = [(list(movie_reviews.sents(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) all_bigrams = nltk.FreqDist((w1.lower(),w2.lower()) for sent in \ movie_reviews.sents() for (w1, w2) in \ nltk.bigrams(sent)) word_features = all_words.keys()[:2000] bigram_features = all_bigrams.keys()[:2000] def document_features(document): document_words = set(word for sent in document for word in sent) document_bigrams = set(bg for sent in document \ for bg in nltk.bigrams(sent)) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) for bigram in bigram_features: features['contains bigram(%s)' % str(bigram)] = \ (bigram in document_bigrams) return features ''' Sentence classifier. Goal is to break text up into sentences. Use punctuation marks to do so... '''
def get_index_sequences(Ids, voc): sentences = [] for fid in Ids: Sentences = movie_reviews.sents(fileids=fid) sentences.extend([np.array([voc.get(w.lower(), 0) for w in s]) for s in Sentences]) return sentences
import nltk from gensim.models import Word2Vec from nltk.corpus import brown, movie_reviews, treebank b = Word2Vec(brown.sents()) mr = Word2Vec(movie_reviews.sents()) t = Word2Vec(treebank.sents()) print(b.most_similar('money', topn=5)) print('aew')
# Module 3: Corpus # Movie Review Corpus # Author: Dr. Alfred from nltk.corpus import movie_reviews # print(movie_reviews.fileids()) # print(movie_reviews.categories()) fileid = 'pos/cv971_10874.txt' text = movie_reviews.raw(fileid) print(text) print(" Num of chars :", len(movie_reviews.raw(fileid))) print(" Num of words :", len(movie_reviews.words(fileid))) print(" Num of sentences :", len(movie_reviews.sents(fileid))) print(" Categories:", movie_reviews.categories(fileid))
def word2vec(document): """ :param document: It is a list of tokenized sentences Example : [ ['first', 'sentence'],['second','sentence']] :return: """ model = Word2Vec(sentences=document,min_count=1) return model full_corpus = [] for i in negids: full_corpus.extend(movie_reviews.sents(i)) for i in posids: full_corpus.extend((movie_reviews.sents(i))) print len(full_corpus) print full_corpus[0] print full_corpus[1] print full_corpus[0][0] model = word2vec(full_corpus,size=50) print model['bad'] print model['good']