def extract_features_posneg(review_blob, features): """ This function takes in two arguments: 1) review_blob, a textblob object creating from the string of review text 2) features, a dictionary holding the features of input review In this function, we want to extract three features from the input review: 1) total number of positive words 2) total number of negative words 3) whether positive words outcount negative words. If True, the value is 1. If same, the value if 0. Otherwise, the value is -1. Note: we use the opinion lexicon from the nltk library This function returns the features (dictionary) with these three features added to it. """ all_words = list(review_blob.words) pos_words = list(opinion_lexicon.words('positive-words.txt')) neg_words = list(opinion_lexicon.words('negative-words.txt')) pos_set = set(pos_words) neg_set = set(neg_words) pos_count = 0 neg_count = 0 for word in all_words: if word in pos_set: pos_count += 1 if word in neg_set: neg_count += 1 features["positive"] = pos_count features["negative"] = neg_count value = None if pos_count > neg_count: value = 1 elif pos_count == neg_count: value = 0 else: value = -1 features["more_pos"] = value return features
def prepare_lexicon(process=True, dim=250, save=False): if process: dm = DatasetManager() data = dm.prepare_datasets() nega = set(opinion_lexicon.negative()) posi = set(opinion_lexicon.positive()) lexicon = opinion_lexicon.words() lexicon_dic = {x: 0 for x in lexicon} for t in data['vader']['text']: for w in t: if w in lexicon_dic: lexicon_dic[w] += 1 for t in data['sentiment140']['text']: for w in t: if w in lexicon_dic: lexicon_dic[w] += 1 L = Counter(lexicon_dic).most_common(4000) N = [] P = [] for w, _ in L: if w in nega: N.append(w) elif w in posi: P.append(w) l = P[:dim] + N[:dim] if save: with open('senti.lexicon', 'w') as f: for d in l: f.write(d) f.write('\n') return l else: with open('senti.lexicon', 'r') as f: data = [line.strip() for line in f] return data
def prepare_lexicon(corpus, embedding, num=250, extra=False): V = set([w for w in embedding.vocab]) neg = set(opinion_lexicon.negative()) pos = set(opinion_lexicon.positive()) senti_lexicon = opinion_lexicon.words() senti_lexicon = [w for w in senti_lexicon if w in V] lexicon_dic = {x: 0 for x in senti_lexicon} for sent in corpus: for w in sent: if w in lexicon_dic: lexicon_dic[w] += 1 L = Counter(lexicon_dic).most_common(5000) N = [] N_count = [] P = [] P_count = [] for word, count in L: if word in neg: N.append(word) N_count.append(count) elif word in pos: P.append(word) P_count.append(count) Senti_L = P[:num] + N[:num] P_sum = sum(P_count[:num]) P_score = [x * 1.0 / P_sum for x in P_count[:num]] N_sum = sum(N_count[:num]) N_score = [x * 1.0 / N_sum for x in N_count[:num]] Senti_W = P_score + N_score if extra: Extra_L = [l for l in Extra_Lexicon if l in V] Extra_W = [1.0 for l in Extra_L] return Senti_L + Extra_L, Senti_W + Extra_W return Senti_L, Senti_W
def prepLexicon(): sentiment_words = {} flag = 0 for w in opinion_lexicon.words(): if flag == 0: sentiment_words[w] = flag if re.search("zombie",w): flag = 1 else: sentiment_words[w] = flag return sentiment_words
def opinion_lexicon(self, opinion=None): ''' download lexicon dictionaries from nltk library :param opinion: positive or negative :return: ''' from nltk.corpus import opinion_lexicon nltk.download('opinion_lexicon', quiet=True) if opinion == 'positive': return opinion_lexicon.positive() elif opinion == 'negative': return opinion_lexicon.negative() else: return opinion_lexicon.words()
def advanced_classifier(training_file, test_file): # generate training and text data training_json_objects = parse(training_file, delimiter='\t') training_texts, training_labels = format_json(training_json_objects) test_json_objects = parse(test_file, delimiter=',') test_texts, test_labels = format_json(test_json_objects) training_texts = parse_text(training_texts) test_texts = parse_text(test_texts) count_vectorizer = CountVectorizer(analyzer="word", stop_words='english', vocabulary=list( set(opinion_lexicon.words()))) counts = count_vectorizer.transform(training_texts) classifier = MultinomialNB() # calculate the 10-fold f1 score k_fold = KFold(n=len(training_texts), n_folds=10) scores = cross_validation.cross_val_score(classifier, counts, training_labels, cv=k_fold) # scoring=f1_scorer f1_score = sum(scores) / len(scores) # calculate the score on the test set classifier.fit(counts, training_labels) test_counts = count_vectorizer.transform(test_texts) predictions = classifier.predict(test_counts) # sideline features for i in range(len(predictions)): if includes_hyperlink(test_texts[i]): predictions[i] = 'neutral' if includes_positive_hashtag(test_texts[i]): predictions[i] = 'positive' # calculate the score on the test set correct_predictions = 0 for i in range(len(predictions)): if predictions[i] == test_labels[i]: correct_predictions += 1 test_score = correct_predictions / len(predictions) return f1_score, test_score
def main(): my_fileids = opinion_lexicon.fileids() print(my_fileids) my_words = opinion_lexicon.words('negative-words.txt') print(my_words) words_from_wordnet = set(get_list_of_words_wordnet()) positive_words = set(get_list_of_word_positive()) negative_words = set(get_list_of_word_negative()) print(positive_words) #print(words_from_wordnet) #res = words_from_wordnet.intersection(positive_words) # общие слова позитивные и ворднетовские res_positive = get_intersection(positive_words, words_from_wordnet) res_negative = get_intersection(negative_words, words_from_wordnet) res_test = get_intersection(positive_words, negative_words) print('Positive & Wordnet: ') pprint(res_positive) print('Negative & Wordnet: ') pprint(res_negative) print('Positive & Negative: ') pprint(res_test)
def run_NB(training_file, test_file): # generate training and text data training_json_objects = parse(training_file, delimiter='\t') training_texts, training_labels = format_json(training_json_objects) test_json_objects = parse(test_file, delimiter=',') test_texts, test_labels = format_json(test_json_objects) training_texts = [element[0] for element in training_texts] test_texts = [element[0] for element in test_texts] count_vectorizer = CountVectorizer(analyzer="word", stop_words='english', vocabulary=list( set(opinion_lexicon.words()))) counts = count_vectorizer.transform(training_texts) classifier = MultinomialNB() # calculate the 10-fold f1 score k_fold = KFold(n=len(training_texts), n_folds=10) scores = cross_validation.cross_val_score(classifier, counts, training_labels, cv=k_fold) # scoring=f1_scorer f1_score = sum(scores) / len(scores) # calculate the score on the test set classifier.fit(counts, training_labels) test_counts = count_vectorizer.transform(test_texts) predictions = classifier.predict(test_counts) correct_predictions = 0 for i in range(len(predictions)): if predictions[i] == test_labels[i]: correct_predictions += 1 test_score = correct_predictions / len(predictions) return f1_score, test_score
def get_list_of_word_positive(): return opinion_lexicon.words('positive-words.txt')
nltk.download('stopwords') nltk.download('punkt') ##################### # ## Problem 4: Movie Review Sentiment starter code... # ##################### # a boolean to turn on/off the movie-review-sentiment portion of the code... RUN_MOVIEREVIEW_CLASSIFIER = True if RUN_MOVIEREVIEW_CLASSIFIER == True: ## Read all of the opinion words in from the nltk corpus. # pos = list(opinion_lexicon.words('positive-words.txt')) neg = list(opinion_lexicon.words('negative-words.txt')) ## Store them as a set (it'll make our feature extractor faster). # pos_set = set(pos) neg_set = set(neg) ## Read all of the fileids in from the nltk corpus and shuffle them. # pos_ids = [(fileid, "pos") for fileid in movie_reviews.fileids('pos')] neg_ids = [(fileid, "neg") for fileid in movie_reviews.fileids('neg')] labeled_fileids = pos_ids + neg_ids ## Here, we "seed" the random number generator with 0 so that we'll all ## get the same split, which will make it easier to compare results.
] for text in documents_tokenized] from nltk.sentiment.util import mark_negation documents_tokenized_lemmatized_negated = [ mark_negation(document) for document in documents_tokenized_lemmatized ] ready_corpus = documents_tokenized_lemmatized_negated download('opinion_lexicon') from nltk.corpus import opinion_lexicon # we consider only sentiment words, opinion_lexicon icludes already mispelled sentiment words, # so we did not use the enchant library this time. sentiment_words = opinion_lexicon.words() sentiment_words_negated = [word + '_NEG' for word in sentiment_words] sentiment_features = sentiment_words + sentiment_words_negated from gensim import corpora, models, matutils # build the dictionary dictionary = corpora.Dictionary(ready_corpus) print(dictionary) from nltk.sentiment import SentimentAnalyzer sentiment_analizer = SentimentAnalyzer() list_all_words = sentiment_analizer.all_words(ready_corpus) used_sentiment_words = list(
from nltk.corpus import opinion_lexicon from nltk.corpus import stopwords from nltk.tokenize import word_tokenize def create_feature(words): useful_words = [ word for word in words if word not in stopwords.words("english") ] my_dict = dict([(word, True) for word in useful_words]) return my_dict # create tuples of negative words neg_words = [] words = opinion_lexicon.words("negative-words.txt") neg_words.append((create_feature(words), "Negative")) # create tuples of positive words pos_words = [] words = opinion_lexicon.words("positive-words.txt") pos_words.append((create_feature(words), "Positive")) train_set = neg_words[:3587] + pos_words[:1504] test_set = neg_words[-1196:] + pos_words[-501:] #print(test_set) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier = nltk.MaxentClassifier.train(train_set, algorithm, max_iter=100) #classifier.show_most_informative_features(10)
vocabSize = 0 #Count the number of documents in each category tweetCount = {} numTweets = 0.0 testProbs = {} for i in range(0, len(engstopwords)): mystopwords[engstopwords[i]] = 1 for i in range(0, len(engwords)): mywords[engwords[i]] = 1 flag = 0 #0 = negative, 1 = positive for w in opinion_lexicon.words(): if flag == 0: sentiment_words[w] = flag if re.search("zombie",w): flag = 1 else: sentiment_words[w] = flag """ lines = [line.rstrip('\n') for line in open('positive-words.txt')] for w in lines: sentiment_words[w] = 1 #print "postitive word: " + w
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import opinion_lexicon print(opinion_lexicon.words()[:4]) print(opinion_lexicon.negative()[:4]) print(opinion_lexicon.words()[0:10]) print(sorted(opinion_lexicon.words())[0:10])
""" demo code in Project3 instruction """ import nltk from nltk.corpus import opinion_lexicon from nltk.corpus import sentence_polarity # TODO: add these lines to project3 main file # nltk.download('opinion_lexicon') # nltk.download('sentence_polarity') if __name__ == '__main__': # opionion_lexicon print('opinion lexicon:') print(opinion_lexicon.words()[:4]) print(len(opinion_lexicon.words())) ## negative lexicon print('negative lexicon:') print(opinion_lexicon.negative()[:4]) print(len(opinion_lexicon.negative())) ## positive lexicon print('positive lexicon:') print(opinion_lexicon.positive()[:4]) print(len(opinion_lexicon.positive())) print() print('-------------------------------------------------------') # sentence polarity print('all sentences:')
#!/usr/bin/env python3 from nltk.corpus import opinion_lexicon from nltk import word_tokenize import matplotlib.pyplot as plt import random pos_word = opinion_lexicon.words("positive-words.txt") neg_word = opinion_lexicon.words("negative-words.txt") def analyse_with_opinion(): try: with open("tweets/twitter.txt", encoding="utf_16") as f: sentences = f.read().split("\n") except: print( "\nError occured while trying to read the twitter.txt. It is either missing or it uses different character set than UTF-16." ) input("Press ENTER to continue....") return countpos = 0 countneg = 0 countneu = 0 for sentence in sentences: pos_word_count = 0 neg_word_count = 0 for word in pos_word: if word in sentence: pos_word_count += 1
def get_list_of_word_negative(): return opinion_lexicon.words('negative-words.txt')
train1 = pd.read_csv("fulldata_neg.txt", header=0, delimiter="\n") train2 = pd.read_csv("fulldata_pos.txt", header=0, delimiter="\n") neg = [0] * 12500 pos = [1] * 12500 dat = pd.DataFrame({'feel': neg}) dat2 = pd.DataFrame({'feel': pos}) train1 = train1.join(dat) train2 = train2.join(dat2) frames = [train1, train2] training_set = pd.concat(frames,ignore_index=True) num_reviews = training_set["Review"].size clean_train_reviews = [] opinions = set(opinion_lexicon.words()) for i in range( 0, num_reviews ): clean_train_reviews.append( review_to_words( training_set["Review"][i], opinions )) vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 500) train_data_features = vectorizer.fit_transform(clean_train_reviews) train_data_features = train_data_features.toarray() y = training_set.iloc[:, 1].values
import nltk from nltk.tokenize import word_tokenize from nltk.corpus import opinion_lexicon from nltk.corpus import sentiwordnet as swn #create a list neg_word to store the negative words from corpora opinio_lexicon neg_word = [] for wor in opinion_lexicon.words( "negative-words.txt" ): #Fetch the values from negative text file and converted into simple words neg_word.append(wor) #print("Negative words :",neg_word)optional for testing pos_word = [] for wos in opinion_lexicon.words("positive-words.txt"): pos_word.append(wos) #print("Positive words :",pos_word)optional for testing text1 = '''My mood is so bad''' #take a input output_word = word_tokenize( text1 ) #tokenzine into words for further processing as we can not iterate directly to a string #Create a function for calculating positive and negative words from the input def calculator(value1): # Count positive words numPosWords = 0 for word in output_word: if word in pos_word: numPosWords += 1