def extract_features_posneg(review_blob, features):
    """ This function takes in two arguments:
            1) review_blob, a textblob object creating from the string of review text
            2) features, a dictionary holding the features of input review

        In this function, we want to extract three features from the input review:
            1) total number of positive words
            2) total number of negative words
            3) whether positive words outcount negative words. If True, the value
               is 1. If same, the value if 0. Otherwise, the value is -1.

        Note: we use the opinion lexicon from the nltk library

        This function returns the features (dictionary) with these three features
        added to it.
    """

    all_words = list(review_blob.words)

    pos_words = list(opinion_lexicon.words('positive-words.txt'))
    neg_words = list(opinion_lexicon.words('negative-words.txt'))

    pos_set = set(pos_words)
    neg_set = set(neg_words)

    pos_count = 0
    neg_count = 0

    for word in all_words:

        if word in pos_set:
            pos_count += 1

        if word in neg_set:
            neg_count += 1

    features["positive"] = pos_count
    features["negative"] = neg_count

    value = None
    if pos_count > neg_count:
        value = 1
    elif pos_count == neg_count:
        value = 0
    else:
        value = -1

    features["more_pos"] = value

    return features
Exemplo n.º 2
0
def prepare_lexicon(process=True, dim=250, save=False):
    if process:
        dm = DatasetManager()
        data = dm.prepare_datasets()
        nega = set(opinion_lexicon.negative())
        posi = set(opinion_lexicon.positive())
        lexicon = opinion_lexicon.words()
        lexicon_dic = {x: 0 for x in lexicon}
        for t in data['vader']['text']:
            for w in t:
                if w in lexicon_dic:
                    lexicon_dic[w] += 1
        for t in data['sentiment140']['text']:
            for w in t:
                if w in lexicon_dic:
                    lexicon_dic[w] += 1
        L = Counter(lexicon_dic).most_common(4000)
        N = []
        P = []
        for w, _ in L:
            if w in nega:
                N.append(w)
            elif w in posi:
                P.append(w)
        l = P[:dim] + N[:dim]
        if save:
            with open('senti.lexicon', 'w') as f:
                for d in l:
                    f.write(d)
                    f.write('\n')
        return l
    else:
        with open('senti.lexicon', 'r') as f:
            data = [line.strip() for line in f]
        return data
Exemplo n.º 3
0
def prepare_lexicon(corpus, embedding, num=250, extra=False):
    V = set([w for w in embedding.vocab])
    neg = set(opinion_lexicon.negative())
    pos = set(opinion_lexicon.positive())
    senti_lexicon = opinion_lexicon.words()
    senti_lexicon = [w for w in senti_lexicon if w in V]
    lexicon_dic = {x: 0 for x in senti_lexicon}
    for sent in corpus:
        for w in sent:
            if w in lexicon_dic:
                lexicon_dic[w] += 1
    L = Counter(lexicon_dic).most_common(5000)
    N = []
    N_count = []
    P = []
    P_count = []
    for word, count in L:
        if word in neg:
            N.append(word)
            N_count.append(count)
        elif word in pos:
            P.append(word)
            P_count.append(count)
    Senti_L = P[:num] + N[:num]
    P_sum = sum(P_count[:num])
    P_score = [x * 1.0 / P_sum for x in P_count[:num]]
    N_sum = sum(N_count[:num])
    N_score = [x * 1.0 / N_sum for x in N_count[:num]]
    Senti_W = P_score + N_score
    if extra:
        Extra_L = [l for l in Extra_Lexicon if l in V]
        Extra_W = [1.0 for l in Extra_L]
        return Senti_L + Extra_L, Senti_W + Extra_W
    return Senti_L, Senti_W
Exemplo n.º 4
0
def prepLexicon():
    sentiment_words = {}
    flag = 0
    for w in opinion_lexicon.words():
        if flag == 0:
            sentiment_words[w] = flag
            if re.search("zombie",w):
                flag = 1
        else:
            sentiment_words[w] = flag
    return sentiment_words
Exemplo n.º 5
0
 def opinion_lexicon(self, opinion=None):
     '''
     download lexicon dictionaries from nltk library
     :param opinion: positive or negative
     :return:
     '''
     from nltk.corpus import opinion_lexicon
     nltk.download('opinion_lexicon', quiet=True)
     if opinion == 'positive':
         return opinion_lexicon.positive()
     elif opinion == 'negative':
         return opinion_lexicon.negative()
     else:
         return opinion_lexicon.words()
Exemplo n.º 6
0
def advanced_classifier(training_file, test_file):
    # generate training and text data
    training_json_objects = parse(training_file, delimiter='\t')
    training_texts, training_labels = format_json(training_json_objects)
    test_json_objects = parse(test_file, delimiter=',')
    test_texts, test_labels = format_json(test_json_objects)

    training_texts = parse_text(training_texts)
    test_texts = parse_text(test_texts)

    count_vectorizer = CountVectorizer(analyzer="word",
                                       stop_words='english',
                                       vocabulary=list(
                                           set(opinion_lexicon.words())))
    counts = count_vectorizer.transform(training_texts)

    classifier = MultinomialNB()

    # calculate the 10-fold f1 score
    k_fold = KFold(n=len(training_texts), n_folds=10)
    scores = cross_validation.cross_val_score(classifier,
                                              counts,
                                              training_labels,
                                              cv=k_fold)  # scoring=f1_scorer
    f1_score = sum(scores) / len(scores)

    # calculate the score on the test set
    classifier.fit(counts, training_labels)
    test_counts = count_vectorizer.transform(test_texts)
    predictions = classifier.predict(test_counts)

    # sideline features
    for i in range(len(predictions)):
        if includes_hyperlink(test_texts[i]):
            predictions[i] = 'neutral'
        if includes_positive_hashtag(test_texts[i]):
            predictions[i] = 'positive'

    # calculate the score on the test set
    correct_predictions = 0
    for i in range(len(predictions)):
        if predictions[i] == test_labels[i]:
            correct_predictions += 1
    test_score = correct_predictions / len(predictions)

    return f1_score, test_score
def main():
    my_fileids = opinion_lexicon.fileids()
    print(my_fileids)
    my_words = opinion_lexicon.words('negative-words.txt')
    print(my_words)
    words_from_wordnet = set(get_list_of_words_wordnet())
    positive_words = set(get_list_of_word_positive())
    negative_words = set(get_list_of_word_negative())
    print(positive_words)
    #print(words_from_wordnet)
    #res = words_from_wordnet.intersection(positive_words)  # общие слова позитивные и ворднетовские
    res_positive = get_intersection(positive_words, words_from_wordnet)
    res_negative = get_intersection(negative_words, words_from_wordnet)
    res_test = get_intersection(positive_words, negative_words)
    print('Positive & Wordnet: ')
    pprint(res_positive)
    print('Negative & Wordnet: ')
    pprint(res_negative)
    print('Positive & Negative: ')
    pprint(res_test)
Exemplo n.º 8
0
def run_NB(training_file, test_file):
    # generate training and text data
    training_json_objects = parse(training_file, delimiter='\t')
    training_texts, training_labels = format_json(training_json_objects)
    test_json_objects = parse(test_file, delimiter=',')
    test_texts, test_labels = format_json(test_json_objects)

    training_texts = [element[0] for element in training_texts]
    test_texts = [element[0] for element in test_texts]

    count_vectorizer = CountVectorizer(analyzer="word",
                                       stop_words='english',
                                       vocabulary=list(
                                           set(opinion_lexicon.words())))
    counts = count_vectorizer.transform(training_texts)

    classifier = MultinomialNB()

    # calculate the 10-fold f1 score
    k_fold = KFold(n=len(training_texts), n_folds=10)
    scores = cross_validation.cross_val_score(classifier,
                                              counts,
                                              training_labels,
                                              cv=k_fold)  # scoring=f1_scorer
    f1_score = sum(scores) / len(scores)

    # calculate the score on the test set
    classifier.fit(counts, training_labels)
    test_counts = count_vectorizer.transform(test_texts)
    predictions = classifier.predict(test_counts)
    correct_predictions = 0
    for i in range(len(predictions)):
        if predictions[i] == test_labels[i]:
            correct_predictions += 1
    test_score = correct_predictions / len(predictions)

    return f1_score, test_score
def get_list_of_word_positive():
    return opinion_lexicon.words('positive-words.txt')
Exemplo n.º 10
0
nltk.download('stopwords')
nltk.download('punkt')

#####################
#
## Problem 4: Movie Review Sentiment starter code...
#
#####################

# a boolean to turn on/off the movie-review-sentiment portion of the code...
RUN_MOVIEREVIEW_CLASSIFIER = True
if RUN_MOVIEREVIEW_CLASSIFIER == True:

    ## Read all of the opinion words in from the nltk corpus.
    #
    pos = list(opinion_lexicon.words('positive-words.txt'))
    neg = list(opinion_lexicon.words('negative-words.txt'))

    ## Store them as a set (it'll make our feature extractor faster).
    #
    pos_set = set(pos)
    neg_set = set(neg)

    ## Read all of the fileids in from the nltk corpus and shuffle them.
    #
    pos_ids = [(fileid, "pos") for fileid in movie_reviews.fileids('pos')]
    neg_ids = [(fileid, "neg") for fileid in movie_reviews.fileids('neg')]
    labeled_fileids = pos_ids + neg_ids

    ## Here, we "seed" the random number generator with 0 so that we'll all
    ## get the same split, which will make it easier to compare results.
] for text in documents_tokenized]

from nltk.sentiment.util import mark_negation

documents_tokenized_lemmatized_negated = [
    mark_negation(document) for document in documents_tokenized_lemmatized
]

ready_corpus = documents_tokenized_lemmatized_negated

download('opinion_lexicon')
from nltk.corpus import opinion_lexicon

# we consider only sentiment words, opinion_lexicon icludes already mispelled sentiment words,
# so we did not use the enchant library this time.
sentiment_words = opinion_lexicon.words()
sentiment_words_negated = [word + '_NEG' for word in sentiment_words]

sentiment_features = sentiment_words + sentiment_words_negated

from gensim import corpora, models, matutils
# build the dictionary
dictionary = corpora.Dictionary(ready_corpus)
print(dictionary)

from nltk.sentiment import SentimentAnalyzer

sentiment_analizer = SentimentAnalyzer()
list_all_words = sentiment_analizer.all_words(ready_corpus)

used_sentiment_words = list(
Exemplo n.º 12
0
from nltk.corpus import opinion_lexicon
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


def create_feature(words):
    useful_words = [
        word for word in words if word not in stopwords.words("english")
    ]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict


# create tuples of negative words
neg_words = []
words = opinion_lexicon.words("negative-words.txt")
neg_words.append((create_feature(words), "Negative"))

# create tuples of positive words
pos_words = []
words = opinion_lexicon.words("positive-words.txt")
pos_words.append((create_feature(words), "Positive"))

train_set = neg_words[:3587] + pos_words[:1504]
test_set = neg_words[-1196:] + pos_words[-501:]

#print(test_set)

algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(train_set, algorithm, max_iter=100)
#classifier.show_most_informative_features(10)
Exemplo n.º 13
0
vocabSize = 0
#Count the number of documents in each category
tweetCount = {}
numTweets = 0.0
testProbs = {}


for i in range(0, len(engstopwords)):
    mystopwords[engstopwords[i]] = 1

for i in range(0, len(engwords)):
    mywords[engwords[i]] = 1

flag = 0
#0 = negative, 1 = positive
for w in opinion_lexicon.words():
    if flag == 0:
        sentiment_words[w] = flag
        if re.search("zombie",w):
            flag = 1

    else:
        sentiment_words[w] = flag


"""
lines = [line.rstrip('\n') for line in open('positive-words.txt')]
for w in lines:
    sentiment_words[w] = 1
    #print "postitive word: " + w
Exemplo n.º 14
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import opinion_lexicon
print(opinion_lexicon.words()[:4])
print(opinion_lexicon.negative()[:4])
print(opinion_lexicon.words()[0:10])
print(sorted(opinion_lexicon.words())[0:10])
"""
demo code in Project3 instruction
"""

import nltk
from nltk.corpus import opinion_lexicon
from nltk.corpus import sentence_polarity

# TODO: add these lines to project3 main file
# nltk.download('opinion_lexicon')
# nltk.download('sentence_polarity')

if __name__ == '__main__':
    # opionion_lexicon
    print('opinion lexicon:')
    print(opinion_lexicon.words()[:4])
    print(len(opinion_lexicon.words()))
    ## negative lexicon
    print('negative lexicon:')
    print(opinion_lexicon.negative()[:4])
    print(len(opinion_lexicon.negative()))
    ## positive lexicon
    print('positive lexicon:')
    print(opinion_lexicon.positive()[:4])
    print(len(opinion_lexicon.positive()))
    print()

    print('-------------------------------------------------------')

    # sentence polarity
    print('all sentences:')
Exemplo n.º 16
0
#!/usr/bin/env python3

from nltk.corpus import opinion_lexicon
from nltk import word_tokenize
import matplotlib.pyplot as plt
import random

pos_word = opinion_lexicon.words("positive-words.txt")
neg_word = opinion_lexicon.words("negative-words.txt")


def analyse_with_opinion():
    try:
        with open("tweets/twitter.txt", encoding="utf_16") as f:
            sentences = f.read().split("\n")
    except:
        print(
            "\nError occured while trying to read the twitter.txt. It is either missing or it uses different character set than UTF-16."
        )
        input("Press ENTER to continue....")
        return

    countpos = 0
    countneg = 0
    countneu = 0
    for sentence in sentences:
        pos_word_count = 0
        neg_word_count = 0
        for word in pos_word:
            if word in sentence:
                pos_word_count += 1
def get_list_of_word_negative():
    return opinion_lexicon.words('negative-words.txt')
Exemplo n.º 18
0
 train1 = pd.read_csv("fulldata_neg.txt", header=0, delimiter="\n")
 train2 = pd.read_csv("fulldata_pos.txt", header=0, delimiter="\n")
 neg = [0] * 12500
 pos = [1] * 12500
 dat = pd.DataFrame({'feel': neg})
 dat2 = pd.DataFrame({'feel': pos})
 train1 = train1.join(dat)
 train2 = train2.join(dat2)
 frames = [train1, train2]
 training_set = pd.concat(frames,ignore_index=True)
 
 
 num_reviews = training_set["Review"].size
 clean_train_reviews = []
 
 opinions = set(opinion_lexicon.words()) 
 for i in range( 0, num_reviews ):
     clean_train_reviews.append( review_to_words( training_set["Review"][i], opinions ))
     
  
 vectorizer = CountVectorizer(analyzer = "word",   \
                              tokenizer = None,    \
                              preprocessor = None, \
                              stop_words = None,   \
                              max_features = 500) 
 
 
 train_data_features = vectorizer.fit_transform(clean_train_reviews)
 train_data_features = train_data_features.toarray()
 
 y = training_set.iloc[:, 1].values
Exemplo n.º 19
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import opinion_lexicon
from nltk.corpus import sentiwordnet as swn

#create a list neg_word to store the negative words from corpora opinio_lexicon
neg_word = []
for wor in opinion_lexicon.words(
        "negative-words.txt"
):  #Fetch the values from negative text file and converted into simple words
    neg_word.append(wor)
#print("Negative words :",neg_word)optional for testing

pos_word = []
for wos in opinion_lexicon.words("positive-words.txt"):
    pos_word.append(wos)
#print("Positive words :",pos_word)optional for testing

text1 = '''My mood is so bad'''  #take a input
output_word = word_tokenize(
    text1
)  #tokenzine into words for further processing as we can not iterate directly to a string


#Create a function for calculating positive and negative words from the input
def calculator(value1):
    # Count positive words
    numPosWords = 0
    for word in output_word:
        if word in pos_word:
            numPosWords += 1