예제 #1
0
def compute_pos_neg_scores(clean_content):
    """""

    :param clean_content: pre-processed confessions
    :type all_comments: list
    :returns: None
    """
    assert isinstance(clean_content, list)

    #nltk opinion lexicon
    pos_lex = opinion_lexicon.positive()
    neg_lex = opinion_lexicon.negative()
    pn_lex_score = defaultdict(int)

    for i, note in tqdm(enumerate(clean_content)):
        pn_lex_score[i] = np.array([0., 0.])
        note = re.sub("[^\w]", " ", note).split()
        for word in note:
            if word in pos_lex:
                pn_lex_score[i] += np.array([1., 0.])
            elif word in neg_lex:
                pn_lex_score[i] += np.array([0., 1.])

    output = open('pn_lex_score.pkl', 'wb')
    pickle.dump(pn_lex_score, output)
    output.close()
    return None
예제 #2
0
def get_nltk_sentiment(sentence, method):
        
    if (method == 'vader'):
        sa = sentiment.vader.SentimentIntensityAnalyzer()
        output = sa.polarity_scores(str(sentence))

        return output['compound']
    
    elif (method == 'liu'):
        
        wordType = ''
        
        if "PERSON" in str(ne_chunk(pos_tag(word_tokenize(sentence)))):
            wordType = 'tag'
        
        tokenizer = treebank.TreebankWordTokenizer()
        pos_words = 0
        neg_words = 0
        tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
        
        for word in tokenized_sent:
            if word in opinion_lexicon.positive():
                pos_words += 1
            elif word in opinion_lexicon.negative():
                neg_words += 1
                
        if pos_words > neg_words:
            return 'Positive'
        elif pos_words < neg_words:
            return 'Negative'
        elif pos_words == neg_words:
            if wordType == 'tag':
                return 'Positive'
            else:
                return 'Neutral'
예제 #3
0
def dlll_pos_neg_ratio(text):
    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)]

    x = list(range(len(tokenized_sent)))  # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    print(pos_words, neg_words)
    ratio = sum(y) / len(y)
    if pos_words > neg_words:
        return ("Positive", ratio)
    elif pos_words < neg_words:
        return ("Negative", ratio)
    elif pos_words == neg_words:
        return ("Neutral", ratio)
예제 #4
0
def evaluate_sentence(sentence: str) -> bool:
    positive_count = 0
    negative_count = 0
    lemmatizer = WordNetLemmatizer()

    # lemmatization is commented out for submission for speed and improved evaluation
    new_sentence = ""
    for word in sentence.split(" "):
        new_sentence += lemmatizer.lemmatize(word)
    sentence = new_sentence

    for positive_word in opinion_lexicon.positive():
        positive_word = lemmatizer.lemmatize(positive_word)

        if positive_word in sentence:
            positive_count += 1

    for negative_word in opinion_lexicon.negative():
        negative_word = lemmatizer.lemmatize(negative_word)

        if negative_word in sentence:
            negative_count += 1

    if positive_count >= negative_count:
        is_sentence_positive = True
    else:
        is_sentence_positive = False

    return is_sentence_positive
예제 #5
0
def get_opinion_features(words):
    """
    This function creates the opinion lexicon features
    as described in the assignment3 handout.

    the negative and positive data has been read into the following lists:
    * neg_opinion
    * pos_opinion

    if you haven't downloaded the opinion lexicon, run the following commands:
    *  import nltk
    *  nltk.download('opinion_lexicon')

    :param tags: tokens
    :return: feature_vectors: a dictionary values for each opinion feature
    """
    neg_opinion = opinion_lexicon.negative()
    pos_opinion = opinion_lexicon.positive()
    feature_vectors = {}

    for word in neg_opinion:
        if word in words:
            feature_vectors[word] = 1
        else:
            feature_vectors[word] = 0
    for word in pos_opinion:
        if word in words:
            feature_vectors[word] = 1
        else:
            feature_vectors[word] = 0

    return feature_vectors
예제 #6
0
def demo_liu_hu_lexicon(sentence):

    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    #tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0.1
    neg_words = 0.1
    #tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(sentence)))
    y = []

    for word in sentence:
        if word in opinion_lexicon.positive():
            pos_words += 1
            #y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            #y.append(-1)  # negative
        else:
            pos_words += 0
            neg_words += 0
    p_n_rat = pos_words/neg_words
    return p_n_rat,pos_words,neg_words
def get_lexicon():
    # This lexicon must be used for sentiment classification (opinion).
    # Opinion Lexicon (or Sentiment Lexicon) (Hu and Liu, KDD-2004).
    opinion_pos = opinion_lexicon.positive()
    opinion_neg = opinion_lexicon.negative()
    lexicon = {0: opinion_neg, 1: opinion_pos}
    return lexicon
예제 #8
0
class Liu_Hu_Sentiment:
    positive = set(opinion_lexicon.positive())
    negative = set(opinion_lexicon.negative())
    sentiments = ('sentiment',)
    name = 'Liu Hu'

    def __init__(self):
        super().__init__()

    def transform(self, corpus, copy=True):
        scores = []
        tokenizer = WordPunctTokenizer()
        tokens = tokenizer(corpus.documents)

        for doc in tokens:
            pos_words = sum(word in self.positive for word in doc)
            neg_words = sum(word in self.negative for word in doc)
            scores.append([100*(pos_words - neg_words)/max(len(doc), 1)])
        X = np.array(scores).reshape((-1, len(self.sentiments)))

        # set  compute values
        shared_cv = SharedTransform(self)
        cv = [VectorizationComputeValue(shared_cv, col)
              for col in self.sentiments]

        if copy:
            corpus = corpus.copy()
        corpus.extend_attributes(X, self.sentiments, compute_values=cv)
        return corpus
예제 #9
0
 def __init__(self):
     self.RATIO = 1.2
     self.pos_lexicon = opinion_lexicon.positive()
     self.neg_lexicon = opinion_lexicon.negative()
     self.neg_synonyms_lexicon = []
     self.pos_synonyms_lexicon = []
     self.enricher = TW.TweetEnricher()
    def _get_pos_neg_words_count(self, text):

        words = word_tokenize(text)
        pos_opinion_count = len(set(opinion_lexicon.positive()) & set(words))
        neg_opinion_count = len(set(opinion_lexicon.negative()) & set(words))

        return [pos_opinion_count, neg_opinion_count]
예제 #11
0
    def predict(self, X, binary=None):
        if (str(self.classes_.dtype)[:3] != 'int'):
            return self._predict(X, binary)
        else:
            tokenised_reviews = [review.split(" ") for review in X]

            if self.binary == None:
                if binary == None:
                    self.binary = False
                else:
                    self.binary = binary

            'implement sentiment analyser using lexicon from Hu and Liu'
            predicted_sentiment = []

            for review in tokenised_reviews:
                pos_words = sum(token in review
                                for token in opinion_lexicon.positive())
                neg_words = sum(token in review
                                for token in opinion_lexicon.negative())

                if self.binary == True:
                    if pos_words > neg_words:
                        predicted_sentiment.append(1)
                    else:
                        predicted_sentiment.append(0)
                else:
                    if pos_words > neg_words:
                        predicted_sentiment.append(2)
                    elif pos_words < neg_words:
                        predicted_sentiment.append(0)
                    else:
                        predicted_sentiment.append(1)

            return np.array(predicted_sentiment).astype(int)
예제 #12
0
def get_senti_lexicon():
    # opinion_lexicon
    from nltk.corpus import opinion_lexicon
    opinion_pos = opinion_lexicon.positive()
    opinion_neg = opinion_lexicon.negative()
    
    # vader_lexicon 
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sentiment_analyzer = SentimentIntensityAnalyzer()
    vader_lexicon = sentiment_analyzer.lexicon
    vader_pos = set()
    vader_neg =  set()
    for d in vader_lexicon:
        if vader_lexicon[d] >= 0.5: # threshold 조정 필요 ?
            vader_pos.add(d)
        elif  vader_lexicon[d] <= -0.5:
            vader_neg.add(d)
    
    lexicon_path = '/'.join(os.getcwd().split('/')[:-1])
    # finance lexcion
    finance_pos = get_lexicon(lexicon_path +'/lexicons/finance_pos.txt')
    finance_neg = get_lexicon(lexicon_path +'/lexicons/finance_neg.txt')
        
    # hu-liu lexicon
    hu_liu_pos = get_lexicon(lexicon_path +'/lexicons/hu_liu_pos.txt')
    hu_liu_neg = get_lexicon(lexicon_path +'/lexicons/hu_liu_neg.txt')
    
    # harvard lexicon
    harvard_neg = get_lexicon(lexicon_path +'/lexicons/harvard_neg.txt')
    
    pos_lexicon = set(opinion_pos)  & hu_liu_pos
    neg_lexicon = set(opinion_neg) &  hu_liu_neg 
    senti_lexicon = pos_lexicon | neg_lexicon
    lexicon = {0:pos_lexicon, 1:neg_lexicon}
    return lexicon
예제 #13
0
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent)))  # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

        # if plot == True:
        #     _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])

    return pos_words, neg_words
예제 #14
0
    def gen_word_cloud(self):
        raw_text = ' '.join(self.tdf.filt_text.to_list())
        wc = wordcloud.WordCloud().process_text(raw_text)
        # assign postive/negative connotations to words from nltk corpus
        sent_df = pd.DataFrame(columns=['Words', 'align'])
        print(len(wc.keys()))
        for h, i in enumerate(wc.keys()):
            if h % 10 == 0:
                print(h)
            sent_df.loc[h, 'Words'] = i
            if i in ol.positive():
                sent_df.loc[h, 'align'] = 'Positive'
            elif i in ol.negative():
                sent_df.loc[h, 'align'] = 'Negative'
            else:
                sent_df.loc[h, 'align'] = 'Neutral'
        pos_words = sent_df.loc[sent_df['align'] == 'Positive',
                                'Words'].tolist()
        neg_words = sent_df.loc[sent_df['align'] == 'Negative',
                                'Words'].tolist()

        pos_dict = {k: v for k, v in wc.items() if k in pos_words}
        neg_dict = {k: v for k, v in wc.items() if k in neg_words}

        wordcloud.WordCloud(
            width=800, height=400).generate_from_frequencies(pos_dict).recolor(
                colormap='Greens').to_file('output/pos_wordcloud.png')
        wordcloud.WordCloud(
            width=800, height=400).generate_from_frequencies(neg_dict).recolor(
                colormap='Reds').to_file('output/neg_wordcloud.png')
예제 #15
0
def extract_features(corpus):
    feature_dict = {}
    analyser = SentimentIntensityAnalyzer()
    pos_opinion_words = set(opinion_lexicon.positive())
    neg_opinion_words = set(opinion_lexicon.negative())

    for dialog_index, dialog in enumerate(corpus):

        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform([diag[0] for diag in dialog])
        similarity_matrix = cosine_similarity(tfidf)

        starter_user_id = dialog[0][1]

        for utterance_index, utt_info in enumerate(dialog[:-1]):
            utterance = utt_info[0]
            key = str(dialog_index) + "_" + str(utterance_index)

            words = word_tokenize(utterance)

            content_features = extract_content_features(similarity_matrix, utterance_index, utterance)
            structural_features = extract_structural_features(utterance_index, dialog, utt_info, starter_user_id, words)
            sentimental_features = extract_sentimental_features(utterance, utt_info, words, analyser, pos_opinion_words,
                                                                neg_opinion_words)

            feature_dict[key] = content_features + structural_features + sentimental_features

    return feature_dict
예제 #16
0
def prepare_lexicon(corpus, embedding, num=250, extra=False):
    V = set([w for w in embedding.vocab])
    neg = set(opinion_lexicon.negative())
    pos = set(opinion_lexicon.positive())
    senti_lexicon = opinion_lexicon.words()
    senti_lexicon = [w for w in senti_lexicon if w in V]
    lexicon_dic = {x: 0 for x in senti_lexicon}
    for sent in corpus:
        for w in sent:
            if w in lexicon_dic:
                lexicon_dic[w] += 1
    L = Counter(lexicon_dic).most_common(5000)
    N = []
    N_count = []
    P = []
    P_count = []
    for word, count in L:
        if word in neg:
            N.append(word)
            N_count.append(count)
        elif word in pos:
            P.append(word)
            P_count.append(count)
    Senti_L = P[:num] + N[:num]
    P_sum = sum(P_count[:num])
    P_score = [x * 1.0 / P_sum for x in P_count[:num]]
    N_sum = sum(N_count[:num])
    N_score = [x * 1.0 / N_sum for x in N_count[:num]]
    Senti_W = P_score + N_score
    if extra:
        Extra_L = [l for l in Extra_Lexicon if l in V]
        Extra_W = [1.0 for l in Extra_L]
        return Senti_L + Extra_L, Senti_W + Extra_W
    return Senti_L, Senti_W
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            y.append(0) # neutral

    if (pos_words+neg_words) > 0:
        return (pos_words-neg_words)/float(pos_words+neg_words)
    else:
        return 0
예제 #18
0
    def classifier(self,sentence):

        tokenizer = treebank.TreebankWordTokenizer()
        pos_words = 0
        neg_words = 0
        tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
    
        x = list(range(len(tokenized_sent))) # x axis for the plot
        y = []
    
        for word in tokenized_sent:
            if word in opinion_lexicon.positive():
                pos_words += 1
                y.append(1) # positive
            elif word in opinion_lexicon.negative():
                neg_words += 1
                y.append(-1) # negative
            else:
                y.append(0) # neutral
    
        if pos_words > neg_words:
            return 'Positive'
        elif pos_words < neg_words:
            return 'Negative'
        elif pos_words == neg_words:
            return 'Neutral'
예제 #19
0
def get_senti_lexicon():
    # opinion_lexicon
    from nltk.corpus import opinion_lexicon
    opinion_pos = opinion_lexicon.positive()
    opinion_neg = opinion_lexicon.negative()
    
    # vader_lexicon 
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sentiment_analyzer = SentimentIntensityAnalyzer()
    vader_lexicon = sentiment_analyzer.lexicon
    vader_pos = set()
    vader_neg =  set()
    for d in vader_lexicon:
        if vader_lexicon[d] >= 0.5: # threshold 조정 필요 ?
            vader_pos.add(d)
        elif  vader_lexicon[d] <= -0.5:
            vader_neg.add(d)
    
    # finance lexcion
    finance_pos = get_lexicon('../lexicons/finance_pos.txt')
    finance_neg = get_lexicon('../lexicons/finance_neg.txt')
        
    # hu-liu lexicon
    hu_liu_pos = get_lexicon('../lexicons/hu_liu_pos.txt')
    hu_liu_neg = get_lexicon('../lexicons/hu_liu_neg.txt')
    
    # harvard lexicon
    harvard_neg = get_lexicon('../lexicons/harvard_neg.txt')
    
    pos_lexicon = set(opinion_pos) | vader_pos | finance_pos | hu_liu_pos
    neg_lexicon = set(opinion_neg) | vader_neg | finance_neg | hu_liu_neg | harvard_neg
    senti_lexicon = pos_lexicon | neg_lexicon
    
    return pos_lexicon, neg_lexicon, senti_lexicon
예제 #20
0
def demo_liu_hu_lexicon(sentence):
    """ THIS IS JUST BIT MODIFIED 
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon

    pos_words = 0
    neg_words = 0

    y = []

    for word in sentence:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    if pos_words > neg_words:
        return 'Positive'
    elif pos_words < neg_words:
        return 'Negative'
    elif pos_words == neg_words:
        return 'Neutral'
예제 #21
0
def prepare_lexicon(process=True, dim=250, save=False):
    if process:
        dm = DatasetManager()
        data = dm.prepare_datasets()
        nega = set(opinion_lexicon.negative())
        posi = set(opinion_lexicon.positive())
        lexicon = opinion_lexicon.words()
        lexicon_dic = {x: 0 for x in lexicon}
        for t in data['vader']['text']:
            for w in t:
                if w in lexicon_dic:
                    lexicon_dic[w] += 1
        for t in data['sentiment140']['text']:
            for w in t:
                if w in lexicon_dic:
                    lexicon_dic[w] += 1
        L = Counter(lexicon_dic).most_common(4000)
        N = []
        P = []
        for w, _ in L:
            if w in nega:
                N.append(w)
            elif w in posi:
                P.append(w)
        l = P[:dim] + N[:dim]
        if save:
            with open('senti.lexicon', 'w') as f:
                for d in l:
                    f.write(d)
                    f.write('\n')
        return l
    else:
        with open('senti.lexicon', 'r') as f:
            data = [line.strip() for line in f]
        return data
예제 #22
0
    def demo_liu_hu_lexicon(sentence):
        tokenizer = treebank.TreebankWordTokenizer()
        pos_words = 0
        neg_words = 0
        tokenized_sent = [
            word.lower() for word in tokenizer.tokenize(sentence)
        ]

        x = list(range(len(tokenized_sent)))  # x axis for the plot
        y = []

        for word in tokenized_sent:
            if word in opinion_lexicon.positive():
                pos_words += 1
                y.append(1)  # positive
            elif word in opinion_lexicon.negative():
                neg_words += 1
                y.append(-1)  # negative
            else:
                y.append(0)  # neutral

        if pos_words > neg_words:
            return 'Positive'
        elif pos_words < neg_words:
            return 'Negative'
        elif pos_words == neg_words:
            return 'Neutral'
예제 #23
0
def pos_neg_fraction_with_negation(text):
    """
    Compute the fraction of positive and negative words in a text, including negated words
    :param text: input text
    :return: a fraction of positive and negative words in the text
    """
    # Sets of already known positive and negative words
    positive_words = set(opinion_lexicon.positive())
    negative_words = set(opinion_lexicon.negative())
    # Set of all positive words including negated negative words
    all_positive_words = positive_words.union(
        {tag + "_NEG"
         for tag in negative_words})
    # Set of all positive words including negated positive words
    all_negative_words = negative_words.union(
        {tag + "_NEG"
         for tag in positive_words})

    tokens = tokenize_with_negation(text)
    # count how many positive and negative words occur in the text
    count_pos, count_neg = 0, 0
    for token in tokens:
        if token in all_positive_words:
            count_pos += 1
        if token in all_negative_words:
            count_neg += 1
    count_all = len(tokens)
    if count_all != 0:
        return count_pos / count_all, count_neg / count_all
    else:  # avoid division by zero
        return 0., 0.
예제 #24
0
파일: main.py 프로젝트: jkatzy/IN4325_IR
def opinion_lex(tokenizer, utterance):
    pos = 0
    neg = 0
    for word in tokenizer.tokenize(utterance.utterance):
        pos += word in opinion_lexicon.positive()
        neg += word in opinion_lexicon.negative()

    return pos, neg
예제 #25
0
def posopinion(sentence):
    tokenizer = treebank.TreebankWordTokenizer()
    pos1 = 0
    tokenized = [word.lower() for word in tokenizer.tokenize(sentence)]
    for word in tokenized:
        if word in opinion_lexicon.positive():
            pos1 += 1
    return pos1
예제 #26
0
    def __init__(self):
        """
        constructor
        """
        self.positive_sentences = []
        self.negative_sentences = []

        response1 = input(
            'Would you want to test sentiment with a local text data? (Y/N) ')
        if response1.lower() == 'y' or response1.lower() == 'yes':
            positive_file = input(
                'Input the path for the positive sentiment data: ')
            negative_file = input(
                'Input the path for the negative sentiment data: ')
            if os.path.exists(positive_file):
                # read positive sentences
                with open(positive_file, "r") as reader:
                    self.positive_sentences = reader.readlines()
                self.positive_sentences = [
                    sent.rstrip() for sent in self.positive_sentences
                ]
            if os.path.exists(negative_file):
                # read negative sentences
                with open(negative_file, "r") as reader:
                    self.negative_sentences = reader.readlines()
                self.negative_sentences = [
                    sent.rstrip() for sent in self.negative_sentences
                ]
        else:
            # use 5331 positive sentences and 5331 negative sentences as testing data
            # since this requires a huge amount of lexica, so this part is not implemented
            response2 = input(
                'Would you want to test sentiment with data in sentence_polarity? (Y/N) '
            )
            if response2.lower() == 'y' or response2.lower() == 'yes':
                # negative words
                self.negative_lexica = opinion_lexicon.negative()
                self.negative_lexica_size = len(self.negative_lexica)
                # positive words
                self.positive_lexica = opinion_lexicon.positive()
                self.positive_lexica_size = len(self.positive_lexica)

                # sentence sentiment categories
                self.senti_categories = sentence_polarity.categories()
                # negative sentiment sentences
                self.negative_sentences = sentence_polarity.sents(
                    categories=['neg'])[:10]  # get the first 10 sentences
                self.negative_sentences = [
                    ' '.join(sent) for sent in self.negative_sentences
                ]
                self.negative_sentences_size = len(self.negative_sentences)
                # positive sentiment sentences
                self.positive_sentences = sentence_polarity.sents(
                    categories=['pos'])[:10]  # get the first 10 sentences
                self.positive_sentences = [
                    ' '.join(sent) for sent in self.positive_sentences
                ]
                self.positive_sentences_size = len(self.positive_sentences)
예제 #27
0
 def __init__(self):
     self.tokenizer = TweetTokenizer()
     self.stemmer = PorterStemmer()
     self.stopset = set(stopwords.words('english'))
     self.negative_opinions = opinion_lexicon.negative()
     self.positive_opinions = opinion_lexicon.positive()
     self.brexit_keywords = [
         line.rstrip('\n') for line in open('../Data/Lists/BrexitKeywords')
     ]
     self.vulgar_words = [
         line.rstrip('\n').lower()
         for line in open('../Data/Lists/VulgarWordsList')
     ]
     self.twitter_jargons = [
         line.rstrip('\n')
         for line in open('../Data/Lists/TwitterSlangsAndAbbreviations')
     ]
     self.web_abbreviations = [
         line.rstrip('\n').lower()
         for line in open('../Data/Lists/WebAcronymns')
     ]
     self.emoticons_list = [
         line.rstrip('\n') for line in open('../Data/Lists/EmojiList')
     ]
     self.pos_emoticons_list = [
         line.rstrip('\n')
         for line in open('../Data/Lists/PositiveEmojiList')
     ]
     self.neg_emoticons_list = [
         line.rstrip('\n')
         for line in open('../Data/Lists/NegativeEmojiList')
     ]
     self.first_person_pronouns = [
         line.rstrip('\n')
         for line in open('../Data/Lists/FirstPersonPronouns')
     ]
     self.speech_act_verbs = [
         line.rstrip('\n')
         for line in open('../Data/Lists/StemmedSpeechActVerbs')
     ]
     self.trusted_domains = [
         line.rstrip('\n') for line in open('../Data/Lists/TrustedDomains')
     ]
     self.verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
     self.n_gram_count_matrix = {}
     self.vectorizer = CountVectorizer(
         ngram_range=(1, 3),
         tokenizer=self.tokenizer.tokenize,
         stop_words=list(self.stopset) + self.web_abbreviations +
         list(string.punctuation) +
         ["…", "...", "..", ")", "(", "-->", "->", ">>", "#", "RT", "@"])
     self.vectorizer_unigram = CountVectorizer(
         ngram_range=(1, 1),
         tokenizer=self.tokenizer.tokenize,
         stop_words=list(self.stopset) + self.web_abbreviations +
         list(string.punctuation))
     self.positive_ops = [x.lower() for x in self.positive_opinions]
     self.negative_ops = [x.lower() for x in self.negative_opinions]
 def __init__(self):
     super().__init__()
     self.sentiment_analyzer = SentimentIntensityAnalyzer()
     self.negative_lexicon = list(opinion_lexicon.negative())
     self.positive_lexicon = list(opinion_lexicon.positive())
     logging.basicConfig(filename="feature.log",
                         filemode="w+",
                         level=logging.INFO)
     self.logger = logging.getLogger("info")
예제 #29
0
    def __init__(self):
        # negative words
        self.negative_lexica = opinion_lexicon.negative()
        self.negative_lexica_size = len(self.negative_lexica)
        # positive words
        self.positive_lexica = opinion_lexicon.positive()
        self.positive_lexica_size = len(self.positive_lexica)

        # sentence sentiment categories
        self.senti_categories = sentence_polarity.categories()
    def count_words(self, sentence, positive=False):
        if positive:
            lex = set(opinion_lexicon.positive())

        else:
            lex = set(opinion_lexicon.negative())

        numb_acc = 0
        for word in word_tokenize(sentence):
            numb_acc += word in lex
        return numb_acc
예제 #31
0
def sentiment_liu_hu_mod(text):
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(text)]

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
        elif word in opinion_lexicon.negative():
            neg_words += 1
            
    return (pos_words - neg_words)/len(tokenized_sent)
예제 #32
0
def compare_positive(review):
    positive_words = opinion_lexicon.positive()
    count_pos = 0
    list_pos = []

    for word in positive_words:
        for w in review:
            if word == w:
                list_pos.append(word)
                count_pos += 1
    # print(count_pos)
    return count_pos
예제 #33
0
파일: util.py 프로젝트: licheng5625/coder
def getPositiveWords(sentence):
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank
    
    tokenizer = treebank.TreebankWordTokenizer()
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
    
    x = list() # x axis for the plot
    
    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            x.append(word)
    return x
wordCount=0
BOW = set()
#these are maps of word with corresponding counts
BOW1 = {}
negBOW = {}
posBOW = {}

positiveWords={}
negativeWords={}
XMap={'A':'B'}  #Map of X for all the documents
stemmer = PorterStemmer()
weights=[]
positiveWords={}
negativeWords={}
negatives=opinion_lexicon.negative()
positives=opinion_lexicon.positive()
unit_step = lambda x: 0 if x < 0 else 1

class Perceptron:
  stopwords = nltk.corpus.stopwords.words('english')
  class TrainSplit:
    """Represents a set of training/testing data. self.train is a list of Examples, as is self.test. 
    """
    def __init__(self):
      self.train = []
      self.test = []

  class Example:
    """Represents a document with a label. klass is 'pos' or 'neg' by convention.
       words is a list of strings.
    """
예제 #35
0
text = raw_input('$ ')
words = text.split()
name = 'Prabhat Saini'

def getpersonaldetails() :
	name = 'Prabhat Saini'
	sex = 'Male'
	dob = '30th November 1991'
	personality = 'ENTP'
	return (name, sex, dob, personality)

def getdetails() :
	name, sex, dob, personality = getpersonaldetails()

##check for positive terms in the text
pos_words = [word for word in ol.positive()]
##print pos_words
neg_words = [word for word in ol.negative()]
##print neg_words

##create positive and negative word indices and use as dictionary
for word in words :
	if word in pos_words :
		pos_sentiment[word] = 'positive'
	if word in neg_words :
		neg_sentiment[word] = 'negative'

##print words
print pos_sentiment
print neg_sentiment
예제 #36
0
def wordSentenceContainsOpinionatedWords(review_spacy):
    for word in review_spacy:
        if word.orth_ in opinion_lexicon.positive() or word in opinion_lexicon.negative():
            return 1
    return 0