예제 #1
0
def voc_sentiword():
  """Construct the vocabulary based on words with the biggest opinion score in
  Sentiword.

  Parameters
  ----------

  Returns
  -------
  None
    The vocabulary is constructed.
  """
  res = {}
  for s_word in swn.all_senti_synsets():
    ll = str(s_word).split('.')
    if len(ll) >= 6:
      continue
    w, pos_tag = ll[0][1:], ll[1]
    if not pos_bool:
      pos_tag = 'x'
    key = w + '.' + pos_tag
    if key not in res:
      if s_word.pos_score() >= threshold or s_word.neg_score() >= threshold:
        res[key] = (s_word.pos_score(), s_word.neg_score())
  sorted_res = sorted(res.items(), key = lambda t : max(t[1][0], t[1][1]))
  sorted_res.reverse()
  select_voc(sorted_res)
예제 #2
0
    def swn_lexicon(self):
        lexicon_data = {}
        temp_lexicon = {}
        #add data from the regular resource
        synsets = swn.all_senti_synsets()
        for synset in synsets:

            synset_val = str(synset)
            synset_full = synset_val.strip('<').split(':')[0]
            synset_full = synset_full[:-3]
            synset_word = synset_full[:-2]
            synset_tag = synset_full[-1]

            if synset_tag == 's':
                synset_tag = 'a'
            if synset_full not in temp_lexicon:  #even appends values without a sentiment rating, could be wrong
                temp_lexicon[synset_full] = {
                    'positive': [synset.pos_score()],
                    'negative': [synset.neg_score()]
                }
            else:
                temp_lexicon[synset_full]['positive'].append(
                    synset.pos_score())
                temp_lexicon[synset_full]['negative'].append(
                    synset.neg_score())

        for word in temp_lexicon:
            if mean(temp_lexicon[word]['positive']) > 0 or mean(
                    temp_lexicon[word]['negative']) > 0:
                lexicon_data[word] = {
                    'positive': mean(temp_lexicon[word]['positive']),
                    'negative': mean(temp_lexicon[word]['negative'])
                }

        with open(self.setup.file_swn, 'w') as f:
            lexicon = {}
            for word in sorted(lexicon_data):
                lexicon[word] = lexicon_data[word]

            json.dump(lexicon, f)
예제 #3
0
"""
Python file to understand properties of libraries.
"""
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import nltk
import enchant
import logging
from datetime import datetime

startTime = datetime.now()

BING_LIU_DATA_PATH = 'data/bingliu_lexicon'

#Does sentiwordnet have words with '_' or multi_words?
swn_all_words = swn.all_senti_synsets()
swn_words = []
print "\nSWN"
for word in swn_all_words:
    word_name = word.synset.name().split('.')[0]
    if '_' in word_name:
        swn_words.append(word_name)

print str(len(swn_words)) + str(swn_words[:10])

#What about Bing Liu?
logging.info(__name__ + " - " + "\nBing Liu")
words = []
with open(BING_LIU_DATA_PATH + "/positive-words.txt", 'r') as bing_pos_file:
    for line in bing_pos_file:
        w = str(line)
예제 #4
0
import re
import math
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('sentiwordnet')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn
from rake_nltk import Metric, Rake
import tfidf



next(swn.all_senti_synsets())


logging.basicConfig(filename='process.log', filemode='a', level='INFO',
                    format='%(asctime)s - %(levelname)s - %(message)s')

redis_client = redis.StrictRedis(host='localhost', port=6379, db=0)

def compute_term_frequency(word_dict, bow):
    tf_dict = {}
    bow_count = len(bow)
    for word, count in word_dict.items():
        tf_dict[word] = count/float(bow_count)
    return tf_dict

def compute_inverse_data_frequency(doc_list):
예제 #5
0
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics=4,
                                           id2word=dictionary,
                                           passes=20)
print("LDA............")
topics = ldamodel.print_topics(num_topics=3, num_words=5)
for topic in topics:
    print(type(topic))
    print(topic)

print("LSA.................")
#id2word = gensim.corpora.Dictionary.load_from_text("c:\lda_test.txt")
lsi = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary)

from nltk.corpus import sentiwordnet as swn

topics = lsi.print_topics(5)
for topic in topics:
    print(topic[1])
    print(swn.senti_synsets(topic[1]))
    print("----------------------------------------")

#print(list(swn.senti_synsets('slow')))

happy = swn.senti_synsets('happy')

print(happy.neg_score())

all = swn.all_senti_synsets()
#print(all)
def extract_new_concepts():
	"""
		Extracts new concepts using SentiWordNet(SWN) and Bing Liu's Opinion lexicon.
		Also adding few manually picked up concepts.

	Arguments: None

	Returns:
		List of new concepts
	"""
	startTime = datetime.now()
	current_concepts = [key for (key, value) in senticnet.iteritems()]
	logging.info("Currently Available Concepts: (sample)")
	logging.info(str(current_concepts[:10]))
	bing_negative_words = []
	bing_positive_words = []

	swn_negative_words = []
	swn_positive_words = []
	new_neg_words, new_pos_words = [], []
	#Section 1: code to extract concepts from SWN.
	#Call preprocess for every word encountered.
	logging.info("Extracting from SWN")
	swn_all_words = swn.all_senti_synsets()
	i, j = 0, 0
	for word in swn_all_words:
		"""if i >=5 and j>=5:
			break"""
		word_name = word.synset.name().split('.')[0]
		if word.pos_score() > word.neg_score():
			w = preprocess(word_name)
			if w and w is not '':
				swn_positive_words.append(w)
				#i+=1
		else:
			w = preprocess(word_name)
			if w and w is not '':
				swn_negative_words.append(w)
				#j+=1


	#include only if they are not available in knowledge base of senticnet
	logging.info("Checking SenticNet...")
	# Running time O(n^2). Better solution below.
	"""
	for x in xrange(len(swn_positive_words)):
		if swn_positive_words[x] not in current_concepts:
			new_pos_words.append(swn_positive_words[x])

	for x in xrange(len(swn_negative_words)):
		if swn_negative_words[x] not in current_concepts:
			new_neg_words.append(swn_negative_words[x])
	"""
	#Running time O(n*logn)
	logging.info("Positive Words")
	new_pos_words = list(set(swn_positive_words)-set(current_concepts))
	logging.info("Negative Words")
	new_neg_words = list(set(swn_negative_words)-set(current_concepts))
	
	print "Sample SWN: \tTotal Length: ", len(new_pos_words), len(new_neg_words)
	print new_pos_words[:10]
	print new_neg_words[:10]
	
	#Section 2: code to extract concepts from Bing Liu's Opinion lexicon.
	logging.info("Extracting from Bing Liu")
	i=0
	with open(BING_LIU_DATA_PATH + "/positive-words.txt", 'r') as bing_pos_file:
		for line in bing_pos_file:
			if i==1:
				break
			w = preprocess(line)
			if w is not '':
				bing_positive_words.append(w)
				i+=1

	i=0
	with open(BING_LIU_DATA_PATH + "/negative-words.txt", 'r') as bing_neg_file:
		for line in bing_neg_file:
			if i==1:
				break
			w = preprocess(line)
			if w is not '':
				bing_negative_words.append(w)
				i+=1
	
	#include only if they are not available in knowledge base of senticnet
	logging.info("Checking SenticNet...")
	# Running time O(n^2). Better solution below.
	"""
	for x in xrange(len(bing_positive_words)):
		if bing_positive_words[x] not in current_concepts:
			new_pos_words.append(bing_positive_words[x])

	for x in xrange(len(bing_negative_words)):
		if bing_negative_words[x] not in current_concepts:
			new_neg_words.append(bing_negative_words[x])
	"""
	#unique concepts
	#Running time O(n*logn)
	logging.info("Positive Words")
	bing_new_pos_words = list(set(bing_positive_words)-set(current_concepts))
	logging.info("Negative Words")
	bing_new_neg_words = list(set(bing_negative_words)-set(current_concepts))
	"""
	print "Sample Bing Liu: Length: ", len(bing_new_pos_words), len(bing_new_neg_words)
	print bing_new_pos_words
	print bing_new_neg_words
	"""
	new_pos_words+=bing_new_pos_words
	new_neg_words+=bing_new_neg_words

	#store them in file.
	with open(OUTPUT_BASE_PATH + '/new_positive_words.txt', 'w+') as out_posi_file:
		for word in new_pos_words:
			out_posi_file.write("%s\n" %word)

	with open(OUTPUT_BASE_PATH + '/new_negative_words.txt', 'w+') as out_neg_file:
		for word in new_neg_words:
			out_neg_file.write("%s\n" %word)
	#startTime = datetime.now()		
	logging.error("Time to execute extract_new_concepts.extract_new_concepts(): {0}".format(datetime.now() - startTime))
예제 #7
0
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

f = open ("pos_dictionary_name.txt","a+")

pos_words = []
for ss in swn.all_senti_synsets():
    if ss.pos_score() > ss.neg_score ():
        pos_words.append (ss.synset)
        f.write(ss.synset.name()[:-5])
        f.write ("\n")

f.close()


f = open ("neg_dictionary_name.txt","a+")

neg_words = []
for ss in swn.all_senti_synsets():
    if ss.neg_score() > ss.pos_score ():
        neg_words.append (ss.synset)
        f.write(ss.synset.name()[:-5])
        f.write ("\n")

f.close()
예제 #8
0
#!usr/bin/env python3
# -*- coding: utf-8 -*-

'get subject-score > 0.5 words from sentiwordnet'

from nltk.corpus import sentiwordnet as swn
import json

pos = {}
neg = {}
all = list(swn.all_senti_synsets())
for each in all:
    if each.pos_score() > 0.5:
        pos[each.__repr__()[13:-7]] = each.pos_score()
    if each.neg_score() > 0.5:
        neg[each.__repr__()[13:-7]] = each.neg_score()
with open('basic_pos_words.txt', 'w', encoding='utf8') as f:
    f.write(json.dumps(pos))
with open('basic_neg_words.txt', 'w', encoding='utf8') as f:
    f.write(json.dumps(neg))
print('complete.')