def findScoreWord(word, dType):
    swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
    swn = SentiWordNetCorpusReader(swn_filename)
    word = re.sub('[%s]' % re.escape(string.punctuation), ' ', word)
    word = word.lower()
    #print word
    wS = 0
    
    for w in word.split():
	#print w
	if dType == NN:
	    test = swn.senti_synsets(w, 'n')
	elif dType == ADJ:
	    test = swn.senti_synsets(w, 'a')
	elif dType == VB:
	    test = swn.senti_synsets(w, 'v')
	elif dType == ADV:
	    test = swn.senti_synsets(w, 'r')
	try:
	    wS += test[0].obj_score
	except:
	    continue
    #print word, wS
    if len(word.split()) == 0:
	return 0
    return wS/len(word.split())
示例#2
0
    def __init__(self,
                 traing_data_fileP1='mood_traing_p1.dat',
                 traing_data_fileP2='mood_traing.dat',
                 data_file='tweets_raw.dat'):
        if self.sentiwordnet:
            print "using sentiwordnet dictionary"
        else:
            print "not using sentiwordnet dictionary"

        self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsFile = open(
            os.path.join(os.curdir, os.path.normpath('../data/' + data_file)),
            'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(
            os.path.join(os.curdir, os.path.normpath('../data/' + data_file)),
            'rb')

        self.limit['en'] = 300000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME
        self.swn = SentiWordNetCorpusReader(swn_filename)
示例#3
0
    def __init__(self):
        self.mwnet = MWNet(os.path.join("data", "mwnet.db"))
        self.swn = SentiWordNetCorpusReader(os.path.join("data", "SentiWordNet_3.0.0.txt"))

        if os.getenv('TANL_EMAIL'):
            self.splitter = TanlSplitter()
        else:
            self.splitter = SimpleSplitter()
    def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'):
        if self.sentiwordnet:
            print "using sentiwordnet dictionary"
        else:
            print "not using sentiwordnet dictionary"

        self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2)
        
        self.langClassifier = LangDetect(supportedLangs)
        
        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()
        
        self.tweetsFile = open(os.path.join(os.curdir, os.path.normpath('../data/' + data_file)) ,'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(os.curdir , os.path.normpath('../data/' + data_file)) ,'rb')

        self.limit['en'] = 300000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME
        self.swn = SentiWordNetCorpusReader(swn_filename)
示例#5
0
class SentiWordNetLexicon():
    
    def __init__(self):
        SWN_FILENAME = "lexicon\SentiWordNet_3.0.0_20130122.txt"
        self.swn= SentiWordNetCorpusReader(SWN_FILENAME)

    def get_values(self, word, context=None, pos_tag=None):
        """
        Perform lookup in SentiWordNet
        """
#            entry = swn.senti_synset("breakdown.n.03")
        entries = None
        for w in word.split(' '):
            entries = self.swn.senti_synsets(w)
            if entries != None: break
        if entries is None or len(entries)==0: 
            return None
        if len(entries)==1 or pos_tag is None:
            return [entries[0].pos_score, entries[0].neg_score, entries[0].obj_score]
        elif len(entries)>1:
            #Find out which word to chose, if there are several classes
            print "Several entries ",entries
            for entry in entries:
                if entry.synset.pos()==TYPECRAFT_SENTIWORDNET[pos_tag]:
                    print "Found matching entry: ", entry
                    return [entry.pos_score, entry.neg_score, entry.obj_score]
            
            return [entries[0].pos_score, entries[0].neg_score, entries[0].obj_score]
        return None
示例#6
0
class Analyzer(object):
    def __init__(self):
        self.mwnet = MWNet(os.path.join("data", "mwnet.db"))
        self.swn = SentiWordNetCorpusReader(os.path.join("data", "SentiWordNet_3.0.0.txt"))

        if os.getenv('TANL_EMAIL'):
            self.splitter = TanlSplitter()
        else:
            self.splitter = SimpleSplitter()

    def analyze_sentence(self, sentence):
        scores = []
        result = {}

        for word, lemma, tag, wn_type, indices in self.splitter.iter_words(sentence):
            # Here we can also impose the type to be an ADJ, NAME, or VERB
            synsets = self.mwnet.get_english_synsets(lemma, wn_type)

            if not synsets:
                continue

            synsets_dict = {}

            found = False
            for syn in synsets:
                for translation in self.mwnet.get_translation(syn):
                    senti_synsets = self.swn.senti_synsets(translation, wn_type)

                    if not senti_synsets:
                        continue

                    synsets_dict[syn] = map(lambda x: [x.synset.name, x.pos_score, x.neg_score, x.obj_score], senti_synsets)
                    scores.extend(map(lambda x: (x.pos_score, x.neg_score, x.obj_score), senti_synsets))

                if found:
                    break

            positive  = map(lambda x: x[0], scores)
            negative  = map(lambda x: x[1], scores)
            objective = map(lambda x: x[2], scores)

            if len(positive) > 0:
                pscore = sum(positive) * 1.0 / len(positive)
                nscore = sum(negative) * 1.0 / len(positive)
                oscore = sum(objective) * 1.0 / len(positive)

                result[word] = {
                    'indices': indices,
                    'lemma': lemma,
                    'features': tag,
                    'synsets': synsets_dict,
                    'scores': {
                        'positive': pscore,
                        'negative': nscore,
                        'objective': oscore,
                    },
                }

        return result
 def __getSentiWords__(self,location=config.SENTI_WORDNET_FILE):
     
     from sentiwordnet import SentiWordNetCorpusReader, SentiSynset
     swn = SentiWordNetCorpusReader(location)
     w = {}
     for senti_synset in swn.all_senti_synsets():
         score = senti_synset.pos_score + senti_synset.neg_score
         #if totalScore > 0 : 
         #    score = abs(senti_synset.pos_score-senti_synset.neg_score)/totalScore
         #else :
         #    continue;
         if score > 0 :
             word = senti_synset.synset.name.split('.')[0]
             try:
                 if w[word]>=score:
                     continue;
             except KeyError, e:
                 pass
             w[word] = score
示例#8
0
def scores(preProData,emot,sentifile='SentiWordNet_3.0.0_20130122.txt'):
    swn = SentiWordNetCorpusReader(sentifile)
    res = list()
    bar = 0.0
    nm = NegMod()
    for tweet,emo in zip(preProData,emot):
        print bar / float(len(preProData))
        tweetneg = 0.0
        tweetpos = 0.0
        c = 0
        for word in tweet:
            try:
                w = str(wn.synsets(word)[0].name())
                temp = swn.senti_synset(w)
                plop = 0.0
                plopp = 0.0
                # Negation et modifieurs
                if c != 0:
                    if nm.neg_it(tweet[c-1]):#negation
                        tweetpos = temp[2]
                        tweetneg = temp[1]
                        break
                    if nm.mod_multiply(tweet[c-1]):#modifier
                        plop = temp[1]*2
                        plopp = temp[2]*2
                    else:
                        plop = temp[1]
                        plopp = temp[2]   
                else:
                    plop = temp[1]
                    plopp = temp[2]
                tweetpos = tweetpos + plop
                tweetneg = tweetneg + plopp
            except:
                pass
            c = c + 1 
        # Add emot feeling        
        tweetpos = tweetpos + emo[0]
        tweetneg = tweetneg + emo[1]
        res.append((tweetpos,tweetneg))
        bar = bar + 1.0
    return res    
def findScoreWord(word, dType):
    swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
    swn = SentiWordNetCorpusReader(swn_filename)
    word = re.sub('[%s]' % re.escape(string.punctuation), ' ', word)
    word = word.lower()
    #print word
    wS = 0
    
    for w in word.split():
	#print w
	if dType == NN:
	    test = swn.senti_synsets(w, 'n')
	elif dType == ADJ:
	    test = swn.senti_synsets(w, 'a')
	elif dType == VB:
	    test = swn.senti_synsets(w, 'v')
	elif dType == ADV:
	    test = swn.senti_synsets(w, 'r')
	
	try:
	    if test[0].pos_score < 0.1:
		wS += -test[0].neg_score
	    elif test[0].neg_score < 0.1:
		wS += test[0].pos_score
	    else:
		wS = test[0].pos_score
	except:
	    continue
    #print word, wS
    
    if len(word.split()) == 0:
	return 0
    
    return wS/len(word.split())
    def __init__(self,
                 traing_data_fileP1='mood_traing_p1.dat',
                 traing_data_fileP2='mood_traing.dat',
                 data_file='tweets_raw.dat'):

        self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt'
        self.swn = SentiWordNetCorpusReader(swn_filename)
示例#11
0
class SentiWordnetTagger():
    def __init__(self):
        f = '../files/SentiWordNet_3.0.0_20120206.txt'
        self.swn = SentiWordNetCorpusReader(f)
    
    def tag(self, word, pos):
        if (pos is None):
            return (None, 'z')
        synsets = self.swn.senti_synsets(word, pos)
        
        if not synsets: return None
        
        # assumes the list is ranked and gets the first as the most frequent
        s = synsets[0] 
        offset = s.offset
        if s.pos_score > s.neg_score:     tag = 'p'
        elif  s.pos_score < s.neg_score: tag = 'n'
        else:                             tag = 'z'
        
        # offset is the synset id
        return (offset, tag)
    def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'):
        
        self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2)
        
        self.langClassifier = LangDetect(supportedLangs)
        
        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()
        
        self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt'
        self.swn = SentiWordNetCorpusReader(swn_filename)
示例#13
0
 def __init__(self):
     SWN_FILENAME = "lexicon\SentiWordNet_3.0.0_20130122.txt"
     self.swn= SentiWordNetCorpusReader(SWN_FILENAME)
示例#14
0
import sys
sys.path.append("/home/sgolbeck/nltk_data/corpora/sentiwordnet")
print sys.path
from sentiwordnet import SentiWordNetCorpusReader, SentiSynset
dir1="/home/sgolbeck/nltk_data/corpora/sentiwordnet/"
swn_filename = dir1+"SentiWordNet_3.0.0.txt"
#swn_filename = "SentiWordNet_3.0.0_20100705.txt"
swn = SentiWordNetCorpusReader(swn_filename)
swn_bad=swn.senti_synsets('bad')

#######################################################
from pattern.en import wordnet
print wordnet.synsets("kill",pos="VB")[0].weight
from pattern.en import ADJECTIVE
pattern_bad=wordnet.synsets('bad', ADJECTIVE)[0]
#######################################################
from pattern.en import parse
pattern_bad_parse=parse('he is a bad man of crime that dances violently')
pattern_bad_parse=pattern_bad_parse.split()
print pattern_bad_parse
pattern_bad_parse_word=pattern_bad_parse[0][3]


#######################################################
import nltk
text=nltk.word_tokenize("And now for something completely different")
#requires that 'maxent_treebank_pos_tagger' has been downloaded
text_tagged=nltk.pos_tag(text)


#######################################################
class RawClassifier(object):
    statsData = {}
    dataDir = "/home/toni/git/financial-twitter-sentiment-analyzer/tracker/data"
    limit = {}
    skip = 0
    p2_f_limit = 0.75

    def __init__(self,
                 traing_data_fileP1='mood_traing_p1.dat',
                 traing_data_fileP2='mood_traing.dat',
                 data_file='tweets_raw.dat'):

        self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt'
        self.swn = SentiWordNetCorpusReader(swn_filename)

    def classifyP1(self, stripSmiles=False):
        self.classifiyRaw(self.tweetsFile, stripSmiles)
        self.clsP1.train(self.training_data_p1)
        print "done training P1"

        print self.statsData

    def classifyP2(self):
        """
            remove noisy n-grams 
        """
        _st = {'tf': 0, 'df': 0}

        for feutures, label in self.training_data_p1:
            lang = feutures.pop('x_lang')
            feuturesP2 = feutures.copy()

            for f, v in feutures.items():
                prob = self.clsP1.classifier.prob_classify({
                    f: v,
                    'x_lang': lang
                })

                _st['tf'] += 1

                if max(prob.prob('n'), prob.prob('p')) <= self.p2_f_limit:
                    del feuturesP2[f]
                    _st['df'] += 1

            if len(feuturesP2) >= 3:
                feuturesP2['x_lang'] = lang
                self.training_data_p2.append((feuturesP2, label))
            else:
                pass

        print 'p2_length:', len(self.training_data_p2), ' p1_lenght:', len(
            self.training_data_p1)
        print 'st:', _st

        print "deleting p1 set"
        del self.training_data_p1
        del self.clsP1
        print "Done deleting p1 set"
        self.clsP2.train(self.training_data_p2)

    def stripSmiles(self, text):
        emos = [
            ':)', ':-)', ';-)', ': )', ':d', '=)', ':p', ';)', '<3', ':(',
            ':-(', ': ('
        ]

        for item in emos:
            text = text.replace(item, "")
        return text

    def stats(self, lang, mood):
        if not self.statsData.has_key(lang):
            self.statsData[lang] = {'n': 0, 'p': 0}

        if self.limit.has_key(lang):
            limit = self.limit[lang]
        else:
            limit = self.limit['default']

        if self.statsData[lang][mood] >= limit:
            return 0
        else:
            self.statsData[lang][mood] += 1
            return 1

    def checkWithSentiwordnet(self, text):
        tokens = nltk.word_tokenize(text)
        for token in tokens:
            synsets = self.swn.senti_synsets(token)
            if len(synsets) > 0:
                synset = self.swn.senti_synset(str(synsets[0]))
                print synset

    def checkKeyWords(self, text):
        count = self.containsPositiveWord(text) + self.containsNegativeWord(
            text)
        if count > 0:
            return 'p'
        if count < 0:
            return 'n'
        return 'x'

    def containsPositiveWord(self, text):
        count = 0
        for item in dictionary.positive:
            if item in text:
                count += 1
                #print 'p:',item
        return count

    def containsNegativeWord(self, text):
        count = 0
        for item in dictionary.negative:
            if item in text:
                #print 'n:', item
                count -= 1
        return count

    def classifiyRaw(self, file, stripSmiles):
        while True:
            try:
                tweet = cPickle.load(file)
            except EOFError:
                print "done classify"
                break
            except:
                print "error"
                pass

            if self.skip > 0:
                print "skip"
                self.skip -= 1
                continue

            if tweet:
                text = unicode(tweet.get('text'))

                if text.lower().find('rt ') != -1:
                    print 'rt'
                    continue

                mood = self.checkKeyWords(text)
                if mood == 'x':
                    continue

                lang = self.langClassifier.detect(text)

                if stripSmiles:
                    text = self.stripSmiles(text)

                sres = self.stats(lang[0], mood)
                if sres == 0:
                    # limite de idioma alcanzado
                    print 'limit reached for ', lang[0]
                    continue

                if sres == -1:
                    print "done for %s" % mood
                    break

                if self.count and self.count % 100 == 0:
                    print "classified %d tweets" % (self.count)
                self.count += 1

                self.checkWithSentiwordnet(text)

                self.training_data_p1.addRow(text, mood, lang[0])

    def countRows(self, file):
        rows = 0
        breakes = 0
        while True:
            try:
                tweet = cPickle.load(file)
                rows += 1
            except EOFError:
                break
            except:
                breakes += 1
        print 'tweets:', rows, ' breakes:', breakes
示例#16
0
class RawClassifier(object):
    statsData = {}
    limit = {}
    skip = 0
    p2_f_limit = 0.6
    sentiwordnet = conf.USE_SENTIWORDNET_DICT

    def __init__(self,
                 traing_data_fileP1='mood_traing_p1.dat',
                 traing_data_fileP2='mood_traing.dat',
                 data_file='tweets_raw.dat'):
        if self.sentiwordnet:
            print "using sentiwordnet dictionary"
        else:
            print "not using sentiwordnet dictionary"

        self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsFile = open(
            os.path.join(os.curdir, os.path.normpath('../data/' + data_file)),
            'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(
            os.path.join(os.curdir, os.path.normpath('../data/' + data_file)),
            'rb')

        self.limit['en'] = 300000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME
        self.swn = SentiWordNetCorpusReader(swn_filename)

    def classifyP1(self, stripSmiles=False):
        self.classifiyRaw(self.tweetsFile, stripSmiles)
        self.clsP1.train(self.training_data_p1)
        print "done training P1"

        print self.statsData

    def classifyP2(self):
        """
            remove noisy n-grams 
        """
        _st = {'tf': 0, 'df': 0}

        for feutures, label in self.training_data_p1:
            lang = feutures.pop('x_lang')
            feuturesP2 = feutures.copy()

            for f, v in feutures.items():
                prob = self.clsP1.classifier.prob_classify({
                    f: v,
                    'x_lang': lang
                })

                _st['tf'] += 1

                if max(prob.prob('n'), prob.prob('p')) <= self.p2_f_limit:
                    del feuturesP2[f]
                    _st['df'] += 1

            if len(feuturesP2) >= 3:
                feuturesP2['x_lang'] = lang
                self.training_data_p2.append((feuturesP2, label))
            else:
                pass

        print 'p2_length:', len(self.training_data_p2), ' p1_lenght:', len(
            self.training_data_p1)
        print 'st:', _st

        print "deleting p1 set"
        del self.training_data_p1
        del self.clsP1
        print "Done deleting p1 set"
        self.clsP2.train(self.training_data_p2)

    def stripSmiles(self, text):
        emos = [
            ':)', ':-)', ';-)', ': )', ':d', '=)', ':p', ';)', '<3', ':(',
            ':-(', ': ('
        ]

        for item in emos:
            text = text.replace(item, "")
        return text

    def stats(self, lang, mood):
        if not self.statsData.has_key(lang):
            self.statsData[lang] = {'n': 0, 'p': 0}

        if self.limit.has_key(lang):
            limit = self.limit[lang]
        else:
            limit = self.limit['default']

        if self.statsData[lang][mood] >= limit:
            return 0
        else:
            self.statsData[lang][mood] += 1
            return 1

    # CHECK WITH SENTIWORDNET
    def checkWithSentiwordnet(self, text):
        count = 0
        tokens = nltk.word_tokenize(text)
        #TODO more languages
        #tokens = [w for w in tokens if not w in nltk.corpus.stopwords.words('english')]
        if len(tokens) > 0:
            for token in tokens:
                synsets = self.swn.senti_synsets(token)
                if len(synsets) > 0:
                    # TODO no tiene por que ser este lemma. Comprobar la categoria
                    lemma = synsets[0]
                    count = count + lemma.pos_score - lemma.neg_score
            #print count, " points for tokens :", tokens
            if count > 0.5:
                return 'p'
            if count < 0.5:
                return 'n'
        return 'x'

    # CHECK WITH FINANCIAL DICTIONARIES
    def checkWithFinancialDict(self, text):
        count = self.containsPositiveWord(text) + self.containsNegativeWord(
            text)
        if count > 0:
            return 'p'
        if count < 0:
            return 'n'
        return 'x'

    def containsPositiveWord(self, text):
        count = 0
        for item in dictionary.positive:
            if item in text:
                count += 1
                #print 'p:',item
        return count

    def containsNegativeWord(self, text):
        count = 0
        for item in dictionary.negative:
            if item in text:
                #print 'n:', item
                count -= 1
        return count

    def classifiyRaw(self, file, stripSmiles):
        while True:
            try:
                tweet = cPickle.load(file)
            except EOFError:
                print "done classify"
                break
            except:
                print "error"
                pass

            if self.skip > 0:
                print "skip"
                self.skip -= 1
                continue

            if tweet:
                text = unicode(tweet.text)

                if text.lower().find('rt ') != -1:
                    print 'rt'
                    continue

                lang = self.langClassifier.detect(text)
                # TODO more languages
                if lang[0] != 'en':
                    continue

                if stripSmiles:
                    text = self.stripSmiles(text)

                if self.sentiwordnet:
                    mood = self.checkWithSentiwordnet(text)
                else:
                    mood = self.checkWithFinancialDict(text)

                if mood == 'x':
                    continue

                sres = self.stats(lang[0], mood)
                if sres == 0:
                    # limite de idioma alcanzado
                    print 'limit reached for ', lang[0]
                    continue

                if sres == -1:
                    print "done for %s" % mood
                    break

                if self.count and self.count % 100 == 0:
                    print "classified %d tweets" % (self.count)
                self.count += 1

                self.training_data_p1.addRow(text, mood, lang[0])

    def countRows(self, file):
        rows = 0
        breakes = 0
        while True:
            try:
                tweet = cPickle.load(file)
                rows += 1
            except EOFError:
                break
            except:
                breakes += 1
        print 'tweets:', rows, ' breakes:', breakes
示例#17
0
words_noslang = []
for w in words:
    for k, v in dico_slang.items():
        if k in w:
            #print(k)
            w = list(filter(lambda x: x != k, w))
            w.append(v)
            #print(w)
    words_noslang.append(w)
print(words_noslang)

tag = nltk.pos_tag
tag_list_tot = []
tag_list = [tag(words_noslang[i]) for i in range(N)]
print(tag_list)
for i in range(N):
    tag_list_tot.extend(tag(words_noslang[i]))

tot_pos = [tag_list_tot[i][1] for i in range(len(tag_list_tot))]
count_VB = Counter(tot_pos)
print(count_VB)
verb_tag = ['VB', 'VBZ', 'VBP', 'VBG', 'VBN', 'VBD']
sum_vb = 0
for k, v in count_VB.items():
    if k in verb_tag:
        sum_vb += v
print('Le nombre total de POS verbes sur l\'ensemble des tweets est: ', sum_vb)

swn_filename = '/home/audrey/Audrey/Cours/INF344/TP/TP_sentiment/SentiWordNet_3.0.0_20130122.txt'
swn = SentiWordNetCorpusReader(swn_filename)
swn.senti_synset('breakdown.n.03')
# -*- coding: cp1252 -*-
import nltk, enchant,re,math
from nltk.corpus import wordnet as wn
from sentiwordnet import SentiWordNetCorpusReader, SentiSynset

swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
swn = SentiWordNetCorpusReader(swn_filename)

dictionary = enchant.Dict("en_US")
emotipatternhappy = re.compile("(:-\))|(:\))|(:o\))|(:])|( :3 )|(:c\))|(:>)|(=])|(8\))|(=\))|(:})|(:\^\))|(:'-\))|(:'\))")
emotipatternveryhappy = re.compile("(:-D)|( :D )|(8-D)|( 8D )|(x-D)|( xD )|(X-D)|( XD )|(=-D)|( =D )|(=-3)|( =3 )|(B\^D)|(:-\)\))|(:\)\))")
emotipatternsad = re.compile("(>:\[)|(:-\()|(:\()|(:-c)|( :c )|(:-<)|(:<)|(:-\[)|(:\[)|(:{)|(:'\()|(:'-\()")
emotipatternverysad = re.compile("(D:<)|( D: )|( D8 )|(D;)|(D=)|( DX )|(v\.v)|(D-':)|(:-\|\|)|(>:\()")

def calculate_sentiment(tweet,followers):

	#### Emoticon ####
	emotipos = 0.3 * len(emotipatternhappy.findall(tweet))
	emotipos = emotipos + 0.5 * len(emotipatternveryhappy.findall(tweet))
	emotineg = 0.3 * len(emotipatternsad.findall(tweet))
	emotineg = emotineg + 0.5 * len(emotipatternverysad.findall(tweet))

	emotiscore = emotipos - emotineg
	if emotiscore > 1:
	    emotiscore = 1
	if emotiscore < -1:
	    emotiscore = -1

	#### Hash Score & SWN Score ####
	#remove punctuation as this will mess up pos tagging
	tweet = re.sub('[\$\£\+\'(..)(...)\?!\(\)\[\]":;-]\&','',tweet)
class RawClassifier(object):
    statsData = {}
    dataDir = "/home/toni/git/financial-twitter-sentiment-analyzer/tracker/data"
    limit = {}
    skip = 0
    p2_f_limit = 0.75
    
    def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'):
        
        self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2)
        
        self.langClassifier = LangDetect(supportedLangs)
        
        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()
        
        self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt'
        self.swn = SentiWordNetCorpusReader(swn_filename)

        
    
    def classifyP1(self,stripSmiles=False):
        self.classifiyRaw(self.tweetsFile,stripSmiles)
        self.clsP1.train(self.training_data_p1)
        print "done training P1"
        
        print self.statsData
        
    def classifyP2(self):
        """
            remove noisy n-grams 
        """
        _st={'tf':0,'df':0}
        
        for feutures,label in self.training_data_p1:
            lang = feutures.pop('x_lang')
            feuturesP2 = feutures.copy()
            
            for f,v in feutures.items():
               prob = self.clsP1.classifier.prob_classify({f:v,'x_lang':lang}) 
               
               
               _st['tf']+=1
               
               if max(prob.prob('n'),prob.prob('p')) <= self.p2_f_limit:
                   del feuturesP2[f]
                   _st['df']+=1
            
            if len(feuturesP2) >= 3:
                feuturesP2['x_lang']=lang
                self.training_data_p2.append((feuturesP2,label))
            else:
                pass
            
        print 'p2_length:' , len(self.training_data_p2), ' p1_lenght:' , len(self.training_data_p1)  
        print 'st:' , _st
        
        print "deleting p1 set"
        del self.training_data_p1
        del self.clsP1
        print "Done deleting p1 set"
        self.clsP2.train(self.training_data_p2)
        
    
    def stripSmiles(self,text):
        emos = [':)',':-)',';-)',': )',':d','=)',':p',';)','<3',':(',':-(',': (']
        
        for item in emos:
            text = text.replace(item,"")
        return text         
    
    def stats(self,lang,mood):
        if not self.statsData.has_key(lang):
            self.statsData[lang] = {'n':0,'p':0}
        
        if self.limit.has_key(lang):
            limit = self.limit[lang]
        else:
            limit = self.limit['default']

        if self.statsData[lang][mood] >= limit:
            return 0
        else:
            self.statsData[lang][mood]+=1
            return 1

    def checkWithSentiwordnet(self, text):
        tokens = nltk.word_tokenize(text)
        for token in tokens: 
            synsets = self.swn.senti_synsets(token)
            if len(synsets) > 0: 
                synset = self.swn.senti_synset(str(synsets[0]))
                print synset
    
       
    def checkKeyWords(self,text):
        count = self.containsPositiveWord(text) + self.containsNegativeWord(text);
        if count > 0:
            return 'p'
        if count < 0:
            return 'n'
        return 'x'

    def containsPositiveWord(self,text):
        count = 0
        for item in dictionary.positive:
            if item in text:
                count += 1                
                #print 'p:',item
        return count


    def containsNegativeWord(self,text):
        count = 0
        for item in dictionary.negative:
            if item in text:
                #print 'n:', item
                count -= 1                
        return count

    def classifiyRaw(self,file,stripSmiles):
        while True:
            try:
                tweet = cPickle.load(file)
            except EOFError:
                print "done classify"
                break
            except:
                print "error"
                pass
            
            if self.skip > 0:
                print "skip"
                self.skip -= 1
                continue
            
            if tweet:
                text = unicode(tweet.get('text'))
                
                if text.lower().find('rt ') != -1:
                    print 'rt'
                    continue

                mood = self.checkKeyWords(text)
                if mood == 'x':
                    continue

                lang  = self.langClassifier.detect(text)

                if stripSmiles:
                    text = self.stripSmiles(text)
                
                sres = self.stats(lang[0], mood)
                if sres == 0:
                    # limite de idioma alcanzado
                    print 'limit reached for ' , lang[0]
                    continue


                if sres == -1:
                    print "done for %s" % mood
                    break
                
                if self.count and self.count % 100 == 0:
                    print "classified %d tweets" % (self.count)
                self.count += 1

                self.checkWithSentiwordnet(text)

                self.training_data_p1.addRow(text, mood, lang[0])

    
    def countRows(self,file):
        rows = 0
        breakes = 0
        while True:
            try:
                tweet = cPickle.load(file)
                rows +=1
            except EOFError:
                break
            except:
                breakes +=1
        print 'tweets:',rows,' breakes:',breakes
示例#20
0
def sentimentAnalysis(filename, outputFile):
    swn_filename = 'SentiWordNet_3.0.0_20100705.txt'
    swn = SentiWordNetCorpusReader(swn_filename)

    regex = re.compile('[%s]' % re.escape(string.punctuation))

    with codecs.open(filename, 'r', 'utf-8') as f:
        data1 = f.readlines()
    f.close()

    writer = csv.writer(open(outputFile, 'wb'))

    tag = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'TO', 'UH', 'PDT', 'SYM', 'RP']
    noun = ['NN', 'NNS', 'NP', 'NPS']
    adj = ['JJ', 'JJR', 'JJS']
    pronoun = ['PP', 'PP$', 'WP', 'WP$']
    verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    adverb = ['RB', 'RBR', 'RBS', 'WRB']
    for text in data1:
        count = re.split("\s+", text, 1)[0]
        if len(re.split("\s+", text, 1)) > 1:
            text = re.split("\s+", text, 1)[1]
            Tex = regex.sub(u'', text)
            words = word_tokenize(Tex.lower())
            word = nltk.pos_tag(words)
            objCount = 0
            subCount = 0
            for w in word:
                if not w[1] in tag:
                    #print(w)
                    if w[1] in noun:
                        pos_Char = 'n'
                    elif w[1] in adj:
                        pos_Char = 'a'
                    elif w[1] in pronoun:
                        pos_Char = 'p'
                    elif w[1] in verb:
                        pos_Char = 'v'
                    elif w[1] in adverb:
                        pos_Char = 'r'
                    else:
                        pos_Char = 'none'

                    if pos_Char == 'none':
                        try:
                            s = swn.senti_synsets(w[0])
                            scores = list(s)[0]
                            if scores.obj_score > 0.5:
                                objCount += 1
                            elif scores.pos_score + scores.neg_score > 0.5:
                                subCount += 1
                        except:
                            print('Unexpected word')
                    else:
                        try:
                            s = swn.senti_synsets(w[0], pos_Char)
                            scores = list(s)[0]
                            if scores.obj_score > 0.5:
                                objCount += 1
                            elif scores.pos_score + scores.neg_score > 0.5:
                                subCount += 1
                        except:
                            print('Unexpected word')

            if objCount + subCount > 0:
                ratioObj = float(objCount) / (objCount + subCount)
                ratioSub = float(subCount) / (objCount + subCount)
            else:
                ratioObj = 0.0
                ratioSub = 0.0
            writer.writerow([count, ratioObj, ratioSub])
示例#21
0
 def __init__(self):
     f = '../files/SentiWordNet_3.0.0_20120206.txt'
     self.swn = SentiWordNetCorpusReader(f)
class polarsentenceSample:
    ssen = ''
    weightP = 0.000
    weightN = 0.000
    index = 0

def myS (myData) :
    if myData < 0.000 :
          return -1.00
    else :
          return 1.00



swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
swn = SentiWordNetCorpusReader(swn_filename)
lmtzr = WordNetLemmatizer()
lemmaList = set()
engStopWords = set(stopwords.words('english'))
#############

# set test size, input directory and output directory

##############
testSize = 3 #no of sents. to be appeared in the summary and anti summary
testSentSize = 12 #maximum no of sents expected in the document
windowRange = 4 #window size
inPath = "/Users/fahmida/Desktop/randomnessTesting/dataSetDUC2004"
outPath = "/Users/fahmida/Desktop/randomnessTesting/textRankDUC2004L"
outPath2 = "/Users/fahmida/Desktop/randomnessTesting/polarityRankDUC2004L/positive"
outPath3 = "/Users/fahmida/Desktop/randomnessTesting/polarityRankDUC2004L/negative"
class RawClassifier(object):
    statsData = {}
    limit = {}
    skip = 0
    p2_f_limit = 0.6
    sentiwordnet = conf.USE_SENTIWORDNET_DICT
    
    def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'):
        if self.sentiwordnet:
            print "using sentiwordnet dictionary"
        else:
            print "not using sentiwordnet dictionary"

        self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2)
        
        self.langClassifier = LangDetect(supportedLangs)
        
        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()
        
        self.tweetsFile = open(os.path.join(os.curdir, os.path.normpath('../data/' + data_file)) ,'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(os.curdir , os.path.normpath('../data/' + data_file)) ,'rb')

        self.limit['en'] = 300000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME
        self.swn = SentiWordNetCorpusReader(swn_filename)

        
    
    def classifyP1(self,stripSmiles=False):
        self.classifiyRaw(self.tweetsFile,stripSmiles)
        self.clsP1.train(self.training_data_p1)
        print "done training P1"
        
        print self.statsData
        
    def classifyP2(self):
        """
            remove noisy n-grams 
        """
        _st={'tf':0,'df':0}
        
        for feutures,label in self.training_data_p1:
            lang = feutures.pop('x_lang')
            feuturesP2 = feutures.copy()
            
            for f,v in feutures.items():
               prob = self.clsP1.classifier.prob_classify({f:v,'x_lang':lang}) 
               
               
               _st['tf']+=1
               
               if max(prob.prob('n'),prob.prob('p')) <= self.p2_f_limit:
                   del feuturesP2[f]
                   _st['df']+=1
            
            if len(feuturesP2) >= 3:
                feuturesP2['x_lang']=lang
                self.training_data_p2.append((feuturesP2,label))
            else:
                pass
            
        print 'p2_length:' , len(self.training_data_p2), ' p1_lenght:' , len(self.training_data_p1)  
        print 'st:' , _st
        
        print "deleting p1 set"
        del self.training_data_p1
        del self.clsP1
        print "Done deleting p1 set"
        self.clsP2.train(self.training_data_p2)
        
    
    def stripSmiles(self,text):
        emos = [':)',':-)',';-)',': )',':d','=)',':p',';)','<3',':(',':-(',': (']
        
        for item in emos:
            text = text.replace(item,"")
        return text         
    
    def stats(self,lang,mood):
        if not self.statsData.has_key(lang):
            self.statsData[lang] = {'n':0,'p':0}
        
        if self.limit.has_key(lang):
            limit = self.limit[lang]
        else:
            limit = self.limit['default']

        if self.statsData[lang][mood] >= limit:
            return 0
        else:
            self.statsData[lang][mood]+=1
            return 1

    # CHECK WITH SENTIWORDNET
    def checkWithSentiwordnet(self, text):
        count = 0
        tokens = nltk.word_tokenize(text)
         #TODO more languages
        #tokens = [w for w in tokens if not w in nltk.corpus.stopwords.words('english')]
        if len(tokens) > 0:
            for token in tokens: 
                synsets = self.swn.senti_synsets(token)
                if len(synsets) > 0: 
                    # TODO no tiene por que ser este lemma. Comprobar la categoria 
                    lemma = synsets[0] 
                    count = count + lemma.pos_score - lemma.neg_score
            #print count, " points for tokens :", tokens
            if count > 0.5:
                return 'p'
            if count < 0.5:
                return 'n'
        return 'x'
    
    # CHECK WITH FINANCIAL DICTIONARIES 
    def checkWithFinancialDict(self,text):
        count = self.containsPositiveWord(text) + self.containsNegativeWord(text);
        if count > 0:
            return 'p'
        if count < 0:
            return 'n'
        return 'x'

    def containsPositiveWord(self,text):
        count = 0
        for item in dictionary.positive:
            if item in text:
                count += 1                
                #print 'p:',item
        return count


    def containsNegativeWord(self,text):
        count = 0
        for item in dictionary.negative:
            if item in text:
                #print 'n:', item
                count -= 1                
        return count


    def classifiyRaw(self,file,stripSmiles):
        while True:
            try:
                tweet = cPickle.load(file)
            except EOFError:
                print "done classify"
                break
            except:
                print "error"
                pass
            
            if self.skip > 0:
                print "skip"
                self.skip -= 1
                continue
            
            if tweet:
                text = unicode(tweet.get('text'))
                
                if text.lower().find('rt ') != -1:
                    print 'rt'
                    continue


                lang  = self.langClassifier.detect(text)
                # TODO more languages
                if lang[0] != 'en':
                    continue

                if stripSmiles:
                    text = self.stripSmiles(text)

                if self.sentiwordnet:
                    mood = self.checkWithSentiwordnet(text)
                else:     
                    mood = self.checkWithFinancialDict(text)

                if mood == 'x':
                    continue
                
                sres = self.stats(lang[0], mood)
                if sres == 0:
                    # limite de idioma alcanzado
                    print 'limit reached for ' , lang[0]
                    continue


                if sres == -1:
                    print "done for %s" % mood
                    break
                
                if self.count and self.count % 100 == 0:
                    print "classified %d tweets" % (self.count)
                self.count += 1

                self.training_data_p1.addRow(text, mood, lang[0])

    
    def countRows(self,file):
        rows = 0
        breakes = 0
        while True:
            try:
                tweet = cPickle.load(file)
                rows +=1
            except EOFError:
                break
            except:
                breakes +=1
        print 'tweets:',rows,' breakes:',breakes
class polarsentenceSample:
    ssen = ''
    weightP = 0.000
    weightN = 0.000
    index = 0


def myS(myData):
    if myData < 0.000:
        return -1.00
    else:
        return 1.00


swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
swn = SentiWordNetCorpusReader(swn_filename)
lmtzr = WordNetLemmatizer()
lemmaList = set()
engStopWords = set(stopwords.words('english'))
#############

# set test size, input directory and output directory

##############
testSize = 3  #no of sents. to be appeared in the summary and anti summary
testSentSize = 12  #maximum no of sents expected in the document
windowRange = 4  #window size
inPath = "/Users/fahmida/Desktop/randomnessTesting/dataSetDUC2004"
outPath = "/Users/fahmida/Desktop/randomnessTesting/textRankDUC2004L"
outPath2 = "/Users/fahmida/Desktop/randomnessTesting/polarityRankDUC2004L/positive"
outPath3 = "/Users/fahmida/Desktop/randomnessTesting/polarityRankDUC2004L/negative"
tick_indices={}
for i in range(len(DF_tick_indices)):
    temp=DF_tick_indices.ix[i,1].split(', ')[1:-1]
    temp1=[]
    [temp1.append(int(x)) for x in temp]
    tick_indices[DF_tick_indices.ix[i,0]]=temp1

##################################################################################################
##################################################################################################
##################################################################################################
#manually setup sentiwordnet
cd /home/sgolbeck/nltk_data/corpora/sentiwordnet
from sentiwordnet import SentiWordNetCorpusReader, SentiSynset
swn_filename = "SentiWordNet_3.0.0.txt"
#swn_filename = "SentiWordNet_3.0.0_20100705.txt"
swn = SentiWordNetCorpusReader(swn_filename)
swn.senti_synsets('slow')
cd /home/sgolbeck/workspace/PythonExercises/twitter/Tweets

##################################################################################################
##################################################################################################
##################################################################################################
#create a time window for September
left_window=datetime(2014,9,1,0,0,0)
right_window=datetime(2014,10,1,0,0,0)
#test if date_object is in the window
cond_l=(DF_tick[11]>=left_window)
cond_r=(DF_tick[11]<right_window)
cond=[cond_l[i] and cond_r[i] for i in DF_tick.index]
#select only those rows within the window
DF_sept=DF_tick[cond]
def sentimentAnalysis(filename, outputFile):
    swn_filename = 'SentiWordNet_3.0.0_20100705.txt'
    swn = SentiWordNetCorpusReader(swn_filename)

    regex = re.compile('[%s]' % re.escape(string.punctuation))

    with codecs.open(filename, 'r', 'utf-8') as f:
        data1 = f.readlines()
    f.close()

    writer = csv.writer(open(outputFile, 'wb'))

    tag = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'TO', 'UH', 'PDT', 'SYM', 'RP']
    noun = ['NN', 'NNS', 'NP', 'NPS']
    adj = ['JJ', 'JJR', 'JJS']
    pronoun = ['PP', 'PP$', 'WP', 'WP$']
    verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    adverb = ['RB', 'RBR', 'RBS', 'WRB']
    for text in data1:
        count = re.split("\s+",text, 1)[0]
        if len(re.split("\s+",text, 1)) > 1:
            text = re.split("\s+",text, 1)[1]
            Tex = regex.sub(u'', text)
            words = word_tokenize(Tex.lower())
            word = nltk.pos_tag(words)
            objCount = 0
            subCount = 0
            for w in word:
                if not w[1] in tag:
                    #print(w)
                    if w[1] in noun:
                        pos_Char = 'n'
                    elif w[1] in adj:
                        pos_Char = 'a'
                    elif w[1] in pronoun:
                        pos_Char = 'p'
                    elif w[1] in verb:
                        pos_Char = 'v'
                    elif w[1] in adverb:
                        pos_Char = 'r'
                    else:
                        pos_Char = 'none'

                    if pos_Char == 'none':
                        try:
                            s = swn.senti_synsets(w[0])
                            scores = list(s)[0]
                            if scores.obj_score > 0.5:
                                objCount += 1
                            elif scores.pos_score + scores.neg_score > 0.5:
                                subCount += 1
                        except:
                            print('Unexpected word')
                    else:
                        try:
                            s = swn.senti_synsets(w[0], pos_Char)
                            scores=list(s)[0]
                            if scores.obj_score > 0.5:
                                objCount += 1
                            elif scores.pos_score + scores.neg_score > 0.5:
                                subCount += 1
                        except:
                            print('Unexpected word')

            if objCount+subCount > 0:
                ratioObj = float(objCount)/(objCount+subCount)
                ratioSub = float(subCount)/(objCount+subCount)
            else:
                ratioObj = 0.0
                ratioSub = 0.0
            writer.writerow([count, ratioObj, ratioSub])