示例#1
0
def scores(preProData,emot,sentifile='SentiWordNet_3.0.0_20130122.txt'):
    swn = SentiWordNetCorpusReader(sentifile)
    res = list()
    bar = 0.0
    nm = NegMod()
    for tweet,emo in zip(preProData,emot):
        print bar / float(len(preProData))
        tweetneg = 0.0
        tweetpos = 0.0
        c = 0
        for word in tweet:
            try:
                w = str(wn.synsets(word)[0].name())
                temp = swn.senti_synset(w)
                plop = 0.0
                plopp = 0.0
                # Negation et modifieurs
                if c != 0:
                    if nm.neg_it(tweet[c-1]):#negation
                        tweetpos = temp[2]
                        tweetneg = temp[1]
                        break
                    if nm.mod_multiply(tweet[c-1]):#modifier
                        plop = temp[1]*2
                        plopp = temp[2]*2
                    else:
                        plop = temp[1]
                        plopp = temp[2]   
                else:
                    plop = temp[1]
                    plopp = temp[2]
                tweetpos = tweetpos + plop
                tweetneg = tweetneg + plopp
            except:
                pass
            c = c + 1 
        # Add emot feeling        
        tweetpos = tweetpos + emo[0]
        tweetneg = tweetneg + emo[1]
        res.append((tweetpos,tweetneg))
        bar = bar + 1.0
    return res    
示例#2
0
for w in words:
    for k, v in dico_slang.items():
        if k in w:
            #print(k)
            w = list(filter(lambda x: x != k, w))
            w.append(v)
            #print(w)
    words_noslang.append(w)
print(words_noslang)

tag = nltk.pos_tag
tag_list_tot = []
tag_list = [tag(words_noslang[i]) for i in range(N)]
print(tag_list)
for i in range(N):
    tag_list_tot.extend(tag(words_noslang[i]))

tot_pos = [tag_list_tot[i][1] for i in range(len(tag_list_tot))]
count_VB = Counter(tot_pos)
print(count_VB)
verb_tag = ['VB', 'VBZ', 'VBP', 'VBG', 'VBN', 'VBD']
sum_vb = 0
for k, v in count_VB.items():
    if k in verb_tag:
        sum_vb += v
print('Le nombre total de POS verbes sur l\'ensemble des tweets est: ', sum_vb)

swn_filename = '/home/audrey/Audrey/Cours/INF344/TP/TP_sentiment/SentiWordNet_3.0.0_20130122.txt'
swn = SentiWordNetCorpusReader(swn_filename)
swn.senti_synset('breakdown.n.03')
class RawClassifier(object):
    statsData = {}
    dataDir = "/home/toni/git/financial-twitter-sentiment-analyzer/tracker/data"
    limit = {}
    skip = 0
    p2_f_limit = 0.75
    
    def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'):
        
        self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2)
        
        self.langClassifier = LangDetect(supportedLangs)
        
        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()
        
        self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt'
        self.swn = SentiWordNetCorpusReader(swn_filename)

        
    
    def classifyP1(self,stripSmiles=False):
        self.classifiyRaw(self.tweetsFile,stripSmiles)
        self.clsP1.train(self.training_data_p1)
        print "done training P1"
        
        print self.statsData
        
    def classifyP2(self):
        """
            remove noisy n-grams 
        """
        _st={'tf':0,'df':0}
        
        for feutures,label in self.training_data_p1:
            lang = feutures.pop('x_lang')
            feuturesP2 = feutures.copy()
            
            for f,v in feutures.items():
               prob = self.clsP1.classifier.prob_classify({f:v,'x_lang':lang}) 
               
               
               _st['tf']+=1
               
               if max(prob.prob('n'),prob.prob('p')) <= self.p2_f_limit:
                   del feuturesP2[f]
                   _st['df']+=1
            
            if len(feuturesP2) >= 3:
                feuturesP2['x_lang']=lang
                self.training_data_p2.append((feuturesP2,label))
            else:
                pass
            
        print 'p2_length:' , len(self.training_data_p2), ' p1_lenght:' , len(self.training_data_p1)  
        print 'st:' , _st
        
        print "deleting p1 set"
        del self.training_data_p1
        del self.clsP1
        print "Done deleting p1 set"
        self.clsP2.train(self.training_data_p2)
        
    
    def stripSmiles(self,text):
        emos = [':)',':-)',';-)',': )',':d','=)',':p',';)','<3',':(',':-(',': (']
        
        for item in emos:
            text = text.replace(item,"")
        return text         
    
    def stats(self,lang,mood):
        if not self.statsData.has_key(lang):
            self.statsData[lang] = {'n':0,'p':0}
        
        if self.limit.has_key(lang):
            limit = self.limit[lang]
        else:
            limit = self.limit['default']

        if self.statsData[lang][mood] >= limit:
            return 0
        else:
            self.statsData[lang][mood]+=1
            return 1

    def checkWithSentiwordnet(self, text):
        tokens = nltk.word_tokenize(text)
        for token in tokens: 
            synsets = self.swn.senti_synsets(token)
            if len(synsets) > 0: 
                synset = self.swn.senti_synset(str(synsets[0]))
                print synset
    
       
    def checkKeyWords(self,text):
        count = self.containsPositiveWord(text) + self.containsNegativeWord(text);
        if count > 0:
            return 'p'
        if count < 0:
            return 'n'
        return 'x'

    def containsPositiveWord(self,text):
        count = 0
        for item in dictionary.positive:
            if item in text:
                count += 1                
                #print 'p:',item
        return count


    def containsNegativeWord(self,text):
        count = 0
        for item in dictionary.negative:
            if item in text:
                #print 'n:', item
                count -= 1                
        return count

    def classifiyRaw(self,file,stripSmiles):
        while True:
            try:
                tweet = cPickle.load(file)
            except EOFError:
                print "done classify"
                break
            except:
                print "error"
                pass
            
            if self.skip > 0:
                print "skip"
                self.skip -= 1
                continue
            
            if tweet:
                text = unicode(tweet.get('text'))
                
                if text.lower().find('rt ') != -1:
                    print 'rt'
                    continue

                mood = self.checkKeyWords(text)
                if mood == 'x':
                    continue

                lang  = self.langClassifier.detect(text)

                if stripSmiles:
                    text = self.stripSmiles(text)
                
                sres = self.stats(lang[0], mood)
                if sres == 0:
                    # limite de idioma alcanzado
                    print 'limit reached for ' , lang[0]
                    continue


                if sres == -1:
                    print "done for %s" % mood
                    break
                
                if self.count and self.count % 100 == 0:
                    print "classified %d tweets" % (self.count)
                self.count += 1

                self.checkWithSentiwordnet(text)

                self.training_data_p1.addRow(text, mood, lang[0])

    
    def countRows(self,file):
        rows = 0
        breakes = 0
        while True:
            try:
                tweet = cPickle.load(file)
                rows +=1
            except EOFError:
                break
            except:
                breakes +=1
        print 'tweets:',rows,' breakes:',breakes
                                    s1.name = str(myWord)
                                    s1.pos = poswn

                                    if previousStopWord in [
                                            'no', 'not', 'neigther', 'nor',
                                            'though', 'but', 'except'
                                    ]:
                                        sign = -1
                                    else:
                                        sign = 1

                                    xBias = 0.000
                                    if wsynset:
                                        s1.definition = wsynset[0].definition
                                        s1.syns = [i for i in wsynset]
                                        x = swn.senti_synset(wsynset[0].name)
                                        if x:
                                            if x.pos_score >= x.neg_score:
                                                xBias = x.pos_score * sign

                                            else:
                                                xBias = x.neg_score * (
                                                    -1) * sign

                                    s1.val = xBias
                                    wordList.append(s1)  #global
                                    ids.append(
                                        s1)  #local --> for each sentence
                                    graph.add_node(s1.name,
                                                   pos=s1.pos,
                                                   rank=0.01)
class RawClassifier(object):
    statsData = {}
    dataDir = "/home/toni/git/financial-twitter-sentiment-analyzer/tracker/data"
    limit = {}
    skip = 0
    p2_f_limit = 0.75

    def __init__(self,
                 traing_data_fileP1='mood_traing_p1.dat',
                 traing_data_fileP2='mood_traing.dat',
                 data_file='tweets_raw.dat'):

        self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt'
        self.swn = SentiWordNetCorpusReader(swn_filename)

    def classifyP1(self, stripSmiles=False):
        self.classifiyRaw(self.tweetsFile, stripSmiles)
        self.clsP1.train(self.training_data_p1)
        print "done training P1"

        print self.statsData

    def classifyP2(self):
        """
            remove noisy n-grams 
        """
        _st = {'tf': 0, 'df': 0}

        for feutures, label in self.training_data_p1:
            lang = feutures.pop('x_lang')
            feuturesP2 = feutures.copy()

            for f, v in feutures.items():
                prob = self.clsP1.classifier.prob_classify({
                    f: v,
                    'x_lang': lang
                })

                _st['tf'] += 1

                if max(prob.prob('n'), prob.prob('p')) <= self.p2_f_limit:
                    del feuturesP2[f]
                    _st['df'] += 1

            if len(feuturesP2) >= 3:
                feuturesP2['x_lang'] = lang
                self.training_data_p2.append((feuturesP2, label))
            else:
                pass

        print 'p2_length:', len(self.training_data_p2), ' p1_lenght:', len(
            self.training_data_p1)
        print 'st:', _st

        print "deleting p1 set"
        del self.training_data_p1
        del self.clsP1
        print "Done deleting p1 set"
        self.clsP2.train(self.training_data_p2)

    def stripSmiles(self, text):
        emos = [
            ':)', ':-)', ';-)', ': )', ':d', '=)', ':p', ';)', '<3', ':(',
            ':-(', ': ('
        ]

        for item in emos:
            text = text.replace(item, "")
        return text

    def stats(self, lang, mood):
        if not self.statsData.has_key(lang):
            self.statsData[lang] = {'n': 0, 'p': 0}

        if self.limit.has_key(lang):
            limit = self.limit[lang]
        else:
            limit = self.limit['default']

        if self.statsData[lang][mood] >= limit:
            return 0
        else:
            self.statsData[lang][mood] += 1
            return 1

    def checkWithSentiwordnet(self, text):
        tokens = nltk.word_tokenize(text)
        for token in tokens:
            synsets = self.swn.senti_synsets(token)
            if len(synsets) > 0:
                synset = self.swn.senti_synset(str(synsets[0]))
                print synset

    def checkKeyWords(self, text):
        count = self.containsPositiveWord(text) + self.containsNegativeWord(
            text)
        if count > 0:
            return 'p'
        if count < 0:
            return 'n'
        return 'x'

    def containsPositiveWord(self, text):
        count = 0
        for item in dictionary.positive:
            if item in text:
                count += 1
                #print 'p:',item
        return count

    def containsNegativeWord(self, text):
        count = 0
        for item in dictionary.negative:
            if item in text:
                #print 'n:', item
                count -= 1
        return count

    def classifiyRaw(self, file, stripSmiles):
        while True:
            try:
                tweet = cPickle.load(file)
            except EOFError:
                print "done classify"
                break
            except:
                print "error"
                pass

            if self.skip > 0:
                print "skip"
                self.skip -= 1
                continue

            if tweet:
                text = unicode(tweet.get('text'))

                if text.lower().find('rt ') != -1:
                    print 'rt'
                    continue

                mood = self.checkKeyWords(text)
                if mood == 'x':
                    continue

                lang = self.langClassifier.detect(text)

                if stripSmiles:
                    text = self.stripSmiles(text)

                sres = self.stats(lang[0], mood)
                if sres == 0:
                    # limite de idioma alcanzado
                    print 'limit reached for ', lang[0]
                    continue

                if sres == -1:
                    print "done for %s" % mood
                    break

                if self.count and self.count % 100 == 0:
                    print "classified %d tweets" % (self.count)
                self.count += 1

                self.checkWithSentiwordnet(text)

                self.training_data_p1.addRow(text, mood, lang[0])

    def countRows(self, file):
        rows = 0
        breakes = 0
        while True:
            try:
                tweet = cPickle.load(file)
                rows += 1
            except EOFError:
                break
            except:
                breakes += 1
        print 'tweets:', rows, ' breakes:', breakes
                          #print wsynset
                          s1 = Sample() 
                          s1.wid = str(wid) + '#'+str(sid) #+'#' + poswn
                          s1.name = str(myWord)
                          s1.pos = poswn

                          if previousStopWord  in ['no', 'not', 'neigther', 'nor', 'though','but','except']:
                              sign = -1
                          else :
                              sign = 1
                          
                          xBias = 0.000
                          if wsynset :
                            s1.definition = wsynset[0].definition
                            s1.syns = [i for i in wsynset]
                            x = swn.senti_synset(wsynset[0].name)
                            if x :
                                  if x.pos_score >= x.neg_score :
                                      xBias = x.pos_score * sign
                                  
                                  else :
                                      xBias = x.neg_score * (-1)* sign

                          s1.val = xBias
                          wordList.append(s1)	#global
                          ids.append(s1) #local --> for each sentence
                          graph.add_node(s1.name, pos = s1.pos, rank = 0.01)
                          pgraph.add_node(s1.name, pos = s1.pos, bias = s1.val, prestige = 0.01)

                      wid += 1