def scores(preProData,emot,sentifile='SentiWordNet_3.0.0_20130122.txt'): swn = SentiWordNetCorpusReader(sentifile) res = list() bar = 0.0 nm = NegMod() for tweet,emo in zip(preProData,emot): print bar / float(len(preProData)) tweetneg = 0.0 tweetpos = 0.0 c = 0 for word in tweet: try: w = str(wn.synsets(word)[0].name()) temp = swn.senti_synset(w) plop = 0.0 plopp = 0.0 # Negation et modifieurs if c != 0: if nm.neg_it(tweet[c-1]):#negation tweetpos = temp[2] tweetneg = temp[1] break if nm.mod_multiply(tweet[c-1]):#modifier plop = temp[1]*2 plopp = temp[2]*2 else: plop = temp[1] plopp = temp[2] else: plop = temp[1] plopp = temp[2] tweetpos = tweetpos + plop tweetneg = tweetneg + plopp except: pass c = c + 1 # Add emot feeling tweetpos = tweetpos + emo[0] tweetneg = tweetneg + emo[1] res.append((tweetpos,tweetneg)) bar = bar + 1.0 return res
for w in words: for k, v in dico_slang.items(): if k in w: #print(k) w = list(filter(lambda x: x != k, w)) w.append(v) #print(w) words_noslang.append(w) print(words_noslang) tag = nltk.pos_tag tag_list_tot = [] tag_list = [tag(words_noslang[i]) for i in range(N)] print(tag_list) for i in range(N): tag_list_tot.extend(tag(words_noslang[i])) tot_pos = [tag_list_tot[i][1] for i in range(len(tag_list_tot))] count_VB = Counter(tot_pos) print(count_VB) verb_tag = ['VB', 'VBZ', 'VBP', 'VBG', 'VBN', 'VBD'] sum_vb = 0 for k, v in count_VB.items(): if k in verb_tag: sum_vb += v print('Le nombre total de POS verbes sur l\'ensemble des tweets est: ', sum_vb) swn_filename = '/home/audrey/Audrey/Cours/INF344/TP/TP_sentiment/SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) swn.senti_synset('breakdown.n.03')
class RawClassifier(object): statsData = {} dataDir = "/home/toni/git/financial-twitter-sentiment-analyzer/tracker/data" limit = {} skip = 0 p2_f_limit = 0.75 def __init__(self,traing_data_fileP1='mood_traing_p1.dat',traing_data_fileP2='mood_traing.dat',data_file='tweets_raw.dat'): self.clsP1 = MoodDetectTrainer(data_file = traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file = traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(self.dataDir,data_file),'rb') self.limit['en'] = 150000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt' self.swn = SentiWordNetCorpusReader(swn_filename) def classifyP1(self,stripSmiles=False): self.classifiyRaw(self.tweetsFile,stripSmiles) self.clsP1.train(self.training_data_p1) print "done training P1" print self.statsData def classifyP2(self): """ remove noisy n-grams """ _st={'tf':0,'df':0} for feutures,label in self.training_data_p1: lang = feutures.pop('x_lang') feuturesP2 = feutures.copy() for f,v in feutures.items(): prob = self.clsP1.classifier.prob_classify({f:v,'x_lang':lang}) _st['tf']+=1 if max(prob.prob('n'),prob.prob('p')) <= self.p2_f_limit: del feuturesP2[f] _st['df']+=1 if len(feuturesP2) >= 3: feuturesP2['x_lang']=lang self.training_data_p2.append((feuturesP2,label)) else: pass print 'p2_length:' , len(self.training_data_p2), ' p1_lenght:' , len(self.training_data_p1) print 'st:' , _st print "deleting p1 set" del self.training_data_p1 del self.clsP1 print "Done deleting p1 set" self.clsP2.train(self.training_data_p2) def stripSmiles(self,text): emos = [':)',':-)',';-)',': )',':d','=)',':p',';)','<3',':(',':-(',': ('] for item in emos: text = text.replace(item,"") return text def stats(self,lang,mood): if not self.statsData.has_key(lang): self.statsData[lang] = {'n':0,'p':0} if self.limit.has_key(lang): limit = self.limit[lang] else: limit = self.limit['default'] if self.statsData[lang][mood] >= limit: return 0 else: self.statsData[lang][mood]+=1 return 1 def checkWithSentiwordnet(self, text): tokens = nltk.word_tokenize(text) for token in tokens: synsets = self.swn.senti_synsets(token) if len(synsets) > 0: synset = self.swn.senti_synset(str(synsets[0])) print synset def checkKeyWords(self,text): count = self.containsPositiveWord(text) + self.containsNegativeWord(text); if count > 0: return 'p' if count < 0: return 'n' return 'x' def containsPositiveWord(self,text): count = 0 for item in dictionary.positive: if item in text: count += 1 #print 'p:',item return count def containsNegativeWord(self,text): count = 0 for item in dictionary.negative: if item in text: #print 'n:', item count -= 1 return count def classifiyRaw(self,file,stripSmiles): while True: try: tweet = cPickle.load(file) except EOFError: print "done classify" break except: print "error" pass if self.skip > 0: print "skip" self.skip -= 1 continue if tweet: text = unicode(tweet.get('text')) if text.lower().find('rt ') != -1: print 'rt' continue mood = self.checkKeyWords(text) if mood == 'x': continue lang = self.langClassifier.detect(text) if stripSmiles: text = self.stripSmiles(text) sres = self.stats(lang[0], mood) if sres == 0: # limite de idioma alcanzado print 'limit reached for ' , lang[0] continue if sres == -1: print "done for %s" % mood break if self.count and self.count % 100 == 0: print "classified %d tweets" % (self.count) self.count += 1 self.checkWithSentiwordnet(text) self.training_data_p1.addRow(text, mood, lang[0]) def countRows(self,file): rows = 0 breakes = 0 while True: try: tweet = cPickle.load(file) rows +=1 except EOFError: break except: breakes +=1 print 'tweets:',rows,' breakes:',breakes
s1.name = str(myWord) s1.pos = poswn if previousStopWord in [ 'no', 'not', 'neigther', 'nor', 'though', 'but', 'except' ]: sign = -1 else: sign = 1 xBias = 0.000 if wsynset: s1.definition = wsynset[0].definition s1.syns = [i for i in wsynset] x = swn.senti_synset(wsynset[0].name) if x: if x.pos_score >= x.neg_score: xBias = x.pos_score * sign else: xBias = x.neg_score * ( -1) * sign s1.val = xBias wordList.append(s1) #global ids.append( s1) #local --> for each sentence graph.add_node(s1.name, pos=s1.pos, rank=0.01)
class RawClassifier(object): statsData = {} dataDir = "/home/toni/git/financial-twitter-sentiment-analyzer/tracker/data" limit = {} skip = 0 p2_f_limit = 0.75 def __init__(self, traing_data_fileP1='mood_traing_p1.dat', traing_data_fileP2='mood_traing.dat', data_file='tweets_raw.dat'): self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb') self.limit['en'] = 150000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt' self.swn = SentiWordNetCorpusReader(swn_filename) def classifyP1(self, stripSmiles=False): self.classifiyRaw(self.tweetsFile, stripSmiles) self.clsP1.train(self.training_data_p1) print "done training P1" print self.statsData def classifyP2(self): """ remove noisy n-grams """ _st = {'tf': 0, 'df': 0} for feutures, label in self.training_data_p1: lang = feutures.pop('x_lang') feuturesP2 = feutures.copy() for f, v in feutures.items(): prob = self.clsP1.classifier.prob_classify({ f: v, 'x_lang': lang }) _st['tf'] += 1 if max(prob.prob('n'), prob.prob('p')) <= self.p2_f_limit: del feuturesP2[f] _st['df'] += 1 if len(feuturesP2) >= 3: feuturesP2['x_lang'] = lang self.training_data_p2.append((feuturesP2, label)) else: pass print 'p2_length:', len(self.training_data_p2), ' p1_lenght:', len( self.training_data_p1) print 'st:', _st print "deleting p1 set" del self.training_data_p1 del self.clsP1 print "Done deleting p1 set" self.clsP2.train(self.training_data_p2) def stripSmiles(self, text): emos = [ ':)', ':-)', ';-)', ': )', ':d', '=)', ':p', ';)', '<3', ':(', ':-(', ': (' ] for item in emos: text = text.replace(item, "") return text def stats(self, lang, mood): if not self.statsData.has_key(lang): self.statsData[lang] = {'n': 0, 'p': 0} if self.limit.has_key(lang): limit = self.limit[lang] else: limit = self.limit['default'] if self.statsData[lang][mood] >= limit: return 0 else: self.statsData[lang][mood] += 1 return 1 def checkWithSentiwordnet(self, text): tokens = nltk.word_tokenize(text) for token in tokens: synsets = self.swn.senti_synsets(token) if len(synsets) > 0: synset = self.swn.senti_synset(str(synsets[0])) print synset def checkKeyWords(self, text): count = self.containsPositiveWord(text) + self.containsNegativeWord( text) if count > 0: return 'p' if count < 0: return 'n' return 'x' def containsPositiveWord(self, text): count = 0 for item in dictionary.positive: if item in text: count += 1 #print 'p:',item return count def containsNegativeWord(self, text): count = 0 for item in dictionary.negative: if item in text: #print 'n:', item count -= 1 return count def classifiyRaw(self, file, stripSmiles): while True: try: tweet = cPickle.load(file) except EOFError: print "done classify" break except: print "error" pass if self.skip > 0: print "skip" self.skip -= 1 continue if tweet: text = unicode(tweet.get('text')) if text.lower().find('rt ') != -1: print 'rt' continue mood = self.checkKeyWords(text) if mood == 'x': continue lang = self.langClassifier.detect(text) if stripSmiles: text = self.stripSmiles(text) sres = self.stats(lang[0], mood) if sres == 0: # limite de idioma alcanzado print 'limit reached for ', lang[0] continue if sres == -1: print "done for %s" % mood break if self.count and self.count % 100 == 0: print "classified %d tweets" % (self.count) self.count += 1 self.checkWithSentiwordnet(text) self.training_data_p1.addRow(text, mood, lang[0]) def countRows(self, file): rows = 0 breakes = 0 while True: try: tweet = cPickle.load(file) rows += 1 except EOFError: break except: breakes += 1 print 'tweets:', rows, ' breakes:', breakes
#print wsynset s1 = Sample() s1.wid = str(wid) + '#'+str(sid) #+'#' + poswn s1.name = str(myWord) s1.pos = poswn if previousStopWord in ['no', 'not', 'neigther', 'nor', 'though','but','except']: sign = -1 else : sign = 1 xBias = 0.000 if wsynset : s1.definition = wsynset[0].definition s1.syns = [i for i in wsynset] x = swn.senti_synset(wsynset[0].name) if x : if x.pos_score >= x.neg_score : xBias = x.pos_score * sign else : xBias = x.neg_score * (-1)* sign s1.val = xBias wordList.append(s1) #global ids.append(s1) #local --> for each sentence graph.add_node(s1.name, pos = s1.pos, rank = 0.01) pgraph.add_node(s1.name, pos = s1.pos, bias = s1.val, prestige = 0.01) wid += 1