def testDetect(self): texts = [ (u"The quick brown",'en'), (u"Le renard brun rapide saute par-dessus le chien paresseux",'fr'), (u"@Ja_Nina HERRLICH :) ich hab nix auf planeten gefunden..deine version klingt absolut logisch :D",'de'), (u"En Google somos plenamente conscientes de la confianza que los usuarios depositan ",'es'), (u"Noi di Google siamo perfettamente consapevoli della fiducia che riponi in noi e della ",'it'), (u'русский язык','ru'), (u'','other') ] ld = LangDetect(languages = supportedLangs) r1 = [] r2 = [] for text,lang in texts: res = ld.detect(text) r1.append(res[0]) r2.append(lang) assert r1==r2
def __init__(self, dataDir = "~", training_data_fileP1 = 'mood_training_p1.dat', training_data_fileP2 = 'mood_training.dat', data_p_file = 'tweets_positive_raw.dat', data_n_file = 'tweets_negative_raw.dat'): self.dataDir = dataDir self.clsP1 = MoodDetectTrainer(data_file = training_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file = training_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsPFile = open(os.path.join( self.dataDir,data_p_file),'rb') self.tweetsNFile = open(os.path.join( self.dataDir,data_n_file),'rb') self.limit['en'] = 150000 self.limit['default'] = 1000
def loadCls(): ThreadedTCPServer.langCls = LangDetect(supportedLangs) ThreadedTCPServer.moodCls = MoodDetect(MoodDetectTrainer())
class RawClassifier(object): statsData = {} dataDir = "~" limit = {} skip = 0 p2_f_limit = 0.75 def __init__(self, dataDir = "~", training_data_fileP1 = 'mood_training_p1.dat', training_data_fileP2 = 'mood_training.dat', data_p_file = 'tweets_positive_raw.dat', data_n_file = 'tweets_negative_raw.dat'): self.dataDir = dataDir self.clsP1 = MoodDetectTrainer(data_file = training_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file = training_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsPFile = open(os.path.join( self.dataDir,data_p_file),'rb') self.tweetsNFile = open(os.path.join( self.dataDir,data_n_file),'rb') self.limit['en'] = 150000 self.limit['default'] = 1000 def classifyP1(self,stripSmiles=False): self.classifyRaw(self.tweetsNFile,'n',stripSmiles) self.classifyRaw(self.tweetsPFile,'p',stripSmiles) self.clsP1.train(self.training_data_p1) print "done training P1" print self.statsData def classifyP2(self): """ remove noisy n-grams """ _st={'tf':0,'df':0} for feutures,label in self.training_data_p1: lang = feutures.pop('x_lang') feuturesP2 = feutures.copy() for f,v in feutures.items(): prob = self.clsP1.classifier.prob_classify({f:v,'x_lang':lang}) _st['tf']+=1 if max(prob.prob('n'),prob.prob('p')) <= self.p2_f_limit: del feuturesP2[f] _st['df']+=1 if len(feuturesP2) >= 3: feuturesP2['x_lang']=lang self.training_data_p2.append((feuturesP2,label)) else: pass print len(self.training_data_p2), len(self.training_data_p1) print _st print "deleting p1 set" del self.training_data_p1 del self.clsP1 print "Done deleting p1 set" self.clsP2.train(self.training_data_p2) def stripSmiles(self,text): emos = [':)',':-)',';-)',': )',':d','=)',':p',';)','<3',':(',':-(',': ('] for item in emos: text = text.replace(item,"") return text def stats(self,lang,mood): if not self.statsData.has_key(lang): self.statsData[lang] = {'n':0,'p':0} if self.limit.has_key(lang): limit = self.limit[lang] else: limit = self.limit['default'] if self.statsData[lang][mood] >= limit: return 0 else: self.statsData[lang][mood]+=1 return 1 def checkDoubleEmo(self,mood,text): if mood == 'n': if text.find(':)') != -1: return True else: return False if mood == 'p': if text.find(':(') != -1: return True else: return False def classifyRaw(self,file,mood,stripSmiles): while True: try: tweet = cPickle.load(file) except EOFError: print "done for %s" % mood break except: pass if self.skip > 0: self.skip -= 1 continue if tweet: text = unicode(tweet.get('text')) if text.lower().find('rt ') != -1: continue if self.checkDoubleEmo(mood,text): continue lang = self.langClassifier.detect(text) if stripSmiles: text = self.stripSmiles(text) sres = self.stats(lang[0], mood) if sres == 0: continue if sres == -1: print "done for %s" % mood break self.training_data_p1.addRow(text, mood, lang[0]) def countRows(self,file): rows = 0 breakes = 0 while True: try: tweet = cPickle.load(file) rows +=1 except EOFError: break except: breakes +=1 print file print rows,breakes
import sys sys.path.append('../../') import socket import os from tracker.lib.moodClassifierClient import MoodClassifierTCPClient from tracker.lib.lang_detection import LangDetect from tracker.lib.supportedLangs import supportedLangs import cPickle import linecache #MCC = MoodClassifierTCPClient('srv1.cyhex.com',6666) MCC = MoodClassifierTCPClient('127.0.0.1', 6666) cls_data = {'nc': 1, 'pc': 1, 'n': 1, 'p': 1, 'n#': 1, 'p#': 1} langClassifier = LangDetect(supportedLangs) tweetsPFile = "/home/gx/Sites/SMM/trunk/tracker/data/tweets_positive_test.dat" tweetsNFile = "/home/gx/Sites/SMM/trunk/tracker/data/tweets_negative_test.dat" def stripSmiles(text): emos = [ ':)', ':-)', ';-)', ': )', ':d', '=)', ':p', ';)', '<3', ':(', ':-(', ': (' ] for item in emos: text = text.replace(item, "") return text