def processTweets(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets): """ Processes a list of tweets: 1. Identify target 2. If target is one of the politicians infer the comment's polarity politiciansFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas tweets -> list of tweets """ print "Loading resources...\nTargets: " + targetsFile targets = None#getFromCache(WIN_PERSONS_CACHE) if targets != None: print "Target list found on cache!" else: targets = Persons.loadPoliticians(targetsFile) putInCache(targets, WIN_PERSONS_CACHE) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = None#getFromCache(WIN_SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) putInCache(sentiTokens, WIN_SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = None#getFromCache(WIN_MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) putInCache(multiWordTokenizer, WIN_MULTIWORD_CACHE) print "Inferring polarity..." naive = Naive(targets,sentiTokens) rules = Rules(targets,sentiTokens) analyzedTweets = [] rejectedTweets = [] for tweet in tweets: t0 = datetime.now() rulesScore,rulesInfo = rules.getRulesScore(tweet,True) cluesScore,clueInfo = rules.getCluesScore(tweet,True) sentiScore,sentiInfo = naive.getSentiScore(tweet,True) tweetScore = int(sentiScore) + int(rulesScore) + int(cluesScore) if tweetScore > 0: tweet.polarity = 1 elif tweetScore < 0: tweet.polarity = -1 else: tweet.polarity = 0 tweet.metadata = sentiInfo+";"+clueInfo+";"+rulesInfo if tweet.polarity == 0: regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)' match = re.search(regex,tweet.metadata).group(2) if len(match.strip(' ')) == 0: rejectedTweets.append(tweet) else: analyzedTweets.append(tweet) else: analyzedTweets.append(tweet) t1 = datetime.now() print tweet.id + " ("+ str(t1-t0) + ")" logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv") return analyzedTweets
def processTweets_old(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets): """ Processes a list of tweets: 1. Identify target 2. If target is one of the politicians infer the comment's polarity politiciansFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas tweets -> list of tweets """ print "hell yeah!" print "Loading resources...\nTargets: " + targetsFile targets = getFromCache(PERSONS_CACHE) if targets != None: print "Target list found on cache!" else: targets = Persons.loadPoliticians(targetsFile) putInCache(targets, PERSONS_CACHE) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = getFromCache(SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) putInCache(sentiTokens, SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = getFromCache(MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) putInCache(multiWordTokenizer, MULTIWORD_CACHE) print "Inferring polarity..." naive = Naive(targets,sentiTokens) rules = Rules(targets,sentiTokens) analyzedTweets = [] rejectedTweets = [] for tweet in tweets: t0 = datetime.now() #print "antes" tweetsWithTarget = naive.inferTarget(tweet) if tweetsWithTarget != None : #a tweet can have multiple targets (in that case the message is replicated) for tweet in tweetsWithTarget: #try to classify with rules... analyzedTweet = rules.inferPolarity(tweet,False) print "depois" #if not possible use the naive classifier if analyzedTweet.polarity == 0: analyzedTweet = naive.inferPolarity(analyzedTweet,False) if analyzedTweet.polarity == 0: regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)' match = re.search(regex,analyzedTweet.metadata).group(2) print "match: ", match if len(match.strip(' ')) == 0: rejectedTweets.append(analyzedTweet) else: analyzedTweets.append(analyzedTweet) else: analyzedTweets.append(analyzedTweet) t1 = datetime.now() print tweet.id + " ("+ str(t1-t0) + ")" logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv") return analyzedTweets
def getFeatures(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,listOfTweets): print "Loading resources...\nTargets: " + targetsFile targets = None#getFromCache(PERSONS_CACHE) if targets != None: print "Target list found on cache!" else: targets = Persons.loadPoliticians(targetsFile) putInCache(targets, PERSONS_CACHE) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = None#getFromCache(SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) putInCache(sentiTokens, SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = None#getFromCache(MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) putInCache(multiWordTokenizer, MULTIWORD_CACHE) print "Calculating features..." naive = Naive(targets,sentiTokens) rules = Rules(targets,sentiTokens) analyzedTweets = [] rejectedTweets = [] for tweet in listOfTweets: feats = [] t0 = datetime.now() rulesFeats = rules.getRulesFeats(tweet,True) cluesFeats = rules.getCluesFeats(tweet,True) sentiFeats = naive.getSentiFeats(tweet,True) x = int(rulesFeats[0])+int(rulesFeats[1])+int(cluesFeats[0])+int(cluesFeats[1])+int(sentiFeats[0])+int(sentiFeats[1])+int(sentiFeats[2]) if x == 0: tweetTest = naive.inferPolarity(tweet, True) regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)' match = re.search(regex,tweetTest.metadata).group(2) if len(match.strip(' ')) == 0: rejectedTweets.append(tweet) print "rejected: " print tweet.tostring() else: feats.append(tweet.id) feats.append(rulesFeats[0]) feats.append(rulesFeats[1]) feats.append(cluesFeats[0]) feats.append(cluesFeats[1]) feats.append(sentiFeats[0]) feats.append(sentiFeats[1]) feats.append(sentiFeats[2]) feats.append(int(tweet.irony)) analyzedTweets.append(feats) else: feats.append(tweet.id) feats.append(rulesFeats[0]) feats.append(rulesFeats[1]) feats.append(cluesFeats[0]) feats.append(cluesFeats[1]) feats.append(sentiFeats[0]) feats.append(sentiFeats[1]) feats.append(sentiFeats[2]) feats.append(int(tweet.irony)) analyzedTweets.append(feats) t1 = datetime.now() print tweet.id + " ("+ str(t1-t0) + ")" #logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv") return analyzedTweets