def getMultiWordsTokenizer(politicians,sentiTokens): multiWordsFile = "../Resources/multiwords.txt" multiWordTokenizer = EuroOpinionizers.MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(politicians)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) return multiWordTokenizer
def getMultiWordsTokenizer(politicians, sentiTokens): multiWordsFile = "../Resources/multiwords.txt" multiWordTokenizer = EuroOpinionizers.MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(politicians)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) return multiWordTokenizer
def processTweets(politiciansFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets): """ Processes a list of tweets: 1. Identify target 2. If target is one of the politicians infer the comment's polarity politiciansFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas tweets -> list of tweets """ print "Loading resources...\nPoliticians: " + politiciansFile politicians = Persons.loadPoliticians(politiciansFile) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) print "Multiword Tokenizer " + multiWordsFile multiWordTokenizer = MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(politicians)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) naive = Naive(politicians,sentiTokens) targetedTweets = {} classifiedTweets = {} #Process tweets... #First step: infer targets and create a dictionary {target,listOfTweets} print "Identifying targets..." for tweet in tweets: tweetsWithTarget = naive.inferTarget(tweet) if tweetsWithTarget != None : #a tweet can have multiple targets (in that case the message is replicated) for tweet in tweetsWithTarget: if tweet.target not in targetedTweets: targetedTweets[tweet.target] = [] tweet.taggedSentence = multiWordTokenizer.tokenizeMultiWords(tweet.sentence) targetedTweets[tweet.target].append(tweet) print len(targetedTweets), " targets Identified! Inferring polarity..." rules = Rules(politicians,sentiTokens) #Second step infer polarity for target,tweets in targetedTweets.items(): for tweet in tweets: if target not in classifiedTweets: classifiedTweets[target] = [] #try to classify with rules... classifiedTweet = rules.inferPolarity(tweet,True) #if not possible use the naive classifier if classifiedTweet.polarity == 0: classifiedTweet = naive.inferPolarity(classifiedTweet,True) classifiedTweets[target].append(classifiedTweet) return classifiedTweets
def processTweets_old(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets): """ Processes a list of tweets: 1. Identify target 2. If target is one of the politicians infer the comment's polarity politiciansFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas tweets -> list of tweets """ print "hell yeah!" print "Loading resources...\nTargets: " + targetsFile targets = getFromCache(PERSONS_CACHE) if targets != None: print "Target list found on cache!" else: targets = Persons.loadPoliticians(targetsFile) putInCache(targets, PERSONS_CACHE) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = getFromCache(SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) putInCache(sentiTokens, SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = getFromCache(MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) putInCache(multiWordTokenizer, MULTIWORD_CACHE) print "Inferring polarity..." naive = Naive(targets,sentiTokens) rules = Rules(targets,sentiTokens) analyzedTweets = [] rejectedTweets = [] for tweet in tweets: t0 = datetime.now() #print "antes" tweetsWithTarget = naive.inferTarget(tweet) if tweetsWithTarget != None : #a tweet can have multiple targets (in that case the message is replicated) for tweet in tweetsWithTarget: #try to classify with rules... analyzedTweet = rules.inferPolarity(tweet,False) print "depois" #if not possible use the naive classifier if analyzedTweet.polarity == 0: analyzedTweet = naive.inferPolarity(analyzedTweet,False) if analyzedTweet.polarity == 0: regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)' match = re.search(regex,analyzedTweet.metadata).group(2) print "match: ", match if len(match.strip(' ')) == 0: rejectedTweets.append(analyzedTweet) else: analyzedTweets.append(analyzedTweet) else: analyzedTweets.append(analyzedTweet) t1 = datetime.now() print tweet.id + " ("+ str(t1-t0) + ")" logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv") return analyzedTweets
def processTweets(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets): """ Processes a list of tweets: 1. Identify target 2. If target is one of the politicians infer the comment's polarity politiciansFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas tweets -> list of tweets """ print "Loading resources...\nTargets: " + targetsFile targets = None#getFromCache(WIN_PERSONS_CACHE) if targets != None: print "Target list found on cache!" else: targets = Persons.loadPoliticians(targetsFile) putInCache(targets, WIN_PERSONS_CACHE) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = None#getFromCache(WIN_SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) putInCache(sentiTokens, WIN_SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = None#getFromCache(WIN_MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) putInCache(multiWordTokenizer, WIN_MULTIWORD_CACHE) print "Inferring polarity..." naive = Naive(targets,sentiTokens) rules = Rules(targets,sentiTokens) analyzedTweets = [] rejectedTweets = [] for tweet in tweets: t0 = datetime.now() rulesScore,rulesInfo = rules.getRulesScore(tweet,True) cluesScore,clueInfo = rules.getCluesScore(tweet,True) sentiScore,sentiInfo = naive.getSentiScore(tweet,True) tweetScore = int(sentiScore) + int(rulesScore) + int(cluesScore) if tweetScore > 0: tweet.polarity = 1 elif tweetScore < 0: tweet.polarity = -1 else: tweet.polarity = 0 tweet.metadata = sentiInfo+";"+clueInfo+";"+rulesInfo if tweet.polarity == 0: regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)' match = re.search(regex,tweet.metadata).group(2) if len(match.strip(' ')) == 0: rejectedTweets.append(tweet) else: analyzedTweets.append(tweet) else: analyzedTweets.append(tweet) t1 = datetime.now() print tweet.id + " ("+ str(t1-t0) + ")" logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv") return analyzedTweets
def getFeatures(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,listOfTweets): print "Loading resources...\nTargets: " + targetsFile targets = None#getFromCache(PERSONS_CACHE) if targets != None: print "Target list found on cache!" else: targets = Persons.loadPoliticians(targetsFile) putInCache(targets, PERSONS_CACHE) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = None#getFromCache(SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) putInCache(sentiTokens, SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = None#getFromCache(MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) putInCache(multiWordTokenizer, MULTIWORD_CACHE) print "Calculating features..." naive = Naive(targets,sentiTokens) rules = Rules(targets,sentiTokens) analyzedTweets = [] rejectedTweets = [] for tweet in listOfTweets: feats = [] t0 = datetime.now() rulesFeats = rules.getRulesFeats(tweet,True) cluesFeats = rules.getCluesFeats(tweet,True) sentiFeats = naive.getSentiFeats(tweet,True) x = int(rulesFeats[0])+int(rulesFeats[1])+int(cluesFeats[0])+int(cluesFeats[1])+int(sentiFeats[0])+int(sentiFeats[1])+int(sentiFeats[2]) if x == 0: tweetTest = naive.inferPolarity(tweet, True) regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)' match = re.search(regex,tweetTest.metadata).group(2) if len(match.strip(' ')) == 0: rejectedTweets.append(tweet) print "rejected: " print tweet.tostring() else: feats.append(tweet.id) feats.append(rulesFeats[0]) feats.append(rulesFeats[1]) feats.append(cluesFeats[0]) feats.append(cluesFeats[1]) feats.append(sentiFeats[0]) feats.append(sentiFeats[1]) feats.append(sentiFeats[2]) feats.append(int(tweet.irony)) analyzedTweets.append(feats) else: feats.append(tweet.id) feats.append(rulesFeats[0]) feats.append(rulesFeats[1]) feats.append(cluesFeats[0]) feats.append(cluesFeats[1]) feats.append(sentiFeats[0]) feats.append(sentiFeats[1]) feats.append(sentiFeats[2]) feats.append(int(tweet.irony)) analyzedTweets.append(feats) t1 = datetime.now() print tweet.id + " ("+ str(t1-t0) + ")" #logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv") return analyzedTweets
def processTweets(politiciansFile, sentiTokensFile, exceptSentiTokens, multiWordsFile, tweets): """ Processes a list of tweets: 1. Identify target 2. If target is one of the politicians infer the comment's polarity politiciansFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas tweets -> list of tweets """ print "Loading resources...\nPoliticians: " + politiciansFile politicians = Persons.loadPoliticians(politiciansFile) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile, exceptSentiTokens) print "Multiword Tokenizer " + multiWordsFile multiWordTokenizer = MultiWordHandler(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(politicians)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) naive = Naive(politicians, sentiTokens) targetedTweets = {} classifiedTweets = {} #Process tweets... #First step: infer targets and create a dictionary {target,listOfTweets} print "Identifying targets..." for tweet in tweets: tweetsWithTarget = naive.inferTarget(tweet) if tweetsWithTarget != None: #a tweet can have multiple targets (in that case the message is replicated) for tweet in tweetsWithTarget: if tweet.target not in targetedTweets: targetedTweets[tweet.target] = [] tweet.taggedSentence = multiWordTokenizer.tokenizeMultiWords( tweet.sentence) targetedTweets[tweet.target].append(tweet) print len(targetedTweets), " targets Identified! Inferring polarity..." rules = Rules(politicians, sentiTokens) #Second step infer polarity for target, tweets in targetedTweets.items(): for tweet in tweets: if target not in classifiedTweets: classifiedTweets[target] = [] #try to classify with rules... classifiedTweet = rules.inferPolarity(tweet, True) #if not possible use the naive classifier if classifiedTweet.polarity == 0: classifiedTweet = naive.inferPolarity(classifiedTweet, True) classifiedTweets[target].append(classifiedTweet) return classifiedTweets
def processTweets(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets): """ Processes a list of tweets, for each: 1. Identifies the target 2. If the message contains a target of interest infer the polarity targetsFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas multiWordsFile -> path to a file that contains the words that should be considered as a unit, e.g. "primeiro ministro" tweets -> list of tweets """ print "hell yeah!" print "Loading resources...\nTargets: " + targetsFile targets = Utils.getFromCache(PERSONS_CACHE) if targets != None: print "Target list found on cache!" else: targets = Persons.loadPoliticians(targetsFile) Utils.putInCache(targets, PERSONS_CACHE) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = Utils.getFromCache(SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) Utils.putInCache(sentiTokens, SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = Utils.getFromCache(MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHelper(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) Utils.putInCache(multiWordTokenizer, MULTIWORD_CACHE) print "Resources loaded! Starting analysis..." targetDetector = TargetDetector(targets) #TODO:Estes senhores já não precisam de receber os targets naive = Naive(sentiTokens) rules = Rules(None,sentiTokens) analyzedTweets = [] rejectedTweets = [] for tweet in tweets: t0 = datetime.now() tweetsWithTarget = targetDetector.inferTarget(tweet) if tweetsWithTarget != None : #a tweet can have multiple targets (in that case the message is replicated) for tweet in tweetsWithTarget: #try to classify with rules... analyzedTweet = rules.inferPolarity(tweet,False) #if not possible use the naive classifier if analyzedTweet.polarity == 0: analyzedTweet = naive.inferPolarity(analyzedTweet,False) #If the polarity is still 0 it can mean: #1) The sum of the polarities of the sentiTokens is 0, #2) There was no evidence usable to assess the sentiment if analyzedTweet.polarity == 0: regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)' #Try to find if there are any evidence of matched sentiTokens match = re.search(regex,analyzedTweet.metadata).group(2) if debug: print "match: ", match if len(match.strip(' ')) == 0: rejectedTweets.append(analyzedTweet) else: analyzedTweets.append(analyzedTweet) else: analyzedTweets.append(analyzedTweet) t1 = datetime.now() print tweet.id + " ("+ str(t1-t0) + ")" logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv") return analyzedTweets
def processComments(sentiTokensFile, exceptSentiTokens, multiWordsFile, messages): """ Processes a list of tweets, for each: 1. Identifies the target 2. If the message contains a target of interest infer the polarity targetsFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas multiWordsFile -> path to a file that contains the words that should be considered as a unit, e.g. "primeiro ministro" tweets -> list of tweets """ if debug: print "DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG \n\n" print "Loading resources..." print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = Utils.getFromCache(SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile, exceptSentiTokens) Utils.putInCache(sentiTokens, SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = Utils.getFromCache(MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHelper(multiWordsFile) multiWordTokenizer.addMultiWords( SentiTokens.getMultiWords(sentiTokens)) Utils.putInCache(multiWordTokenizer, MULTIWORD_CACHE) print "Resources loaded! Starting analysis..." naive = Naive(sentiTokens) #rules = Rules(None,sentiTokens) rows = 0 positiveTokens = {} negativeTokens = {} for message in messages: rows += 1 t0 = datetime.now() tokens = naive.tokenizeSentiTokens(message, True) for token in tokens[0]: if token not in positiveTokens: positiveTokens[token] = 1 else: positiveTokens[token] += 1 for token in tokens[1]: if token not in negativeTokens: negativeTokens[token] = 1 else: negativeTokens[token] += 1 if rows % 1000 == 0 and rows != 0: writeResults(positiveTokens, "./positive" + str(rows) + ".csv") writeResults(negativeTokens, "./negative" + str(rows) + ".csv") if debug: t1 = datetime.now() print "Time: " + str(t1 - t0) print message.sentence print "positive: ", tokens[0] print "negative: ", tokens[1] print "\n------------------\n" writeResults(positiveTokens, "./positive.csv") writeResults(negativeTokens, "./negative.csv") print "done!"