def extract(self,text): emo_list=[] acr_list=[] for word in twokenize.tokenize(text): if word !=" ": word=word.strip() try: score=self.emoticons[word] emo=emoticons.analyze_tweetHeavy(word) emo_list.append(emo) self.answer['EMOTICONS']=emo_list except: if "@" in word: word="@user" text=text.lower() for word in twokenize.tokenize(text): if word!="": word=word.strip() try: word=self.acronyms[word] acr_list.append(word) self.answer['EXPANDED ACRONYMS']=acr_list except: if "@" in word: word="@user" str1=str(self.answer) return str1
def Smileyoperation(self, text): emo_list = [] for word in twokenize.tokenize(text): if word != " ": word = word.strip() try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) emo_list.append(emo) self.answer["SMILEYS"] = emo_list except: if "@" in word: word = "@user" return self.answer
def Smileyoperation(self, text): emo_list = [] for word in twokenize.tokenize(text): if word != " ": word = word.strip() try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) emo_list.append(emo) self.answer['SMILEYS'] = emo_list except: if "@" in word: word = "@user" return self.answer
def process(self,text,stopwordsF = 0, stemmerF = 0, encode = 1): list1=[] list2=[] line = re.sub(twokenize.Url_RE," ", text) temp = line.replace("#" , " ").lower().split() temp = " ".join(temp) for word in twokenize.tokenize(temp): if word != " ": word = word.strip() flagNonDict = 0 try: if self.wordDict[word] == 1: word = word except: flagNonDict = 1 try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) list1.append(emo) self.result['EMOTICONS'] = list1 except: try: #Normalize Acronyms word = self.acronyms[word] list2.append(word) self.result['ACRONYMS'] = list2 except: if "@" in word: word = "@user" return self.result
def ExEmo(self,text): emo_list=[] for word in twokenize.tokenize(text): if word !=" ": word=word.strip() try: score=self.emoticons[word] emo=emoticons.analyze_tweetHeavy(word) emo_list.append(word) emo_list.append(emo) #d=dict(itertools.izip_longest(*[iter(emo_list)] * 2, fillvalue="")) self.answer['EMOTICONS']=emo_list except: if "@" in word: word="@user" str1=str(self.answer) return str1
def EmoOperation(self,text): emo_list=[] acr_list=[] for word in twokenize.tokenize(text): if word !=" ": word=word.strip() try: score=self.emoticons[word] emo=emoticons.analyze_tweetHeavy(word) emo_list.append(word) emo_list.append(emo) d=dict(itertools.izip_longest(*[iter(emo_list)] * 2, fillvalue="")) except: if "@" in word: word="@user" return d
def process(self,text): list1=[] list2=[] for word in twokenize.tokenize(text): if word != " ": word = word.strip() try: if self.wordDict[word] == 1: word = word except: try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) list1.append(emo) self.result['EMOTICONS'] = list1 except: try: #Normalize Acronyms word = self.acronyms[word] list2.append(word) self.result['ACRONYMS'] = list2 except: if "@" in word: word = "@user" return self.result
def process(self,text,stopwordsF = 0, stemmerF = 0, encode = 1): # remove URL line = re.sub(twokenize.Url_RE," ", text) # to strip of extra white spaces temp = line.replace("#" , " ").lower().split() temp = " ".join(temp) tempTweet = "" for word in twokenize.tokenize(temp): if word != " ": word = word.strip() flagNonDict = 0 try: #### Check Dict and set flag if self.wordDict[word] == 1: # print(word) word = word except: flagNonDict = 1 try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) word = emo + "#("+ str(score) +")#" # print(word) except: try: #Normalize Acronyms word = self.acronyms[word] except: try: #Normalize Contractions word = self.contractions[word] except: #Normalize words (Spell) if flagNonDict == 1: if "@" in word: # remove user mentions word = "@user" else: corrected = self.spellCheck.correct(word) if corrected != "a": word = corrected try: tempTweet = " ".join([tempTweet,word.strip()]) tempTweet = tempTweet.lower().strip() except: tempTweet = " ".join([tempTweet,word.strip().decode("iso-8859-1")]) tempTweet = tempTweet.lower().strip() if stemmerF == 1 and stopwordsF == 1: tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop) elif stemmerF == 1: tempTweet = " ".join(stemmer.stem(w.strip()) for w in tempTweet.split(" ")) elif stopwordsF == 1: tempTweet = " ".join(w for w in tempTweet.split(" ") if w.strip() not in self.stop) # print(tempTweet.encode("utf-8")) if encode == 0: return(tempTweet) return(tempTweet.encode("utf-8"))
def process(self, text, stopwordsF=0, stemmerF=0, encode=1): # remove URL line = re.sub(twokenize.Url_RE, " ", text) # to strip of extra white spaces temp = line.replace("#", " ").lower().split() temp = " ".join(temp) tempTweet = "" for word in twokenize.tokenize(temp): if word != " ": word = word.strip() flagNonDict = 0 try: #### Check Dict and set flag if self.wordDict[word] == 1: # print(word) word = word except: flagNonDict = 1 try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) word = emo + "#(" + str(score) + ")#" # print(word) except: try: #Normalize Acronyms word = self.acronyms[word] except: try: #Normalize Contractions word = self.contractions[word] except: #Normalize words (Spell) if flagNonDict == 1: if "@" in word: # remove user mentions word = "@user" else: corrected = self.spellCheck.correct( word) if corrected != "a": word = corrected try: tempTweet = " ".join([tempTweet, word.strip()]) tempTweet = tempTweet.lower().strip() except: tempTweet = " ".join( [tempTweet, word.strip().decode("iso-8859-1")]) tempTweet = tempTweet.lower().strip() if stemmerF == 1 and stopwordsF == 1: tempTweet = " ".join( stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop) elif stemmerF == 1: tempTweet = " ".join( stemmer.stem(w.strip()) for w in tempTweet.split(" ")) elif stopwordsF == 1: tempTweet = " ".join(w for w in tempTweet.split(" ") if w.strip() not in self.stop) # print(tempTweet.encode("utf-8")) if encode == 0: return (tempTweet) return (tempTweet.encode("utf-8"))