def extract(self,text): emo_list=[] acr_list=[] for word in twokenize.tokenize(text): if word !=" ": word=word.strip() try: score=self.emoticons[word] emo=emoticons.analyze_tweetHeavy(word) emo_list.append(emo) self.answer['EMOTICONS']=emo_list except: if "@" in word: word="@user" text=text.lower() for word in twokenize.tokenize(text): if word!="": word=word.strip() try: word=self.acronyms[word] acr_list.append(word) self.answer['EXPANDED ACRONYMS']=acr_list except: if "@" in word: word="@user" str1=str(self.answer) return str1
def process(self,text,stopwordsF = 0, stemmerF = 0, encode = 1): # remove URL line = re.sub(twokenize.Url_RE," ", text) # to strip of extra white spaces temp = line.replace("#" , " ").lower().split() temp = " ".join(temp) #print temp tempTweet = "" for word in twokenize.tokenize(temp): """ except: tempTweet = " ".join([tempTweet,word.strip().decode("iso-8859-1")]) """ #print(tempTweet.encode("utf-8")) if encode == 0: return(tempTweet) #return(tempTweet.encode("utf-8")) return temp
def process(self,stopwordsF = 0, stemmerF = 0, encode = 1): f=open('tumblr.txt','r') for line in iter(f): item=line.rstrip() #new=item.split('|') #tweet=new[4] #print tweet #print "_____" # remove URL line = re.sub(twokenize.Url_RE," ", item) # to strip of extra white spaces temp = line.replace("#" , " ").lower().split() temp = " ".join(temp) print "TUMBLR POST:" print temp out=[i for i in temp.split() if i not in stop] for word1 in list1: for word2 in out: wordFromList1=wordnet.synsets(word1) wordFromList2=wordnet.synsets(word2) if wordFromList1 and wordFromList2: s=wordFromList1[0].wup_similarity(wordFromList2[0]) list2.append(s) #print list2 print word1 #print list2 if list2: print(max(list2)) else: print '0.0' del list2[:] #getkeys(out,list1) #print tweet tempTweet = "" for word in twokenize.tokenize(temp): """ except: tempTweet = " ".join([tempTweet,word.strip().decode("iso-8859-1")]) """ #print(tempTweet.encode("utf-8")) if encode == 0: return(tempTweet) #return(tempTweet.encode("utf-8")) return(max(list2))
def AcronymsOperation(self, text): acr_list = [] for word in twokenize.tokenize(text): if word != " ": word = word.strip() try: word = self.acronyms[word] acr_list.append(word) self.answer["EXPANDED ACRONYMS"] = acr_list except: if "@" in word: word = "@user" return self.answer
def AcronymsOperation(self, text): acr_list = [] text = text.lower() for word in twokenize.tokenize(text): if word != " ": word = word.strip() try: word = self.acronyms[word] acr_list.append(word) self.answer['EXPANDED ACRONYMS'] = acr_list except: if "@" in word: word = "@user" return self.answer
def Smileyoperation(self, text): emo_list = [] for word in twokenize.tokenize(text): if word != " ": word = word.strip() try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) emo_list.append(emo) self.answer["SMILEYS"] = emo_list except: if "@" in word: word = "@user" return self.answer
def Smileyoperation(self, text): emo_list = [] for word in twokenize.tokenize(text): if word != " ": word = word.strip() try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) emo_list.append(emo) self.answer['SMILEYS'] = emo_list except: if "@" in word: word = "@user" return self.answer
def process(self, stopwordsF=0, stemmerF=0, encode=1): f = open('tumblr.txt', 'r') for line in iter(f): item = line.rstrip() #new=item.split('|') #tweet=new[4] #print tweet #print "_____" # remove URL line = re.sub(twokenize.Url_RE, " ", item) # to strip of extra white spaces temp = line.replace("#", " ").lower().split() temp = " ".join(temp) print "TUMBLR POST:" print temp out = [i for i in temp.split() if i not in stop] for word1 in list1: for word2 in out: wordFromList1 = wordnet.synsets(word1) wordFromList2 = wordnet.synsets(word2) if wordFromList1 and wordFromList2: s = wordFromList1[0].wup_similarity(wordFromList2[0]) list2.append(s) #print list2 print word1 #print list2 if list2: print(max(list2)) else: print '0.0' del list2[:] #getkeys(out,list1) #print tweet tempTweet = "" for word in twokenize.tokenize(temp): """
def ExAcro(self,text): acr_list=[] text=text.lower() store_acronyms={} for word in twokenize.tokenize(text): if word!="": word=word.strip() try: word_after=self.acronyms[word] #acr_list.append(word) store_acronyms[word]=word_after #self.answer['EXPANDED ACRONYMS']=acr_list except: if "@" in word: word="@user" self.answer=store_acronyms str2=str(self.answer) return str2
def process(self,text,stopwordsF = 0, stemmerF = 0, encode = 1): list1=[] list2=[] line = re.sub(twokenize.Url_RE," ", text) temp = line.replace("#" , " ").lower().split() temp = " ".join(temp) for word in twokenize.tokenize(temp): if word != " ": word = word.strip() flagNonDict = 0 try: if self.wordDict[word] == 1: word = word except: flagNonDict = 1 try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) list1.append(emo) self.result['EMOTICONS'] = list1 except: try: #Normalize Acronyms word = self.acronyms[word] list2.append(word) self.result['ACRONYMS'] = list2 except: if "@" in word: word = "@user" return self.result
def ExEmo(self,text): emo_list=[] for word in twokenize.tokenize(text): if word !=" ": word=word.strip() try: score=self.emoticons[word] emo=emoticons.analyze_tweetHeavy(word) emo_list.append(word) emo_list.append(emo) #d=dict(itertools.izip_longest(*[iter(emo_list)] * 2, fillvalue="")) self.answer['EMOTICONS']=emo_list except: if "@" in word: word="@user" str1=str(self.answer) return str1
def EmoOperation(self,text): emo_list=[] acr_list=[] for word in twokenize.tokenize(text): if word !=" ": word=word.strip() try: score=self.emoticons[word] emo=emoticons.analyze_tweetHeavy(word) emo_list.append(word) emo_list.append(emo) d=dict(itertools.izip_longest(*[iter(emo_list)] * 2, fillvalue="")) except: if "@" in word: word="@user" return d
def process(self,text): list1=[] list2=[] for word in twokenize.tokenize(text): if word != " ": word = word.strip() try: if self.wordDict[word] == 1: word = word except: try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) list1.append(emo) self.result['EMOTICONS'] = list1 except: try: #Normalize Acronyms word = self.acronyms[word] list2.append(word) self.result['ACRONYMS'] = list2 except: if "@" in word: word = "@user" return self.result
def AcroOperation(self,text): text=text.lower() store_acronyms = {} for word in twokenize.tokenize(text): if word !=" ": word=word.strip() try: #print "before:", word word_after=self.acronyms[word] # acr_list.append(word) #print "after:", word store_acronyms[word] = word_after # self.answer['ACRONYMS']=acr_list except: if "@" in word: word="@user" # print "EMOTICONS" self.answer['ACRONYMS'] = store_acronyms #return self.answer return store_acronyms
def process(self,text,stopwordsF = 0, stemmerF = 0, encode = 1): # remove URL line = re.sub(twokenize.Url_RE," ", text) # to strip of extra white spaces temp = line.replace("#" , " ").lower().split() temp = " ".join(temp) tempTweet = "" for word in twokenize.tokenize(temp): if word != " ": word = word.strip() flagNonDict = 0 try: #### Check Dict and set flag if self.wordDict[word] == 1: # print(word) word = word except: flagNonDict = 1 try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) word = emo + "#("+ str(score) +")#" # print(word) except: try: #Normalize Acronyms word = self.acronyms[word] except: try: #Normalize Contractions word = self.contractions[word] except: #Normalize words (Spell) if flagNonDict == 1: if "@" in word: # remove user mentions word = "@user" else: corrected = self.spellCheck.correct(word) if corrected != "a": word = corrected try: tempTweet = " ".join([tempTweet,word.strip()]) tempTweet = tempTweet.lower().strip() except: tempTweet = " ".join([tempTweet,word.strip().decode("iso-8859-1")]) tempTweet = tempTweet.lower().strip() if stemmerF == 1 and stopwordsF == 1: tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop) elif stemmerF == 1: tempTweet = " ".join(stemmer.stem(w.strip()) for w in tempTweet.split(" ")) elif stopwordsF == 1: tempTweet = " ".join(w for w in tempTweet.split(" ") if w.strip() not in self.stop) # print(tempTweet.encode("utf-8")) if encode == 0: return(tempTweet) return(tempTweet.encode("utf-8"))
def process(self, text, stopwordsF=0, stemmerF=0, encode=1): # remove URL line = re.sub(twokenize.Url_RE, " ", text) # to strip of extra white spaces temp = line.replace("#", " ").lower().split() temp = " ".join(temp) tempTweet = "" for word in twokenize.tokenize(temp): if word != " ": word = word.strip() flagNonDict = 0 try: #### Check Dict and set flag if self.wordDict[word] == 1: # print(word) word = word except: flagNonDict = 1 try: score = self.emoticons[word] emo = emoticons.analyze_tweetHeavy(word) word = emo + "#(" + str(score) + ")#" # print(word) except: try: #Normalize Acronyms word = self.acronyms[word] except: try: #Normalize Contractions word = self.contractions[word] except: #Normalize words (Spell) if flagNonDict == 1: if "@" in word: # remove user mentions word = "@user" else: corrected = self.spellCheck.correct( word) if corrected != "a": word = corrected try: tempTweet = " ".join([tempTweet, word.strip()]) tempTweet = tempTweet.lower().strip() except: tempTweet = " ".join( [tempTweet, word.strip().decode("iso-8859-1")]) tempTweet = tempTweet.lower().strip() if stemmerF == 1 and stopwordsF == 1: tempTweet = " ".join( stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop) elif stemmerF == 1: tempTweet = " ".join( stemmer.stem(w.strip()) for w in tempTweet.split(" ")) elif stopwordsF == 1: tempTweet = " ".join(w for w in tempTweet.split(" ") if w.strip() not in self.stop) # print(tempTweet.encode("utf-8")) if encode == 0: return (tempTweet) return (tempTweet.encode("utf-8"))