Python tokenize примеры, Twokenize.twokenize.tokenize Python примеры использования

Пример #1

0

Показать файл

Файл: ExtractServer.py Проект: INIGOS/JavaThrift

    def extract(self,text):
        emo_list=[]
        acr_list=[]

        for word in twokenize.tokenize(text):
            if word !=" ":
                word=word.strip()

                try:
                    score=self.emoticons[word]
                    emo=emoticons.analyze_tweetHeavy(word)
                    emo_list.append(emo)
                    self.answer['EMOTICONS']=emo_list
                except:

                    if "@" in word:
                        word="@user"
        text=text.lower()
        for word in twokenize.tokenize(text):
            if word!="":
                word=word.strip()
                
                try:
                    word=self.acronyms[word]
                    acr_list.append(word)
                    self.answer['EXPANDED ACRONYMS']=acr_list
                except:
                    if "@" in word:
                        word="@user"
        str1=str(self.answer)
        return str1

Пример #2

0

Показать файл

Файл: new.py Проект: INIGOS/SimilarityOfTweetsUsingWordnet

    def process(self,text,stopwordsF = 0, stemmerF = 0, encode = 1):

    # remove URL
        line = re.sub(twokenize.Url_RE," ", text)

    # to strip of extra white spaces
        temp = line.replace("#" , " ").lower().split()
        temp = " ".join(temp)
        
    #print temp

        tempTweet = ""


        for word in twokenize.tokenize(temp):
            """
                except:
                    tempTweet = " ".join([tempTweet,word.strip().decode("iso-8859-1")])
                    """
        


    #print(tempTweet.encode("utf-8"))
        if encode == 0:
            return(tempTweet)
    #return(tempTweet.encode("utf-8"))
        return temp

Пример #3

0

Показать файл

Файл: newtumb.py Проект: INIGOS/SimilarityOfTweetsUsingWordnet

    def process(self,stopwordsF = 0, stemmerF = 0, encode = 1):


        f=open('tumblr.txt','r')
        for line in iter(f):
            item=line.rstrip()
            #new=item.split('|')
            #tweet=new[4]
            #print tweet
            #print "_____"
    # remove URL
            line = re.sub(twokenize.Url_RE," ", item)

    # to strip of extra white spaces
            temp = line.replace("#" , " ").lower().split()
            temp = " ".join(temp)
            print "TUMBLR POST:"
            print temp
            out=[i for i in temp.split() if i not in stop]
            

            for word1 in list1:
                for word2 in out:
                    wordFromList1=wordnet.synsets(word1)
                    wordFromList2=wordnet.synsets(word2)
                    if wordFromList1 and wordFromList2:
                        s=wordFromList1[0].wup_similarity(wordFromList2[0])
                        list2.append(s)
                    #print list2
                print word1
                #print list2
                if list2:
                    print(max(list2))
                else:
                    print '0.0'
                del list2[:]

        #getkeys(out,list1)
        
    #print tweet

        tempTweet = ""



        for word in twokenize.tokenize(temp):
            """
                except:
                    tempTweet = " ".join([tempTweet,word.strip().decode("iso-8859-1")])
                    """
        

        

    #print(tempTweet.encode("utf-8"))
        if encode == 0:
            return(tempTweet)
    #return(tempTweet.encode("utf-8"))
        return(max(list2))

Пример #4

0

Показать файл

Файл: final.py Проект: INIGOS/emoticons

    def AcronymsOperation(self, text):
        acr_list = []

        for word in twokenize.tokenize(text):
            if word != " ":
                word = word.strip()

                try:
                    word = self.acronyms[word]
                    acr_list.append(word)
                    self.answer["EXPANDED ACRONYMS"] = acr_list
                except:
                    if "@" in word:
                        word = "@user"

        return self.answer

Пример #5

0

Показать файл

Файл: extraction.py Проект: INIGOS/ClientProject

    def AcronymsOperation(self, text):
        acr_list = []
        text = text.lower()
        for word in twokenize.tokenize(text):
            if word != " ":
                word = word.strip()

                try:
                    word = self.acronyms[word]
                    acr_list.append(word)
                    self.answer['EXPANDED ACRONYMS'] = acr_list
                except:
                    if "@" in word:
                        word = "@user"

        return self.answer

Пример #6

0

Показать файл

Файл: final.py Проект: INIGOS/emoticons

    def Smileyoperation(self, text):
        emo_list = []

        for word in twokenize.tokenize(text):
            if word != " ":
                word = word.strip()

                try:
                    score = self.emoticons[word]
                    emo = emoticons.analyze_tweetHeavy(word)
                    emo_list.append(emo)
                    self.answer["SMILEYS"] = emo_list
                except:

                    if "@" in word:
                        word = "@user"
        return self.answer

Пример #7

0

Показать файл

Файл: extraction.py Проект: INIGOS/ClientProject

    def Smileyoperation(self, text):
        emo_list = []

        for word in twokenize.tokenize(text):
            if word != " ":
                word = word.strip()

                try:
                    score = self.emoticons[word]
                    emo = emoticons.analyze_tweetHeavy(word)
                    emo_list.append(emo)
                    self.answer['SMILEYS'] = emo_list
                except:

                    if "@" in word:
                        word = "@user"
        return self.answer

Пример #8

0

Показать файл

    def process(self, stopwordsF=0, stemmerF=0, encode=1):

        f = open('tumblr.txt', 'r')
        for line in iter(f):
            item = line.rstrip()
            #new=item.split('|')
            #tweet=new[4]
            #print tweet
            #print "_____"
            # remove URL
            line = re.sub(twokenize.Url_RE, " ", item)

            # to strip of extra white spaces
            temp = line.replace("#", " ").lower().split()
            temp = " ".join(temp)
            print "TUMBLR POST:"
            print temp
            out = [i for i in temp.split() if i not in stop]

            for word1 in list1:
                for word2 in out:
                    wordFromList1 = wordnet.synsets(word1)
                    wordFromList2 = wordnet.synsets(word2)
                    if wordFromList1 and wordFromList2:
                        s = wordFromList1[0].wup_similarity(wordFromList2[0])
                        list2.append(s)
                    #print list2
                print word1
                #print list2
                if list2:
                    print(max(list2))
                else:
                    print '0.0'
                del list2[:]

        #getkeys(out,list1)

    #print tweet

        tempTweet = ""

        for word in twokenize.tokenize(temp):
            """

Пример #9

0

Показать файл

Файл: ExServer.py Проект: INIGOS/ThriftDictionary

 def ExAcro(self,text):
     acr_list=[]
     text=text.lower()
     store_acronyms={}
     for word in twokenize.tokenize(text):
         if word!="":
             word=word.strip()
             
             try:
                 word_after=self.acronyms[word]
                 #acr_list.append(word)
                 store_acronyms[word]=word_after
                 #self.answer['EXPANDED ACRONYMS']=acr_list
             except:
                 if "@" in word:
                     word="@user"
     self.answer=store_acronyms
     str2=str(self.answer)
     return str2

Пример #10

0

Показать файл

Файл: retry.py Проект: INIGOS/emoticons

    def process(self,text,stopwordsF = 0, stemmerF = 0, encode = 1):
        list1=[]
        list2=[]
        line = re.sub(twokenize.Url_RE," ", text)
        temp = line.replace("#" , " ").lower().split()
        temp = " ".join(temp)

        for word in twokenize.tokenize(temp):
            if word != " ":

                word = word.strip()
                flagNonDict = 0

                try:

                    if self.wordDict[word] == 1:

                        word =	word
                except:
                    flagNonDict = 1
                    try:
                        score = self.emoticons[word]
                        emo = emoticons.analyze_tweetHeavy(word)

                        list1.append(emo)

                        self.result['EMOTICONS'] = list1
                    except:
                        try:
                            #Normalize Acronyms
                            word = self.acronyms[word]

                            list2.append(word)

                            self.result['ACRONYMS'] = list2

                        except:


                                if "@" in word:
                                 word = "@user"
        return self.result

Пример #11

0

Показать файл

Файл: ExServer.py Проект: INIGOS/JavaThrift

 def ExEmo(self,text):
     emo_list=[]
 
     for word in twokenize.tokenize(text):
         if word !=" ":
             word=word.strip()
 
             try:
                 score=self.emoticons[word]
                 emo=emoticons.analyze_tweetHeavy(word)
                 emo_list.append(word)
                 emo_list.append(emo)
                 #d=dict(itertools.izip_longest(*[iter(emo_list)] * 2, fillvalue=""))
                 self.answer['EMOTICONS']=emo_list
             except:
 
                 if "@" in word:
                     word="@user"
     str1=str(self.answer)
     return str1

Пример #12

0

Показать файл

Файл: extraction.py Проект: INIGOS/RANDOM-WORKS

    def EmoOperation(self,text):
        emo_list=[]
        acr_list=[]

        for word in twokenize.tokenize(text):
            if word !=" ":
                word=word.strip()

                try:
                    score=self.emoticons[word]
                    emo=emoticons.analyze_tweetHeavy(word)

                    emo_list.append(word)
                    emo_list.append(emo)
                    d=dict(itertools.izip_longest(*[iter(emo_list)] * 2, fillvalue=""))
                    
                except:

                    if "@" in word:
                        word="@user"
	return d

Пример #13

0

Показать файл

Файл: neat.py Проект: INIGOS/emoticons

    def process(self,text):
        list1=[]
        list2=[]


        for word in twokenize.tokenize(text):
            if word != " ":

                word = word.strip()


                try:

                    if self.wordDict[word] == 1:

                        word =	word
                except:

                    try:
                        score = self.emoticons[word]
                        emo = emoticons.analyze_tweetHeavy(word)

                        list1.append(emo)

                        self.result['EMOTICONS'] = list1
                    except:
                        try:
                            #Normalize Acronyms
                            word = self.acronyms[word]

                            list2.append(word)

                            self.result['ACRONYMS'] = list2

                        except:


                                if "@" in word:
                                 word = "@user"
        return self.result

Пример #14

0

Показать файл

Файл: extraction.py Проект: INIGOS/RANDOM-WORKS

    def AcroOperation(self,text):


        text=text.lower()
        store_acronyms = {}
        for word in twokenize.tokenize(text):
            if word !=" ":
                word=word.strip()

                try:
                    #print "before:", word
                    word_after=self.acronyms[word]
                    # acr_list.append(word)
                    #print "after:", word
                    store_acronyms[word] = word_after
                    # self.answer['ACRONYMS']=acr_list
                except:
                    if "@" in word:
                        word="@user"
            # print "EMOTICONS"
        self.answer['ACRONYMS'] = store_acronyms
        #return self.answer
	return store_acronyms

Пример #15

0

Показать файл

Файл: TextFilter.py Проект: billho/diskoveror-ta

	def process(self,text,stopwordsF = 0, stemmerF = 0, encode = 1):

		# remove URL
		line = re.sub(twokenize.Url_RE," ", text)

		# to strip of extra white spaces
		temp = line.replace("#" , " ").lower().split()
		temp = " ".join(temp)

		tempTweet = ""

		for word in twokenize.tokenize(temp):
			if word != " ":

				word = word.strip()
				flagNonDict = 0

				try:
					#### Check Dict and set flag
					if self.wordDict[word] == 1:
						# print(word)
						word =	word
				except:
					flagNonDict = 1
					try:
						score = self.emoticons[word]
						emo = emoticons.analyze_tweetHeavy(word)
						word = emo + "#("+ str(score) +")#"
						# print(word)
					except:
						try:
							#Normalize Acronyms
							word = self.acronyms[word]
						except:
							try:
								#Normalize Contractions
								word = self.contractions[word]
							except:
								#Normalize words (Spell)
								if flagNonDict == 1:
									if "@" in word:
										# remove user mentions
										word = "@user"
									else:
										corrected = self.spellCheck.correct(word)
										if corrected != "a":
											word = corrected
				try:
					tempTweet = " ".join([tempTweet,word.strip()])
					tempTweet = tempTweet.lower().strip()
				except:
					tempTweet = " ".join([tempTweet,word.strip().decode("iso-8859-1")])
					tempTweet = tempTweet.lower().strip()

		if stemmerF == 1 and stopwordsF == 1:
			tempTweet = " ".join(stemmer.stem(w) for w in tempTweet.split(" ") if w not in self.stop)
		elif stemmerF == 1:
			tempTweet = " ".join(stemmer.stem(w.strip()) for w in tempTweet.split(" "))
		elif stopwordsF == 1:
			tempTweet = " ".join(w for w in tempTweet.split(" ") if w.strip() not in self.stop)

		# print(tempTweet.encode("utf-8"))
		if encode == 0:
			return(tempTweet)
		return(tempTweet.encode("utf-8"))

Пример #16

0

Показать файл

Файл: TextFilter.py Проект: serendio-labs/diskoveror-ta

    def process(self, text, stopwordsF=0, stemmerF=0, encode=1):

        # remove URL
        line = re.sub(twokenize.Url_RE, " ", text)

        # to strip of extra white spaces
        temp = line.replace("#", " ").lower().split()
        temp = " ".join(temp)

        tempTweet = ""

        for word in twokenize.tokenize(temp):
            if word != " ":

                word = word.strip()
                flagNonDict = 0

                try:
                    #### Check Dict and set flag
                    if self.wordDict[word] == 1:
                        # print(word)
                        word = word
                except:
                    flagNonDict = 1
                    try:
                        score = self.emoticons[word]
                        emo = emoticons.analyze_tweetHeavy(word)
                        word = emo + "#(" + str(score) + ")#"
                        # print(word)
                    except:
                        try:
                            #Normalize Acronyms
                            word = self.acronyms[word]
                        except:
                            try:
                                #Normalize Contractions
                                word = self.contractions[word]
                            except:
                                #Normalize words (Spell)
                                if flagNonDict == 1:
                                    if "@" in word:
                                        # remove user mentions
                                        word = "@user"
                                    else:
                                        corrected = self.spellCheck.correct(
                                            word)
                                        if corrected != "a":
                                            word = corrected
                try:
                    tempTweet = " ".join([tempTweet, word.strip()])
                    tempTweet = tempTweet.lower().strip()
                except:
                    tempTweet = " ".join(
                        [tempTweet,
                         word.strip().decode("iso-8859-1")])
                    tempTweet = tempTweet.lower().strip()

        if stemmerF == 1 and stopwordsF == 1:
            tempTweet = " ".join(
                stemmer.stem(w) for w in tempTweet.split(" ")
                if w not in self.stop)
        elif stemmerF == 1:
            tempTweet = " ".join(
                stemmer.stem(w.strip()) for w in tempTweet.split(" "))
        elif stopwordsF == 1:
            tempTweet = " ".join(w for w in tempTweet.split(" ")
                                 if w.strip() not in self.stop)

        # print(tempTweet.encode("utf-8"))
        if encode == 0:
            return (tempTweet)
        return (tempTweet.encode("utf-8"))

Python tokenize примеры использования