def displayPageView(request): mycursor.execute('TRUNCATE table logs_c') filePath = request.GET['input-file'] filePath = "C:/Users/Rhishabh/Documents/mithi hackathon/" + filePath log = readfile(filePath) line = log.readline() tk = SpaceTokenizer() tokens = tk.tokenize(line) while line: tokens = tk.tokenize(line) process(tokens) line = log.readline() mydb.commit() result1 = query_1() result2 = query2() result3 = query3() result4 = query4() result5 = query5() result7 = query7() # mydb.close() temp = [['test', 'test'], ['test', 'test']] test = 'sdsds' return render(request, 'display.htm', {'ipfile': filePath, 'result1': result1, 'result2': result2, 'result3': result3, 'result4': result4, 'result5': result5, 'result7': result7})
def tokenization(corpus, stop_words=nltk.corpus.stopwords.words('portuguese')): '''Input : corpus é uma Serie de corpusumentos(frases) Output : Uma lista de listas com palavras stop_words : lista de palavras que devem ser removidas ''' #Tokenizacao spacetok = SpaceTokenizer() corpus = [spacetok.tokenize(phrases) for phrases in corpus] #stopwords if (stop_words != None): tmp_corpus = list() tmp_words = list() for phrases in corpus: for word in phrases: if (word not in stop_words): tmp_words.append(word) else: pass tmp_corpus.append(tmp_words) tmp_words = list() corpus = tmp_corpus else: pass return corpus
class NLTKSpaceTokenizeBody(BaseEnrichment): def __init__(self): self.tokenizer = SpaceTokenizer() def enrichment_value(self,tweet): return self.tokenizer.tokenize(tweet['body']) def __repr__(self): return "Use the NLTK SpaceTokenizer to parse the Tweet body."
class NLTKSpaceTokenizeBody(BaseEnrichment): """Use the NLTK SpaceTokenizer to parse the Tweet body.""" def __init__(self): self.tokenizer = SpaceTokenizer() def enrichment_value(self, tweet): return self.tokenizer.tokenize(tweet['body'])
def extract_name(tweet): token = SpaceTokenizer() toks = token.tokenize(tweet) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) nes = [ ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree) ] return nes
def extract_entities(text): entities = [] for sentence in sent_tokenize(text): tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(sentence) default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) #model = {'_': 'RB','shikha':'NNP','Lots':'','bbnt':'NNP','Swati':'NNP','Sarkar':'NNP','Deepak':'NNP','Capgemini':'NNP','Swati':'NNP','Deepak Shete':'NNP','Melini':'NNP','Lots':'RB','Prashant Deshpande':'NNP','Deepak A. Shete':'NNP','Rajesh Achyut Patankar':'NNP','Shailesh V. Naik':'NNP','Prashant':'NNP','Kuldeep Vishnu Deshpande':'NNP','Kuldeep Deshpande':'NNP','Hi':'UH','From':'IN','Subject':'VB','RE':'SYM','Cc':'SYM','CC':'SYM','Start':'RB','All':'RB','PLEASE':'RB','Request':'RB','Add':'RB','Need':'RB','Completed':'VB','To':'RB','Dear':'RB','Thank':'RB','You':'PRP','We':'PRP','Here':'RB','Team':'RB','Please':'UH','Thanks':'UH','Regards':'UH','See':'VB','Test':'VB','ASAP':'SYM','Sent':'VB','mailto':'SYM','Together':'RB','Is':'VB','AS':'RB','Financial Services Strategic Business Unit':'NNP','fax':'RB','mobile':'RB','except':'RB','date':'RB','new':'RB','courier':'RB','extn':'RB'} model = {'extn':'RB'} tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger) pos = pos_tag(toks) pos=tagger.tag(toks) #print pos chunks = ne_chunk(pos) #chunks = ne_chunk(pos_tag(word_tokenize(sentence))) entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')]) return entities #with open("D:/R/BOA/PySrc/FGD1_18-25_Vodafone_Prepaid_BCUsers_Mumbai.csv", "r") as csvfile: datareader = csv.reader(csvfile,quotechar='"' ,lineterminator='\n',quoting=csv.QUOTE_ALL) csv_out = open('D:/R/BOA/Noun/FNoun.csv', 'wb') mywriter = csv.writer(csv_out) count=0 for row in datareader: count = count + 1 print "COUNT is :%d" % count print row(''.join(row)) #mywriter.writerow(extract_entities(''.join(row))) #csv_out.close() file = open('D:/R/BOA/txtfiles/FGD1_18-25_Vodafone_Prepaid_BCUsers_Mumbai.txt', 'r') print file.read() filew = open('D:/R/BOA/Noun/FNoun.txt', "w") for line in file: print line filew.write(extract_entities(line)) #filew.write("yeah its me") filew.close()
# 3장 전처리 - 토큰화-NLTK 내장 토크나이저 사용법 from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize # LineTokenizer 사용('줄'로 나누기) lTokenizer = LineTokenizer() print( "Line toknizer 출력 :", lTokenizer.tokenize( "My name is" + "Maximus Decimus Meridius, commander of the Armies of the North, " + "General of the Felix Legions and loyal servant to the true emperor," + "Marcus Aurlius. \nFather to a murdered son, husband to a murdered" + "wife. \nAnd I will have my vengeance, in this life or the next.")) # SpaceTokenizer 사용('공백 문자'로 나누기) rawText = "By 11 o'clock on sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("Space Tokenizer 출력 :", sTokenizer.tokenize(rawText)) # word_tokenize 사용('단어'와 '구두점' 나누기) print("word Tokenizer 출력 :", word_tokenize(rawText)) # TweetTokenizer 사용('특수문자'를 다룰 때 사용) tTokenizer = TweetTokenizer() print("Tweet Tokenizer 출력 :", tTokenizer.tokenize("This is a coooool" + "#dummysmiley: :-) :-P <3"))
# In[14]: from nltk.tokenize import SpaceTokenizer tm = SpaceTokenizer() to_rank = [] key_words = [] for i in range(len(ranked_q)): yn = 0 #ranked_q[i][yn] question[i] = untokenize(question[i]) yy = "_____" to_rank.append(tm.tokenize(ranked_q[i][0])) print("Q:", question[i].replace(to_rank[i][len(to_rank[i]) // 2], yy)) print('Ans - ', to_rank[i][len(to_rank[i]) // 2]) #quita = question[i].index(to_rank[i][len(to_rank[i])//2]) #key_words.append(question[i][quita]) #print(to_rank[0][len(to_rank[0])//2]) #question[0].remove(question[0][quita]) #question[0][quita] = to_rank[0][len(to_rank[0])//2] #print(question[0][quita]) # In[ ]:
from nltk.corpus import stopwords from functools import partial #from gensim import corpora #from gensim.models import TfidfModel import re # initialize the instances for various NLP tools tokenizer = SpaceTokenizer() stemmer = PorterStemmer() # define steps pipeline = [lambda s: re.sub('[\n]', '', s), lambda s: re.sub('[^\w\s]', '', s), lambda s: re.sub('[\d\n]', '', s), lambda s: s.lower(), lambda s: ' '.join(filter(lambda s: not (s in stopwords.words('english')), tokenizer.tokenize(s))), lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s))) ] # function that carries out the pipeline step-by-step def preprocess_text(text, pipeline): if len(pipeline)==0: return text else: return preprocess_text(pipeline[0](text), pipeline[1:]) #This section reads in documents from the selected corpus as real text from nltk.corpus import reuters #This reads in all documents and finds all unique words
from nltk.tokenize import SpaceTokenizer os.chdir("D:/R/BOA/txtfiles") for fileName in glob.glob("*.txt"): count=0 file = open('D:/R/BOA/txtfiles/'+fileName, 'r') filew = open('D:/R/BOA/Noun/'+fileName, "wb") for line in file: count=count+1 print count print line line = re.sub('\\f', '', line) #line = line.decode("utf-8") line = unicode(line, errors='ignore') tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(line) default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) model = {'Consumer': 'RB'} tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger) #pos = pos_tag(toks) pos=tagger.tag(toks) print pos chunked_nes = ne_chunk(pos) nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)] #data.append(nes) print nes filew.write((','.join(nes))+'\n') #filew.write("yeah its me")
status = tokens[6] size = tokens[7][:-1] # print(ip + ' ' + date_time + ' ' + method + ' ' + url + ' ' + protocol + ' ' + status + ' ' + size) val = (ip, date_time, method, url, protocol, status, size) mycursor.execute(sql, val) # Type 0 -> Tab Seperated (Server 1) # Type 1 -> Space Seperated (Server 2) log = readfile("access_log") line = log.readline() tk = SpaceTokenizer() tokens = tk.tokenize(line) while line: tokens = tk.tokenize(line) process(tokens) line = log.readline() mydb.commit() print("records inserted.") # Top client ip addresses by number of requests sql = "SELECT IP, count(*) FROM logs_c GROUP BY IP ORDER BY count(*) DESC LIMIT 5" mycursor.execute(sql) results = mycursor.fetchall() for x in results:
#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Time : 2020/7/11 17:37 # @Author : 代登辉 # @Email : [email protected] # @File : tokenizer.py # @Software : PyCharm # @Description: 分词 # 导入相应库 from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize text = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \ "loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. " \ "\nAnd I will have my vengeance, in this life or the next. " ITokenizer = LineTokenizer() print("按照换行分词 ", ITokenizer.tokenize(text)) rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("按照空格符分词 :", sTokenizer.tokenize(rawText)) # 表达符号和单词连在一起 print("按照单词分词 :", word_tokenize(rawText)) # 表达符号和单词分开 tweet = "This is a cooool #dummysmiley: :-) :-P <3" tTokenizer = TweetTokenizer() print("处理特殊字符 ", tTokenizer.tokenize(tweet))
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize line = "My name is Venkatram Veerareddy, technical architect.\n I am having 20 years of experience in "\ " Software industry working \nfrom applications to products by using \n" \ " C, C++, Java, Javascript and databases "\ " like Oracle, MS SQL Server, Postgres, MySQL and OrientDB." lTokenizer = LineTokenizer() print("Line tokenizer output: ", lTokenizer.tokenize(line)) sTokenizer = SpaceTokenizer() print("Space Tokenizer output: ", sTokenizer.tokenize(line)) print("Word Tokenizer output: ", word_tokenize(line)) tTokenizer = TweetTokenizer() print("Tweet Tokenizer output: ", tTokenizer.tokenize("This is a coooool #dummysmiley: :-) :-P <3"))
import nltk.tag, nltk.data from nltk.tokenize import SpaceTokenizer with open("D:/R/email_Analysis/FINAL/pyhton_mssg.csv", "r") as csvfile: datareader = csv.reader(csvfile, quotechar='"', lineterminator='\n', quoting=csv.QUOTE_ALL) csv_out = open('D:/R/email_Analysis/FINAL/Noun.csv.csv', 'wb') mywriter = csv.writer(csv_out) count = 0 for row in datareader: count = count + 1 print "COUNT is :%d" % count tokenizer = SpaceTokenizer() toks = tokenizer.tokenize((''.join(row))) default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) model = { 'Almost': 'RB', 'shikha': 'NNP', 'Lots': '', 'bbnt': 'NNP', 'Swati': 'NNP', 'Sarkar': 'NNP', 'Deepak': 'NNP', 'Capgemini': 'NNP', 'Swati': 'NNP', 'Deepak Shete': 'NNP', 'Melini': 'NNP', 'Lots': 'RB', 'Prashant Deshpande': 'NNP',
from nltk.tokenize import SpaceTokenizer from nltk.corpus import stopwords from functools import partial from gensim import corpora from gensim.models import TfidfModel import re # initialize the instances for various NLP tools tokenizer = SpaceTokenizer() stemmer = PorterStemmer() # define each steps pipeline = [ lambda s: re.sub('[^\w\s]', '', s), lambda s: re.sub('[\d]', '', s), lambda s: s.lower(), lambda s: ' '.join( filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s))), lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s))) ] # function that carries out the pipeline step-by-step def preprocess_text(text, pipeline): if len(pipeline) == 0: return text else: return preprocess_text(pipeline[0](text), pipeline[1:]) # preprocessing preprocessed_texts = map(partial(preprocess_text, pipeline=pipeline), texts)
def text_pre_processing(text, remove_number=True, stop_word=True, stop_word_language='english', remove_punctuation=True): # --------------------------------------------- # Patterns results_chunk = '' results_named_entitiy = '' patterns1 = r'@[A-Za-z0-9_]+' pattterns2 = r'https?://[^ ]+' combined_patterns = r'|'.join((patterns1, pattterns2)) www_patterns = r'www.[^ ]+' negations_dic = { "isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not", "wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not", "can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not", "mustn't": "must not" } negations_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') # --------------------------------------------- # convert to lower case results = str(text) # --------------------------------------------- # Text Cleaning results = re.sub(combined_patterns, '', results) results = re.sub(www_patterns, '', results) results = results.lower() results = negations_pattern.sub(lambda x: negations_dic[x.group()], results) results = re.sub("[^a-zA-Z]", " ", results) results = results.replace("(<br/>)", "") results = results.replace('(<a).*(>).*(</a>)', '') results = results.replace('(&)', '') results = results.replace('(>)', '') results = results.replace('(<)', '') results = results.replace('(\xa0)', ' ') # --------------------------------------------- if (remove_number) & (results != ''): results = re.sub(r'\d+', '', results) # --------------------------------------------- if remove_punctuation & (results != ''): translator = str.maketrans('', '', string.punctuation) results = results.translate(translator) # --------------------------------------------- # Remove whitespaces results = results.strip() # --------------------------------------------- # Line Tokenize if results != '': line_tokenizer = LineTokenizer() results = line_tokenizer.tokenize(results) results = list(filter(None, results)) results = results[0] # --------------------------------------------- # Tab Tokenize if results != '': tab_tokenizer = TabTokenizer() results = tab_tokenizer.tokenize(results) results = list(filter(None, results)) results = results[0] # --------------------------------------------- # Space Tokenizer if results != '': space_toknizer = SpaceTokenizer() results = space_toknizer.tokenize(results) results = list(filter(None, results)) results = ' '.join([w for w in results]) # ----------------------------------------------- # Lemmatization using NLTK if results != '': lemmatizer_of_text = WordNetLemmatizer() word_list = word_tokenize(results) results = ' '.join([ lemmatizer_of_text.lemmatize(w, get_word_net_pos_tag(w)) for w in word_list ]) # --------------------------------------------- # Stemming using NLTK if results != '': stemmer = PorterStemmer() if type(results) == list: results = ' '.join(str(w) for w in results) results = word_tokenize(str(results)) results = [stemmer.stem(word) for word in results] results = ' '.join(str(w) for w in results) # --------------------------------------------- # Remove Stop Words if stop_word & (results != ''): nltk.download('stopwords') stop_words = set(stopwords.words(stop_word_language)) word_tokens = word_tokenize(results) results = ' '.join(str(w) for w in word_tokens if not w in stop_words) # --------------------------------------------- # Chunking of the input, will be used ofr coloring of the text if results != '': result_str = TextBlob(results) reg_exp = 'NP: { < DT >? < JJ > * < NN >}' rp = nltk.RegexpParser(reg_exp) results_chunk = rp.parse(result_str.tags) # results_chunk.draw() # --------------------------------------------- # Named Entity Recognition if results != '': results_named_entitiy = ne_chunk(pos_tag(word_tokenize(results))) return results, results_chunk, results_named_entitiy
from nltk import pos_tag, ne_chunk import nltk.tag, nltk.data from nltk.tokenize import SpaceTokenizer with open("D:/R/email_Analysis/FINAL/pyhton_mssg.csv", "r") as csvfile: datareader = csv.reader(csvfile,quotechar='"' ,lineterminator='\n',quoting=csv.QUOTE_ALL) csv_out = open('D:/R/email_Analysis/FINAL/Noun.csv.csv', 'wb') mywriter = csv.writer(csv_out) count=0 for row in datareader: count = count + 1 print "COUNT is :%d" % count tokenizer = SpaceTokenizer() toks = tokenizer.tokenize((''.join(row))) default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) model = {'Almost': 'RB','shikha':'NNP','Lots':'','bbnt':'NNP','Swati':'NNP','Sarkar':'NNP','Deepak':'NNP','Capgemini':'NNP','Swati':'NNP','Deepak Shete':'NNP','Melini':'NNP','Lots':'RB','Prashant Deshpande':'NNP','Deepak A. Shete':'NNP','Rajesh Achyut Patankar':'NNP','Shailesh V. Naik':'NNP','Prashant':'NNP','Kuldeep Vishnu Deshpande':'NNP','Kuldeep Deshpande':'NNP','Hi':'UH','From':'IN','Subject':'VB','RE':'SYM','Cc':'SYM','CC':'SYM','Start':'RB','All':'RB','PLEASE':'RB','Request':'RB','Add':'RB','Need':'RB','Completed':'VB','To':'RB','Dear':'RB','Thank':'RB','You':'PRP','We':'PRP','Here':'RB','Team':'RB','Please':'UH','Thanks':'UH','Regards':'UH','See':'VB','Test':'VB','ASAP':'SYM','Sent':'VB','mailto':'SYM','Together':'RB','Is':'VB','AS':'RB','Financial Services Strategic Business Unit':'NNP','fax':'RB','mobile':'RB','except':'RB','date':'RB','new':'RB','courier':'RB','extn':'RB'} tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger) #pos = pos_tag(toks) pos=tagger.tag(toks) print pos chunked_nes = ne_chunk(pos) nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)] #data.append(nes) print nes mywriter.writerow(nes)
from nltk import word_tokenize # Line tokenizer longSentence = 'My name is Maximus Decimus Meridius, Commander of the Armies '\ 'of the North, General of the Felix Legions, loyal servant to '\ 'the true emperor, Marcus Aurelius. Father to a murdered son, '\ 'husband to a murdered wife. And I will have my vengeance, in '\ 'this life or the next.' lTokenizer = LineTokenizer() sentenceTokens = lTokenizer.tokenize(longSentence) print (sentenceTokens) # Space tokenizer sTokenizer = SpaceTokenizer() spaceTokens = sTokenizer.tokenize(longSentence) print (spaceTokens) # Tweet tokenizer tweet = 'This is a coool #dummysmiley: :-) :) :-P <3' tTokenizer = TweetTokenizer() tTokens = tTokenizer.tokenize(tweet) print ('Tweet tokenizer outpur:') print (tTokens) # Word tokenizer wTokenizer = word_tokenize(longSentence) print (wTokenizer) ################ ### Stemming ###
indx ='\n'.join(res) print ("\nThe sentences contaning '"+ inp +"'"+" are : \n" + indx) #conversations containing input con = re.findall(r'"(?:(?:(?!(?<!\\)").)*)"', str(res)) indx2 ='\n'.join(con) print ("\nThe conversations contaning '"+ inp +"'"+" are : \n" + indx2) #count of conversations count = len(list(filter(lambda x: inp in x, con))) print ("\nThe count of conversations contaning '"+ inp +"'"+" are :\n"+str(count)) #All conversations in the excerpt allconv = re.findall(r'"(.*?)"', str(token_text)) indx3 ='\n'.join(allconv) print ("\nThe conversations in the excerpt are : \n" + indx3) from nltk.tag import pos_tag tagged_sent = pos_tag(text_string.split()) #propernouns = [word for word,pos in tagged_sent if pos == 'NNP'] #print( propernouns) from nltk.tree import Tree from nltk import pos_tag, ne_chunk from nltk.tokenize import SpaceTokenizer tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(text_string) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne,Tree)] indx4 ='\n'.join(nes) print("\n Proper nouns used in the excerpt are:\n", indx4)
from nltk.tag import pos_tag import nltk.tokenize from nltk.corpus import cmudict from wordgen import gen_word from nltk import pos_tag, ne_chunk from nltk.tokenize import SpaceTokenizer sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus" tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(sentence) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) print chunked_nes nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)] print nes ''' qry = "who is Mahatma Gandhi" tokens = nltk.tokenize.word_tokenize(qry) pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary = False) print sentt person = [] for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'): for leave in subtree.leaves(): person.append(leave) print "person=", person ''' ''' d = cmudict.dict()
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize lTokenizer = LineTokenizer() print( "Line tokenizer output :", lTokenizer.tokenize( "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. \nAnd I will have my vengeance, in this life or the next." )) rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("Space Tokenizer output :", sTokenizer.tokenize(rawText)) print("Word Tokenizer output :", word_tokenize(rawText)) tTokenizer = TweetTokenizer() print("Tweet Tokenizer output :", tTokenizer.tokenize("This is a cooool #dummysmiley: :-) :-P <3"))
from nltk.tag import pos_tag import nltk.tokenize from nltk.corpus import cmudict from wordgen import gen_word from nltk import pos_tag, ne_chunk from nltk.tokenize import SpaceTokenizer sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus" tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(sentence) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) print chunked_nes nes = [ ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree) ] print nes ''' qry = "who is Mahatma Gandhi" tokens = nltk.tokenize.word_tokenize(qry) pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary = False) print sentt person = [] for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'): for leave in subtree.leaves(): person.append(leave) print "person=", person
from nltk.tokenize import SpaceTokenizer os.chdir("D:/R/BOA/txtfiles") for fileName in glob.glob("*.txt"): count = 0 file = open('D:/R/BOA/txtfiles/' + fileName, 'r') filew = open('D:/R/BOA/Noun/' + fileName, "wb") for line in file: count = count + 1 print count print line line = re.sub('\\f', '', line) #line = line.decode("utf-8") line = unicode(line, errors='ignore') tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(line) default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) model = {'Consumer': 'RB'} tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger) #pos = pos_tag(toks) pos = tagger.tag(toks) print pos chunked_nes = ne_chunk(pos) nes = [ ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree) ] #data.append(nes) print nes filew.write((','.join(nes)) + '\n') #filew.write("yeah its me")
tokenizer = SpaceTokenizer() stop_words = set(stopwords.words("english")) #adding more stop_words based on initial analysis stop_words.update(['new', 'use', 'would', '-', 'using']) #print stop_words while (itr < 100): try: if (messages[itr][0:1] == "`"): itr += 1 #print "code found at: " + itr else: lowercased = messages[itr].lower() lemmatized_lowercased = lemmatizer.lemmatize(lowercased) tokenized = tokenizer.tokenize(lowercased) filtered_sentence = [ words for words in tokenized if not words in stop_words ] tokenized_stopless_messages.append(filtered_sentence) #print "filtered_sentence added" #tokenized_messages.append(tokenizer.tokenize(messages[itr])) itr += 1 #print itr except TypeError: #print "Skipped" itr += 1 #print itr #print itr
wiki_files = [ "soccer_teams_wiki/resources/wikipedia_corinthians.txt", "soccer_teams_wiki/resources/wikipedia_palmeiras.txt", "soccer_teams_wiki/resources/wikipedia_portuguesa.txt", "soccer_teams_wiki/resources/wikipedia_santos.txt", "soccer_teams_wiki/resources/wikipedia_sao_paulo.txt" ] for file in wiki_files: with open(file, "r") as wiki_file: wiki_text = wiki_file.readlines() # TODO text cleanup. Remove stop words and other text treatment for articles for line in wiki_text: phrase = [ word.lower() for word in tokenizer.tokenize(line) if word not in stop_words ] wiki_tokenized.append(phrase) our_model = Word2Vec(wiki_tokenized, size=10, window=15, min_count=1, workers=4) while True: query_word = input('Type Word: ') query_word = query_word.strip().lower() if our_model.__contains__(query_word): print(our_model.most_similar(query_word))
ml = len(messages) print ml #34467 itr = 0 tokenized_messages = [] tokenizer = SpaceTokenizer() while (itr < 10): try: if (messages[itr][0:1] == "`"): itr += 1 print "code found" else: tokenized_messages.append(tokenizer.tokenize(messages[itr])) itr += 1 print itr except TypeError: print "Skipped" itr += 1 print itr #print itr #error after 1741 print tokenized_messages[0] eg_string = "This is a sample sentence, showing off the stop words filtration." print eg_string[0:2]
# import all necessary libraries from nltk.stem import PorterStemmer from nltk.tokenize import SpaceTokenizer from nltk.corpus import stopwords import re # initialize the instances for various NLP tools tokenizer = SpaceTokenizer() stemmer = PorterStemmer() # define each steps pipeline1 = [lambda s: re.sub('[^\w\s]', '', s), # remove special characters lambda s: re.sub('[\d]', '', s), # remove numbers lambda s: s.lower(), # lower case lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s))), # remove stop words lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s))) # stem (using Porter stemmer) ] pipeline2 = [lambda s: re.sub('[^\w\s]', '', s), lambda s: re.sub('[\d]', '', s), lambda s: s.lower(), lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s))) ] stopword_removal_pipeline = [lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s)))] # pipeline handling def preprocess_text(text, pipeline): return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
REMOVE_PONCT = 3 LIST_OF_TASKS = [REMOVE_LINKS, REMOVE_ARROBA, REMOVE_PONCT] #textMSG='testo com vários @davi @vito @maria https://www.google.com/ e links' textMSG = 'será que vai cortar @davi . ; esse texto ? https://www.google.com vamos ver né?' print(remove_ponctuation(textMSG)) print("Texto=", textMSG) tk = SpaceTokenizer() #s1 = tk.tokenize(textMSG) #print(s1) for task in LIST_OF_TASKS: s1 = tk.tokenize(textMSG) tam = len(s1) print("Num=", tam) #determine aqui o número de threads desejadas numthreads = 10 pedaco = int(tam / numthreads) threads = [] for i in range(0, numthreads): inicio = i * pedaco if i == numthreads - 1: fim = tam else: fim = (i + 1) * pedaco