def tokenization(corpus, stop_words=nltk.corpus.stopwords.words('portuguese')): '''Input : corpus é uma Serie de corpusumentos(frases) Output : Uma lista de listas com palavras stop_words : lista de palavras que devem ser removidas ''' #Tokenizacao spacetok = SpaceTokenizer() corpus = [spacetok.tokenize(phrases) for phrases in corpus] #stopwords if (stop_words != None): tmp_corpus = list() tmp_words = list() for phrases in corpus: for word in phrases: if (word not in stop_words): tmp_words.append(word) else: pass tmp_corpus.append(tmp_words) tmp_words = list() corpus = tmp_corpus else: pass return corpus
def displayPageView(request): mycursor.execute('TRUNCATE table logs_c') filePath = request.GET['input-file'] filePath = "C:/Users/Rhishabh/Documents/mithi hackathon/" + filePath log = readfile(filePath) line = log.readline() tk = SpaceTokenizer() tokens = tk.tokenize(line) while line: tokens = tk.tokenize(line) process(tokens) line = log.readline() mydb.commit() result1 = query_1() result2 = query2() result3 = query3() result4 = query4() result5 = query5() result7 = query7() # mydb.close() temp = [['test', 'test'], ['test', 'test']] test = 'sdsds' return render(request, 'display.htm', {'ipfile': filePath, 'result1': result1, 'result2': result2, 'result3': result3, 'result4': result4, 'result5': result5, 'result7': result7})
def extract_name(tweet): token = SpaceTokenizer() toks = token.tokenize(tweet) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) nes = [ ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree) ] return nes
class NLTKSpaceTokenizeBody(BaseEnrichment): """Use the NLTK SpaceTokenizer to parse the Tweet body.""" def __init__(self): self.tokenizer = SpaceTokenizer() def enrichment_value(self, tweet): return self.tokenizer.tokenize(tweet['body'])
def read_doc(doc, labels): doc = SpaceTokenizer().tokenize(doc.strip()) # doc = doc.strip().split() labels = labels.strip().split('|') labels = [la.split() for la in labels] for i in range(len(labels)): for j in range(len(labels[i])): labels[i][j] = int(labels[i][j]) res_labels = [0] * len(doc) for la in labels: if la[2] != 0: start = la[0] end = la[1] res_labels[start:end + 1] = [1] * (end + 1 - start) return [(doc[i], str(res_labels[i])) for i in range(len(doc))]
class NLTKSpaceTokenizeBody(BaseEnrichment): def __init__(self): self.tokenizer = SpaceTokenizer() def enrichment_value(self,tweet): return self.tokenizer.tokenize(tweet['body']) def __repr__(self): return "Use the NLTK SpaceTokenizer to parse the Tweet body."
def extract_entities(text): entities = [] for sentence in sent_tokenize(text): tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(sentence) default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) #model = {'_': 'RB','shikha':'NNP','Lots':'','bbnt':'NNP','Swati':'NNP','Sarkar':'NNP','Deepak':'NNP','Capgemini':'NNP','Swati':'NNP','Deepak Shete':'NNP','Melini':'NNP','Lots':'RB','Prashant Deshpande':'NNP','Deepak A. Shete':'NNP','Rajesh Achyut Patankar':'NNP','Shailesh V. Naik':'NNP','Prashant':'NNP','Kuldeep Vishnu Deshpande':'NNP','Kuldeep Deshpande':'NNP','Hi':'UH','From':'IN','Subject':'VB','RE':'SYM','Cc':'SYM','CC':'SYM','Start':'RB','All':'RB','PLEASE':'RB','Request':'RB','Add':'RB','Need':'RB','Completed':'VB','To':'RB','Dear':'RB','Thank':'RB','You':'PRP','We':'PRP','Here':'RB','Team':'RB','Please':'UH','Thanks':'UH','Regards':'UH','See':'VB','Test':'VB','ASAP':'SYM','Sent':'VB','mailto':'SYM','Together':'RB','Is':'VB','AS':'RB','Financial Services Strategic Business Unit':'NNP','fax':'RB','mobile':'RB','except':'RB','date':'RB','new':'RB','courier':'RB','extn':'RB'} model = {'extn':'RB'} tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger) pos = pos_tag(toks) pos=tagger.tag(toks) #print pos chunks = ne_chunk(pos) #chunks = ne_chunk(pos_tag(word_tokenize(sentence))) entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')]) return entities #with open("D:/R/BOA/PySrc/FGD1_18-25_Vodafone_Prepaid_BCUsers_Mumbai.csv", "r") as csvfile: datareader = csv.reader(csvfile,quotechar='"' ,lineterminator='\n',quoting=csv.QUOTE_ALL) csv_out = open('D:/R/BOA/Noun/FNoun.csv', 'wb') mywriter = csv.writer(csv_out) count=0 for row in datareader: count = count + 1 print "COUNT is :%d" % count print row(''.join(row)) #mywriter.writerow(extract_entities(''.join(row))) #csv_out.close() file = open('D:/R/BOA/txtfiles/FGD1_18-25_Vodafone_Prepaid_BCUsers_Mumbai.txt', 'r') print file.read() filew = open('D:/R/BOA/Noun/FNoun.txt', "w") for line in file: print line filew.write(extract_entities(line)) #filew.write("yeah its me") filew.close()
def tokenize(s): out = [] tokens = SpaceTokenizer().tokenize(s) for w in tokens: if w[:1] == "\n": out.append("\n") out.append(w[1:]) else: out.append(w) return out
def read_sent(sent): sent = SpaceTokenizer().tokenize(sent) start = int(sent[0]) end = int(sent[1]) dataset = int(sent[2]) sent = sent[4:] labels = [0] * len(sent) if dataset != 0: labels[start:end + 1] = [1] * (end + 1 - start) return [(sent[i], str(labels[i])) for i in range(len(sent))]
def analyze_line(line): tokens = pos_tag(SpaceTokenizer().tokenize(line)) names = [] for token in tokens: if token[1] == 'NNP': names.append(re.sub('[' + string.punctuation + ']', '', token[0])) return { "names": names, "sentiment": SentimentIntensityAnalyzer().polarity_scores(line) }
def fun_1_1_5(): import nltk from nltk.tokenize import RegexpTokenizer from nltk.tokenize import regexp_tokenize tokenizer = RegexpTokenizer("[\w]+") print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") print "regexp_tokenizer:", regexp_tokenize( "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+") # 通过空格来执行切分 tokenizer = RegexpTokenizer('\s+', gaps=True) print "RegexpTokenizer:", tokenizer.tokenize( "Don't hesitate to ask questions") # 筛选以大写字母开头的单词 sent = " She secured 90.56 % in class X \n. She is a meritorious student" capt = RegexpTokenizer('[A-Z]\w+') print "RegexpTokenizer:", capt.tokenize(sent) # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的 from nltk.tokenize import BlanklineTokenizer print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent) # 字符串的切分可以通过空格、间隔、换行等来完成 from nltk.tokenize import WhitespaceTokenizer print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent) # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其 # 切分为字母与非字母字符 from nltk.tokenize import WordPunctTokenizer print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent) # 使用 split()方法进行切分 print "split():", sent.split() print "split(' '):", sent.split(' ') print "split('\n'):", sent.split('\n') # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分 from nltk.tokenize import LineTokenizer print "LineTokenizer:", LineTokenizer().tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent) print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent) # SpaceTokenizer 与 sent.split('')方法的工作原理类似 from nltk.tokenize import SpaceTokenizer print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent) # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符 # 在语句中的位置和偏移量 print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent)) # 给定一个标识符的序列,则可以返回其跨度序列 from nltk.tokenize.util import spans_to_relative print "位置和偏移:", list( spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))) # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量: from nltk.tokenize.util import string_span_tokenize print "标识符序列:", list(string_span_tokenize(sent, " "))
def get_vocab(self, start_index=2, min_count=10): text = ''.join(list(self.publications['full_text'].values)) all_words = SpaceTokenizer().tokenize(text + text.lower()) vocab = Counter(all_words).most_common() vocab_out_json = {} for items in vocab: if items[1] > min_count: vocab_out_json[items[0].decode( 'utf-8', 'replace')] = len(vocab_out_json) + start_index print(len(vocab) - len(vocab_out_json), ' words are discarded as OOV') print(len(vocab_out_json), ' words are in vocab') with codecs.open(self.outdir + 'vocab.json', 'wb') as vocabfile: json.dump(vocab_out_json, vocabfile)
def space_tokenizer(text, strip=None): ''' Only " " blank character Same as s.split(" ") >>> s = "Good muffins cost $3.88\nin New York. It's inexpensive. Free-for-all. Please buy me\ntwo of them.\n\nThanks." >>> SpaceTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', "It's", 'inexpensive.', 'Free-for-all.', 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] >>> s.split(' ') ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', "It's", 'inexpensive.', 'Free-for-all.', 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] >>>''' for token in SpaceTokenizer().tokenize(text): if token not in patterns.PUNCTUATION and not token.isspace(): yield token.strip(strip)
def tokenizarPorTipo(): cadena = "Sorry, I can't go to the meeting.\n" print("TreebankWordTokenizer - 1") print("WhitespaceTokenizer - 2") print("SpaceTokenizer - 3") print("WordPunctTokenizer - 4") num = input("Introduzca un tokenizer: ") if num == "1": tokenizer = TreebankWordTokenizer() elif num == "2": tokenizer = WhitespaceTokenizer() elif num == "3": tokenizer = SpaceTokenizer() elif num == "4": tokenizer = WordPunctTokenizer() else: return tokens = tokenizer.tokenize(cadena) print(tokens)
from nltk import CFG,ChartParser from nltk.tokenize import SpaceTokenizer grammar = CFG.fromstring(""" S -> NP VP NP -> Det N VP -> IV Det -> 'the' N -> 'man' IV -> 'walks' """) #>>> grammar #<Grammar with 14 productions> #>>> grammar.start() #S #>>> grammar.productions() #[S -> NP VP, NP -> Det N, VP -> IV, Det -> 'the', N -> 'man', IV -> 'walks'] parser = ChartParser(grammar) parses = parser.parse_all(SpaceTokenizer().tokenize("the man walks")) #>>> parses #[Tree('S', [Tree('NP', [Tree('Det', ['the']), Tree('N', ['man'])]), Tree('VP', [Tree('IV', ['walks'])])])]
#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Time : 2020/7/11 17:37 # @Author : 代登辉 # @Email : [email protected] # @File : tokenizer.py # @Software : PyCharm # @Description: 分词 # 导入相应库 from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize text = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \ "loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. " \ "\nAnd I will have my vengeance, in this life or the next. " ITokenizer = LineTokenizer() print("按照换行分词 ", ITokenizer.tokenize(text)) rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("按照空格符分词 :", sTokenizer.tokenize(rawText)) # 表达符号和单词连在一起 print("按照单词分词 :", word_tokenize(rawText)) # 表达符号和单词分开 tweet = "This is a cooool #dummysmiley: :-) :-P <3" tTokenizer = TweetTokenizer() print("处理特殊字符 ", tTokenizer.tokenize(tweet))
import csv from nltk import pos_tag, ne_chunk import nltk.tag, nltk.data from nltk.tokenize import SpaceTokenizer with open("D:/R/email_Analysis/FINAL/pyhton_mssg.csv", "r") as csvfile: datareader = csv.reader(csvfile,quotechar='"' ,lineterminator='\n',quoting=csv.QUOTE_ALL) csv_out = open('D:/R/email_Analysis/FINAL/Noun.csv.csv', 'wb') mywriter = csv.writer(csv_out) count=0 for row in datareader: count = count + 1 print "COUNT is :%d" % count tokenizer = SpaceTokenizer() toks = tokenizer.tokenize((''.join(row))) default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) model = {'Almost': 'RB','shikha':'NNP','Lots':'','bbnt':'NNP','Swati':'NNP','Sarkar':'NNP','Deepak':'NNP','Capgemini':'NNP','Swati':'NNP','Deepak Shete':'NNP','Melini':'NNP','Lots':'RB','Prashant Deshpande':'NNP','Deepak A. Shete':'NNP','Rajesh Achyut Patankar':'NNP','Shailesh V. Naik':'NNP','Prashant':'NNP','Kuldeep Vishnu Deshpande':'NNP','Kuldeep Deshpande':'NNP','Hi':'UH','From':'IN','Subject':'VB','RE':'SYM','Cc':'SYM','CC':'SYM','Start':'RB','All':'RB','PLEASE':'RB','Request':'RB','Add':'RB','Need':'RB','Completed':'VB','To':'RB','Dear':'RB','Thank':'RB','You':'PRP','We':'PRP','Here':'RB','Team':'RB','Please':'UH','Thanks':'UH','Regards':'UH','See':'VB','Test':'VB','ASAP':'SYM','Sent':'VB','mailto':'SYM','Together':'RB','Is':'VB','AS':'RB','Financial Services Strategic Business Unit':'NNP','fax':'RB','mobile':'RB','except':'RB','date':'RB','new':'RB','courier':'RB','extn':'RB'} tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger) #pos = pos_tag(toks) pos=tagger.tag(toks) print pos chunked_nes = ne_chunk(pos) nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)] #data.append(nes) print nes mywriter.writerow(nes)
# import all necessary libraries from nltk.stem import PorterStemmer from nltk.tokenize import SpaceTokenizer from nltk.corpus import stopwords import re # initialize the instances for various NLP tools tokenizer = SpaceTokenizer() stemmer = PorterStemmer() # define each steps pipeline1 = [lambda s: re.sub('[^\w\s]', '', s), # remove special characters lambda s: re.sub('[\d]', '', s), # remove numbers lambda s: s.lower(), # lower case lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s))), # remove stop words lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s))) # stem (using Porter stemmer) ] pipeline2 = [lambda s: re.sub('[^\w\s]', '', s), lambda s: re.sub('[\d]', '', s), lambda s: s.lower(), lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s))) ] stopword_removal_pipeline = [lambda s: ' '.join(filter(lambda s: not (s in stopwords.words()), tokenizer.tokenize(s)))] # pipeline handling def preprocess_text(text, pipeline): return text if len(pipeline)==0 else preprocess_text(pipeline[0](text), pipeline[1:])
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
def __init__(self): self.tokenizer = SpaceTokenizer()
def get_test_docs(self): test_doc_ids = open(self.outdir + '/test_doc_ids', 'w+') test_docs = open(self.outdir + '/test_docs', 'w+') golden_data = open(self.outdir + '/test_doc_gold', 'w+') test_doc_list = [] for doc in set(self.data_set_citations['publication_id']): if np.random.randint( 0, 100) < 10 and doc not in self.zero_shot_doc_ids: test_doc_list.append(doc) test_doc_ids.write(str(doc) + '\n') logger.info(str(len(test_doc_list)) + ' test docs selected') pub_ids = list(self.data_set_citations['publication_id']) pos_tokens = 0 neg_tokens = 0 #to locate lines with relevant pubs for pub_id in test_doc_list: pub_text = self.full_text[str(pub_id) + '.txt'] test_docs.write(pub_text + '\n') pub_text_tokens = list(SpaceTokenizer().tokenize(pub_text)) pub_text_spans = list(SpaceTokenizer().span_tokenize(pub_text)) cur_pos_tokens = 0 cur_neg_tokens = len(pub_text_tokens) res_line = [] rows = [pub_ids.index(i) for i in pub_ids if i == pub_id] for idx in rows: d_row = self.data_set_citations.loc[idx] for mention_text in d_row['mention_list']: mention_text = re.sub('\d', ' ', mention_text) # mention_text = re.sub('[^ ]- ', '', mention_text) mention_text_spans = list( SpaceTokenizer().span_tokenize(mention_text)) index_finder_lower = findall_lower(mention_text, pub_text) found_indices = [idx for idx in index_finder_lower] for find_index in found_indices: try: if find_index != -1: new_mention_text_spans = [ (indices[0] + find_index, indices[1] + find_index) for indices in mention_text_spans ] cur_pos_tokens += len(mention_text_spans) res_line.append( (pub_text_spans.index( new_mention_text_spans[0]), pub_text_spans.index( new_mention_text_spans[-1]), d_row['data_set_id'], d_row['publication_id'])) except: pass res_line = list(set(res_line)) if len(res_line) == 0: # no mentions at all res_line.append((-1, -1, 0, pub_id)) i = 0 for c in res_line: if i > 0: golden_data.write(' | ' + str(c[0]) + ' ' + str(c[1]) + ' ' + str(c[2]) + ' ' + str(c[3])) else: golden_data.write( str(c[0]) + ' ' + str(c[1]) + ' ' + str(c[2]) + ' ' + str(c[3])) i += 1 golden_data.write('\n') pos_tokens += cur_pos_tokens neg_tokens += (cur_neg_tokens - cur_pos_tokens) test_doc_ids.close() test_docs.close() golden_data.close() logger.info(str(pos_tokens) + " pos tokens added.") logger.info(str(neg_tokens) + " neg tokens added.") logger.info("neg token percentage: {}".format( neg_tokens * 100 / (pos_tokens + neg_tokens)))
#print(question[1]) selected_k = [] for r in range(len(ranked_q)): pos = nltk.pos_tag(ranked_q[r]) selective_pos = ['NN', 'VB'] selective_pos_words = [] for word, tag in pos: if tag in selective_pos: selective_pos_words.append((word, tag)) selected_k.append(selective_pos_words) #print(selected_k[1][0]) # In[14]: from nltk.tokenize import SpaceTokenizer tm = SpaceTokenizer() to_rank = [] key_words = [] for i in range(len(ranked_q)): yn = 0 #ranked_q[i][yn] question[i] = untokenize(question[i]) yy = "_____" to_rank.append(tm.tokenize(ranked_q[i][0])) print("Q:", question[i].replace(to_rank[i][len(to_rank[i]) // 2], yy)) print('Ans - ', to_rank[i][len(to_rank[i]) // 2]) #quita = question[i].index(to_rank[i][len(to_rank[i])//2])
import nltk import numpy import os # import all necessary libraries from nltk.stem import PorterStemmer from nltk.tokenize import SpaceTokenizer from nltk.corpus import stopwords from functools import partial #from gensim import corpora #from gensim.models import TfidfModel import re # initialize the instances for various NLP tools tokenizer = SpaceTokenizer() stemmer = PorterStemmer() # define steps pipeline = [lambda s: re.sub('[\n]', '', s), lambda s: re.sub('[^\w\s]', '', s), lambda s: re.sub('[\d\n]', '', s), lambda s: s.lower(), lambda s: ' '.join(filter(lambda s: not (s in stopwords.words('english')), tokenizer.tokenize(s))), lambda s: ' '.join(map(lambda t: stemmer.stem(t), tokenizer.tokenize(s))) ] # function that carries out the pipeline step-by-step def preprocess_text(text, pipeline): if len(pipeline)==0: return text else:
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize lTokenizer = LineTokenizer() print( "Line tokenizer output :", lTokenizer.tokenize( "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. \nAnd I will have my vengeance, in this life or the next." )) rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("Space Tokenizer output :", sTokenizer.tokenize(rawText)) print("Word Tokenizer output :", word_tokenize(rawText)) tTokenizer = TweetTokenizer() print("Tweet Tokenizer output :", tTokenizer.tokenize("This is a cooool #dummysmiley: :-) :-P <3"))
from nltk.tokenize import SpaceTokenizer from gensim.models import Word2Vec import nltk nltk.download('stopwords') stop_words = nltk.corpus.stopwords.words('portuguese') tokenizer = SpaceTokenizer() wiki_tokenized = [] wiki_files = [ "soccer_teams_wiki/resources/wikipedia_corinthians.txt", "soccer_teams_wiki/resources/wikipedia_palmeiras.txt", "soccer_teams_wiki/resources/wikipedia_portuguesa.txt", "soccer_teams_wiki/resources/wikipedia_santos.txt", "soccer_teams_wiki/resources/wikipedia_sao_paulo.txt" ] for file in wiki_files: with open(file, "r") as wiki_file: wiki_text = wiki_file.readlines() # TODO text cleanup. Remove stop words and other text treatment for articles for line in wiki_text: phrase = [ word.lower() for word in tokenizer.tokenize(line) if word not in stop_words ] wiki_tokenized.append(phrase) our_model = Word2Vec(wiki_tokenized,
from nltk.tag import pos_tag import nltk.tokenize from nltk.corpus import cmudict from wordgen import gen_word from nltk import pos_tag, ne_chunk from nltk.tokenize import SpaceTokenizer sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus" tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(sentence) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) print chunked_nes nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)] print nes ''' qry = "who is Mahatma Gandhi" tokens = nltk.tokenize.word_tokenize(qry) pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary = False) print sentt person = [] for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'): for leave in subtree.leaves(): person.append(leave) print "person=", person ''' ''' d = cmudict.dict()
from nltk.tag import pos_tag import nltk.tokenize from nltk.corpus import cmudict from wordgen import gen_word from nltk import pos_tag, ne_chunk from nltk.tokenize import SpaceTokenizer sentence = "who is Mahatma Gandhi visiting I'm HIS PRETTY GIRLFRIEND a Denny's McDonalds in broad daylight Shtruus" tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(sentence) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) print chunked_nes nes = [ ' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree) ] print nes ''' qry = "who is Mahatma Gandhi" tokens = nltk.tokenize.word_tokenize(qry) pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary = False) print sentt person = [] for subtree in sentt.subtrees(filter=lambda t: t.node == 'PERSON'): for leave in subtree.leaves(): person.append(leave) print "person=", person
# 3장 전처리 - 토큰화-NLTK 내장 토크나이저 사용법 from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize # LineTokenizer 사용('줄'로 나누기) lTokenizer = LineTokenizer() print( "Line toknizer 출력 :", lTokenizer.tokenize( "My name is" + "Maximus Decimus Meridius, commander of the Armies of the North, " + "General of the Felix Legions and loyal servant to the true emperor," + "Marcus Aurlius. \nFather to a murdered son, husband to a murdered" + "wife. \nAnd I will have my vengeance, in this life or the next.")) # SpaceTokenizer 사용('공백 문자'로 나누기) rawText = "By 11 o'clock on sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("Space Tokenizer 출력 :", sTokenizer.tokenize(rawText)) # word_tokenize 사용('단어'와 '구두점' 나누기) print("word Tokenizer 출력 :", word_tokenize(rawText)) # TweetTokenizer 사용('특수문자'를 다룰 때 사용) tTokenizer = TweetTokenizer() print("Tweet Tokenizer 출력 :", tTokenizer.tokenize("This is a coooool" + "#dummysmiley: :-) :-P <3"))
#import sys #reload(sys) #sys.setdefaultencoding('utf-8') logs = pn.read_csv("logs_enhanced.csv", encoding='latin-1') messages = logs['message_utf8'] print messages[0] + '\n' + messages[1] ml = len(messages) print ml #34467 itr = 0 tokenized_messages = [] tokenizer = SpaceTokenizer() while (itr < 10): try: if (messages[itr][0:1] == "`"): itr += 1 print "code found" else: tokenized_messages.append(tokenizer.tokenize(messages[itr])) itr += 1 print itr except TypeError: print "Skipped" itr += 1 print itr
def get_train_data(self, MAX_LENGTH=60, full_neg=True): ## MAX_LENGTH: max length of segments to be split into ## neg ratio: how many neg data to use (out of 100), should be an integer ## full_neg: whether to extract all neg data max_length_token = MAX_LENGTH ## avoid taking docs from test set test_doc_ids = [] zero_shot_doc_ids = [] with open('../data/all_test_docs/test_doc_ids') as f: fl = f.readlines() test_doc_ids = [int(line.strip()) for line in fl] with open('../data/all_test_docs/zero_shot_doc_ids') as f: fl = f.readlines() zero_shot_doc_ids = [int(line.strip()) for line in fl] train_doc_len = len( set(self.data_set_citations['publication_id'].values)) - len( test_doc_ids) - len(zero_shot_doc_ids) logger.info('sample from ' + str(train_doc_len) + ' train docs') pos_count = 0 neg_count = 0 pos_tokens = 0 neg_tokens = 0 sampled = [] with codecs.open(self.outdir + 'pos_data', 'w') as pos_data, codecs.open( self.outdir + 'neg_data', 'w') as neg_data: for index, row in self.data_set_citations.iterrows(): pub_id = row['publication_id'] if pub_id in zero_shot_doc_ids or pub_id in test_doc_ids: continue if pub_id in sampled: continue else: sampled.append(pub_id) pub_ids = list(self.data_set_citations['publication_id']) rows = [pub_ids.index(i) for i in pub_ids if i == pub_id] mention_list = [] for r in rows: d_row = self.data_set_citations.loc[r] mention_list.extend(d_row['mention_list']) mention_list = set(mention_list) logger.info('pub id: {}, mentions: {}'.format( pub_id, len(mention_list))) sample_text = self.full_text[str(pub_id) + '.txt'] sample_text_tokens = list( SpaceTokenizer().tokenize(sample_text)) sample_text_spans = list( SpaceTokenizer().span_tokenize(sample_text)) pos_splits = [] for mention_text in mention_list: mention_text = re.sub('\d', ' ', mention_text) # mention_text = re.sub('[^ ]- ', '', mention_text) mention_text_spans = list( SpaceTokenizer().span_tokenize(mention_text)) index_finder_lower = findall_lower(mention_text, sample_text) all_found_indices = [idx for idx in index_finder_lower] for find_index in all_found_indices: try: if find_index != -1: # logger.info('Found: '+mention_text) new_mention_text_spans = [ (indices[0] + find_index, indices[1] + find_index) for indices in mention_text_spans ] #write to training sample pointers here for splits in range( len(sample_text_tokens) // max_length_token - 1): if sample_text_spans.index(new_mention_text_spans[0]) > splits*(max_length_token) and \ sample_text_spans.index(new_mention_text_spans[-1]) < (splits+1)*(max_length_token): pos_splits.append(splits) pos_count += 1 pos_tokens += len( new_mention_text_spans) neg_tokens += ( MAX_LENGTH - len(new_mention_text_spans)) #TODO Wrapper over full data reader pos_data.write( str( sample_text_spans.index( new_mention_text_spans[0]) - splits * (max_length_token)) + ' ' + str( sample_text_spans.index( new_mention_text_spans[-1]) - splits * (max_length_token)) + ' ' + str(row['data_set_id']) + ' ' + str(row['publication_id']) + ' ' + ' '.join(sample_text_tokens[ splits * (max_length_token): (splits + 1) * (max_length_token) + 1]) + '\n') else: # print ('Annotation Error: Annotated gold standards not correct') pass except: # print ('Indexing Logic Error: Some corner index case missed while parsing') pass if not full_neg: continue ## NOTE: index starts from 0 ## -1 - 1 means no mention for splits in range( len(sample_text_tokens) // (max_length_token) - 1): if splits not in pos_splits: neg_data.write( str(-1) + ' ' + str(-1) + ' ' + str(0) + ' ' + str(row['publication_id']) + ' ' + ' '.join(sample_text_tokens[splits * (max_length_token): (splits + 1) * (max_length_token)]) + '\n') neg_count += 1 neg_tokens += MAX_LENGTH logger.info(str(pos_count) + " mentions added.") logger.info(str(neg_count) + " no mentions added.") logger.info(str(pos_tokens) + " pos tokens added.") logger.info(str(neg_tokens) + " neg tokens added.") logger.info("neg token percentage: {}".format( neg_tokens * 100 / (pos_tokens + neg_tokens)))
from nltk.tokenize import SpaceTokenizer tokenizer = SpaceTokenizer() start_tag = 0 def write_sentence_in_format(sentence, input_file, out): list_of_words_from_sentence = tokenize(sentence) write(list_of_words_from_sentence, out, input_file) def tokenize(sentence): return tokenizer.tokenize(sentence) def write(list_of_words, out, input_file): global start_tag for line in list_of_words: # write line to output file pass def write_value(sentence, value, out): out.write(sentence + " " + value + "\n")
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize # Line tokenizer longSentence = 'My name is Maximus Decimus Meridius, Commander of the Armies '\ 'of the North, General of the Felix Legions, loyal servant to '\ 'the true emperor, Marcus Aurelius. Father to a murdered son, '\ 'husband to a murdered wife. And I will have my vengeance, in '\ 'this life or the next.' lTokenizer = LineTokenizer() sentenceTokens = lTokenizer.tokenize(longSentence) print (sentenceTokens) # Space tokenizer sTokenizer = SpaceTokenizer() spaceTokens = sTokenizer.tokenize(longSentence) print (spaceTokens) # Tweet tokenizer tweet = 'This is a coool #dummysmiley: :-) :) :-P <3' tTokenizer = TweetTokenizer() tTokens = tTokenizer.tokenize(tweet) print ('Tweet tokenizer outpur:') print (tTokens) # Word tokenizer wTokenizer = word_tokenize(longSentence) print (wTokenizer) ################
indx ='\n'.join(res) print ("\nThe sentences contaning '"+ inp +"'"+" are : \n" + indx) #conversations containing input con = re.findall(r'"(?:(?:(?!(?<!\\)").)*)"', str(res)) indx2 ='\n'.join(con) print ("\nThe conversations contaning '"+ inp +"'"+" are : \n" + indx2) #count of conversations count = len(list(filter(lambda x: inp in x, con))) print ("\nThe count of conversations contaning '"+ inp +"'"+" are :\n"+str(count)) #All conversations in the excerpt allconv = re.findall(r'"(.*?)"', str(token_text)) indx3 ='\n'.join(allconv) print ("\nThe conversations in the excerpt are : \n" + indx3) from nltk.tag import pos_tag tagged_sent = pos_tag(text_string.split()) #propernouns = [word for word,pos in tagged_sent if pos == 'NNP'] #print( propernouns) from nltk.tree import Tree from nltk import pos_tag, ne_chunk from nltk.tokenize import SpaceTokenizer tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(text_string) pos = pos_tag(toks) chunked_nes = ne_chunk(pos) nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne,Tree)] indx4 ='\n'.join(nes) print("\n Proper nouns used in the excerpt are:\n", indx4)
# -*- coding: utf-8 -*- from nltk.tokenize import SpaceTokenizer import sys s = sys.argv[1].decode('utf-8') dt = sys.argv[2] #print "dt = "+dt #python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ" ' ' tokens = [] if(dt == " "): tokens = SpaceTokenizer().tokenize(s) #python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \t ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ" '\t' from nltk.tokenize import TabTokenizer if(dt == '\\t'): print "dt = "+dt s = s.replace(u'\\t','\t') tokens = TabTokenizer().tokenize(s) #python telugu_tokenizer.py "భారతదేశపు దక్షిణ \tసముద్ర \nప్రాంతంలో అది \n ‘పార్సన్స్ పిగ్మాలియన్’ ప్రదేశ" '\n' from nltk.tokenize import LineTokenizer if(dt == '\\n'): s = s.replace(u'\\n','\n')
import nltk.tag, nltk.data from nltk.tokenize import SpaceTokenizer os.chdir("D:/R/BOA/txtfiles") for fileName in glob.glob("*.txt"): count=0 file = open('D:/R/BOA/txtfiles/'+fileName, 'r') filew = open('D:/R/BOA/Noun/'+fileName, "wb") for line in file: count=count+1 print count print line line = re.sub('\\f', '', line) #line = line.decode("utf-8") line = unicode(line, errors='ignore') tokenizer = SpaceTokenizer() toks = tokenizer.tokenize(line) default_tagger = nltk.data.load(nltk.tag._POS_TAGGER) model = {'Consumer': 'RB'} tagger = nltk.tag.UnigramTagger(model=model, backoff=default_tagger) #pos = pos_tag(toks) pos=tagger.tag(toks) print pos chunked_nes = ne_chunk(pos) nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)] #data.append(nes) print nes filew.write((','.join(nes))+'\n') #filew.write("yeah its me")