class TextCleaner(object): """ Takes in an iterable / sequence of multi-sentence text. Returns cleaned text as requested. Author Note: The goal is to have standardized text cleaning utilities that I can use for any text application with multi-language support. """ def __init__(self, language='english'): self.tokenizer = = nltk.data.load(‘tokenizers/punkt/' + language + '.pickle’) self.punkt_word_tokenizer = PunktWordTokenizer() def sentence_tokenize(text): self.sentences = tokenizer.tokenize(text) def remove_stop(sentence): self.punkt_word_tokenizer.tokenize(sentence) return text
class Cleaner: """better than a Polish maid""" def __init__(self, input_file, basedir): self.original_data = self.open_gold_data(input_file) self.basedir = basedir self.data_dict = {} self.tokenizer = PunktWordTokenizer() def open_and_parse_xml_file(self, file_name): with open(file_name, "r") as f_in: return parse_parser_xml_results(f_in.read()) def update_cache(self, file_name): data_dict[file_name] = self.open_and_parse_xml_file( os.path.join(self.basedir, file_name + ".raw.xml")) def open_gold_data(self, gold_file): original_data = [] with open(gold_file, "r") as f_in: for line in f_in: line = line.rstrip().split() if line == []: continue if len(line) == 11: line.extend(["", "", ""]) else: line.extend(["", ""]) original_data.append(FeatureRow(*line)) return original_data def get_correct_offset(self, token, sentence, offset_begin, offset_end): token_list = self.tokenizer.tokenize(" ".join(token.split("_"))) if len(token_list) > offset_end - offset_begin: offset_end = len(token_list) + offset_begin if token_list == sentence[offset_begin:offset_end]: return (offset_begin, offset_end) while token_list != sentence[offset_begin:offset_end]: offset_begin += 1 offset_end += 1 if offset_end >= len(sentence): raise IndexError("{:d} invalid index, token={:s}".format( offset_end, token)) return (offset_begin, offset_end) def build_new_data(self): for fr in self.original_data: curr_article = fr.article curr_referent = (fr.token_ref, fr.sentence_ref, fr.offset_begin_ref, fr.offset_end_ref) try: nlp_data = self.data_dict[curr_article] except KeyError: self.update_cache(curr_article) nlp_data = self.data_dict[curr_article] new_offsets = self.get_correct_offset( fr.token, nlp_data["sentences"][int(fr.sentence)]["text"], int(fr.offset_begin), int(fr.offset_end)) if new_offsets != (fr.offset_begin, fr.offset_end): print fr.token, new_offsets
class Cleaner: """better than a Polish maid""" def __init__(self, input_file, basedir): self.original_data = self.open_gold_data(input_file) self.basedir = basedir self.data_dict = {} self.tokenizer = PunktWordTokenizer() def open_and_parse_xml_file(self, file_name): with open(file_name, "r") as f_in: return parse_parser_xml_results(f_in.read()) def update_cache(self, file_name): data_dict[file_name] = self.open_and_parse_xml_file(os.path.join(self.basedir,file_name+".raw.xml")) def open_gold_data(self, gold_file): original_data = [] with open(gold_file, "r") as f_in: for line in f_in: line = line.rstrip().split() if line == []: continue if len(line) == 11: line.extend(["", "", ""]) else: line.extend(["", ""]) original_data.append(FeatureRow(*line)) return original_data def get_correct_offset(self, token, sentence, offset_begin, offset_end): token_list = self.tokenizer.tokenize(" ".join(token.split("_"))) if len(token_list) > offset_end-offset_begin: offset_end = len(token_list) + offset_begin if token_list == sentence[offset_begin:offset_end]: return (offset_begin, offset_end) while token_list != sentence[offset_begin:offset_end]: offset_begin += 1 offset_end += 1 if offset_end >= len(sentence): raise IndexError("{:d} invalid index, token={:s}".format(offset_end, token)) return (offset_begin, offset_end) def build_new_data(self): for fr in self.original_data: curr_article = fr.article curr_referent = (fr.token_ref, fr.sentence_ref, fr.offset_begin_ref, fr.offset_end_ref) try: nlp_data = self.data_dict[curr_article] except KeyError: self.update_cache(curr_article) nlp_data = self.data_dict[curr_article] new_offsets = self.get_correct_offset(fr.token, nlp_data["sentences"][int(fr.sentence)]["text"], int(fr.offset_begin), int(fr.offset_end)) if new_offsets != (fr.offset_begin, fr.offset_end): print fr.token, new_offsets
def tokenize(t): tokenizer = PunktWordTokenizer() sentences = sent_tokenize(t) words = [] refined_words = [] for sentence in sentences: word = tokenizer.tokenize(sentence) for i in word: words.append(i.lower()) #Removal of stopwords and punctuations #stopwords = open('stop-words-it-en.txt','r').read().split('\r\n') for word in words: if word not in stopwords.words('french') and word not in punctuation: refined_words.append(word) return refined_words
def count_word_ngrams(n, processed_string): """ Counts all word ngrams in processed_string and creates a dictionary of those ngram counts called ngram_counts_dict. """ pwt = PunktWordTokenizer() processed_string = pwt.tokenize(processed_string) ngram_counts_dict = defaultdict(int) i = 0 j = i + n for i,_ in enumerate(processed_string): ngram = ' '.join(processed_string[i:j]) i += 1 j = i + n ngram_counts_dict[ngram] = 1 return ngram_counts_dict
#Assign to train/dev/test partition = random.random() if partition < train_fraction: f = train elif partition < (train_fraction + dev_fraction): f = dev else: f = test #Ascii encode ents try: ents_str = ','.join(ents) + ', ' f.write(ents_str) except Exception, e: ents_str = "".join(asciiEnts(ents_str)) f.write(ents_str) pass title = " ".join(cleanEnts(tokenizer.tokenize( title))) f.write( title + '\n') f.write(tagged_body + '\n') f.flush() train.close() dev.close() test.close()
class corpusParser(): def __init__(self, lang, vocab_dir, corpus_dir, window_size, output_dir): self._lang = 0 self._vocab_dir = vocab_dir self._corpus_dir = corpus_dir self._window_size = window_size self._output_dir = output_dir self._stemmer = Snowball() self._sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') self._word_tokenizer = PunktWordTokenizer() self._cooccur = defaultdict() self._wordcount = defaultdict() self._vocab = set() self._doc_num = 0 #self._vocab_word_index = defaultdict() #self._vocab_index_word = defaultdict() def loadVocab(self): vocabfile = open(self._vocab_dir, 'r') vocab_word_index = defaultdict() vocab_index_word = [] #index = -1 for line in vocabfile: line = line.strip() words = line.split('\t') word = words[1] self._vocab.add(word) #index += 1 #self._vocab_word_index[word] = index #self._vocab_index_word.append(word) vocabfile.close() # Initialize wordcount and cooccur for word in self._vocab: self._wordcount[word] = 0 self._cooccur[word] = defaultdict() for word2 in self._vocab: if word2 > word: self._cooccur[word][word2] = 0 def parseDoc(self, doc_raw): tokens = [] for sent in self._sent_tokenizer.tokenize(doc_raw): for token in self._word_tokenizer.tokenize(sent): tokens.append(self._stemmer(self._lang, token)) tokens_len = len(tokens) for index1 in range(0, tokens_len): w1 = tokens[index1] if w1 in self._vocab: self._wordcount[w1] += 1 if self._window_size == -1: index_end = tokens_len else: index_end = min(tokens_len, index1 + self._window_size) for index2 in range(index1 + 1, index_end): w2 = tokens[index2] if w2 in self._vocab: if w1 < w2: self._cooccur[w1][w2] += 1 elif w1 > w2: self._cooccur[w2][w1] += 1 def parseCorpus20news(self): print "Loading vocab" self.loadVocab() doc_count = 0 print "Parsing corpus" data_folders = [self._corpus_dir + "/train", self._corpus_dir + "/test"] print data_folders for data_folder in data_folders: for folder in glob("%s/*^tgz" % data_folder): for ff in glob("%s/*" % folder): doc_count += 1 infile = open(ff, 'r') doc_raw = "" for line in infile: line = line.strip().lower() doc_raw += " " + line self.parseDoc(doc_raw) infile.close() if doc_count % 1000 == 0: print "Finish parsing", doc_count, "documents!" self._doc_num = doc_count print "Total number of docunments: ", doc_count print "writing results!" self.writeResult() def parseCorpusNyt(self): print "Loading vocab" self.loadVocab() doc_count = 0 print "Parsing corpus" years = ["1987", "1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995", "1996"] print data_folders for year in years: folder_year = self._corpus_dir + "/" + year for month in glob("%s/[0-9][0-9]" % folder_year): for day in glob("%s" % month): for ff in glob("%s/*" % day): doc_count += 1 infile = open(ff, 'r') doc_raw = "" for line in infile: line = line.strip().lower() doc_raw += " " + line self.parseDoc(doc_raw) infile.close() if doc_count % 1000 == 0: print "Finish parsing", doc_count, "documents!" self._doc_num = doc_count print "Total number of docunments: ", doc_count print "writing results!" self.writeResult() def parseCorpusWiki(self): print "Loading vocab" self.loadVocab() print "Parsing corpus" doc_count = 0 file_count = 0 for folder in glob("%s/*" % self._corpus_dir): for ff in glob("%s/*" % folder): infile = open(ff, 'r') file_count += 1 if file_count % 100 == 0: print "Finish parsing", file_count, "files or ", doc_count, "documents!" for line in infile: line = line.strip().lower() if line.startswith("<doc"): doc_count += 1 doc_flag = True doc_raw = "" elif line.startswith("</doc>"): doc_flag = False ### processing doc self.parseDoc(doc_raw) else: assert doc_flag == True doc_raw += " " + line infile.close() self._doc_num = doc_count print "Total number of docunments: ", doc_count self.writeResult() def writeResult(self): # write wordcount outputfile = self._output_dir + "/wordcount.txt" outfile = open(outputfile, 'w') for word in self._wordcount.keys(): tmp = word + "\t" + str(self._wordcount[word]) + "\n" outfile.write(tmp) outfile.close() # write coccurance: outputfile = self._output_dir + "/cooccurance.txt" outfile = open(outputfile, 'w') for w1 in self._cooccur.keys(): for w2 in self._cooccur[w1].keys(): if self._cooccur[w1][w2] != 0: tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n" outfile.write(tmp) outfile.close()
def my_token(s): my_tokenizer = PunktWordTokenizer() return my_tokenizer.tokenize(s)
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action." # Sentence tokenization from nltk.tokenize import sent_tokenize sent_tokenize_list = sent_tokenize(text) print "\nSentence tokenizer:" print sent_tokenize_list # Create a new word tokenizer from nltk.tokenize import word_tokenize print "\nWord tokenizer:" print word_tokenize(text) # Create a new punkt word tokenizer from nltk.tokenize import PunktWordTokenizer punkt_word_tokenizer = PunktWordTokenizer() print "\nPunkt word tokenizer:" print punkt_word_tokenizer.tokenize(text) # Create a new WordPunct tokenizer from nltk.tokenize import WordPunctTokenizer word_punct_tokenizer = WordPunctTokenizer() print "\nWord punct tokenizer:" print word_punct_tokenizer.tokenize(text)
len(word_tokens) # Returns the number of words in the tokenized list of text len(sentence_tokens) # Returns the number of sentences in the tokenized list of text word_unique = list(set(word_tokens)) # Eliminates duplicated words in the tokenized list # Word tokenization details # When tokenizing words, the punctiation and contraction symbols receive special treatemnt: nlkt.word_tokenize('Hello World.') # Returns ['Hello', 'World', '.'] nltk.word_tokenize("can't") # Returns ['ca', "n't"] # Word Tokenization alternatives # PunktWordTokenizer # Splits on punctuation, but keeps it with the word from nltk.tokenize import PunktWordTokenizer # Imports the tokenizer tokenizer = PunktWordTokenizer() # Instanciates the tokenizer tokenizer.tokenize("Can't is a contraction") # Returns ['Can', "'t", 'is', 'a', 'contraction.'] # WordPunctTokenizer from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() tokenize.tokenizer("Can't is a contraction") # Returns ['Can', "'", 't', 'is', 'a', 'contraction', '.'] # Tokenizing (sentences) in different languages (Spanish) para = "Hola amigos. Gracias por ver este video. Saludos" # Defines the text to tokenize tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') # Loads the spanish sentence tokenizer print (tokenizer.tokenize(para)) # Tokenizes the text # Tokenize based on lines, spaces or tweets (special class) from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action." # Sentence tokenization from nltk.tokenize import sent_tokenize sent_tokenize_list = sent_tokenize(text) print("\nSentence tokenizer:") print(sent_tokenize_list) # Create a new word tokenizer from nltk.tokenize import word_tokenize print("\nWord tokenizer:") print(word_tokenize(text)) # Create a new punkt word tokenizer from nltk.tokenize import PunktWordTokenizer punkt_word_tokenizer = PunktWordTokenizer() print("\nPunkt word tokenizer:") print(punkt_word_tokenizer.tokenize(text)) # Create a new WordPunct tokenizer from nltk.tokenize import WordPunctTokenizer word_punct_tokenizer = WordPunctTokenizer() print("\nWord punct tokenizer:") print(word_punct_tokenizer.tokenize(text))
''' Created on 06/05/2013 @author: Rodrigo ''' from nltk.tokenize import word_tokenize, PunktWordTokenizer, RegexpTokenizer print word_tokenize("Hello word!") print word_tokenize("We can't do this") tokenizer = PunktWordTokenizer() print tokenizer.tokenize("We can't do this") tokenizer = RegexpTokenizer("[\w']+") print tokenizer.tokenize("We can't do this") # Split instead of findall tokenizer = RegexpTokenizer("\s+", gaps=True) print tokenizer.tokenize("We can't do this")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from nltk.tokenize import PunktWordTokenizer tokenizer = PunktWordTokenizer() result = tokenizer.tokenize("Can't is a contraction.") print(result) #['Can', "'t", 'is', 'a', 'contraction.']
words3 = penn_tokenizer.tokenize("Hello World.") print words1 print words2 print words3 # <markdowncell> # <p> Or, since we have already broken <em>para</em> into sentences, we can create the word list by tokeninzing each # sentence and creating a <strong>flatmap</strong> as shown here: </p> # <codecell> from nltk.tokenize import PunktWordTokenizer words3 = [word for sentence in sentences for word in word_tokenize(sentence)] punkt_tokenizer = PunktWordTokenizer() words4 = [word for sentence in sentences for word in punkt_tokenizer.tokenize(sentence)] print words3 == words4 print words3 print words4 # <markdowncell> # <p>Notice that there are <em><strong>subtle differences in the output</em></strong>. The first example did not separate the '<strong>.</strong>' from the words <em>World</em> # and <em>you</em> where as the # second example did. Both accounted for the '<strong>.</strong>' after <em>NLTK</em>. I'm not sure why this is the case. Notice how both examples resulted in the splitting of # <em>It's</em> into two words. It seems that the <strong>TreebankWordTokenizer</strong>, for which <strong>word_tokenizer</strong> is a wrapper, seems to change behavior # when working on a whole paragraph versus individual sentences.</p> # # <p>But I don't want contractions split into separate words. Use a <strong>RegexpTokenizer</strong> as shown next:</p> # <codecell>
from nltk.tokenize import PunktWordTokenizer tokenizer = PunktWordTokenizer() strExSentence = "Can't is a contraction." lstWordPunkt = tokenizer.tokenize(strExSentence) print(lstWordPunkt) #OUTPUT is ['Can', "'t", 'is', 'a', 'contraction.'] from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() lstWordPunkt = tokenizer.tokenize(strExSentence) print(lstWordPunkt) #OUTPUT is ['Can', "'", 't', 'is', 'a', 'contraction', '.'] def fnTest(strArgument): print(strArgument) fnTest("mark")
#English tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #Spanish tokenizer spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') texto = "Un buen reportaje puede ser tan fascinante e instructivo sobre el mundo real como un gran cuento o una magnífica novela. Si alguien lo pone en duda, le ruego que lea la crónica de Ioan Grillo Bring On the Wall que apareció en The New York Times el pasado 7 de mayo. Cuenta la historia del Flaco, un contrabandista mexicano que, desde que estaba en el colegio, a los 15 años, se ha pasado la vida contrabandeando drogas e inmigrantes ilegales a Estados Unidos. Aunque estuvo cinco años en la cárcel no se ha arrepentido del oficio que practica y menos ahora, cuando, dice, su ilícita profesión está más floreciente que nunca." spanish_tokenizer.tokenize(texto) #Different types of tokenizers (and how to call different methods on NLTK) from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() tokenizer.tokenize("This's a test") from nltk.tokenize import PunktWordTokenizer punkt_word_tokenizer = PunktWordTokenizer() punkt_word_tokenizer.tokenize("this's a test") from nltk.tokenize import WordPunctTokenizer word_punct_tokenizer = WordPunctTokenizer() word_punct_tokenizer.tokenize("this's a test") #Stemming #Using the porter algorithm from nltk.stem.porter import PorterStemmer porter_stemmer = PorterStemmer() porter_stemmer.stem('maximum') porter_stemmer.stem('presumably') porter_stemmer.stem('multiply') porter_stemmer.stem('provision') porter_stemmer.stem('owed') porter_stemmer.stem('ear')