示例#1
0
class TextCleaner(object):
	"""
	Takes in an iterable / sequence of multi-sentence text.

	Returns cleaned text as requested.

	Author Note: 
	The goal is to have standardized text cleaning utilities that I can use for 
	any text application with multi-language support.
	
	"""

	def __init__(self, language='english'):

		self.tokenizer = = nltk.data.load(‘tokenizers/punkt/' + language + '.pickle’)
		self.punkt_word_tokenizer = PunktWordTokenizer()
	
	def sentence_tokenize(text):
		self.sentences = tokenizer.tokenize(text)




	def remove_stop(sentence):
		self.punkt_word_tokenizer.tokenize(sentence)





	return text
class Cleaner:
    """better than a Polish maid"""
    def __init__(self, input_file, basedir):
        self.original_data = self.open_gold_data(input_file)
        self.basedir = basedir
        self.data_dict = {}
        self.tokenizer = PunktWordTokenizer()

    def open_and_parse_xml_file(self, file_name):
        with open(file_name, "r") as f_in:
            return parse_parser_xml_results(f_in.read())

    def update_cache(self, file_name):
        data_dict[file_name] = self.open_and_parse_xml_file(
            os.path.join(self.basedir, file_name + ".raw.xml"))

    def open_gold_data(self, gold_file):
        original_data = []
        with open(gold_file, "r") as f_in:
            for line in f_in:
                line = line.rstrip().split()
                if line == []:
                    continue
                if len(line) == 11:
                    line.extend(["", "", ""])
                else:
                    line.extend(["", ""])
                original_data.append(FeatureRow(*line))
        return original_data

    def get_correct_offset(self, token, sentence, offset_begin, offset_end):
        token_list = self.tokenizer.tokenize(" ".join(token.split("_")))
        if len(token_list) > offset_end - offset_begin:
            offset_end = len(token_list) + offset_begin

        if token_list == sentence[offset_begin:offset_end]:
            return (offset_begin, offset_end)
        while token_list != sentence[offset_begin:offset_end]:
            offset_begin += 1
            offset_end += 1
            if offset_end >= len(sentence):
                raise IndexError("{:d} invalid index, token={:s}".format(
                    offset_end, token))
        return (offset_begin, offset_end)

    def build_new_data(self):
        for fr in self.original_data:
            curr_article = fr.article
            curr_referent = (fr.token_ref, fr.sentence_ref,
                             fr.offset_begin_ref, fr.offset_end_ref)
            try:
                nlp_data = self.data_dict[curr_article]
            except KeyError:
                self.update_cache(curr_article)
                nlp_data = self.data_dict[curr_article]
            new_offsets = self.get_correct_offset(
                fr.token, nlp_data["sentences"][int(fr.sentence)]["text"],
                int(fr.offset_begin), int(fr.offset_end))
            if new_offsets != (fr.offset_begin, fr.offset_end):
                print fr.token, new_offsets
class Cleaner:
    """better than a Polish maid"""

    def __init__(self, input_file, basedir):
        self.original_data = self.open_gold_data(input_file)
        self.basedir = basedir
        self.data_dict = {}
        self.tokenizer = PunktWordTokenizer()


    def open_and_parse_xml_file(self, file_name):
        with open(file_name, "r") as f_in:
            return parse_parser_xml_results(f_in.read())

    def update_cache(self, file_name):
        data_dict[file_name] = self.open_and_parse_xml_file(os.path.join(self.basedir,file_name+".raw.xml"))

    def open_gold_data(self, gold_file):
        original_data = []
        with open(gold_file, "r") as f_in:
            for line in f_in:
                line = line.rstrip().split()
                if line == []:
                    continue
                if len(line) == 11:
                    line.extend(["", "", ""])
                else:
                    line.extend(["", ""])
                original_data.append(FeatureRow(*line))
        return original_data

    def get_correct_offset(self, token, sentence, offset_begin, offset_end):
        token_list = self.tokenizer.tokenize(" ".join(token.split("_")))
        if len(token_list) > offset_end-offset_begin:
            offset_end = len(token_list) + offset_begin

        if token_list == sentence[offset_begin:offset_end]:
            return (offset_begin, offset_end)
        while token_list != sentence[offset_begin:offset_end]:
            offset_begin += 1
            offset_end += 1
            if offset_end >= len(sentence):
                raise IndexError("{:d} invalid index, token={:s}".format(offset_end, token))
        return (offset_begin, offset_end)


    def build_new_data(self):
        for fr in self.original_data:
            curr_article = fr.article
            curr_referent = (fr.token_ref, fr.sentence_ref, fr.offset_begin_ref, fr.offset_end_ref)
            try:
                nlp_data = self.data_dict[curr_article]
            except KeyError:
                self.update_cache(curr_article)
                nlp_data = self.data_dict[curr_article]
            new_offsets = self.get_correct_offset(fr.token, nlp_data["sentences"][int(fr.sentence)]["text"], int(fr.offset_begin), int(fr.offset_end))
            if new_offsets != (fr.offset_begin, fr.offset_end):
                print fr.token, new_offsets
def tokenize(t):
    tokenizer = PunktWordTokenizer()
    sentences = sent_tokenize(t)
    words = []
    refined_words = []
    for sentence in sentences:
        word = tokenizer.tokenize(sentence)
        for i in word:
            words.append(i.lower())

#Removal of stopwords and punctuations
#stopwords = open('stop-words-it-en.txt','r').read().split('\r\n')
    for word in words:
        if word not in stopwords.words('french') and word not in punctuation:
            refined_words.append(word)
    return refined_words
示例#5
0
def count_word_ngrams(n, processed_string):
    """
    Counts all word ngrams in processed_string
    and creates a dictionary of those ngram counts
    called ngram_counts_dict.
    """
    pwt = PunktWordTokenizer()
    processed_string = pwt.tokenize(processed_string)
    ngram_counts_dict = defaultdict(int)

    i = 0
    j = i + n
    for i,_ in enumerate(processed_string):
        ngram = ' '.join(processed_string[i:j])
        i += 1
        j = i + n
        ngram_counts_dict[ngram] = 1

    return ngram_counts_dict
示例#6
0
        

        #Assign to train/dev/test
        partition = random.random()
        if partition < train_fraction:
            f = train
        elif partition < (train_fraction + dev_fraction):
            f = dev
        else:
            f = test

        #Ascii encode ents 
        try:
            ents_str = ','.join(ents) + ', '
            f.write(ents_str)
        except Exception, e:
            ents_str = "".join(asciiEnts(ents_str))
            f.write(ents_str)
            pass
        
        title = " ".join(cleanEnts(tokenizer.tokenize( title)))
        f.write( title + '\n')
        f.write(tagged_body + '\n')
        f.flush()

    train.close()
    dev.close()
    test.close()


示例#7
0
class corpusParser():


  def __init__(self, lang, vocab_dir, corpus_dir, window_size, output_dir):
    self._lang = 0
    self._vocab_dir = vocab_dir
    self._corpus_dir = corpus_dir
    self._window_size = window_size
    self._output_dir = output_dir
    self._stemmer = Snowball()
    self._sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    self._word_tokenizer = PunktWordTokenizer()
    self._cooccur = defaultdict()
    self._wordcount = defaultdict()
    self._vocab = set()
    self._doc_num = 0
    #self._vocab_word_index = defaultdict()
    #self._vocab_index_word = defaultdict()


  def loadVocab(self):
    vocabfile = open(self._vocab_dir, 'r')
    vocab_word_index = defaultdict()
    vocab_index_word = []
    #index = -1
    for line in vocabfile:
      line = line.strip()
      words = line.split('\t')
      word = words[1]
      self._vocab.add(word)
      #index += 1
      #self._vocab_word_index[word] = index
      #self._vocab_index_word.append(word)

    vocabfile.close()

    # Initialize wordcount and cooccur
    for word in self._vocab:
      self._wordcount[word] = 0
      self._cooccur[word] = defaultdict()
      for word2 in self._vocab:
        if word2 > word:
          self._cooccur[word][word2] = 0
    

  def parseDoc(self, doc_raw):
    tokens = []
    for sent in self._sent_tokenizer.tokenize(doc_raw):
      for token in self._word_tokenizer.tokenize(sent):
        tokens.append(self._stemmer(self._lang, token))

    tokens_len = len(tokens)
    for index1 in range(0, tokens_len):
      w1 = tokens[index1]
      if w1 in self._vocab:
        self._wordcount[w1] += 1

        if self._window_size == -1:
          index_end = tokens_len
        else:
          index_end = min(tokens_len, index1 + self._window_size)

        for index2 in range(index1 + 1, index_end):
          w2 = tokens[index2]
          if w2 in self._vocab:
            if w1 < w2:
              self._cooccur[w1][w2] += 1
            elif w1 > w2:
              self._cooccur[w2][w1] += 1


  def parseCorpus20news(self):

    print "Loading vocab"
    self.loadVocab()
    
    doc_count = 0

    print "Parsing corpus"
    data_folders = [self._corpus_dir + "/train", self._corpus_dir + "/test"]
    print data_folders
    for data_folder in data_folders:
      for folder in glob("%s/*^tgz" % data_folder):
        for ff in glob("%s/*" % folder):
          doc_count += 1
          infile = open(ff, 'r')
          doc_raw = ""
          for line in infile:
            line = line.strip().lower()
            doc_raw += " " + line
          self.parseDoc(doc_raw)
          infile.close()
          if doc_count % 1000 == 0:
            print "Finish parsing", doc_count, "documents!"

    self._doc_num = doc_count
    print "Total number of docunments: ", doc_count
    print "writing results!"
    self.writeResult()


  def parseCorpusNyt(self):

    print "Loading vocab"
    self.loadVocab()
    
    doc_count = 0

    print "Parsing corpus"

    years = ["1987", "1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995", "1996"]
    print data_folders

    for year in years:
      folder_year = self._corpus_dir + "/" + year
      for month in glob("%s/[0-9][0-9]" % folder_year):
        for day in glob("%s" % month):
          for ff in glob("%s/*" % day):
            doc_count += 1
            infile = open(ff, 'r')
            doc_raw = ""
            for line in infile:
              line = line.strip().lower()
              doc_raw += " " + line
            self.parseDoc(doc_raw)
            infile.close()
            if doc_count % 1000 == 0:
              print "Finish parsing", doc_count, "documents!"

    self._doc_num = doc_count
    print "Total number of docunments: ", doc_count
    print "writing results!"
    self.writeResult()


  def parseCorpusWiki(self):
    print "Loading vocab"
    self.loadVocab()

    print "Parsing corpus"
    doc_count = 0
    file_count = 0
    for folder in glob("%s/*" % self._corpus_dir):
      for ff in glob("%s/*" % folder):
        infile = open(ff, 'r')
        file_count += 1
        if file_count % 100 == 0:
          print "Finish parsing", file_count, "files or ", doc_count, "documents!"

        for line in infile:
          line = line.strip().lower()

          if line.startswith("<doc"):
            doc_count += 1
            doc_flag = True
            doc_raw = ""
          elif line.startswith("</doc>"):
            doc_flag = False
            ### processing doc
            self.parseDoc(doc_raw)
          else:
            assert doc_flag == True
            doc_raw += " " + line
        infile.close()

    self._doc_num = doc_count
    print "Total number of docunments: ", doc_count
    self.writeResult()


  def writeResult(self):
    # write wordcount
    outputfile = self._output_dir + "/wordcount.txt"
    outfile = open(outputfile, 'w')
    for word in self._wordcount.keys():
      tmp = word + "\t" + str(self._wordcount[word]) + "\n"
      outfile.write(tmp)
    outfile.close()

    # write coccurance:
    outputfile = self._output_dir + "/cooccurance.txt"
    outfile = open(outputfile, 'w')
    for w1 in self._cooccur.keys():
      for w2 in self._cooccur[w1].keys():
        if self._cooccur[w1][w2] != 0:
          tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n"
          outfile.write(tmp)
    outfile.close()
def my_token(s):
    my_tokenizer = PunktWordTokenizer()
    return my_tokenizer.tokenize(s)
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

# Sentence tokenization
from nltk.tokenize import sent_tokenize

sent_tokenize_list = sent_tokenize(text)
print "\nSentence tokenizer:"
print sent_tokenize_list

# Create a new word tokenizer
from nltk.tokenize import word_tokenize

print "\nWord tokenizer:"
print word_tokenize(text)

# Create a new punkt word tokenizer
from nltk.tokenize import PunktWordTokenizer

punkt_word_tokenizer = PunktWordTokenizer()
print "\nPunkt word tokenizer:"
print punkt_word_tokenizer.tokenize(text)

# Create a new WordPunct tokenizer
from nltk.tokenize import WordPunctTokenizer

word_punct_tokenizer = WordPunctTokenizer()
print "\nWord punct tokenizer:"
print word_punct_tokenizer.tokenize(text)

示例#10
0
len(word_tokens)        # Returns the number of words in the tokenized list of text
len(sentence_tokens)    # Returns the number of sentences in the tokenized list of text
word_unique = list(set(word_tokens))  # Eliminates duplicated words in the tokenized list

# Word tokenization details
# When tokenizing words, the punctiation and contraction symbols receive special treatemnt:
nlkt.word_tokenize('Hello World.')  # Returns ['Hello', 'World', '.']
nltk.word_tokenize("can't")         # Returns ['ca', "n't"]

# Word Tokenization alternatives

# PunktWordTokenizer
# Splits on punctuation, but keeps it with the word
from nltk.tokenize import PunktWordTokenizer        # Imports the tokenizer
tokenizer = PunktWordTokenizer()                    # Instanciates the tokenizer
tokenizer.tokenize("Can't is a contraction")        # Returns ['Can', "'t", 'is', 'a', 'contraction.']

# WordPunctTokenizer
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenize.tokenizer("Can't is a contraction")        # Returns ['Can', "'", 't', 'is', 'a', 'contraction', '.']

# Tokenizing (sentences) in different languages (Spanish)
para = "Hola amigos. Gracias por ver este video. Saludos"       # Defines the text to tokenize
tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')   # Loads the spanish sentence tokenizer
print (tokenizer.tokenize(para))                                # Tokenizes the text

# Tokenize based on lines, spaces or tweets (special class)
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize
示例#11
0
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

# Sentence tokenization
from nltk.tokenize import sent_tokenize

sent_tokenize_list = sent_tokenize(text)
print("\nSentence tokenizer:")
print(sent_tokenize_list)

# Create a new word tokenizer
from nltk.tokenize import word_tokenize

print("\nWord tokenizer:")
print(word_tokenize(text))

# Create a new punkt word tokenizer
from nltk.tokenize import PunktWordTokenizer

punkt_word_tokenizer = PunktWordTokenizer()
print("\nPunkt word tokenizer:")
print(punkt_word_tokenizer.tokenize(text))

# Create a new WordPunct tokenizer
from nltk.tokenize import WordPunctTokenizer

word_punct_tokenizer = WordPunctTokenizer()
print("\nWord punct tokenizer:")
print(word_punct_tokenizer.tokenize(text))

示例#12
0
文件: 02_words.py 项目: rmachado/nlp
'''
Created on 06/05/2013

@author: Rodrigo
'''

from nltk.tokenize import word_tokenize, PunktWordTokenizer, RegexpTokenizer
print word_tokenize("Hello word!")

print word_tokenize("We can't do this")

tokenizer = PunktWordTokenizer()
print tokenizer.tokenize("We can't do this")

tokenizer = RegexpTokenizer("[\w']+")
print tokenizer.tokenize("We can't do this")

# Split instead of findall
tokenizer = RegexpTokenizer("\s+", gaps=True)
print tokenizer.tokenize("We can't do this")
示例#13
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from nltk.tokenize import PunktWordTokenizer

tokenizer = PunktWordTokenizer()
result = tokenizer.tokenize("Can't is a contraction.")
print(result)
#['Can', "'t", 'is', 'a', 'contraction.']
示例#14
0
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

# Sentence tokenization
from nltk.tokenize import sent_tokenize

sent_tokenize_list = sent_tokenize(text)
print "\nSentence tokenizer:"
print sent_tokenize_list

# Create a new word tokenizer
from nltk.tokenize import word_tokenize

print "\nWord tokenizer:"
print word_tokenize(text)

# Create a new punkt word tokenizer
from nltk.tokenize import PunktWordTokenizer

punkt_word_tokenizer = PunktWordTokenizer()
print "\nPunkt word tokenizer:"
print punkt_word_tokenizer.tokenize(text)

# Create a new WordPunct tokenizer
from nltk.tokenize import WordPunctTokenizer

word_punct_tokenizer = WordPunctTokenizer()
print "\nWord punct tokenizer:"
print word_punct_tokenizer.tokenize(text)

words3 = penn_tokenizer.tokenize("Hello World.")
print words1
print words2
print words3

# <markdowncell>

# <p> Or, since we have already broken <em>para</em> into sentences, we can create the word list by tokeninzing each 
#     sentence and creating a <strong>flatmap</strong> as shown here: </p>

# <codecell>

from nltk.tokenize import PunktWordTokenizer
words3 = [word for sentence in sentences for word in word_tokenize(sentence)]
punkt_tokenizer = PunktWordTokenizer()
words4 = [word for sentence in sentences for word in punkt_tokenizer.tokenize(sentence)]
print words3 == words4
print words3
print words4

# <markdowncell>

# <p>Notice that there are <em><strong>subtle differences in the output</em></strong>. The first example did not separate the '<strong>.</strong>' from the words <em>World</em>
#     and <em>you</em> where as the
#     second example did. Both accounted for the '<strong>.</strong>' after <em>NLTK</em>. I'm not sure why this is the case. Notice how both examples resulted in the splitting of
#     <em>It's</em> into two words. It seems that the <strong>TreebankWordTokenizer</strong>, for which <strong>word_tokenizer</strong> is a wrapper, seems to change behavior 
# when working on a whole paragraph versus individual sentences.</p>
# 
# <p>But I don't want contractions split into separate words. Use a <strong>RegexpTokenizer</strong> as shown next:</p>

# <codecell>
示例#16
0
from nltk.tokenize import PunktWordTokenizer
tokenizer = PunktWordTokenizer()
strExSentence = "Can't is a contraction."
lstWordPunkt = tokenizer.tokenize(strExSentence)
print(lstWordPunkt)

#OUTPUT is ['Can', "'t", 'is', 'a', 'contraction.']


from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
lstWordPunkt = tokenizer.tokenize(strExSentence)
print(lstWordPunkt)

#OUTPUT is  ['Can', "'", 't', 'is', 'a', 'contraction', '.']


def fnTest(strArgument):
    print(strArgument)


fnTest("mark")




    
示例#17
0
文件: day1.py 项目: julioadl/ds101
#English tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#Spanish tokenizer
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
texto = "Un buen reportaje puede ser tan fascinante e instructivo sobre el mundo real como un gran cuento o una magnífica novela. Si alguien lo pone en duda, le ruego que lea la crónica de Ioan Grillo Bring On the Wall que apareció en The New York Times el pasado 7 de mayo. Cuenta la historia del Flaco, un contrabandista mexicano que, desde que estaba en el colegio, a los 15 años, se ha pasado la vida contrabandeando drogas e inmigrantes ilegales a Estados Unidos. Aunque estuvo cinco años en la cárcel no se ha arrepentido del oficio que practica y menos ahora, cuando, dice, su ilícita profesión está más floreciente que nunca."
spanish_tokenizer.tokenize(texto)

#Different types of tokenizers (and how to call different methods on NLTK)
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("This's a test")

from nltk.tokenize import PunktWordTokenizer
punkt_word_tokenizer = PunktWordTokenizer()
punkt_word_tokenizer.tokenize("this's a test")

from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()
word_punct_tokenizer.tokenize("this's a test")

#Stemming
#Using the porter algorithm
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
porter_stemmer.stem('maximum')
porter_stemmer.stem('presumably')
porter_stemmer.stem('multiply')
porter_stemmer.stem('provision')
porter_stemmer.stem('owed')
porter_stemmer.stem('ear')
        

        #Assign to train/dev/test
        partition = random.random()
        if partition < train_fraction:
            f = train
        elif partition < (train_fraction + dev_fraction):
            f = dev
        else:
            f = test

        #Ascii encode ents 
        try:
            ents_str = ','.join(ents) + ', '
            f.write(ents_str)
        except Exception, e:
            ents_str = "".join(asciiEnts(ents_str))
            f.write(ents_str)
            pass
        
        title = " ".join(cleanEnts(tokenizer.tokenize( title)))
        f.write( title + '\n')
        f.write(tagged_body + '\n')
        f.flush()

    train.close()
    dev.close()
    test.close()