def tokenize_plain_text_file(plain_text_file):
    tokenizer = WordPunctuationTokenizer()

    with plain_text_file as f:
        for line in f:
            for token in tokenizer.tokenize(line):
                yield token
Пример #2
0
    writer.writerow(['Original', 'Normalised'])
    for row in reader:
        key = (row[0], row[1])
        if key not in entries:
            writer.writerow(row)
            entries.add(key)


#############################################################################################

#############################################################################################

#############################################################################################

#read in the witnesses  from your file system
from collatex.core_classes import WordPunctuationTokenizer

tokenizer = WordPunctuationTokenizer()


# build a function to tokenize and to normalize by replace keys to be found in
# the dictionary by the corresponding values
def tokennormalizer(witness):
    tokens_as_strings = tokenizer.tokenize(witness)
    list = []
    for token_string in tokens_as_strings:
        normversion = re.sub(r'\s+$', "", token_string)
        replaceversion = Normit.get(normversion, normversion)
        list.append({'t': token_string, 'n': replaceversion})
    return (list)
Пример #3
0
 def test_tokenize(self):
     contents = "a b c"
     tokenizer = WordPunctuationTokenizer()
     #print contents
     self.assertEquals(["a ","b ","c"], tokenizer.tokenize(contents))
Пример #4
0
 def tokens(self):
     #print("COLLATION TOKENIZE IS CALLED!")
     #TODO: complete set of witnesses is retokenized here!
     tokenizer = WordPunctuationTokenizer()
     tokens = tokenizer.tokenize(self.get_combined_string())
     return tokens
def tokenize_text_node(text_node):
    # split on whitespace, punctuation and numerical values
    tokenizer = WordPunctuationTokenizer()
    return tokenizer.tokenize(text_node.data)
Пример #6
0
 def test_tokenize(self):
     contents = "a b c"
     tokenizer = WordPunctuationTokenizer()
     #print contents
     self.assertEqual(["a ","b ","c"], tokenizer.tokenize(contents))