Exemplo n.º 1
0
    writer.writerow(['Original', 'Normalised'])
    for row in reader:
        key = (row[0], row[1])
        if key not in entries:
            writer.writerow(row)
            entries.add(key)


#############################################################################################

#############################################################################################

#############################################################################################

#read in the witnesses  from your file system
from collatex.core_classes import WordPunctuationTokenizer

tokenizer = WordPunctuationTokenizer()


# build a function to tokenize and to normalize by replace keys to be found in
# the dictionary by the corresponding values
def tokennormalizer(witness):
    tokens_as_strings = tokenizer.tokenize(witness)
    list = []
    for token_string in tokens_as_strings:
        normversion = re.sub(r'\s+$', "", token_string)
        replaceversion = Normit.get(normversion, normversion)
        list.append({'t': token_string, 'n': replaceversion})
    return (list)
Exemplo n.º 2
0
 def test_tokenize(self):
     contents = "a b c"
     tokenizer = WordPunctuationTokenizer()
     #print contents
     self.assertEqual(["a ","b ","c"], tokenizer.tokenize(contents))