writer.writerow(['Original', 'Normalised']) for row in reader: key = (row[0], row[1]) if key not in entries: writer.writerow(row) entries.add(key) ############################################################################################# ############################################################################################# ############################################################################################# #read in the witnesses from your file system from collatex.core_classes import WordPunctuationTokenizer tokenizer = WordPunctuationTokenizer() # build a function to tokenize and to normalize by replace keys to be found in # the dictionary by the corresponding values def tokennormalizer(witness): tokens_as_strings = tokenizer.tokenize(witness) list = [] for token_string in tokens_as_strings: normversion = re.sub(r'\s+$', "", token_string) replaceversion = Normit.get(normversion, normversion) list.append({'t': token_string, 'n': replaceversion}) return (list)
def test_tokenize(self): contents = "a b c" tokenizer = WordPunctuationTokenizer() #print contents self.assertEqual(["a ","b ","c"], tokenizer.tokenize(contents))