def tokenize_plain_text_file(plain_text_file): tokenizer = WordPunctuationTokenizer() with plain_text_file as f: for line in f: for token in tokenizer.tokenize(line): yield token
writer.writerow(['Original', 'Normalised']) for row in reader: key = (row[0], row[1]) if key not in entries: writer.writerow(row) entries.add(key) ############################################################################################# ############################################################################################# ############################################################################################# #read in the witnesses from your file system from collatex.core_classes import WordPunctuationTokenizer tokenizer = WordPunctuationTokenizer() # build a function to tokenize and to normalize by replace keys to be found in # the dictionary by the corresponding values def tokennormalizer(witness): tokens_as_strings = tokenizer.tokenize(witness) list = [] for token_string in tokens_as_strings: normversion = re.sub(r'\s+$', "", token_string) replaceversion = Normit.get(normversion, normversion) list.append({'t': token_string, 'n': replaceversion}) return (list)
def test_tokenize(self): contents = "a b c" tokenizer = WordPunctuationTokenizer() #print contents self.assertEquals(["a ","b ","c"], tokenizer.tokenize(contents))
def tokens(self): #print("COLLATION TOKENIZE IS CALLED!") #TODO: complete set of witnesses is retokenized here! tokenizer = WordPunctuationTokenizer() tokens = tokenizer.tokenize(self.get_combined_string()) return tokens
def tokenize_text_node(text_node): # split on whitespace, punctuation and numerical values tokenizer = WordPunctuationTokenizer() return tokenizer.tokenize(text_node.data)
def test_tokenize(self): contents = "a b c" tokenizer = WordPunctuationTokenizer() #print contents self.assertEqual(["a ","b ","c"], tokenizer.tokenize(contents))