def python_basic_english_normalize(input): patterns_list = [(r'\'', ' \' '), (r'\"', ''), (r'\.', ' . '), (r'<br \/>', ' '), (r',', ' , '), (r'\(', ' ( '), (r'\)', ' ) '), (r'\!', ' ! '), (r'\?', ' ? '), (r'\;', ' '), (r'\:', ' '), (r'\s+', ' ')] norm_transform = custom_replace(patterns_list) return list(norm_transform([input.lower()]))[0].split()
def test_custom_replace(self): custom_replace_transform = custom_replace([(r'S', 's'), (r'\s+', ' ')]) test_sample = [ 'test cuStom replace', 'with uSer instruction' ] ref_results = ['test custom replace', 'with user instruction'] self.assertEqual(list(custom_replace_transform(test_sample)), ref_results)
(r'\|right', ''), (r'\|\d+px', ''), (r'\[\[image:[^\[\]]*\|', ''), (r'\[\[category:([^|\]]*)[^]]*\]\]', '[[$1]]'), (r'\[\[[a-z\-]*:[^\]]*\]\]', ''), (r'\[\[[^\|\]]*\|', '[['), (r'\{\{[^\}]*\}\}', ''), (r'\{[^\}]*\}', ''), (r'\[', ''), (r'\]', ''), (r'&[^;]*;', ' '), (r'A', 'a'), (r'B', 'b'), (r'C', 'c'), (r'D', 'd'), (r'E', 'e'), (r'F', 'f'), (r'G', 'g'), (r'H', 'h'), (r'I', 'i'), (r'J', 'j'), (r'K', 'k'), (r'L', 'l'), (r'M', 'm'), (r'N', 'n'), (r'O', 'o'), (r'P', 'p'), (r'Q', 'q'), (r'R', 'r'), (r'S', 's'), (r'T', 't'), (r'U', 'u'), (r'V', 'v'), (r'W', 'w'), (r'X', 'x'), (r'Y', 'y'), (r'Z', 'z'), (r'0', ' zero '), (r'1', ' one '), (r'2', ' two '), (r'3', ' three '), (r'4', ' four '), (r'5', ' five '), (r'6', ' six '), (r'7', ' seven '), (r'8', ' eight '), (r'9', ' nine '), (r'[^a-z\n]+', ' '), (r'\n ', ''), (r'\s+', ' '), (r'\n\s*\n', r'\n')] enwik9_norm_transform = custom_replace(_patterns) def generate_offsets(filename): offsets = [] with open(filename) as f: offsets.append(f.tell()) while f.readline(): offsets.append(f.tell()) return offsets def read_lines_from_iterator(data_path, offsets, begin_line, num_lines): with open(data_path) as f: f.seek(offsets[begin_line]) for i in range(num_lines):