def basic_english_normalize(): r"""Basic normalization for a string sentence. Normalization includes - lowercasing - complete some basic text normalization for English words as follows: - add spaces before and after '\'' - remove '\"', - add spaces before and after '.' - replace '<br \/>'with single space - add spaces before and after ',' - add spaces before and after '(' - add spaces before and after ')' - add spaces before and after '!' - add spaces before and after '?' - replace ';' with single space - replace ':' with single space - replace multiple spaces with single space Examples: >>> import torch >>> from torchtext.experimental.transforms import basic_english_normalize >>> test_sample = 'Basic English Normalization for a Line of Text' >>> basic_eng_norm = basic_english_normalize() >>> jit_basic_eng_norm = torch.jit.script(basic_eng_norm.to_ivalue()) >>> tokens = jit_basic_eng_norm(test_sample) """ patterns_list = [ (r'\'', ' \' '), (r'\"', ''), (r'\.', ' . '), (r'<br \/>', ' '), (r',', ' , '), (r'\(', ' ( '), (r'\)', ' ) '), (r'\!', ' ! '), (r'\?', ' ? '), (r'\;', ' '), (r'\:', ' '), (r'\s+', ' ')] patterns = [pair[0] for pair in patterns_list] replacements = [pair[1] for pair in patterns_list] return BasicEnglishNormalize(RegexTokenizerPybind(patterns, replacements, True))
def regex_tokenizer(patterns_list): r"""Regex tokenizer for a string sentence that applies all regex replacements defined in patterns_list. Args: patterns_list (List[Tuple[str, str]]): a list of tuples (ordered pairs) which contain the regex pattern string as the first element and the replacement string as the second element. Examples: >>> import torch >>> from torchtext.experimental.transforms import regex_tokenizer >>> test_sample = 'Basic Regex Tokenization for a Line of Text' >>> patterns_list = [ (r'\'', ' \' '), (r'\"', '')] >>> reg_tokenizer = regex_tokenizer(patterns_list) >>> jit_reg_tokenizer = torch.jit.script(reg_tokenizer) >>> tokens = jit_reg_tokenizer(test_sample) """ patterns = [pair[0] for pair in patterns_list] replacements = [pair[1] for pair in patterns_list] return RegexTokenizer(RegexTokenizerPybind(patterns, replacements, False))