def get_recurrent_tokenizer(vocab, max_context_tokens, unk_token, pad_token, device="cpu"): """ Return a tokenizer to be used with recurrent-based models """ question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) question_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) question_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) question_tokenizer.enable_padding(direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token) context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) context_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) context_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) context_tokenizer.enable_padding( direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token, ) context_tokenizer.enable_truncation(max_context_tokens) return RecurrentSquadTokenizer(question_tokenizer, context_tokenizer, device=device)
def test_can_modify(self): normalizer = Strip(left=True, right=True) assert normalizer.left == True assert normalizer.right == True # Modify these normalizer.left = False assert normalizer.left == False normalizer.right = False assert normalizer.right == False
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): try: tokenizer = WordLevel(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizer," "please note they are not compatible.".format(vocab_file)) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] # Strip normalizer at the end normalizer += [Strip(left=True, right=True)] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first')
def __init__( self, vocab_file, sep_token="<sep>", cls_token="<cls>", pad_token="<pad>", mask_token="<mask>", lowercase: bool = True, ): tokenizer = Tokenizer(WordLevel(vocab_file, unk_token=unk_token)) tokenizer.normalizer = Strip() tokenizer.pre_tokenizer = CharDelimiterSplit(" ") tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # Let the tokenizer know about special tokens if they are part of the vocab if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) if tokenizer.token_to_id(str(sep_token)) is not None: tokenizer.add_special_tokens([str(sep_token)]) if tokenizer.token_to_id(str(cls_token)) is not None: tokenizer.add_special_tokens([str(cls_token)]) if tokenizer.token_to_id(str(pad_token)) is not None: tokenizer.add_special_tokens([str(pad_token)]) if tokenizer.token_to_id(str(mask_token)) is not None: tokenizer.add_special_tokens([str(mask_token)]) parameters = { "model": "WordLevel", "unk_token": unk_token, "sep_token": sep_token, "cls_token": cls_token, "pad_token": pad_token, "mask_token": mask_token, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def test_can_make_sequences(self): normalizer = Sequence([Lowercase(), Strip()]) output = normalizer.normalize_str(" HELLO ") assert output == "hello"
def test_full_strip(self): normalizer = Strip(left=True, right=True) output = normalizer.normalize_str(" hello ") assert output == "hello"
def test_instantiate(self): assert isinstance(Strip(), Normalizer) assert isinstance(Strip(), Strip) assert isinstance(pickle.loads(pickle.dumps(Strip())), Strip)
def test_full_strip(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Strip(left=True, right=True) output = tokenizer.normalize(" hello ") assert output == "hello"
def test_can_make_sequences(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence([Lowercase(), Strip()]) output = tokenizer.normalize(" HELLO ") assert output == "hello"
import string, re from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents, Strip, BertNormalizer normalizer = normalizers.Sequence([BertNormalizer(), Strip()]) def delete_punct(w: str) -> str: """Delete all puctuation in a string.""" return w.lower().translate( str.maketrans(string.punctuation, len(string.punctuation) * " ")) def normalize(x): y = normalizer.normalize_str(delete_punct(x)) y = y.replace("\n", " ") # remove double spaces y = re.sub(' +', ' ', y).strip() return y def get_str(x): res = '' if isinstance(x, dict): for f in x: if f not in ['lang']: res += ' ' + get_str(x[f]) if isinstance(x, str): res = x.strip() if isinstance(x, list):
def test_right_strip(self): tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Strip(left=False, right=True) output = tokenizer.normalize(" hello ") assert output == " hello"
from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.trainers import WordPieceTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.normalizers import Sequence, NFD, Lowercase, Strip def train(dataset_path, output_dir='data/tokenizer/', vocab_size=30_000, min_frequency=3): trainer = WordPieceTrainer(vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']) tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([NFD(), Lowercase(), Strip()]) files = [dataset_path] tokenizer.train(trainer, files) files = tokenizer.model.save(output_dir) tokenizer.model = WordPiece.from_file(*files, unk_token='[UNK]') tokenizer.save(f'{output_dir}tokenizer.json') if __name__ == '__main__': fire.Fire(train)
def test_instantiate(self): assert isinstance(Strip(), Normalizer) assert isinstance(Strip(), Strip)