def test_has_expected_type_and_methods(self): tokenizer = Tokenizer(BPE.empty()) assert type(tokenizer) == Tokenizer assert callable(tokenizer.num_special_tokens_to_add) assert callable(tokenizer.get_vocab) assert callable(tokenizer.get_vocab_size) assert callable(tokenizer.enable_truncation) assert callable(tokenizer.no_truncation) assert callable(tokenizer.enable_padding) assert callable(tokenizer.no_padding) assert callable(tokenizer.normalize) assert callable(tokenizer.encode) assert callable(tokenizer.encode_batch) assert callable(tokenizer.decode) assert callable(tokenizer.decode_batch) assert callable(tokenizer.token_to_id) assert callable(tokenizer.id_to_token) assert callable(tokenizer.add_tokens) assert callable(tokenizer.add_special_tokens) assert callable(tokenizer.train) assert callable(tokenizer.post_process) assert isinstance(tokenizer.model, Model) assert tokenizer.normalizer is None assert tokenizer.pre_tokenizer is None assert tokenizer.post_processor is None assert tokenizer.decoder is None
def test_normalize(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.normalizer = Lowercase() output = tokenizer.normalize("My Name Is John") assert output == "my name is john"
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()]) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new() tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.add_special_tokens([unk_token]) tokenizer.normalizer = NFKC.new() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace.new( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def test_strip_accents(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = BertNormalizer( strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False ) output = tokenizer.normalize("Héllò") assert output == "Hello"
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, trim_offsets: bool = False, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel( trim_offsets=trim_offsets) parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, "trim_offsets": trim_offsets, } super().__init__(tokenizer, parameters)
def test_processing(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"] assert output.ids == [0, 2, 3, 1, 1, 6, 1]
def test_processing(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_special_tokens(["[SEP]", "[CLS]"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"] assert output.ids == [1, 2, 3, 0, 6, 0]
def test_add_tokens(self): tokenizer = Tokenizer(BPE.empty()) added = tokenizer.add_tokens(["my", "name", "is", "john"]) assert added == 4 added = tokenizer.add_tokens( [AddedToken("the"), AddedToken("quick", rstrip=True)]) assert added == 2
def test_get_vocab_size(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can retrieve vocab's size with added tokens size = tokenizer.get_vocab_size(with_added_tokens=True) assert size == 5 # Can retrieve vocab's size without added tokens size = tokenizer.get_vocab_size(with_added_tokens=False) assert size == 0
def test_get_vocab(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can retrieve vocab with added tokens vocab = tokenizer.get_vocab(with_added_tokens=True) assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4} # Can retrieve vocab without added tokens vocab = tokenizer.get_vocab(with_added_tokens=False) assert vocab == {}
def test_decode(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can decode single sequences output = tokenizer.decode([0, 1, 2, 3]) assert output == "my name is john" # Can decode batch output = tokenizer.decode_batch([[0, 1, 2, 3], [4]]) assert output == ["my name is john", "pair"]
def test_add_special_tokens(self): tokenizer = Tokenizer(BPE.empty()) # Can add special tokens as `str` added = tokenizer.add_special_tokens(["my", "name", "is", "john"]) assert added == 4 # Can add special tokens as `AddedToken` added = tokenizer.add_special_tokens( [AddedToken("the"), AddedToken("quick", rstrip=True)]) assert added == 2
def test_truncation(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) # Can truncate single sequences output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name"] # Can truncate pair sequences as well output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "pair"]
def test_post_process(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) tokenizer.enable_padding(max_length=4) encoding = tokenizer.encode("my name is john") pair_encoding = tokenizer.encode("pair") # Can post process a single encoding output = tokenizer.post_process(encoding) assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] # Can post process a pair of encodings output = tokenizer.post_process(encoding, pair_encoding) assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] # OpenAI normalization is the same as Bert normalizers += [BertNormalizer()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = BPEDecoder(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, do_lowercase: bool = False, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "")) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if do_lowercase: normalizers += [Lowercase.new()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence.new(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel.new() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str]=None, merges_file: Optional[str]=None, add_prefix_space: bool=False): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = NFKC.new() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel.new() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def test_padding(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # By default it does nothing when encoding single sequence tokenizer.enable_padding() output = tokenizer.encode("my name") assert output.tokens == ["my", "name"] # Can pad to the longest in a batch output = tokenizer.encode_batch(["my name", "my name is john"]) assert all([len(encoding) == 4 for encoding in output]) # Can pad to the specified max length otherwise tokenizer.enable_padding(max_length=4) output = tokenizer.encode("my name") assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] output = tokenizer.encode("my name", "pair") assert output.tokens == ["my", "name", "pair", "[PAD]"]
def test_encode(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can encode single sequence output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name", "is", "john"] assert type(output.ids) == list assert type(output.type_ids) == list assert type(output.offsets) == list assert type(output.words) == list assert type(output.special_tokens_mask) == list assert type(output.attention_mask) == list assert type(output.overflowing) == list # Can encode a pair of sequences output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "name", "is", "john", "pair"] # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch( ["my name is john", ("my name is john", "pair")]) assert len(output) == 2
def test_instantiate(self, roberta_files): assert isinstance(BPE.empty(), Model) assert isinstance( BPE.from_files(roberta_files["vocab"], roberta_files["merges"]), Model)
def test_full_strip(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Strip(left=True, right=True) output = tokenizer.normalize(" hello ") assert output == "hello"
def test_lowercase(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Lowercase() output = tokenizer.normalize("HELLO") assert output == "hello"
def test_can_make_sequences(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence([Lowercase(), Strip()]) output = tokenizer.normalize(" HELLO ") assert output == "hello"