def test_normalize(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.normalizer = Lowercase() output = tokenizer.normalize("My Name Is John") assert output == "my name is john"
def test_encode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can encode single sequence output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name", "is", "john"] assert type(output.ids) == list assert type(output.type_ids) == list assert type(output.offsets) == list with pytest.warns(DeprecationWarning): assert type(output.words) == list assert type(output.word_ids) == list assert type(output.special_tokens_mask) == list assert type(output.attention_mask) == list assert type(output.overflowing) == list # Can encode a pair of sequences output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "name", "is", "john", "pair"] assert isinstance(pickle.loads(pickle.dumps(output)), Encoding) # Can encode a single pre-tokenized sequence output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["my", "name", "is", "john"] # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2
def test_add_tokens(self): tokenizer = Tokenizer(BPE()) added = tokenizer.add_tokens(["my", "name", "is", "john"]) assert added == 4 added = tokenizer.add_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)]) assert added == 2
def test_processing(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"] assert output.ids == [0, 2, 3, 1, 1, 6, 1]
def test_processing(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["[SEP]", "[CLS]"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"] assert output.ids == [1, 2, 3, 0, 6, 0]
def test_roberta_parity(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0)) original = tokenizer.encode("my name is john", "pair") tokenizer.post_processor = self.get_roberta() template = tokenizer.encode("my name is john", "pair") assert original.ids == template.ids
def test_decode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can decode single sequences output = tokenizer.decode([0, 1, 2, 3]) assert output == "my name is john" # Can decode batch output = tokenizer.decode_batch([[0, 1, 2, 3], [4]]) assert output == ["my name is john", "pair"]
def test_get_vocab_size(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can retrieve vocab's size with added tokens size = tokenizer.get_vocab_size(with_added_tokens=True) assert size == 5 # Can retrieve vocab's size without added tokens size = tokenizer.get_vocab_size(with_added_tokens=False) assert size == 0
def test_bert_parity(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["[SEP]", "[CLS]"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) original = tokenizer.encode("my name", "pair") tokenizer.post_processor = self.get_bert() template = tokenizer.encode("my name", "pair") assert original.ids == template.ids
def test_get_vocab(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can retrieve vocab with added tokens vocab = tokenizer.get_vocab(with_added_tokens=True) assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4} # Can retrieve vocab without added tokens vocab = tokenizer.get_vocab(with_added_tokens=False) assert vocab == {}
def test_add_tokens(self): tokenizer = Tokenizer(BPE()) added = tokenizer.add_tokens(["my", "name", "is", "john"]) assert added == 4 tokens = [AddedToken("the"), AddedToken("quick", normalized=False), AddedToken()] assert tokens[0].normalized == True added = tokenizer.add_tokens(tokens) assert added == 2 assert tokens[0].normalized == True assert tokens[1].normalized == False
def test_truncation(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) # Can truncate single sequences output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name"] # Can truncate pair sequences as well output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "pair"]
def test_post_process(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) tokenizer.enable_padding(length=4) encoding = tokenizer.encode("my name is john") pair_encoding = tokenizer.encode("pair") # Can post process a single encoding output = tokenizer.post_process(encoding) assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] # Can post process a pair of encodings output = tokenizer.post_process(encoding, pair_encoding) assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
def test_padding(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # By default it does nothing when encoding single sequence tokenizer.enable_padding() output = tokenizer.encode("my name") assert output.tokens == ["my", "name"] # Can pad to the longest in a batch output = tokenizer.encode_batch(["my name", "my name is john"]) assert all([len(encoding) == 4 for encoding in output]) # Can pad to the specified max length otherwise tokenizer.enable_padding(max_length=4) output = tokenizer.encode("my name") assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] output = tokenizer.encode("my name", "pair") assert output.tokens == ["my", "name", "pair", "[PAD]"]
def test_encode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can encode single sequence output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name", "is", "john"] assert type(output.ids) == list assert type(output.type_ids) == list assert type(output.offsets) == list assert type(output.words) == list assert type(output.special_tokens_mask) == list assert type(output.attention_mask) == list assert type(output.overflowing) == list # Can encode a pair of sequences output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "name", "is", "john", "pair"] # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
def test_truncation(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) # Can truncate single sequences output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name"] # Can truncate pair sequences as well output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "pair"] # Can get the params and give them to enable_truncation trunc = tokenizer.truncation tokenizer.enable_truncation(**trunc) # Left truncation direction tokenizer.enable_truncation(2, direction="left") output = tokenizer.encode("my name is john") assert output.tokens == ["is", "john"] output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["john", "pair"]