def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer ids = bert_tokenizer.encode(sentences[10]).ids bert_tokenizer.decode(ids) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) tokenizer.train_from_iterator(sentences, trainer=trainer) tokenizer.encode(sentences[4]).ids tokenizer.decode(tokenizer.encode(sentences[4]).ids) tokenizer.save('bert_out/test2') tokenizer.save_pretrained('bert_out/test')
class LitTokenizer: def __init__(self, padding=False, truncation=False, max_length=None, lower=False, lang=None): super().__init__() self.UNK_WORD = '[UNK]' self.PAD_WORD = '[PAD]' self.MASK_WORD = '[MASK]' self.SOS_WORD = '[SOS]' self.EOS_WORD = '[EOS]' self.special_tokens = [ self.UNK_WORD, self.PAD_WORD, self.MASK_WORD, self.SOS_WORD, self.EOS_WORD ] # Define tokenizer self.tokenizer = None self.configure_tokenizers(padding, truncation, max_length, lower) # Other self.lang = lang def get_vocab_size(self): return self.tokenizer.get_vocab_size() def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first') def load_vocab(self, vocab, merges): vocab, merges = tok_model.read_file(vocab, merges) self.tokenizer.model = tok_model(vocab, merges) def train_vocab(self, files, vocab_size=32000, min_frequency=3): # Train trainer trainer = tok_trainer(vocab_size=vocab_size, min_frequency=min_frequency) self.tokenizer.train(files, trainer) def save_vocab(self, output_dir, prefix): self.tokenizer.model.save(output_dir, prefix) def pad(self, examples, keys=None): pad_idx = self.special_tokens.index(self.PAD_WORD) # Keys to modify if not keys: keys = list(examples[0].keys()) d = {} for k in keys: # Collect same-type items (list of IDs, list of masks,...) d[k] = [x[k] for x in examples] # Get max length (value to pad) max_length = max([x.shape[-1] for x in d[k]]) # Apply padding for i, x in enumerate(examples): unpadded_t = x[k] if k == "ids": tmp = torch.full((max_length, ), fill_value=pad_idx, device=unpadded_t.device) # All padding elif k == "attention_mask": tmp = torch.full( (max_length, ), fill_value=0, device=unpadded_t.device) # No attention mask else: raise TypeError("Unknown key") tmp[:unpadded_t.shape[-1]] = unpadded_t d[k][i] = tmp return d def encode(self, x): return self.tokenizer.encode(x) def decode(self, x): if isinstance(x, torch.Tensor): assert len(x.shape) == 2 x = x.detach().cpu().numpy() return [self.tokenizer.decode(x_i) for x_i in x]
tokenizer = Tokenizer(models.BPE.from_files(args.vocab, args.merges)) # Test the good custom classes good_custom = GoodCustom() good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom) good_decoder = decoders.Decoder.custom(good_custom) tokenizer.pre_tokenizer = good_pretok tokenizer.decoder = good_decoder print("Tokenization will work with good custom:") encoding = tokenizer.encode("Hey friend!") print(f"IDS: {encoding.ids}") print(f"TOKENS: {encoding.tokens}") print(f"OFFSETS: {encoding.offsets}") decoded = tokenizer.decode(encoding.ids) print(f"DECODED: {decoded}") # Now test with the bad custom classes bad_custom = BadCustom() bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom) bad_decoder = decoders.Decoder.custom(bad_custom) tokenizer.pre_tokenizer = bad_pretok tokenizer.decoder = bad_decoder try: encoding = tokenizer.encode("Hey friend!") except: print("Bad tokenizer didn't work")
tokenizer.model.save(str(PATH)) # %% # 在需要时重新载入使用(可与transformers无缝衔接配合使用) # 注意,实践中这里需要按训练时的情况重新构建好tokenizer再载入model tokenizer.model = BPE(vocab=str(PATH / 'vocab.json'), merges=str(PATH / 'merges.txt')) # %% # 编码/解码 encoded = \ tokenizer.encode("This is a simple input to be tokenized.") print("Encoded string: {}".format(encoded.tokens)) decoded = \ tokenizer.decode(encoded.ids) print("Decoded string: {}".format(decoded)) # %% from tokenizers import ByteLevelBPETokenizer # tokenizer提供了一些经典tokenization算法的高级封装 # 譬如可以用`ByteLevelBPETokenizer`简单地重写上面的内容 # tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=['data/big.txt'], vocab_size=25000, show_progress=True) SAVE_PATH = Path('tokenizers') PATH = SAVE_PATH / 'bytelevel-bpe-tokenizer-model' if not PATH.exists(): PATH.mkdir(parents=True, exist_ok=True)