def test_get_set_components(self): toki = Tokenizer(models.BPE()) toki.normalizer = normalizers.NFC() toki.pre_tokenizer = pre_tokenizers.ByteLevel() toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1)) toki.decoder = decoders.ByteLevel() tokenizer = BaseTokenizer(toki) assert isinstance(tokenizer.model, models.BPE) assert isinstance(tokenizer.normalizer, normalizers.NFC) assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel) assert isinstance(tokenizer.post_processor, processors.BertProcessing) assert isinstance(tokenizer.decoder, decoders.ByteLevel) tokenizer.model = models.Unigram() assert isinstance(tokenizer.model, models.Unigram) tokenizer.normalizer = normalizers.NFD() assert isinstance(tokenizer.normalizer, normalizers.NFD) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace) tokenizer.post_processor = processors.ByteLevel() assert isinstance(tokenizer.post_processor, processors.ByteLevel) tokenizer.decoder = decoders.WordPiece() assert isinstance(tokenizer.decoder, decoders.WordPiece)
def train_tokenizer(input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000): """ Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path` :param input_dir: input directory containing jsonl files :param save_path: path to save tokenizer to :param tokenizer_type: type of tokenizer to train. :param vocab_size: int, size of tokenizer's vocab :return: """ if tokenizer_type == "BPE": model = models.BPE() else: raise NotImplementedError( f'Tokenizer type {tokenizer_type} not implemented') tokenizer = Tokenizer(model) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer( vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train_from_iterator(json_iterator(input_dir), trainer) # And Save it tokenizer.save(save_path, pretty=True) print(f'Tokenizer saved at {save_path}')
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, trim_offsets: bool = False, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE( vocab, merges, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel( trim_offsets=trim_offsets) parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, "trim_offsets": trim_offsets, } super().__init__(tokenizer, parameters)
def setup_tokenizer(_): # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) normalizers = [NFKC()] tokenizer.normalizer = Sequence(normalizers) return tokenizer
def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=self.original_tokenizer.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) return tokenizer
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
for s in g: f.write(s) f.write("\n\n") elif args.file_type == 'txt': shutil.copyfile(str(arch), str(fp)) data_files = glob(str(out_path / "*.txt")) data_files = random.sample(data_files, int(0.2 * len(data_files))) assert len(data_files) > 0, 'No data files found' # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train(trainer, data_files) # And Save it tokenizer_path = out_path / "byte-level-bpe.tokenizer.json" tokenizer.save(str(tokenizer_path), pretty=True) print(f'tokenizer saved at {str(tokenizer_path)}') return tokenizer_path