def get_tokenizer(args): tokenizer = Tokenizer(models.BPE()) tokenizer.normalizer = Sequence( [NFKC(), Replace('\r', ''), Replace('\n', ' ')]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() if os.path.isdir(args.tokenizer_dir): vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json') merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt') tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn) else: os.makedirs(args.tokenizer_dir) trainer = trainers.BpeTrainer( vocab_size=args.vocab_size, special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"]) files = [ os.path.join(args.data_dir, split) for split in ['train.json', 'val.json', 'test.json'] ] tokenizer.train(files=files, trainer=trainer) tokenizer.model.save(args.tokenizer_dir) return tokenizer
def train_tokenizer(input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000): """ Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path` :param input_dir: input directory containing jsonl files :param save_path: path to save tokenizer to :param tokenizer_type: type of tokenizer to train. :param vocab_size: int, size of tokenizer's vocab :return: """ if tokenizer_type == "BPE": model = models.BPE() else: raise NotImplementedError( f'Tokenizer type {tokenizer_type} not implemented') tokenizer = Tokenizer(model) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer( vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train_from_iterator(json_iterator(input_dir), trainer) # And Save it tokenizer.save(save_path, pretty=True) print(f'Tokenizer saved at {save_path}')
def train( self, files: Union[str, List[str]], vocab_size: int = 30000, min_frequency: int = 20, special_tokens: List[Union[str, AddedToken]] = [ "<pad>", "<unk>", "<s>", "<nl>", "</s>", "<mask>", ], limit_alphabet: int = 6000, initial_alphabet: List[str] = [], show_progress: bool = True, ): """ Train the model using the given files """ trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, limit_alphabet=limit_alphabet, initial_alphabet=initial_alphabet, show_progress=show_progress, ) if isinstance(files, str): files = [files] self._tokenizer.train(trainer, files)
def test_continuing_prefix_trainer_mistmatch(self): UNK = "[UNK]" special_tokens = [UNK] tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##")) trainer = trainers.BpeTrainer(special_tokens=special_tokens) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)] ) tokenizer.train(files=["data/big.txt"], trainer=trainer) tokenizer.save("data/tokenizer.json") tokenizer.from_file("data/tokenizer.json")
def train(cls, dataset: Sequence[str], vocab_size: int = 1000, min_frequency: int = 2, dropout: float = 0.0, max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer': instance = cls(dropout=dropout, max_length=max_length) trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[cls.pad_token, cls.unk_token]) instance.tokenizer.train_from_iterator(dataset, trainer=trainer) instance.tokenizer.model.dropout = None return instance
def main(args): random.seed(args.random_seed) txt_files = listfiles(args.input) if not txt_files: logging.error("no data files found") return os.makedirs(args.output, exist_ok=True) # setup tokenizer = setup_tokenizer(args) if args.extra_tokens: with tf.io.gfile.GFile(args.extra_tokens) as fd: words = [l.strip() for l in fd.readlines()] tokenizer.add_tokens(words) if args.vocab_size < len(words): logging.error( "vocab size is less than the provided tokens. aborting") sys.exit(-1) # train trainer = trainers.BpeTrainer( vocab_size=args.vocab_size, min_frequency=2, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=[constants.PAD, constants.EOS], ) tokenizer.train(trainer, txt_files) # save tokenizer_path = os.path.join(args.output, "byte-level-bpe.tokenizer.json") tokenizer.save(tokenizer_path, pretty=True) encoded_gold = tokenizer.encode("I can feel the magic, can you?") logging.info("tokenizer saved at %s", tokenizer_path) # test tokenizer = Tokenizer.from_file(tokenizer_path) encoded = tokenizer.encode("I can feel the magic, can you?") if not all(a == b for a, b in zip(encoded.ids, encoded_gold.ids)): logging.error("saved tokenizer and trained tokenizer do not match") tokenizer.model.save(args.output) logging.info("tokenizer model saved at %s", args.output)
def train_from_iterator( self, iterator: Union[Iterator[str], Iterator[Iterator[str]]], vocab_size: int = 30000, min_frequency: int = 2, show_progress: bool = True, special_tokens: List[Union[str, AddedToken]] = [], ): """ Train the model using the given iterator """ trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, show_progress=show_progress, special_tokens=special_tokens, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), ) self._tokenizer.train_from_iterator(iterator, trainer=trainer)
def train( self, files: Union[str, List[str]], vocab_size: int = 30000, min_frequency: int = 2, show_progress: bool = True, special_tokens: List[Union[str, AddedToken]] = [], ): """ Train the model using the given files """ trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, show_progress=show_progress, special_tokens=special_tokens, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), ) if isinstance(files, str): files = [files] self._tokenizer.train(trainer, files)
def test_train_parallelism_with_custom_pretokenizer(self, train_files): class GoodCustomPretok: def split(self, n, normalized): # Here we just test that we can return a List[NormalizedString], it # does not really make sense to return twice the same otherwise return [normalized, normalized] def pre_tokenize(self, pretok): pretok.split(self.split) custom = pre_tokenizers.PreTokenizer.custom(GoodCustomPretok()) bpe_tokenizer = Tokenizer(models.BPE()) bpe_tokenizer.normalizer = normalizers.Lowercase() bpe_tokenizer.pre_tokenizer = custom if "TOKENIZERS_PARALLELISM" in os.environ: del os.environ["TOKENIZERS_PARALLELISM"] trainer = trainers.BpeTrainer(special_tokens=["<unk>"], show_progress=False) bpe_tokenizer.train([train_files["small"]], trainer=trainer)
def test_can_modify(self): trainer = trainers.BpeTrainer( vocab_size=12345, min_frequency=12, show_progress=False, special_tokens=["1", "2"], limit_alphabet=13, initial_alphabet=["a", "b", "c"], continuing_subword_prefix="pref", end_of_word_suffix="suf", ) assert trainer.vocab_size == 12345 assert trainer.min_frequency == 12 assert trainer.show_progress == False assert trainer.special_tokens == [ AddedToken("1"), AddedToken("2"), ] assert trainer.limit_alphabet == 13 assert sorted(trainer.initial_alphabet) == ["a", "b", "c"] assert trainer.continuing_subword_prefix == "pref" assert trainer.end_of_word_suffix == "suf" # Modify these trainer.vocab_size = 20000 assert trainer.vocab_size == 20000 trainer.min_frequency = 1 assert trainer.min_frequency == 1 trainer.show_progress = True assert trainer.show_progress == True trainer.special_tokens = [] assert trainer.special_tokens == [] trainer.limit_alphabet = None assert trainer.limit_alphabet == None trainer.initial_alphabet = ["d", "z"] assert sorted(trainer.initial_alphabet) == ["d", "z"] trainer.continuing_subword_prefix = None assert trainer.continuing_subword_prefix == None trainer.end_of_word_suffix = None assert trainer.continuing_subword_prefix == None
def train_from_iterator( self, iterator: Union[Iterator[str], Iterator[Iterator[str]]], vocab_size: int = 30000, min_frequency: int = 2, special_tokens: List[Union[str, AddedToken]] = ["<unk>"], limit_alphabet: int = 1000, initial_alphabet: List[str] = [], show_progress: bool = True, ): """ Train the model using the given iterator """ trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, limit_alphabet=limit_alphabet, initial_alphabet=initial_alphabet, show_progress=show_progress, ) self._tokenizer.train_from_iterator(iterator, trainer=trainer)
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
for s in g: f.write(s) f.write("\n\n") elif args.file_type == 'txt': shutil.copyfile(str(arch), str(fp)) data_files = glob(str(out_path / "*.txt")) data_files = random.sample(data_files, int(0.2 * len(data_files))) assert len(data_files) > 0, 'No data files found' # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) tokenizer.normalizer = NFKC() # And then train trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"]) tokenizer.train(trainer, data_files) # And Save it tokenizer_path = out_path / "byte-level-bpe.tokenizer.json" tokenizer.save(str(tokenizer_path), pretty=True) print(f'tokenizer saved at {str(tokenizer_path)}') return tokenizer_path
# https://github.com/huggingface/tokenizers/tree/master/bindings/python#train-a-new-tokenizer from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # TODO True tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) trainer = trainers.BpeTrainer(vocab_size=10000, min_frequency=2) tokenizer.train(trainer, ['bar']) encoded = tokenizer.encode(seq) print(encoded.tokens) # TODO: Use a clustered set of proteins like UniRef50 # -- https://www.uniprot.org/help/uniref # TODO: Use an LSTM to train on sequences, then freeze early layers and add # classification backend, retrain. # https://github.com/huggingface/tokenizers/tree/master/bindings/python # https://github.com/huggingface/tokenizers/tree/master/bindings/python#provided-tokenizers from tokenizers import CharBPETokenizer tokenizer = CharBPETokenizer(bert_normalizer=False) tokenizer.train(['./bar'], vocab_size=1000, min_frequency=2) # tokenizer.encode(seq).tokens