Пример #1
0
def get_tokenizer(args):

    tokenizer = Tokenizer(models.BPE())
    tokenizer.normalizer = Sequence(
        [NFKC(), Replace('\r', ''),
         Replace('\n', ' ')])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()

    if os.path.isdir(args.tokenizer_dir):
        vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json')
        merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt')
        tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn)
    else:
        os.makedirs(args.tokenizer_dir)
        trainer = trainers.BpeTrainer(
            vocab_size=args.vocab_size,
            special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"])
        files = [
            os.path.join(args.data_dir, split)
            for split in ['train.json', 'val.json', 'test.json']
        ]
        tokenizer.train(files=files, trainer=trainer)
        tokenizer.model.save(args.tokenizer_dir)

    return tokenizer
Пример #2
0
def train_tokenizer(input_dir: str,
                    save_path: str,
                    tokenizer_type: str = "BPE",
                    vocab_size: int = 52000):
    """
    Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`

    :param input_dir: input directory containing jsonl files
    :param save_path: path to save tokenizer to
    :param tokenizer_type: type of tokenizer to train.
    :param vocab_size: int, size of tokenizer's vocab
    :return:
    """

    if tokenizer_type == "BPE":
        model = models.BPE()
    else:
        raise NotImplementedError(
            f'Tokenizer type {tokenizer_type} not implemented')
    tokenizer = Tokenizer(model)

    # Customize pre-tokenization and decoding
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
    tokenizer.normalizer = NFKC()

    # And then train
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"])
    tokenizer.train_from_iterator(json_iterator(input_dir), trainer)

    # And Save it
    tokenizer.save(save_path, pretty=True)
    print(f'Tokenizer saved at {save_path}')
Пример #3
0
    def train(
        self,
        files: Union[str, List[str]],
        vocab_size: int = 30000,
        min_frequency: int = 20,
        special_tokens: List[Union[str, AddedToken]] = [
            "<pad>",
            "<unk>",
            "<s>",
            "<nl>",
            "</s>",
            "<mask>",
        ],
        limit_alphabet: int = 6000,
        initial_alphabet: List[str] = [],
        show_progress: bool = True,
    ):
        """ Train the model using the given files """

        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=special_tokens,
            limit_alphabet=limit_alphabet,
            initial_alphabet=initial_alphabet,
            show_progress=show_progress,
        )
        if isinstance(files, str):
            files = [files]
        self._tokenizer.train(trainer, files)
Пример #4
0
    def test_continuing_prefix_trainer_mistmatch(self):
        UNK = "[UNK]"
        special_tokens = [UNK]
        tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
        trainer = trainers.BpeTrainer(special_tokens=special_tokens)
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)]
        )
        tokenizer.train(files=["data/big.txt"], trainer=trainer)

        tokenizer.save("data/tokenizer.json")

        tokenizer.from_file("data/tokenizer.json")
Пример #5
0
 def train(cls,
           dataset: Sequence[str],
           vocab_size: int = 1000,
           min_frequency: int = 2,
           dropout: float = 0.0,
           max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer':
     instance = cls(dropout=dropout, max_length=max_length)
     trainer = trainers.BpeTrainer(
         vocab_size=vocab_size,
         min_frequency=min_frequency,
         special_tokens=[cls.pad_token, cls.unk_token])
     instance.tokenizer.train_from_iterator(dataset, trainer=trainer)
     instance.tokenizer.model.dropout = None
     return instance
Пример #6
0
def main(args):

    random.seed(args.random_seed)

    txt_files = listfiles(args.input)
    if not txt_files:
        logging.error("no data files found")
        return

    os.makedirs(args.output, exist_ok=True)

    # setup
    tokenizer = setup_tokenizer(args)

    if args.extra_tokens:
        with tf.io.gfile.GFile(args.extra_tokens) as fd:
            words = [l.strip() for l in fd.readlines()]
        tokenizer.add_tokens(words)
        if args.vocab_size < len(words):
            logging.error(
                "vocab size is less than the provided tokens. aborting")
            sys.exit(-1)

    # train
    trainer = trainers.BpeTrainer(
        vocab_size=args.vocab_size,
        min_frequency=2,
        initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        special_tokens=[constants.PAD, constants.EOS],
    )
    tokenizer.train(trainer, txt_files)

    # save
    tokenizer_path = os.path.join(args.output, "byte-level-bpe.tokenizer.json")
    tokenizer.save(tokenizer_path, pretty=True)
    encoded_gold = tokenizer.encode("I can feel the magic, can you?")
    logging.info("tokenizer saved at %s", tokenizer_path)

    # test
    tokenizer = Tokenizer.from_file(tokenizer_path)
    encoded = tokenizer.encode("I can feel the magic, can you?")

    if not all(a == b for a, b in zip(encoded.ids, encoded_gold.ids)):
        logging.error("saved tokenizer and trained tokenizer do not match")

    tokenizer.model.save(args.output)

    logging.info("tokenizer model saved at %s", args.output)
Пример #7
0
    def train_from_iterator(
        self,
        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
        vocab_size: int = 30000,
        min_frequency: int = 2,
        show_progress: bool = True,
        special_tokens: List[Union[str, AddedToken]] = [],
    ):
        """ Train the model using the given iterator """

        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            show_progress=show_progress,
            special_tokens=special_tokens,
            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        )
        self._tokenizer.train_from_iterator(iterator, trainer=trainer)
Пример #8
0
    def train(
        self,
        files: Union[str, List[str]],
        vocab_size: int = 30000,
        min_frequency: int = 2,
        show_progress: bool = True,
        special_tokens: List[Union[str, AddedToken]] = [],
    ):
        """ Train the model using the given files """

        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            show_progress=show_progress,
            special_tokens=special_tokens,
            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        )
        if isinstance(files, str):
            files = [files]
        self._tokenizer.train(trainer, files)
Пример #9
0
    def test_train_parallelism_with_custom_pretokenizer(self, train_files):
        class GoodCustomPretok:
            def split(self, n, normalized):
                #  Here we just test that we can return a List[NormalizedString], it
                # does not really make sense to return twice the same otherwise
                return [normalized, normalized]

            def pre_tokenize(self, pretok):
                pretok.split(self.split)

        custom = pre_tokenizers.PreTokenizer.custom(GoodCustomPretok())
        bpe_tokenizer = Tokenizer(models.BPE())
        bpe_tokenizer.normalizer = normalizers.Lowercase()
        bpe_tokenizer.pre_tokenizer = custom

        if "TOKENIZERS_PARALLELISM" in os.environ:
            del os.environ["TOKENIZERS_PARALLELISM"]

        trainer = trainers.BpeTrainer(special_tokens=["<unk>"], show_progress=False)
        bpe_tokenizer.train([train_files["small"]], trainer=trainer)
    def test_can_modify(self):
        trainer = trainers.BpeTrainer(
            vocab_size=12345,
            min_frequency=12,
            show_progress=False,
            special_tokens=["1", "2"],
            limit_alphabet=13,
            initial_alphabet=["a", "b", "c"],
            continuing_subword_prefix="pref",
            end_of_word_suffix="suf",
        )

        assert trainer.vocab_size == 12345
        assert trainer.min_frequency == 12
        assert trainer.show_progress == False
        assert trainer.special_tokens == [
            AddedToken("1"),
            AddedToken("2"),
        ]
        assert trainer.limit_alphabet == 13
        assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
        assert trainer.continuing_subword_prefix == "pref"
        assert trainer.end_of_word_suffix == "suf"

        # Modify these
        trainer.vocab_size = 20000
        assert trainer.vocab_size == 20000
        trainer.min_frequency = 1
        assert trainer.min_frequency == 1
        trainer.show_progress = True
        assert trainer.show_progress == True
        trainer.special_tokens = []
        assert trainer.special_tokens == []
        trainer.limit_alphabet = None
        assert trainer.limit_alphabet == None
        trainer.initial_alphabet = ["d", "z"]
        assert sorted(trainer.initial_alphabet) == ["d", "z"]
        trainer.continuing_subword_prefix = None
        assert trainer.continuing_subword_prefix == None
        trainer.end_of_word_suffix = None
        assert trainer.continuing_subword_prefix == None
Пример #11
0
    def train_from_iterator(
        self,
        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
        vocab_size: int = 30000,
        min_frequency: int = 2,
        special_tokens: List[Union[str, AddedToken]] = ["<unk>"],
        limit_alphabet: int = 1000,
        initial_alphabet: List[str] = [],
        show_progress: bool = True,
    ):
        """ Train the model using the given iterator """

        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=special_tokens,
            limit_alphabet=limit_alphabet,
            initial_alphabet=initial_alphabet,
            show_progress=show_progress,
        )
        self._tokenizer.train_from_iterator(iterator, trainer=trainer)
Пример #12
0
def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
Пример #13
0
                  for s in g:
                      f.write(s)
                      f.write("\n\n")
          elif args.file_type == 'txt':
              shutil.copyfile(str(arch), str(fp))

  data_files = glob(str(out_path / "*.txt"))
  data_files = random.sample(data_files, int(0.2 * len(data_files)))

  assert len(data_files) > 0, 'No data files found'

  # Initialize a tokenizer
  tokenizer = Tokenizer(models.BPE())

  # Customize pre-tokenization and decoding
  tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
  tokenizer.decoder = decoders.ByteLevel()
  tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
  tokenizer.normalizer = NFKC()

  # And then train
  trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"])
  tokenizer.train(trainer, data_files)

  # And Save it
  tokenizer_path = out_path / "byte-level-bpe.tokenizer.json"
  tokenizer.save(str(tokenizer_path), pretty=True)

  print(f'tokenizer saved at {str(tokenizer_path)}')
  return tokenizer_path
Пример #14
0
# https://github.com/huggingface/tokenizers/tree/master/bindings/python#train-a-new-tokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())
# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

# TODO True

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
trainer = trainers.BpeTrainer(vocab_size=10000, min_frequency=2)
tokenizer.train(trainer, ['bar'])

encoded = tokenizer.encode(seq)
print(encoded.tokens)

# TODO: Use a clustered set of proteins like UniRef50
# -- https://www.uniprot.org/help/uniref

# TODO: Use an LSTM to train on sequences, then freeze early layers and add
# classification backend, retrain.

# https://github.com/huggingface/tokenizers/tree/master/bindings/python
# https://github.com/huggingface/tokenizers/tree/master/bindings/python#provided-tokenizers
from tokenizers import CharBPETokenizer

tokenizer = CharBPETokenizer(bert_normalizer=False)
tokenizer.train(['./bar'], vocab_size=1000, min_frequency=2)
# tokenizer.encode(seq).tokens