def test_get_set_components(self):
        toki = Tokenizer(models.BPE())
        toki.normalizer = normalizers.NFC()
        toki.pre_tokenizer = pre_tokenizers.ByteLevel()
        toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1))
        toki.decoder = decoders.ByteLevel()

        tokenizer = BaseTokenizer(toki)

        assert isinstance(tokenizer.model, models.BPE)
        assert isinstance(tokenizer.normalizer, normalizers.NFC)
        assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel)
        assert isinstance(tokenizer.post_processor, processors.BertProcessing)
        assert isinstance(tokenizer.decoder, decoders.ByteLevel)

        tokenizer.model = models.Unigram()
        assert isinstance(tokenizer.model, models.Unigram)
        tokenizer.normalizer = normalizers.NFD()
        assert isinstance(tokenizer.normalizer, normalizers.NFD)
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace)
        tokenizer.post_processor = processors.ByteLevel()
        assert isinstance(tokenizer.post_processor, processors.ByteLevel)
        tokenizer.decoder = decoders.WordPiece()
        assert isinstance(tokenizer.decoder, decoders.WordPiece)
Exemplo n.º 2
0
def train_tokenizer(input_dir: str,
                    save_path: str,
                    tokenizer_type: str = "BPE",
                    vocab_size: int = 52000):
    """
    Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`

    :param input_dir: input directory containing jsonl files
    :param save_path: path to save tokenizer to
    :param tokenizer_type: type of tokenizer to train.
    :param vocab_size: int, size of tokenizer's vocab
    :return:
    """

    if tokenizer_type == "BPE":
        model = models.BPE()
    else:
        raise NotImplementedError(
            f'Tokenizer type {tokenizer_type} not implemented')
    tokenizer = Tokenizer(model)

    # Customize pre-tokenization and decoding
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
    tokenizer.normalizer = NFKC()

    # And then train
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"])
    tokenizer.train_from_iterator(json_iterator(input_dir), trainer)

    # And Save it
    tokenizer.save(save_path, pretty=True)
    print(f'Tokenizer saved at {save_path}')
Exemplo n.º 3
0
def get_tokenizer(args):

    tokenizer = Tokenizer(models.BPE())
    tokenizer.normalizer = Sequence(
        [NFKC(), Replace('\r', ''),
         Replace('\n', ' ')])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()

    if os.path.isdir(args.tokenizer_dir):
        vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json')
        merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt')
        tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn)
    else:
        os.makedirs(args.tokenizer_dir)
        trainer = trainers.BpeTrainer(
            vocab_size=args.vocab_size,
            special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"])
        files = [
            os.path.join(args.data_dir, split)
            for split in ['train.json', 'val.json', 'test.json']
        ]
        tokenizer.train(files=files, trainer=trainer)
        tokenizer.model.save(args.tokenizer_dir)

    return tokenizer
Exemplo n.º 4
0
    def converted(self) -> Tokenizer:
        ot = self.original_tokenizer
        vocab = ot.encoder
        merges = list(ot.bpe_ranks.keys())

        tokenizer = Tokenizer(
            BPE(
                vocab=vocab,
                merges=merges,
                dropout=None,
                continuing_subword_prefix="",
                end_of_word_suffix="",
                fuse_unk=False,
            ))

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=ot.add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.TemplateProcessing(
            single="[CLS]:0 $A:0 [SEP]:0",
            pair="[CLS]:0 $A:0 [SEP]:0 $B:0 [SEP]:0",
            special_tokens=[
                ("[CLS]",
                 self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
                ("[SEP]",
                 self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
            ],
        )

        return tokenizer
    def converted(self) -> Tokenizer:
        ot = self.original_tokenizer
        vocab = ot.encoder
        merges = list(ot.bpe_ranks.keys())

        tokenizer = Tokenizer(
            BPE(
                vocab=vocab,
                merges=merges,
                dropout=None,
                continuing_subword_prefix="",
                end_of_word_suffix="",
                fuse_unk=False,
            ))

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=ot.add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.RobertaProcessing(
            sep=(ot.sep_token, ot.sep_token_id),
            cls=(ot.cls_token, ot.cls_token_id),
            add_prefix_space=ot.add_prefix_space,
            trim_offsets=True,  # True by default on Roberta (historical)
        )

        return tokenizer
Exemplo n.º 6
0
def train_tokenizer(langs, dataset, vocab_size):
    """Train a tokenizer on given list of languages.
    Reserves a special token for each language which is
    [LANG] where LANG is the language tag. These are assigned
    to tokens 5, 6, ..., len(langs) + 4.
    """

    # Byte-pair encoding
    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

    # trainer
    lang_tokens = ['[' + lang + ']' for lang in langs]
    special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens
    trainer = BpeTrainer(
        special_tokens=special_tokens,
        vocab_size=vocab_size)

    # normalise and pre tokenize
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()

    # create iterator and train
    iterator = _MultilingualIterator(dataset, langs)
    tokenizer.train_from_iterator(iterator, trainer)

    # post process start/end tokens
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ], )
    return tokenizer
Exemplo n.º 7
0
    def __init__(
        self,
        vocab: Optional[Union[str, Dict[str, int]]] = None,
        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int,
                                                                int]]]] = None,
        add_prefix_space: bool = False,
        lowercase: bool = False,
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
        continuing_subword_prefix: Optional[str] = None,
        end_of_word_suffix: Optional[str] = None,
        trim_offsets: bool = False,
    ):
        if vocab is not None and merges is not None:
            tokenizer = Tokenizer(
                BPE(
                    vocab,
                    merges,
                    dropout=dropout,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or "",
                ))
        else:
            tokenizer = Tokenizer(BPE())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(
            trim_offsets=trim_offsets)

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
            "lowercase": lowercase,
            "dropout": dropout,
            "unicode_normalizer": unicode_normalizer,
            "continuing_subword_prefix": continuing_subword_prefix,
            "end_of_word_suffix": end_of_word_suffix,
            "trim_offsets": trim_offsets,
        }

        super().__init__(tokenizer, parameters)
Exemplo n.º 8
0
 def configure(self):
     self.testing_file = self.get_value_from_config('testing_file')
     self.vocab_file = self.get_value_from_config('vocab_file')
     self.merges_file = self.get_value_from_config('merges_file')
     self.max_seq_length = int(self.get_value_from_config('max_seq_length'))
     self.tokenizer = Tokenizer(BPE(str(self.vocab_file), str(self.merges_file)))
     self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
     self.tokenizer.decoder = decoders.ByteLevel()
Exemplo n.º 9
0
def generate_tokenizer(equations, output, vocab_size):
    from tokenizers import Tokenizer, pre_tokenizers
    from tokenizers.models import BPE
    from tokenizers.trainers import BpeTrainer
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    trainer = BpeTrainer(special_tokens=["[PAD]", "[BOS]", "[EOS]"], vocab_size=vocab_size, show_progress=True)
    tokenizer.train(trainer, equations)
    tokenizer.save(path=output, pretty=False)
Exemplo n.º 10
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        add_prefix_space: bool = False,
        lowercase: bool = False,
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
        continuing_subword_prefix: Optional[str] = None,
        end_of_word_suffix: Optional[str] = None,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(
                    vocab_file,
                    merges_file,
                    dropout=dropout,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or "",
                ))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
            "lowercase": lowercase,
            "dropout": dropout,
            "unicode_normalizer": unicode_normalizer,
            "continuing_subword_prefix": continuing_subword_prefix,
            "end_of_word_suffix": end_of_word_suffix,
        }

        super().__init__(tokenizer, parameters)
Exemplo n.º 11
0
def setup_tokenizer(_):
    # Initialize a tokenizer
    tokenizer = Tokenizer(models.BPE())

    # Customize pre-tokenization and decoding
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
    normalizers = [NFKC()]
    tokenizer.normalizer = Sequence(normalizers)
    return tokenizer
Exemplo n.º 12
0
 def configure(self):
     if isinstance(Tokenizer, UnsupportedPackage):
         Tokenizer.raise_error(self.__provider__)
     self.testing_file = self.get_value_from_config('testing_file')
     self.vocab_file = self.get_value_from_config('vocab_file')
     self.merges_file = self.get_value_from_config('merges_file')
     self.max_seq_length = int(self.get_value_from_config('max_seq_length'))
     self.tokenizer = Tokenizer(
         BPE(str(self.vocab_file), str(self.merges_file)))
     self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
         add_prefix_space=False)
     self.tokenizer.decoder = decoders.ByteLevel()
Exemplo n.º 13
0
 def configure(self):
     if Tokenizer is None:
         raise ConfigError(
             "Annotation converter: wikitext2raw required tokenizers package installation. "
             "Please install it before usage.")
     self.testing_file = self.get_value_from_config('testing_file')
     self.vocab_file = self.get_value_from_config('vocab_file')
     self.merges_file = self.get_value_from_config('merges_file')
     self.max_seq_length = int(self.get_value_from_config('max_seq_length'))
     self.tokenizer = Tokenizer(
         BPE(str(self.vocab_file), str(self.merges_file)))
     self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
         add_prefix_space=False)
     self.tokenizer.decoder = decoders.ByteLevel()
Exemplo n.º 14
0
    def get_tokenizer(self, tokenizer_dir):

        tokenizer = Tokenizer(models.BPE())
        tokenizer.normalizer = Sequence(
            [NFKC(), Replace('\r', ''),
             Replace('\n', ' ')])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
        tokenizer.decoder = decoders.ByteLevel()

        vocab_fn = os.path.join(tokenizer_dir, 'vocab.json')
        merge_fn = os.path.join(tokenizer_dir, 'merges.txt')
        tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn)
        tokenizer.add_special_tokens(['[UNK]', '[PAD]', '[BOS]', '[EOS]'])
        return tokenizer
    def get_tokenizer_trainer():
        # START init_tokenizer_trainer
        from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

        tokenizer = Tokenizer(models.Unigram())
        tokenizer.normalizer = normalizers.NFKC()
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
        tokenizer.decoders = decoders.ByteLevel()

        trainer = trainers.UnigramTrainer(
            vocab_size=20000,
            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
            special_tokens=["<PAD>", "<BOS>", "<EOS>"],
        )
        # END init_tokenizer_trainer
        trainer.show_progress = False

        return tokenizer, trainer
Exemplo n.º 16
0
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.encoder
        merges = list(self.original_tokenizer.bpe_ranks.keys())
        unk_token = self.original_tokenizer.unk_token

        tokenizer = Tokenizer(
            BPE(
                vocab=vocab,
                merges=merges,
                dropout=None,
                continuing_subword_prefix="",
                end_of_word_suffix="</w>",
                fuse_unk=False,
                unk_token=str(unk_token),
            ))

        tokenizer.normalizer = normalizers.Sequence([
            normalizers.NFC(),
            normalizers.Replace(Regex(r"\s+"), " "),
            normalizers.Lowercase()
        ])
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
            pre_tokenizers.Split(
                Regex(
                    r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""
                ),
                behavior="removed",
                invert=True,
            ),
            pre_tokenizers.ByteLevel(add_prefix_space=False),
        ])
        tokenizer.decoder = decoders.ByteLevel()

        # Hack to have a ByteLevel and TemplaceProcessor
        tokenizer.post_processor = processors.RobertaProcessing(
            sep=(self.original_tokenizer.eos_token,
                 self.original_tokenizer.eos_token_id),
            cls=(self.original_tokenizer.bos_token,
                 self.original_tokenizer.bos_token_id),
            add_prefix_space=False,
            trim_offsets=False,
        )
        return tokenizer
Exemplo n.º 17
0
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.encoder
        merges = list(self.original_tokenizer.bpe_ranks.keys())

        tokenizer = Tokenizer(
            BPE(
                vocab=vocab,
                merges=merges,
                dropout=None,
                continuing_subword_prefix="",
                end_of_word_suffix="",
                fuse_unk=False,
            ))

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=self.original_tokenizer.add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

        return tokenizer
Exemplo n.º 18
0
def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
Exemplo n.º 19
0
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
""".split("\n")

if args.type == "gpt2":
    print("Running GPT-2 tokenizer")
    tok_p = GPT2Tokenizer.from_pretrained('gpt2')

    # Create a Tokenizer using BPE
    tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
    # Use ByteLevel PreTokenizer
    tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    # Use ByteLevel Decoder
    tok_r.decoder = decoders.ByteLevel()
elif args.type == "bert":
    print("Running Bert tokenizer")
    tok_p = BertTokenizer.from_pretrained(args.vocab)

    tok_r = Tokenizer(
        WordPiece.from_files(args.vocab,
                             unk_token="[UNK]",
                             max_input_chars_per_word=100))
    tok_r.normalizer = BertNormalizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=True,
        lowercase=True,
Exemplo n.º 20
0
def main():
    args = build_argparser().parse_args()

    # load vocabulary file for model
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # create tokenizer
    tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges)))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    log.info('OpenVINO Runtime')
    log.info('\tbuild: {}'.format(get_version()))
    core = Core()

    # read IR
    log.info('Reading model {}'.format(args.model))
    model = core.read_model(args.model)

    # check number inputs and outputs
    if len(model.inputs) != 1:
        raise RuntimeError(
            'The demo expects model with single input, while provided {}'.
            format(len(model.inputs)))
    if len(model.outputs) != 1:
        raise RuntimeError(
            'The demo expects model with single output, while provided {}'.
            format(len(model.outputs)))
    input_tensor = model.inputs[0].any_name

    if not args.dynamic_shape and (
            model.inputs[0].partial_shape.is_dynamic
            or model.inputs[0].shape[1] != args.max_seq_len):
        model.reshape({
            input_tensor:
            PartialShape([Dimension(1),
                          Dimension(args.max_seq_len)])
        })

    if args.dynamic_shape:
        model.reshape({
            input_tensor:
            PartialShape([Dimension(1),
                          Dimension(0, args.max_seq_len)])
        })

    # load model to the device
    compiled_model = core.compile_model(model, args.device)
    output_tensor = compiled_model.outputs[0]
    infer_request = compiled_model.create_infer_request()
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.input:

        def prompts():
            for prompt in args.input:
                log.info("Input prompt: {}".format(prompt))
                yield prompt
    else:

        def prompts():
            while True:
                yield input('Type input prompt (empty string to exit):')

    # loop on user's or prepared prompts
    for prompt in prompts():
        if not prompt.strip():
            break

        # encode input
        tokens = tokenizer.encode_batch([prompt])[0].ids
        input_ids = np.array([tokens], dtype=np.int32)

        # maximum number of tokens that can be processed by network at once
        max_length = args.max_seq_len

        eos_token_id = len(vocab) - 1

        cur_input_len = input_ids.shape[-1]

        # maximum number of tokens that will be generated
        max_sample_token_num = args.max_sample_token_num + cur_input_len

        t0 = time.perf_counter()
        t_count = 0

        while True:
            model_input = input_ids
            if not args.dynamic_shape:
                # pad the rest of the request
                pad_len = max_length - cur_input_len
                model_input = np.concatenate(
                    (input_ids, [[eos_token_id] * pad_len]), axis=-1)

            # create numpy inputs for OpenVINO runtime
            inputs = {
                input_tensor: model_input,
            }

            # infer by OpenVINO runtime
            t_start = time.perf_counter()
            outputs = infer_request.infer(inputs)[output_tensor]
            t_end = time.perf_counter()
            t_count += 1
            log.info(
                "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)"
                .format(model_input.shape[1], 1 / (t_end - t_start),
                        t_end - t_start))

            next_token_logits = outputs[:, cur_input_len - 1, :]

            # pre-process distribution
            next_token_scores = process_logits(input_ids, next_token_logits,
                                               eos_token_id)
            if args.top_k > 0:
                next_token_scores = get_top_k_logits(next_token_scores,
                                                     args.top_k)

            if args.top_p < 1.0:
                next_token_scores = get_top_p_logits(next_token_scores,
                                                     args.top_p)

            # get next token id
            probs = softmax(next_token_scores)
            next_tokens = np.random.choice(probs.shape[-1],
                                           1,
                                           p=probs[0],
                                           replace=True)

            # update info for the next step
            input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1)

            cur_input_len = input_ids.shape[-1]

            if stop_criteria(input_ids, min(max_length, max_sample_token_num),
                             eos_token_id):
                break

        t1 = time.perf_counter()

        text = tokenizer.decode_batch(input_ids)[0]

        log.info(
            "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)".
            format(t_count, t1 - t0, (t1 - t0) / t_count))

        # print result
        log.info("GENERATED SEQUENCE: {}".format(text))
Exemplo n.º 21
0
def main():
    args = build_argparser().parse_args()

    # load vocabulary file for model
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # create tokenizer
    tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges)))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    log.info('OpenVINO Inference Engine')
    log.info('\tbuild: {}'.format(get_version()))
    ie = IECore()

    # read IR
    model_xml = args.model
    model_bin = model_xml.with_suffix(".bin")
    log.info('Reading model {}'.format(args.model))
    ie_net = ie.read_network(model=model_xml, weights=model_bin)

    # check input and output names
    if len(ie_net.input_info) != 1:
        raise RuntimeError(
            'The demo expects model with single input, while provided {}'.
            format(len(ie_net.input_info)))
    if len(ie_net.outputs) != 1:
        raise RuntimeError(
            'The demo expects model with single output, while provided {}'.
            format(len(ie_net.outputs)))
    input_names = next(iter(ie_net.input_info))
    output_names = next(iter(ie_net.outputs))

    # load model to the device
    ie_net_exec = ie.load_network(network=ie_net, device_name=args.device)
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.input:

        def prompts():
            for prompt in args.input:
                log.info("Input prompt: {}".format(prompt))
                yield prompt
    else:

        def prompts():
            while True:
                yield input('Type input prompt (empty string to exit):')

    # loop on user's or prepared prompts
    for prompt in prompts():
        if not prompt.strip():
            break

        # encode input
        tokens = tokenizer.encode_batch([prompt])[0].ids
        input_ids = np.array([tokens], dtype=np.int32)

        # maximum number of tokens that can be processed by network at once
        max_length = ie_net.input_info[input_names].input_data.shape[1]

        eos_token_id = len(vocab) - 1

        cur_input_len = input_ids.shape[-1]

        # maximum number of tokens that will be generated
        max_sample_token_num = args.max_sample_token_num + cur_input_len

        t0 = time.perf_counter()
        t_count = 0

        while True:
            # pad the rest of the request
            pad_len = max_length - cur_input_len
            model_input = np.concatenate(
                (input_ids, [[eos_token_id] * pad_len]), axis=-1)

            # create numpy inputs for IE
            inputs = {
                input_names: model_input,
            }

            # infer by IE
            t_start = time.perf_counter()
            res = ie_net_exec.infer(inputs=inputs)
            t_end = time.perf_counter()
            t_count += 1
            log.info(
                "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)"
                .format(max_length, 1 / (t_end - t_start), t_end - t_start))

            outputs = res[output_names]
            next_token_logits = outputs[:, cur_input_len - 1, :]

            # pre-process distribution
            next_token_scores = process_logits(input_ids, next_token_logits,
                                               eos_token_id)
            if args.top_k > 0:
                next_token_scores = get_top_k_logits(next_token_scores,
                                                     args.top_k)

            if args.top_p < 1.0:
                next_token_scores = get_top_p_logits(next_token_scores,
                                                     args.top_p)

            # get next token id
            probs = softmax(next_token_scores)
            next_tokens = np.random.choice(probs.shape[-1],
                                           1,
                                           p=probs[0],
                                           replace=True)

            # update info for the next step
            input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1)

            cur_input_len = input_ids.shape[-1]

            if stop_criteria(input_ids, min(max_length, max_sample_token_num),
                             eos_token_id):
                break

        t1 = time.perf_counter()

        text = tokenizer.decode_batch(input_ids)[0]

        log.info(
            "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)"
            .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count))

        # print result
        log.info("GENERATED SEQUENCE: {}".format(text))
Exemplo n.º 22
0
                  for s in g:
                      f.write(s)
                      f.write("\n\n")
          elif args.file_type == 'txt':
              shutil.copyfile(str(arch), str(fp))

  data_files = glob(str(out_path / "*.txt"))
  data_files = random.sample(data_files, int(0.2 * len(data_files)))

  assert len(data_files) > 0, 'No data files found'

  # Initialize a tokenizer
  tokenizer = Tokenizer(models.BPE())

  # Customize pre-tokenization and decoding
  tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
  tokenizer.decoder = decoders.ByteLevel()
  tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
  tokenizer.normalizer = NFKC()

  # And then train
  trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"])
  tokenizer.train(trainer, data_files)

  # And Save it
  tokenizer_path = out_path / "byte-level-bpe.tokenizer.json"
  tokenizer.save(str(tokenizer_path), pretty=True)

  print(f'tokenizer saved at {str(tokenizer_path)}')
  return tokenizer_path
Exemplo n.º 23
0
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer



ids = bert_tokenizer.encode(sentences[10]).ids
bert_tokenizer.decode(ids)


from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoders = decoders.ByteLevel()

trainer = trainers.UnigramTrainer(
    vocab_size=20000,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
)

tokenizer.train_from_iterator(sentences, trainer=trainer)
tokenizer.encode(sentences[4]).ids
tokenizer.decode(tokenizer.encode(sentences[4]).ids)
tokenizer.save('bert_out/test2')

tokenizer.save_pretrained('bert_out/test')