def inference():

    from tokenizers import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    '''
    initialize tokenizer with saved model files
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./tok_checkpoints/tokenizer_model-vocab.json",
        "./tok_checkpoints/tokenizer_model-merges.txt",
    )
    '''
    optional step : preprocess the strings
    Ex: add <s> and </s> as BOS and EOS tokens to the string
        pad string to some max length and truncate string to some max length
    '''
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='<pad>',
                             pad_id=tokenizer.get_vocab()['<pad>'],
                             length=20)
    tokenizer.enable_truncation(max_length=20)
    '''
    tokenize/encode strings
    '''
    input_ids = tokenizer.encode("Hello World, Whats up!!!").ids
    print("input ids", input_ids)
    tokens = tokenizer.encode("Hello World, Whats up!!!").tokens
    print("tokens", tokens)
    '''
    tokenize/encode batch of string
    '''
    batch_tokenized = tokenizer.encode_batch(
        ["Hello World, Whats up!!!", "Whata whata wa wada wada"])
    input_ids = [i.ids for i in batch_tokenized]
    print("input ids", input_ids)
    tokens = [i.tokens for i in batch_tokenized]
    print("tokens", tokens)
예제 #2
0
    def __init__(self, path, vocab_size=-1, use_bpe=False, tokenizer_data=""):
        self.dictionary = Dictionary()

        if use_bpe:
            assert os.path.exists(path), "Path does not exist: " + path

            print(
                "-------------------------------------------------------------"
            )

            tokenizer = ByteLevelBPETokenizer()
            if len(tokenizer_data) != 0:
                print("Training tokenizer on: " +
                      os.path.join(tokenizer_data, 'train.txt'))
                tokenizer.train([os.path.join(tokenizer_data, 'train.txt')],
                                vocab_size=vocab_size,
                                show_progress=False)
            else:
                print("Training tokenizer on: " +
                      os.path.join(path, 'train.txt'))
                tokenizer.train(
                    [
                        os.path.join(path, 'train.txt')
                        # os.path.join(path, 'valid.txt'),
                        # os.path.join(path, 'test.txt')
                    ],
                    vocab_size=vocab_size,
                    show_progress=False)
            print(
                "-------------------------------------------------------------"
            )

            print("Encoding dataset at: " + path)
            with open(os.path.join(path, 'train.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.train = ids
                self.dictionary.avg_characters_per_token['train'] = len(
                    text) / len(enc.ids)

            with open(os.path.join(path, 'valid.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.valid = ids
                self.dictionary.avg_characters_per_token['valid'] = len(
                    text) / len(enc.ids)

            with open(os.path.join(path, 'test.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.test = ids
                self.dictionary.avg_characters_per_token['test'] = len(
                    text) / len(enc.ids)
            print(
                "-------------------------------------------------------------"
            )

            self.dictionary.word2idx = tokenizer.get_vocab()
            self.dictionary.idx2word = [
                tokenizer.id_to_token(x)
                for x in range(tokenizer.get_vocab_size())
            ]
            self.dictionary.total = tokenizer.get_vocab_size()

        else:
            self.train = self.tokenize(os.path.join(path, 'train.txt'))
            self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
            self.test = self.tokenize(os.path.join(path, 'test.txt'))