예제 #1
0
def inference(checkpoint_path,
              hyperparameters_path,
              tokenizer_path,
              merges_path,
              input='In 1691 Moscow established ',
              generated_length=64,
              random_selection=True):

    # Iitialize tokenizer and model from files
    tokenizer = ByteLevelBPETokenizer(
        tokenizer_path,
        merges_path,
        add_prefix_space=True,
    )

    #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]"))
    #tokenizer2.pre_tokenizer2 = Whitespace()
    #tokenizer2 = Tokenizer.from_file("example/tokenizer.json")

    #initialize model
    model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path,
                                         hparams_file=hyperparameters_path)

    # Tokenize input sample
    encoded_sample = tokenizer.encode(input).ids

    for i in range(generated_length):
        input_ids = torch.unsqueeze(torch.tensor(encoded_sample,
                                                 dtype=torch.long),
                                    axis=0)

        # Inference
        output, attn = model(input_ids)
        last_word = output[0][-1]

        if not random_selection:
            # Pick highest probability token from probability distributions
            prediction = torch.argmax(output,
                                      axis=2).squeeze(axis=0).tolist()[-1]
        else:
            # Pick Tokens acording to their probabilities
            prediction = torch.multinomial(torch.softmax(last_word, 0)**10,
                                           1)[0]
        # Add prediciton to sequence
        encoded_sample.append(prediction)

    # Detokenize output sample
    decoded_output = tokenizer.decode(encoded_sample)
    #decoded_output2 = tokenizer2.decode(encoded_sample)

    output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample]
    #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample]
    #print('\n========================\n      ORIGINAL BPE        \n========================')
    #print(output_tokens2, decoded_output2, sep='\n')
    #print('\n========================\n      MODIFIED BPE        \n========================')
    return decoded_output, output_tokens, attn
예제 #2
0
class HuggingFaceBpeHelper(BPEHelper):
    """
    HuggingFace's ByteLevelBPE Tokenizer.

    Fast because Rust.
    """

    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.special_tok_map = {}  # map from HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if PathManager.exists(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if PathManager.exists(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.bpe_dropout:
            raise NotImplementedError(
                '--bpe-dropout is not supported with ByteLevelBPE because tokenizers '
                'library does not allow dynamically turning BPE on/off. You can use '
                '--dict-tokenizer slow_bytelevel_bpe to gain this feature.'
            )

        if self.lower:
            warn_once('Are you sure you want to lower case your BPE dictionary?')
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).'
            )
        if 'bpe_vocab' not in opt:
            raise ValueError('--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError('--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError(
                '--bpe-vocab and --bpe-merge are mandatory with '
                '--dict-tokenizer bytelevelbpe'
            )

        if not PathManager.exists(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not PathManager.exists(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(
            self.vocab_path, self.merge_path, self.add_prefix_space
        )

    def helper_encode(self, text: str) -> List[str]:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        return self.tokenizer.encode(text).tokens

    def helper_decode(
        self, tokens: List[str], token_ids: List[int], delimiter: str
    ) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = self.tokenizer.decode(token_ids, skip_special_tokens=False)

        return text

    def add_special_tokens(self, dict_agent, special_tokens: List[str]):
        """
        Add special tokens to the tokenizer and dict_agent.
        """
        logging.debug(f'adding the following special tokens: {special_tokens}')
        self.tokenizer.add_special_tokens(special_tokens)  # add to HF

        for tok in special_tokens:
            parlai_key = dict_agent[tok]
            hf_key = self.tokenizer.token_to_id(tok)
            self.special_tok_map[parlai_key] = hf_key

    def sync_with_dict(self, dict_agent):
        """
        Sync the dictionary agent with Hugging Face tokenizer's BPE dict.

        Called only once on initialization.
        """
        special_tokens = [
            dict_agent.null_token,
            dict_agent.start_token,
            dict_agent.end_token,
            dict_agent.unk_token,
        ]
        self.add_special_tokens(dict_agent, special_tokens)

        for i in range(self.tokenizer.get_vocab_size() - len(special_tokens)):
            token = self.tokenizer.id_to_token(i)
            dict_agent.add_token(token)
            # We don't have access to the hugging face word frequency table,
            # just set it to 1 instead
            dict_agent.freq[token] = 1

    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        self.tokenizer.save_model(dir_name, file_name)
예제 #3
0
파일: bpe.py 프로젝트: nii4u/ParlAI
class HuggingFaceBpeHelper(BPEHelper):
    """
    HuggingFace's ByteLevelBPE Tokenizer.

    Fast because Rust.
    """
    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if os.path.isfile(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if os.path.isfile(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.lower:
            raise ValueError(
                'Only use --dict-lower false with --dict-tokenizer bytelevelbpe'
            )
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).')
        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)

    def helper_encode(self, text: str) -> List[str]:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        return self.tokenizer.encode(text).tokens

    def helper_decode(self, tokens: List[str], token_ids: List[int],
                      delimiter: str) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = self.tokenizer.decode(token_ids)
        return text

    def sync_with_dict(self, dict_agent):
        """
        Sync the dictionary agent with Hugging Face tokenizer's BPE dict.

        Called only once on initialization.
        """
        special_tokens = [
            dict_agent.null_token,
            dict_agent.start_token,
            dict_agent.end_token,
            dict_agent.unk_token,
        ]
        self.tokenizer.add_special_tokens(special_tokens)
        for i in range(self.tokenizer.get_vocab_size() - 4):
            token = self.tokenizer.id_to_token(i)
            dict_agent.add_token(token)
            # We don't have access to the hugging face word frequency table,
            # just set it to 1 instead
            dict_agent.freq[token] = 1

    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        self.tokenizer.save(dir_name, file_name)
예제 #4
0
    def __init__(self, path, vocab_size=-1, use_bpe=False, tokenizer_data=""):
        self.dictionary = Dictionary()

        if use_bpe:
            assert os.path.exists(path), "Path does not exist: " + path

            print(
                "-------------------------------------------------------------"
            )

            tokenizer = ByteLevelBPETokenizer()
            if len(tokenizer_data) != 0:
                print("Training tokenizer on: " +
                      os.path.join(tokenizer_data, 'train.txt'))
                tokenizer.train([os.path.join(tokenizer_data, 'train.txt')],
                                vocab_size=vocab_size,
                                show_progress=False)
            else:
                print("Training tokenizer on: " +
                      os.path.join(path, 'train.txt'))
                tokenizer.train(
                    [
                        os.path.join(path, 'train.txt')
                        # os.path.join(path, 'valid.txt'),
                        # os.path.join(path, 'test.txt')
                    ],
                    vocab_size=vocab_size,
                    show_progress=False)
            print(
                "-------------------------------------------------------------"
            )

            print("Encoding dataset at: " + path)
            with open(os.path.join(path, 'train.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.train = ids
                self.dictionary.avg_characters_per_token['train'] = len(
                    text) / len(enc.ids)

            with open(os.path.join(path, 'valid.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.valid = ids
                self.dictionary.avg_characters_per_token['valid'] = len(
                    text) / len(enc.ids)

            with open(os.path.join(path, 'test.txt'), 'r',
                      encoding='utf-8') as f:
                text = f.read()
                enc = tokenizer.encode(text)
                tokens = len(enc.ids)
                ids = torch.LongTensor(tokens)

                for index, id in enumerate(enc.ids):
                    ids[index] = id
                self.test = ids
                self.dictionary.avg_characters_per_token['test'] = len(
                    text) / len(enc.ids)
            print(
                "-------------------------------------------------------------"
            )

            self.dictionary.word2idx = tokenizer.get_vocab()
            self.dictionary.idx2word = [
                tokenizer.id_to_token(x)
                for x in range(tokenizer.get_vocab_size())
            ]
            self.dictionary.total = tokenizer.get_vocab_size()

        else:
            self.train = self.tokenize(os.path.join(path, 'train.txt'))
            self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
            self.test = self.tokenize(os.path.join(path, 'test.txt'))
예제 #5
0
class HuggingfaceTokenizerBPE(nn.Module):
    def __init__(self, text_files, dataset_info_path='', config_data=None):
        super().__init__()
        # The default vocab size in the BERT model is 30522. If we want a number larger than that, we will also have to
        # change the BERT configuration.
        vocab_size = 30000
        self.info = f'hug{vocab_size}'

        with open(f'config/data/{config_data}.json') as json_file:
            tokenizer_from = json.load(json_file)['tokenizer_from']

        config_name = config_data if tokenizer_from == "" else tokenizer_from
        print(
            os.path.join(dataset_info_path,
                         f'tokenizer_{config_name}_{vocab_size}-vocab.json'))

        # The loading is only properly implemented starting from version 0.8. However, it makes the system use a lot of
        #  CPU for no reason (it is much slower). Maybe it will be fixed in the future.
        if not os.path.isfile(
                os.path.join(
                    dataset_info_path,
                    f'tokenizer_{config_name}_{vocab_size}-vocab.json')):
            text_files = text_files()
            self.tokenizer = ByteLevelBPETokenizer()
            # Join into a single file. This should NOT be necessary but it does not work properly with a lot of files
            with open('/tmp/text_files.txt', 'wb') as outfile:
                for filename in tqdm(
                        text_files,
                        desc='Joining all files into one for tokenization'):
                    with open(filename, 'rb') as readfile:
                        shutil.copyfileobj(readfile, outfile)
                text_files = '/tmp/text_files.txt'
            self.tokenizer.train(text_files,
                                 vocab_size=vocab_size,
                                 special_tokens=special_tokens)
            self.tokenizer.save(dataset_info_path,
                                f'tokenizer_{config_name}_{vocab_size}')

        # No "else", always load for consistency
        vocab_file = os.path.join(
            dataset_info_path,
            f'tokenizer_{config_name}_{vocab_size}-vocab.json')
        merges_file = os.path.join(
            dataset_info_path,
            f'tokenizer_{config_name}_{vocab_size}-merges.txt')
        self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                               merges_file=merges_file)
        self.tokenizer.add_special_tokens(special_tokens)

        self.index_special_tokens = {
            tok: self.tokenizer.encode(tok).ids[0]
            for tok in special_tokens
        }

    @property
    def device(self):
        return self._float_tensor.device

    def encode(self, sentence: str):
        output = self.tokenizer.encode(sentence)
        token_ids = output.ids
        tokens = output.tokens
        return torch.tensor(token_ids), tokens

    def decode(self, tokens: torch.LongTensor):
        assert tokens.dim() == 1
        tokens = list(tokens.cpu().numpy())
        sentences = self.tokenizer.decode(tokens)
        return sentences

    def id_to_token(self, token_id):
        if type(token_id) != torch.Tensor:
            token_id = torch.tensor(token_id)
        return self.tokenizer.id_to_token(token_id)

    def token_to_id(self, token):
        assert type(token) == str
        return self.tokenizer.token_to_id(token)

    def __len__(self):
        return self.tokenizer.get_vocab_size()

    # This is simply for PyCharm to find the correct reference to the methods of the class
    def __call__(self, *input, **kwargs) -> typing.Any:
        return super().__call__(*input, **kwargs)