Пример #1
0
def get_embedding(type_embedding, data):
    if type_embedding.split('_')[0] == 'BERT' or type_embedding.split(
            '_')[0] == 'bert':
        if type_embedding == 'BERT_portuguese_large_neural_mind':
            path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/'
        elif type_embedding == 'BERT_portuguese_base_neural_mind':
            path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/'
        elif type_embedding == 'bert_base_multilingual_cased':
            path = 'bert-base-multilingual-cased'
        elif type_embedding == 'bert_base_multilingual_uncased':
            data = [x.lower() for x in data]
            path = 'bert-base-multilingual-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding.split('_')[0] == 'xlmroberta':
        if type_embedding == 'xlmroberta_base':
            path = 'xlm-roberta-base'
        elif type_embedding == 'xlmroberta_large':
            path = 'xlm-roberta-large'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'xlm':
        path = 'xlm-mlm-100-1280'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path, output_hidden_states=True)
    #########
    #ENGLISH
    #########
    elif type_embedding == 'en_bert_base_uncased':
        path = 'bert-base-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'en_xlm_mlm_enfr_1024':
        path = 'xlm-mlm-enfr-1024'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'en_xlm_roberta_base':
        path = 'xlm-roberta-base'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'distilbert_base_cased':
        path = 'distilbert-base-cased'
        #load tokenizer and model
        tokenizer = DistilBertTokenizer.from_pretrained(path)
        model = DistilBertModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'Mobile_Bert':
        path = 'google/mobilebert-uncased'
        #load tokenizer and model
        tokenizer = MobileBertTokenizer.from_pretrained(path)
        model = MobileBertModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'Electra':
        path = 'google/electra-small-discriminator'
        #load tokenizer and model
        tokenizer = ElectraTokenizer.from_pretrained(path)
        model = ElectraModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'BART':
        path = 'facebook/bart-large'
        #load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(path)
        model = BartModel.from_pretrained(path, output_hidden_states=True)

    # Set the device to GPU (cuda) if available, otherwise stick with CPU
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    list_of_four_last_embeddings = []
    list_of_mean = []

    for l in data:
        # Convert the string "granola bars" to tokenized vocabulary IDs
        input_ids = tokenizer.encode(l)
        #print(input_ids)
        # Convert the list of IDs to a tensor of IDs
        input_ids = torch.LongTensor(input_ids)
        #print(input_ids)
        model = model.to(device)
        input_ids = input_ids.to(device)
        #print(input_ids)
        model.eval()

        # unsqueeze IDs to get batch size of 1 as added dimension
        input_ids = input_ids.unsqueeze(0)
        with torch.no_grad():
            out = model(input_ids=input_ids)

        # we only want the hidden_states
        if type_embedding == 'xlm':
            hidden_states = out[1]
        else:
            hidden_states = out[2]
        #mean of layers
        sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze()
        list_of_mean.append(sentence_embedding.tolist())

        # get last four layers
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        # cast layers to a tuple and concatenate over the last dimension
        cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)

        # take the mean of the concatenated vector over the token dimension
        cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()
        list_of_four_last_embeddings.append(cat_sentence_embedding.tolist())

    #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape)
    #print('list of mean', np.array(list_of_mean).shape)

    return list_of_mean, list_of_four_last_embeddings
Пример #2
0
def generate_embeddings(sentences,
                        method='special_tokens',
                        target_dir_path='/ukp-storage-1/nbeck/evidence/data/',
                        batch_size=500,
                        logging_level=logging.INFO):
    """
    Base method for generating embeddings.
    Handles the batch-wise calculation of them, thus enabling more efficient calculations in case errors bring a given
    batch to halt.
    Resulting embeddings are stored in a file in the given directory.
    :param logging_level: desired level of logging (from logging library)
    :param batch_size: amount of sentences to be processes in one run/batch
    :param sentences: list or array of strings representing the DIRTY sentences, i.e. with markers for the lemmas
    :param method: how to calculate fixed-size word embeddings for the lemma (which might be tokenized as multiple tokens
    can be 'special_tokens' or 'mean'
    :param target_dir_path: path of the directory where embeddings are supposed to be stored
    """
    # set up the logger
    logging.basicConfig(level=logging_level,
                        format='%(asctime)s - %(message)s')

    logging.info('Beginning the calculation of embeddings.')
    # create an empty file
    file_name = 'word_plus_sen_embeddings_' + method + '_' + strftime(
        "%Y-%m-%d %H:%M:%S", gmtime()) + '.npy'
    numpy.save(target_dir_path + file_name, [])

    # load sentence embeddings model (just) once rather than in every batch loop
    sen_embd_model = SentenceTransformer('distiluse-base-multilingual-cased')

    # load lemma embeddings model (just) once rather than every time the respective sub-function is called
    lemma_embd_model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
    lemma_embd_model.eval()

    # load lemma embeddings tokenizer (just) once - you get it
    lemma_embd_tokenizer = XLMRobertaTokenizer.from_pretrained(
        'xlm-roberta-base')
    if method == 'special_tokens':
        lemma_embd_tokenizer = XLMRobertaTokenizer.\
            from_pretrained('xlm-roberta-base', additional_special_tokens=_extract_lemmas_from_sentences(sentences))

    # create a LongSentenceCounter
    lsc = LongSentenceCounter()

    # iterate over batches
    for i in range(int(numpy.ceil(len(sentences) / batch_size))):
        # determine batch length
        batch_len = numpy.min([batch_size, len(sentences) - i * batch_size])
        # determine current batch
        current_batch = sentences[i * batch_size:i * batch_size + batch_len]
        # generate contextualized word embedding for each lemma
        if method == 'special_tokens':
            lemma_embeddings = _generate_lemma_embs_with_special_tokens(
                current_batch, lemma_embd_model, lemma_embd_tokenizer, lsc)
        elif method == 'mean':
            lemma_embeddings = _generate_lemma_embs_with_mean(
                current_batch, lemma_embd_model, lemma_embd_tokenizer, lsc)
        else:
            raise Exception(
                'Specified method of dealing with multi-token lemma not found')

        # generate sentence embeddings for sentences
        sen_embeddings = sen_embd_model.encode(current_batch)

        # concatenate sentence and word embeddings
        current_embds = []
        for a, b in zip(sen_embeddings, lemma_embeddings):
            current_embds.append(numpy.append(a, b))

        # store/append embeddings to file
        arr = numpy.load(target_dir_path + file_name, allow_pickle=True)
        new_embds = numpy.append(arr, current_embds)
        numpy.save(target_dir_path + file_name, new_embds)
        # log that current batch was stored successfully
        logging.info('Batch #' + str(i) +
                     ' of embeddings successfully stored in ' +
                     target_dir_path + file_name + '.')

    logging.info('ALL EMBEDDINGS CALCULATED SUCCESSFULLY!')
    lsc.show()
    lsc.reset()

    # reshape embeddings, so they can be more intuitively used (up to this point they are one flat list)
    arr = numpy.load(target_dir_path + file_name, allow_pickle=True)
    arr = arr.reshape(int(len(arr) / 1280), 1280)
    numpy.save(target_dir_path + file_name, arr)
Пример #3
0
    def __init__(self):

        super(TranslationModel, self).__init__()
        self.encoder = XLMRobertaModel.from_pretrained('xlm-roberta-base')
        self.linear = nn.Linear(768, 768)
Пример #4
0
from torch.utils.data import Dataset
import numpy as np
import utils
from torch.utils.data.dataloader import default_collate
import logging
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import Trainer, TrainingArguments, get_linear_schedule_with_warmup
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from nltk.tokenize import word_tokenize
from torchsummary import summary

bert_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base',
                                                     is_split_into_words=True)
bert_model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained("./model")


def load_json(json_file):
    with codecs.open(json_file, 'r', encoding='utf-8') as f:
        data_list = json.load(f)
    data_dict = {}
    for data in data_list:
        data_dict[data['id']] = data
    return data_dict


def extrace_range(string):
    ranges = []
Пример #5
0
def tokenizer_and_model(type_embedding):
    #########
    #PORTUGUESE
    #########
    if type_embedding.split('_')[0] == 'BERT' or type_embedding.split(
            '_')[0] == 'bert':
        if type_embedding == 'BERT_portuguese_large_neural_mind':
            path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/'
        elif type_embedding == 'BERT_portuguese_base_neural_mind':
            path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/'
        elif type_embedding == 'bert_base_multilingual_cased':
            path = 'bert-base-multilingual-cased'
        elif type_embedding == 'bert_base_multilingual_uncased':
            path = 'bert-base-multilingual-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)
        special_tokens_dict = {
            'additional_special_tokens': ['[USER]', '[SYSTEM]']
        }
        orig_num_tokens = len(tokenizer)
        num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
        total_num_tokens = orig_num_tokens + num_added_tokens
        model.resize_token_embeddings(total_num_tokens)
    elif type_embedding.split('_')[0] == 'xlmroberta':
        if type_embedding == 'xlmroberta_base':
            path = 'xlm-roberta-base'
        elif type_embedding == 'xlmroberta_large':
            path = 'xlm-roberta-large'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'xlm':
        path = 'xlm-mlm-100-1280'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path,
                                         output_hidden_states=True,
                                         return_dict=True)
    #########
    #ENGLISH
    #########
    elif type_embedding == 'en_bert_base_uncased':
        path = 'bert-base-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)
    elif type_embedding == 'en_xlm_mlm_enfr_1024':
        path = 'xlm-mlm-enfr-1024'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path,
                                         output_hidden_states=True,
                                         return_dict=True)
    elif type_embedding == 'en_xlm_roberta_base':
        path = 'xlm-roberta-base'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'distilbert_base_cased':
        path = 'distilbert-base-cased'
        #load tokenizer and model
        tokenizer = DistilBertTokenizer.from_pretrained(path)
        model = DistilBertModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'Mobile_Bert':
        path = 'google/mobilebert-uncased'
        #load tokenizer and model
        tokenizer = MobileBertTokenizer.from_pretrained(path)
        model = MobileBertModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'Electra':
        path = 'google/electra-small-discriminator'
        #load tokenizer and model
        tokenizer = ElectraTokenizer.from_pretrained(path)
        model = ElectraModel.from_pretrained(path,
                                             output_hidden_states=True,
                                             return_dict=True)
    elif type_embedding == 'BART':
        path = 'facebook/bart-large'
        #load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(path)
        model = BartModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)

    return tokenizer, model
Пример #6
0
 def __init__(self):
     super(xlmr_Classifier, self).__init__()
     self.xlmr = XLMRobertaModel.from_pretrained("xlm-roberta-large")
     self.drop = nn.Dropout(p=0.3)
     self.out = nn.Linear(self.xlmr.config.hidden_size, 2)
Пример #7
0
        args.model_name[:args.model_name.index("-")],
        args.model_name[args.model_name.index("-") + 1:],
    )

if model_type == "roberta":
    embed = RobertaWordPieceEncoder(model_dir_or_name=args.model_name,
                                    requires_grad=True,
                                    num_aspect=1)
elif model_type == "bert":
    embed = BertWordPieceEncoder(model_dir_or_name=args.model_name,
                                 requires_grad=True)
elif model_type == "xlnet":
    embed = XLNetModel.from_pretrained(
        pretrained_model_name_or_path=args.model_name)
elif model_type == "xlmroberta":
    embed = XLMRobertaModel.from_pretrained(
        pretrained_model_name_or_path=args.model_name)


class AspectModel(nn.Module):
    def __init__(self, embed, dropout, num_classes, pool="max"):
        super().__init__()
        assert pool in ("max", "mean")
        self.embed = embed
        self.embed_dropout = nn.Dropout(dropout)
        if hasattr(embed, "embedding_dim"):
            embed_size = embed.embedding_dim
        else:
            embed_size = embed.config.hidden_size
        self.ffn = nn.Sequential(
            nn.Linear(embed_size, embed_size),
            nn.ReLU(),
from transformers import GPT2Model, GPT2Tokenizer, XLMRobertaModel, XLMRobertaTokenizer

# The Gulordava LSTM model can be found here:
# https://drive.google.com/open?id=1w47WsZcZzPyBKDn83cMNd0Hb336e-_Sy

# Initializing the LSTM
lstm, lstm_vocab = make_pretrained_lstm_and_tokenizer()

# Initializing the Transformer
trans_model_type: str = config.feature_model_type if config.feature_model_type is not 'LSTM' else config.default_trans_model_type

if 'distilgpt2' in trans_model_type:
    trans_model = GPT2Model.from_pretrained(trans_model_type)
    trans_tokenizer = GPT2Tokenizer.from_pretrained(trans_model_type)
elif 'xlm-roberta-base' in trans_model_type:
    trans_model = XLMRobertaModel.from_pretrained(trans_model_type)
    trans_tokenizer = XLMRobertaTokenizer.from_pretrained(trans_model_type)

# %% [markdown]
# # Data
#
# For this assignment you will train your probes on __treebank__ corpora. A treebank is a corpus that has been *parsed*, and stored in a representation that allows the parse tree to be recovered. Next to a parse tree, treebanks also often contain information about part-of-speech tags, which is exactly what we are after now.
#
# The treebank you will use for now is part of the Universal Dependencies project. I provide a sample of this treebank as well, so you can test your setup on that before moving on to larger amounts of data.
#
# Make sure you accustom yourself to the format that is created by the `conllu` library that parses the treebank files before moving on. For example, make sure you understand how you can access the pos tag of a token, or how to cope with the tree structure that is formed using the `to_tree()` functionality.

# %%
from typing import Callable, Dict, List, Optional, Union
from conllu import parse_incr, TokenList
from data_tools.data_inits import parse_corpus
Пример #9
0
    def __init__(
        self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True
    ):
        super().__init__(config=config)

        if pre_load_model:
            self.xlm_roberta = XLMRobertaModel.from_pretrained(
                self.config.model_name, output_hidden_states=True
            )
        else:
            xlm_roberta_config = XLMRobertaConfig.from_pretrained(
                self.config.model_name, output_hidden_states=True
            )
            self.xlm_roberta = XLMRobertaModel(xlm_roberta_config)

        self.vocabs = {
            const.TARGET: vocabs[const.TARGET],
            const.SOURCE: vocabs[const.SOURCE],
        }

        self.mlp = None

        if self.config.use_mlp:
            self.mlp = nn.Sequential(
                nn.Linear(self.xlm_roberta.config.hidden_size, self.config.hidden_size),
                nn.Tanh(),
            )
            output_size = self.config.hidden_size
        else:
            output_size = self.xlm_roberta.config.hidden_size

        sentence_size = output_size
        if config.pooling == 'mixed':
            sentence_size *= 2

        self.scalar_mix = ScalarMixWithDropout(
            mixture_size=self.xlm_roberta.config.num_hidden_layers
            + 1,  # +1 for embeddings
            do_layer_norm=self.config.scalar_mix_layer_norm,
            dropout=self.config.scalar_mix_dropout,
        )

        self._sizes = {
            const.TARGET: output_size,
            const.TARGET_LOGITS: output_size,
            const.TARGET_SENTENCE: sentence_size,
            const.SOURCE: output_size,
        }

        self.output_embeddings = self.xlm_roberta.embeddings.word_embeddings

        self._training_steps_ran = 0
        self._is_frozen = False
        if self.config.freeze:
            logger.info(
                'Freezing XLMRoberta encoder weights; training will not update them'
            )
            for param in self.xlm_roberta.parameters():
                param.requires_grad = False
            self._is_frozen = True
        if self.config.freeze_for_number_of_steps > 0:
            # Done inside `forward()` to guarantee we can unfreeze (if optimizer is
            #  built after this, we cannot unfreeze without calling
            #  `optimizer.add_param_group({'params': self.xlm.parameters()})`
            pass
    def create(cls,
               model_type='camem',
               model_name="camembert-base",
               embedding_size=768,
               hidden_dim=512,
               rnn_layers=1,
               lstm_dropout=0.5,
               device="cuda",
               mode="weighted",
               key_dim=64,
               val_dim=64,
               num_heads=3,
               attn_dropout=0.3,
               self_attention=False,
               is_require_grad=False):
        configuration = {
            'model_type': model_type,
            "model_name": model_name,
            "device": device,
            "mode": mode,
            "self_attention": self_attention,
            "is_freeze": is_require_grad
        }

        if 'camem' in model_type:
            config_bert = CamembertConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = CamembertModel.from_pretrained(model_name,
                                                   config=config_bert)
            model.to(device)
        elif 'flaubert' in model_type:
            config_bert = FlaubertConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = FlaubertModel.from_pretrained(model_name,
                                                  config=config_bert)
            model.to(device)
        elif 'XLMRoberta' in model_type:
            config_bert = XLMRobertaConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = XLMRobertaModel.from_pretrained(model_name,
                                                    config=config_bert)
            model.to(device)
        elif 'M-Bert' in model_type:
            config_bert = BertConfig.from_pretrained(model_name,
                                                     output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, config=config_bert)
            model.to(device)

        lstm = BiLSTM.create(embedding_size=embedding_size,
                             hidden_dim=hidden_dim,
                             rnn_layers=rnn_layers,
                             dropout=lstm_dropout)

        attn = MultiHeadAttention(key_dim, val_dim, hidden_dim, num_heads,
                                  attn_dropout)
        model.train()
        self = cls(model=model, config=configuration, lstm=lstm, attn=attn)
        # if is_freeze:
        self.freeze()

        return self
Пример #11
0
    def __init__(self,
                 model_dir_or_name: str,
                 vocab: Vocabulary,
                 layers: str = '-1',
                 pool_method: str = 'first',
                 include_cls_sep: bool = False,
                 pooled_cls: bool = False,
                 auto_truncate: bool = True,
                 min_freq=1,
                 only_use_pretrain_bpe=False,
                 truncate_embed=True):
        super().__init__()

        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir_or_name)
        self.encoder = XLMRobertaModel.from_pretrained(model_dir_or_name)

        self.encoder.resize_token_embeddings(len(self.tokenizer))
        self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2
        encoder_layer_number = len(self.encoder.encoder.layer)
        if isinstance(layers, list):
            self.layers = [int(l) for l in layers]
        elif isinstance(layers, str):
            self.layers = list(map(int, layers.split(',')))
        else:
            raise TypeError("`layers` only supports str or list[int]")
        for layer in self.layers:
            if layer < 0:
                assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                                                       f"a bert model with {encoder_layer_number} layers."
            else:
                assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."

        assert pool_method in ('avg', 'max', 'first', 'last')
        self.pool_method = pool_method
        self.include_cls_sep = include_cls_sep
        self.pooled_cls = pooled_cls
        self.auto_truncate = auto_truncate

        #        logger.info("Start to generate word pieces for word.")
        #        word_piece_dict = {'<s>': 1, '</s>': 1}
        #        found_count = 0
        #        new_add_to_bpe_vocab = 0
        #        unsegment_count = 0
        #        if "<s>" in vocab:
        #            warnings.warn("<s> detected in your vocabulary. RobertaEmbedding will add <s> and </s> to the begin "
        #                          "and end of the input automatically, make sure you don't add <s> and </s> at the begin"
        #                          " and end.")

        #        unique = []
        #        for word, index in vocab:
        #
        #            word_pieces = []
        #            word_pieces.extend(self.tokenizer.tokenize(
        #                word))#, add_prefix_space=True))
        #            if len(word_pieces) > 0:
        #                word_token_ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        #                if 3 in word_token_ids:
        #                    if word_pieces[word_token_ids.index(3)] not in unique:
        #                        unique.append(word_pieces[word_token_ids.index(3)])
        #                        unsegment_count += 1
        #                if not vocab._is_word_no_create_entry(word):
        #    #                import pdb;pdb.set_trace()
        #                    if index != vocab.unknown_idx and word_pieces[0] == '<unk>':
        #                        if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry(
        #                                word) and not only_use_pretrain_bpe:
        #                            word_piece_dict[word] = 1
        #                            new_add_to_bpe_vocab += 1
        #                        unsegment_count += 1
        #                        continue
        #                found_count += 1
        #                for word_piece in word_pieces:
        #                    word_piece_dict[word_piece] = 1
        #
        #        if unsegment_count > 0:
        #            logger.info(f"{unsegment_count} words are unsegmented.")

        word_to_wordpieces = []
        word_pieces_lengths = []
        for word, index in vocab:
            if index == vocab.padding_idx:
                word = '<pad>'
            elif index == vocab.unknown_idx:
                word = '<unk>'
            word_pieces = self.tokenizer.tokenize(word)
            word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces)
            word_to_wordpieces.append(word_pieces)
            word_pieces_lengths.append(len(word_pieces))
        self._cls_index = self.tokenizer.convert_tokens_to_ids('<s>')
        self._sep_index = self.tokenizer.convert_tokens_to_ids('</s>')
        self._word_pad_index = vocab.padding_idx
        self._wordpiece_pad_index = self.tokenizer.convert_tokens_to_ids(
            '<pad>')
        self.word_to_wordpieces = np.array(word_to_wordpieces)
        self.register_buffer('word_pieces_lengths',
                             torch.LongTensor(word_pieces_lengths))
        self.encoder.resize_token_embeddings(len(self.tokenizer))
        logger.debug("Successfully generate word pieces.")