Exemplos de GPT2Config.from_dict em Python, exemplos de transformers.GPT2Config.from_dict em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: model.py Projeto: NilayNigam/Networks

    def __init__(
            self,
            config,
            class_labels,
            pretrained_model_path,
            dropout=0.1,
            freeze_pretrained_part=True,
            reinitialize=False,
            n_layers=6,
    ):
        super().__init__(config, class_labels)

        if reinitialize:
            logger.info('resetting model weights')
            config = GPT2Config.from_json_file(pretrained_model_path + '/config.json')
            config = config.to_dict()
            config['n_layer'] = n_layers
            config = GPT2Config.from_dict(config)
            self.gpt2 = GPT2Model(config)
        else:
            self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path)

        self.dropout = torch.nn.Dropout(dropout)
        self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim)
        if freeze_pretrained_part:
            for param in self.gpt2.parameters():
                param.requires_grad = False

Exemplo n.º 2

0

Exibir arquivo

Arquivo: model_generator.py Projeto: CannyLab/summary_loop

    def __init__(self,
                 max_output_length=25,
                 max_input_length=300,
                 device='cpu',
                 tokenizer_type='gpt2',
                 bpe_model="",
                 starter_model=None):
        if tokenizer_type == "gpt2":
            self.tokenizer = utils_tokenizer.GPT2Tokenizer()
            config = GPT2Config.from_pretrained("gpt2")

        elif tokenizer_type == "bpecap":
            self.tokenizer = utils_tokenizer.BPETokenizer(bpe_model)
            config = GPT2Config.from_dict({
                "finetuning_task":
                None,
                "initializer_range":
                0.02,
                "layer_norm_epsilon":
                1e-05,
                "n_ctx":
                1024,
                "n_embd":
                768,
                "n_head":
                12,
                "n_layer":
                12,
                "n_positions":
                1024,
                "num_labels":
                1,
                "resid_pdrop":
                0.1,
                "use_bfloat16":
                False,
                "vocab_size":
                self.tokenizer.vocab_size
            })
        else:
            print("Tokenizer unrecognized. Should be gpt2 or bpecap.")
            exit()

        self.model = GPT2LMHeadModel(config)

        self.model.to(device)
        self.device = device
        if starter_model is not None:
            self.reload(starter_model)

        self.max_output_length = max_output_length
        self.max_input_length = max_input_length

        self.model.train()
        self.mode = "train"

Exemplo n.º 3

0

Exibir arquivo

Arquivo: function_test.py Projeto: leek018/Image-to-Story

def korean_gpt_long_setence_life_test():
    config = get_config()
    kogpt2_config = get_kog_config()
    kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth"

    kogpt2_vocab_path = config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    torch.load(kogpt2_model_path)
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    sent = '나는 밥을 먹었'
    toked = tok(sent)
    print(toked)
    sent_cnt = 0

    input_ids = torch.tensor([
        vocab[vocab.bos_token],
    ] + vocab[toked]).unsqueeze(0)
    input_ids = input_ids.to(device)

    outputs = kogpt2model.generate(input_ids=input_ids,
                                   max_length=100,
                                   min_length=50,
                                   repetition_penalty=1.2,
                                   do_sample=True,
                                   num_beams=3,
                                   bos_token_id=0,
                                   pad_token_id=3,
                                   eos_token_id=1,
                                   num_return_sequences=3)

    target = outputs[0]
    print("========수필===========")
    for i in range(3):  # 3 output sequences were generated
        toked = vocab.to_tokens(outputs[i].squeeze().tolist())
        ret = re.sub(r'(<s>|</s>|<pad>|<unk>)', '',
                     ''.join(toked).replace('▁', ' ').strip())
        print('Generated {}: {}'.format(i, ret))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: pytorch_kogpt2.py Projeto: leek018/Image-to-Story

def get_kogpt2_model(model_file, vocab_file, ctx="cpu"):
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(torch.load(model_file))
    device = torch.device(ctx)
    kogpt2model.to(device)
    kogpt2model.eval()
    vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
                                                         mask_token=None,
                                                         sep_token=None,
                                                         cls_token=None,
                                                         unknown_token='<unk>',
                                                         padding_token='<pad>',
                                                         bos_token='<s>',
                                                         eos_token='</s>')
    return kogpt2model, vocab_b_obj

Exemplo n.º 5

0

Exibir arquivo

Arquivo: prepare_pytorch_kogpt2.py Projeto: YeonwooSung/KoGPT2_Lyricist

def load_kogpt2_model_from_checkpoint(kogpt2, load_path, device, ctx='cpu'):
    try:
        checkpoint = torch.load(load_path, map_location=device)
        
        kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
        kogpt2model.load_state_dict(checkpoint['model_state_dict'])

        kogpt2model.eval()
    except:
        count = 0
        kogpt2model, _ = load_kogpt2_model()
    else:
        count = int(re.findall("\d+", load_path)[1])
    
    print(count)
    return kogpt2model, count

Exemplo n.º 6

0

Exibir arquivo

def fine_tuning(config, fine_tune_num, AI_DIRECTORY):
    """ Train the model """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = config['kogpt_batch_size']
    train_path = AI_DIRECTORY + config['kogpt_story_train_data_path']
    num_train_epochs = config['kogpt_epoch']

    kogpt2_config = get_kog_config()
    kogpt2_model_path = AI_DIRECTORY + config['kogpt_model_path']
    kogpt2_vocab_path = AI_DIRECTORY + config['kogpt_vocab_path']

    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    kogpt2model.to(device)

    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    loader = make_kogpt2_loader(train_path, batch_size)
    num_training_steps = len(loader) * num_train_epochs
    learning_rate = 5e-6
    adam_epsilon = 1e-8
    warmup_steps = 0
    no_decay = ["bias", "LayerNorm.weight"]
    #freeze_model(fine_tune_num,kogpt2model)
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in kogpt2model.named_parameters()
                if not any(nd in n
                           for nd in no_decay) and p.requires_grad == True
            ],
            "weight_decay":
            0.0,
        },
        {
            "params": [
                p for n, p in kogpt2model.named_parameters()
                if any(nd in n for nd in no_decay) and p.requires_grad == True
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=learning_rate,
                      eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_training_steps)
    global_step = 0
    epochs_trained = 0

    tr_loss = 0.0
    logging_loss = 0.0
    kogpt2model.zero_grad()

    train_iterator = trange(epochs_trained,
                            int(num_train_epochs),
                            desc="Epoch")
    logging_steps = 500
    loss_record = []
    for epoch in train_iterator:
        epoch_iterator = tqdm(loader, desc="Iteration")
        for step, inputs in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training

            ###############
            kogpt2model.train()
            input = inputs.to(device)
            label = inputs.to(device)

            outputs = kogpt2model(input_ids=input, labels=label)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)
            loss.backward()

            tr_loss += loss.item()

            optimizer.step()
            scheduler.step()
            kogpt2model.zero_grad()
            global_step += 1

            if logging_steps > 0 and global_step % logging_steps == 0:
                logs = {}
                loss_scalar = (tr_loss - logging_loss) / logging_steps
                learning_rate_scalar = scheduler.get_last_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["loss"] = loss_scalar
                loss_record.append(loss_scalar)
                logging_loss = tr_loss
                epoch_iterator.write(
                    json.dumps({
                        **logs,
                        **{
                            "step": global_step
                        }
                    }))

    return kogpt2model, loss_record

Exemplo n.º 7

0

Exibir arquivo

def get_kogpt2_config():
    return GPT2Config.from_dict(kogpt2_config)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: prediction.py Projeto: leek018/Image-to-Story

def predict(images,root_path,AI_directory_path,model_type="life"):
    config = get_config()
    #0. Extract captions from images
    vocab = load_voca(AI_directory_path+config['caption_vocab_path'])
    caption_embed_size = config['caption_embed_size']
    caption_hidden_layer = config['caption_hidden_layer']
    caption_hidden_size = config['caption_hidden_size']
    caption_encoder_path = AI_directory_path+config['caption_encoder_path']
    caption_decoder_path = AI_directory_path+config['caption_decoder_path']
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    max_sequence_len = 30 #default value

    transform = torch_transform.Compose([
        torch_transform.ToTensor(),
        torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833), std=(0.2738, 0.2664, 0.2766))])

    encoder = EncoderCNN(caption_embed_size)
    decoder = DecoderRNN(caption_embed_size, len(vocab), caption_hidden_layer, caption_hidden_size)

    encoder.load_state_dict(torch.load(caption_encoder_path,map_location=device))
    decoder.load_state_dict(torch.load(caption_decoder_path,map_location=device))
    images = load_image(images, root_path, transform)

    encoder.eval()
    decoder.eval()

    encoder.to(device)
    decoder.to(device)
    images = images.to(device)

    features = encoder(images)
    states = None
    predicted_index = []
    lstm_inputs = features.unsqueeze(1)

    for i in range(max_sequence_len):
        outputs,states = decoder.lstm(lstm_inputs,states)
        # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함
        outputs = outputs.squeeze(1)
        scores_per_batch = decoder.score_layer(outputs)
        values, predicted = scores_per_batch.max(1)
        predicted_index.append(predicted)
        lstm_inputs = decoder.embed(predicted)
        lstm_inputs = lstm_inputs.unsqueeze(1)

    predicted_index = torch.stack(predicted_index,dim=1)
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        text =""
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            text += word + " "
        result_captions.append(text)

    print("result_caption : ",result_captions)
    # 1. translate captions to korean

    korean_sentences = []
    for sent in result_captions:
        translate_result = get_translate(sent)
        if translate_result != -1:
            translate_result = re.sub(r'\.','',translate_result)
            korean_sentences.append(translate_result)
    print("result_korean : ",korean_sentences)

    kogpt2_config = get_kog_config()
    if model_type == "life":
        kogpt2_model_path = AI_directory_path+config['kogpt_life_model_path']
    elif model_type == "story":
        kogpt2_model_path = AI_directory_path + config['kogpt_story_model_path']
    else:
        kogpt2_model_path = AI_directory_path+config['kogpt_model_path']
    kogpt2_vocab_path = AI_directory_path+config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path,map_location=device))

    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                         mask_token=None,
                                                         sep_token=None,
                                                         cls_token=None,
                                                         unknown_token='<unk>',
                                                         padding_token='<pad>',
                                                         bos_token='<s>',
                                                         eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    korean_preprocess(korean_sentences)
    gpt_result = naive_prediction(korean_sentences,tok,vocab,device,kogpt2model,model_type)
    korean_postprocess(gpt_result)
    result = []
    make_sentence(gpt_result,"",result,0)
    result.sort(key=lambda item: (-len(item),item))
    result_len = len(result)
    if result_len >11:
        result_len = 11
    result = result[1:result_len]
    return result

Exemplo n.º 9

0

Exibir arquivo

save_path = '/kogpt2_article/KoGPT2_checkpoint/'

kogpt2_config = {
    "initializer_range": 0.02,
    "layer_norm_epsilon": 0.000025,
    "n_ctx": 1024,
    "n_embd": 768,
    "n_head": 12,
    "n_layer": 12,
    "n_positions": 1024,
    "vocab_size": 50000
}

checkpoint = torch.load(save_path + 'KoGPT2_checkpoint.tar', map_location=PU)

kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

kogpt2model.load_state_dict(checkpoint['model_state_dict'])

kogpt2model.eval()

kogpt2model.to(torch.device(PU))

model = kogpt2model

Tokenizer = SentencepieceTokenizer(get_tokenizer(), num_best=0, alpha=0)


def make(start_msg):
    global Tokenizer
    sentence = start_msg

Exemplo n.º 10

0

Exibir arquivo

Arquivo: function_test.py Projeto: leek018/Image-to-Story

def kogpt_life_recursive_test():
    config = get_config()
    AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    kogpt2_config = get_kog_config()
    kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth"

    kogpt2_vocab_path = AI_directory_path + config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    torch.load(kogpt2_model_path)
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    # sent = ' 신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있자, 신부님은 웨딩드레파란 셔츠를 입은 남자가 사다리에 서 있'
    # toked = tok(sent)
    # print(toked)
    # sent_cnt = 0

    # input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0)
    # input_ids = input_ids.to(device)
    korean_sentences = [
        '신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있다.', '파란 셔츠를 입은 남자가 사다리에 서 있다.',
        '두 남자가 서 있다'
    ]
    kogpt_input_sentences = []
    for korean in korean_sentences:
        korean_size = len(korean)
        if not kogpt_input_sentences:
            korean_size = len(korean)
            if korean_size > 3:
                kogpt_input_sentences.append(korean[:-2])
            elif korean_size > 1:
                kogpt_input_sentences.append(korean[:-1])
            else:
                kogpt_input_sentences.append(korean)
        else:
            for i in range(len(kogpt_input_sentences)):
                if korean_size > 3:
                    kogpt_input_sentences[i] += korean[:-2]
                elif korean_size > 1:
                    kogpt_input_sentences[i] += korean[:-1]
                else:
                    kogpt_input_sentences[i] += korean[:]
        kogpt_output_sentences = []
        print(kogpt_input_sentences)
        expected_length = 50
        for kogpt_input_sentence in kogpt_input_sentences:
            print(kogpt_input_sentence)
            toked = tok(kogpt_input_sentence)
            input_ids = torch.tensor([
                vocab[vocab.bos_token],
            ] + vocab[toked]).unsqueeze(0)
            print(input_ids)
            input_ids = input_ids.to(device)
            input_length = input_ids.shape[1]
            outputs = kogpt2model.generate(input_ids=input_ids,
                                           max_length=input_length +
                                           expected_length,
                                           repetition_penalty=1.2,
                                           do_sample=True,
                                           num_beams=3,
                                           bos_token_id=0,
                                           pad_token_id=3,
                                           eos_token_id=1,
                                           num_return_sequences=3)
            for i in range(3):  # 3 output sequences were generated
                toked = vocab.to_tokens(outputs[i].squeeze().tolist())
                ret = re.sub(r'(<s>|</s>|<pad>|<unk>|)', '',
                             ''.join(toked).replace('▁', ' ').strip())
                kogpt_output_sentences.append(ret)
        kogpt_input_sentences = copy.deepcopy(kogpt_output_sentences)
    print(kogpt_input_sentences)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: main.py Projeto: fpem123/gpt2-everytime

    "모두의 연애": "<unused3>",
    "숭실대 에타": "<unused5>",
    "대학생 잡담방": "<unused4>"
}
os.system('ls')
app = Flask(__name__)
can_gpu = torch.cuda.is_available()

# Model & Tokenizer loading
tokenizer = sentencepiece.SentencePieceProcessor()
tokenizer.load(tok_path)

if can_gpu:
    device = torch.device('cuda')
    model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path=None,
                                            config=GPT2Config.from_dict(kogpt2_config),
                                            state_dict=torch.load(model_file))
else:
    device = torch.device('cpu')
    model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path=None,
                                            config=GPT2Config.from_dict(kogpt2_config),
                                            state_dict=torch.load(model_file, map_location=device))

model.to(device)

requests_queue = Queue()    # request queue.
BATCH_SIZE = 1              # max request size.
CHECK_INTERVAL = 0.1


##

Exemplo n.º 12

0

Exibir arquivo

Arquivo: run_language_modeling.py Projeto: taeminlee/train_KoGPT2

def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    print("training_args: ", training_args)

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        if model_args.config_name == 'kogpt2':
            config = GPT2Config.from_dict(kogpt2_config)
        else:
            config = AutoConfig.from_pretrained(model_args.config_name,
                                                cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        if model_args.model_name_or_path == 'kogpt2':
            config = GPT2Config.from_dict(kogpt2_config)
        else:
            config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                                cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name and model_args.use_gluonnlp_tokenizer == False:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path and model_args.use_gluonnlp_tokenizer == False:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    elif model_args.use_gluonnlp_tokenizer == True:
        vocab_file = model_args.vocab_path
        vocab = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file,
                                                       mask_token=None,
                                                       sep_token=None,
                                                       cls_token=None,
                                                       unknown_token='<unk>',
                                                       padding_token='<pad>',
                                                       bos_token='<s>',
                                                       eos_token='</s>')

        tokenizer = nlp.data.BERTSPTokenizer(tokenizer_path,
                                             vocab,
                                             lower=False)

    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        if model_args.model_name_or_path == 'kogpt2':
            model = GPT2LMHeadModel.from_pretrained(
                pretrained_model_name_or_path=None,
                config=config,
                state_dict=torch.load(kogpt2_model_path))
        else:
            model = AutoModelWithLMHead.from_pretrained(
                model_args.model_name_or_path,
                from_tf=bool(".ckpt" in model_args.model_name_or_path),
                config=config,
                cache_dir=model_args.cache_dir,
            )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    if model_args.use_gluonnlp_tokenizer == False:
        model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")

    if model_args.use_gluonnlp_tokenizer == True:
        max_len = model_args.max_len
    else:
        max_len = tokenizer.max_len

    if data_args.block_size <= 0:
        data_args.block_size = max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, max_len)

    # Get datasets
    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        max_len=max_len) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer, max_len=max_len,
        evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        print("model_path: ", model_path)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master(
        ) and model_args.use_gluonnlp_tokenizer == False:
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results