def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = BertForMaskedLM(config=config)
     model.eval()
     loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
예제 #2
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument("--wp",
                        type=bool,
                        default=False,
                        help="if train on wp")
    parser.add_argument(
        '--from_scratch',
        action='store_true',
        help='do not load prtrain model, only random initialize')
    parser.add_argument("--output_step",
                        type=int,
                        default=100000,
                        help="Number of step to save model")

    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    num_data_epochs = args.epochs
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)
    # Set seed
    set_seed(args)

    args.output_mode = "classification"

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    while True:
        try:
            tokenizer = BertTokenizer.from_pretrained(
                args.bert_model, do_lower_case=args.do_lower_case)
            if tokenizer._noi_token is None:
                tokenizer._noi_token = '[NOI]'
                if args.bert_model == 'bert-base-uncased' or 'bert-large-uncased':
                    tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused0]')
                else:
                    tokenizer.vocab['[NOI]'] = tokenizer.vocab.pop('[unused1]')
                # else:
                #     raise ValueError("No clear choice for insert NOI for tokenizer type {}".format(args.model_name_or_path))
                tokenizer.ids_to_tokens[1] = '[NOI]'
                logger.info("Adding [NOI] to the vocabulary 1")
        except:
            continue
        break

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    if args.from_scratch:
        model = BertForMaskedLM()
    else:
        model = BertForMaskedLM.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory,
            args=args)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, lm_label_ids = batch
            outputs = model(
                input_ids,
                segment_ids,
                input_mask,
                lm_label_ids,
            )
            loss = outputs[0]
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps

            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if global_step % args.output_step == 0 and args.local_rank in [
                    -1, 0
            ]:
                # Save model checkpoint
                output_dir = os.path.join(args.output_dir,
                                          'checkpoint-{}'.format(global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(
                    model, 'module'
                ) else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                logger.info("Saving model checkpoint to %s", output_dir)

        if args.local_rank in [-1, 0]:
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir,
                                      'checkpoint-{}'.format(global_step))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            model_to_save = model.module if hasattr(
                model, 'module'
            ) else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            torch.save(args, os.path.join(output_dir, 'training_args.bin'))
            logger.info("Saving model checkpoint to %s", output_dir)
        logger.info("PROGRESS: {}%".format(
            round(100 * (epoch + 1) / args.epochs, 4)))
        logger.info("EVALERR: {}%".format(tr_loss))

    # Save a trained model
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        logger.info("Saving model checkpoint to %s", args.output_dir)
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
예제 #3
0
def pretrain(args, data_path):
    print('[pretrain] create config, model')
    if args.model == 'bert':
        if args.redefined_tokenizer:
            bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                           do_lower_case=True)
        else:
            bert_tokenizer = BertTokenizer.from_pretrained(
                './pretrained_weights/bert-base-uncased-vocab.txt',
                do_lower_case=True)
    elif args.model == 'biobert':
        if args.redefined_tokenizer:
            bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                           do_lower_case=False)
        else:
            bert_tokenizer = BertTokenizer.from_pretrained(
                './pretrained_weights/biobert_pretrain_output_all_notes_150000/vocab.txt',
                do_lower_case=False)
    elif args.model == 'bert-tiny':
        if args.redefined_tokenizer:
            bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                           do_lower_case=True)
        else:
            bert_tokenizer = BertTokenizer.from_pretrained(
                './pretrained_weights/bert-tiny-uncased-vocab.txt',
                do_lower_case=True)

    if args.model == 'bert':
        config = BertConfig.from_pretrained(
            './pretrained_weights/bert-base-uncased-config.json')
        if args.Y == 'full':
            config.Y = 8921
        else:
            config.Y = int(args.Y)
        config.gpu = args.gpu
        config.redefined_vocab_size = len(bert_tokenizer)
        if args.max_sequence_length is None:
            config.redefined_max_position_embeddings = MAX_LENGTH
        else:
            config.redefined_max_position_embeddings = args.max_sequence_length
        config.last_module = args.last_module
        config.model = args.model

        if args.from_scratch:
            model = BertForMaskedLM(config=config)
        else:
            model = BertForMaskedLM.from_pretrained(
                './pretrained_weights/bert-base-uncased-pytorch_model.bin',
                config=config)
    elif args.model == 'biobert':
        config = BertConfig.from_pretrained(
            './pretrained_weights/biobert_pretrain_output_all_notes_150000/bert_config.json'
        )
        if args.Y == 'full':
            config.Y = 8921
        else:
            config.Y = int(args.Y)
        config.gpu = args.gpu
        config.redefined_vocab_size = len(bert_tokenizer)
        if args.max_sequence_length is None:
            config.redefined_max_position_embeddings = MAX_LENGTH
        else:
            config.redefined_max_position_embeddings = args.max_sequence_length
        config.last_module = args.last_module
        config.model = args.model
        if args.from_scratch:
            bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                           do_lower_case=False)
        else:
            bert_tokenizer = BertTokenizer.from_pretrained(
                './pretrained_weights/biobert_pretrain_output_all_notes_150000/vocab.txt',
                do_lower_case=False)
        config.redefined_vocab_size = len(bert_tokenizer)
        if args.max_sequence_length is None:
            config.redefined_max_position_embeddings = MAX_LENGTH
        else:
            config.redefined_max_position_embeddings = args.max_sequence_length
        config.model = args.model
        if args.from_scratch:
            model = BertForMaskedLM(config=config)
        else:
            model = BertForMaskedLM.from_pretrained(
                './pretrained_weights/biobert_pretrain_output_all_notes_150000/pytorch_model.bin',
                config=config)
    elif args.model == 'bert-tiny':
        config = BertConfig.from_pretrained(
            './pretrained_weights/bert-tiny-uncased-config.json')
        if args.Y == 'full':
            config.Y = 8921
        else:
            config.Y = int(args.Y)
        config.gpu = args.gpu
        config.redefined_vocab_size = len(bert_tokenizer)
        if args.max_sequence_length is None:
            config.redefined_max_position_embeddings = MAX_LENGTH
        else:
            config.redefined_max_position_embeddings = args.max_sequence_length
        config.last_module = args.last_module
        config.model = args.model
        if args.from_scratch:
            model = BertForMaskedLM(config=config)
        else:
            model = BertForMaskedLM.from_pretrained(
                './pretrained_weights/bert-tiny-uncased-pytorch_model.bin',
                config=config)

    if args.gpu:
        model.cuda()

    print('[pretrain] prepare optimizer, scheduler')
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    pretrain_optimizer = optim.Adam(optimizer_grouped_parameters,
                                    weight_decay=args.weight_decay,
                                    lr=args.lr)
    length = datasets.data_length(args.data_path, args.version)
    t_total = length // args.pretrain_batch_size * args.pretrain_epochs
    pretrain_scheduler = get_linear_schedule_with_warmup(pretrain_optimizer, \
                                                         num_warmup_steps=args.warmup_steps, \
                                                         num_training_steps=t_total, \
                                                        )

    print_every = 25

    model.train()
    model.zero_grad()

    print('[pretrain] create dataloader')
    train_dataset = datasets.pretrain_data_generator(
        args,
        data_path,
        args.pretrain_batch_size,
        version=args.version,
        bert_tokenizer=bert_tokenizer)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.pretrain_batch_size)

    print('[pretrain] start epoch')
    for epoch in range(args.pretrain_epochs):
        losses = []
        for batch_idx, data in tqdm(enumerate(train_dataloader)):
            inputs, labels = random_mask_tokens(args, data, bert_tokenizer)
            if args.gpu:
                inputs = inputs.cuda()
                labels = labels.cuda()

            token_type_ids = (inputs > 0).long() * 0
            attention_mask = (inputs > 0).long()
            position_ids = torch.arange(inputs.size(1)).expand(
                inputs.size(0), inputs.size(1))
            if args.gpu:
                position_ids = position_ids.cuda()
            position_ids = position_ids * (inputs > 0).long()

            outputs = model(input_ids=inputs, \
                            token_type_ids=token_type_ids, \
                            attention_mask=attention_mask, \
                            position_ids=position_ids, \
                            masked_lm_labels=labels, \
                           )
            loss = outputs[0]
            losses.append(loss.item())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            pretrain_optimizer.step()
            pretrain_scheduler.step()
            model.zero_grad()

            if batch_idx % print_every == 0:
                # print the average loss of the last 10 batches
                print(
                    "Train epoch: {} [batch #{}, batch_size {}, seq length {}]\tLoss: {:.6f}"
                    .format(epoch, batch_idx,
                            data.size()[0],
                            data.size()[1], np.mean(losses[-10:])))

        loss = sum(losses) / len(losses)
        print('Epoch %d: %.4f' % (epoch, loss))

    model.save_pretrained(args.pretrain_ckpt_dir)
    print('Save pretrained model --> %s' % (args.pretrain_ckpt_dir))
예제 #4
0
        logging.basicConfig(filename=log_path,
                            format="%(asctime)s %(message)s",
                            level=logging.DEBUG)
    logging.info(f"script_path: {script_path}")
    logging.info(f"soft labels will be saved to {save_path}")

    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.read(args.conf)
    vocab_size = int(config["vocab"]["vocab_size"])
    hidden_size = int(config["model"]["hidden_size"])
    num_hidden_layers = int(config["model"]["num_hidden_layers"])
    num_attention_heads = int(config["model"]["num_attention_heads"])
    intermediate_size = int(config["model"]["intermediate_size"])
    max_position_embeddings = int(config["model"]["max_position_embeddings"])

    bertconfig = modeling_bert.BertConfig(vocab_size_or_config_json_file=vocab_size,
                                          hidden_size=hidden_size,
                                          num_hidden_layers=num_hidden_layers,
                                          num_attention_heads=num_attention_heads,
                                          intermediate_size=intermediate_size,
                                          max_position_embeddings=max_position_embeddings)
    model = BertForMaskedLM(config=bertconfig)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state_dict = torch.load(args.model, map_location=device)
    model.load_state_dict(state_dict)
    logging.info(f"load model from {args.model}")
    model.to(device)

    get_label(model, device, script_path, save_path, args.temp)
예제 #5
0
def train():
    parser = argparse.ArgumentParser()
    parser.add_argument("-conf", type=str)
    parser.add_argument("--debug", action="store_true")
    parser.add_argument(
        "--gpu",
        type=str,
        default=None,
        help=
        "binary flag which gpu to use (For example '10100000' means use device_id=0 and 2)"
    )

    args = parser.parse_args()
    config = configparser.ConfigParser()
    config.read(args.conf)

    hidden_size = int(config["model"]["hidden_size"])
    num_hidden_layers = int(config["model"]["num_hidden_layers"])
    num_attention_heads = int(config["model"]["num_attention_heads"])
    intermediate_size = int(config["model"]["intermediate_size"])
    max_position_embeddings = int(config["model"]["max_position_embeddings"])
    #
    vocab_size = int(config["vocab"]["vocab_size"])
    mask_id = int(config["vocab"]["mask_id"])
    #
    log_path = config["log"]["log_path"]
    log_dir = os.path.dirname(log_path)
    os.makedirs(log_dir, exist_ok=True)
    log_step = int(config["log"]["log_step"])
    #
    train_size = int(config["data"]["train_size"])
    #
    save_prefix = config["save"]["save_prefix"]
    save_dir = os.path.dirname(save_prefix)
    os.makedirs(save_dir, exist_ok=True)
    save_epoch = int(config["save"]["save_epoch"])
    #
    batch_size = int(config["train"]["batch_size"])
    if args.debug:
        batch_size = 10
    num_epochs = int(config["train"]["num_epochs"])
    learning_rate = float(config["train"]["learning_rate"])
    warmup_proportion = float(config["train"]["warmup_proportion"])
    weight_decay = float(config["train"]["weight_decay"])
    #
    num_to_mask = int(config["mask"]["num_to_mask"])
    max_seq_len = int(config["mask"]["max_seq_len"])

    if args.debug:
        logging.basicConfig(format="%(asctime)s %(message)s",
                            level=logging.DEBUG)
    else:
        logging.basicConfig(filename=log_path,
                            format="%(asctime)s %(message)s",
                            level=logging.DEBUG)

    bertconfig = modeling_bert.BertConfig(
        vocab_size_or_config_json_file=vocab_size,
        hidden_size=hidden_size,
        num_hidden_layers=num_hidden_layers,
        num_attention_heads=num_attention_heads,
        intermediate_size=intermediate_size,
        max_position_embeddings=max_position_embeddings)
    model = BertForMaskedLM(config=bertconfig)
    total_params = sum(p.numel() for p in model.parameters()
                       if p.requires_grad)

    if args.gpu is not None:
        device_ids = []
        for device_id, flag in enumerate(args.gpu):
            if flag == "1":
                device_ids.append(device_id)
        multi_gpu = True
        device = torch.device("cuda:{}".format(device_ids[0]))
    else:
        multi_gpu = False
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    logging.info(f"device: {device}")
    if "model_path" in config["train"]:
        model_path = config["train"]["model_path"]
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        logging.info(f"load model from {model_path}")
    model.to(device)
    if multi_gpu:
        logging.info(f"GPU: device_id={device_ids}")
        model = torch.nn.DataParallel(model, device_ids=device_ids)
    model.train()

    # optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = (train_size // batch_size) * num_epochs
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         weight_decay=weight_decay,
                         t_total=t_total)
    logging.info("start training...")

    for epoch in range(num_epochs):
        if "train_dir" in config["data"]:
            train_dir = config["data"]["train_dir"]
            datpaths = os.listdir(train_dir)
            random.shuffle(datpaths)
            for step_ds, path in enumerate(datpaths):
                path = os.path.join(train_dir, path)
                dataset = LMDataset(path)
                num_steps = (len(dataset) // batch_size) + 1
                logging.info(f"dataset from: {path}")
                loss_ds = train_dataset(dataset=dataset,
                                        model=model,
                                        optimizer=optimizer,
                                        multi_gpu=multi_gpu,
                                        device=device,
                                        epoch=epoch,
                                        batch_size=batch_size,
                                        num_steps=num_steps,
                                        log_step=log_step,
                                        num_to_mask=num_to_mask,
                                        mask_id=mask_id,
                                        max_seq_len=max_seq_len)
                logging.info(
                    f"step {step_ds + 1} / {len(datpaths)}: {(loss_ds / num_steps):.6f}"
                )
        else:
            train_path = config["data"]["train_path"]
            dataset = LMDataset(train_path)
            num_steps = (len(dataset) // batch_size) + 1
            loss_epoch = train_dataset(dataset=dataset,
                                       model=model,
                                       optimizer=optimizer,
                                       multi_gpu=multi_gpu,
                                       device=device,
                                       epoch=epoch,
                                       batch_size=batch_size,
                                       num_steps=num_steps,
                                       log_step=log_step,
                                       num_to_mask=num_to_mask,
                                       mask_id=mask_id,
                                       max_seq_len=max_seq_len)
            logging.info(
                f"epoch {epoch + 1} / {num_epochs} : {(loss_epoch / num_steps):.6f}"
            )

        if (epoch + 1) % save_epoch == 0:
            save_path = f"{save_prefix}.network.epoch{(epoch + 1):d}"
            optimizer_save_path = f"{save_prefix}.optimizer.epoch{(epoch + 1):d}"
            if multi_gpu:
                torch.save(model.module.state_dict(),
                           save_path.format(epoch + 1))
            else:
                torch.save(model.state_dict(), save_path.format(epoch + 1))
            logging.info(f"model saved: {save_path}")
            torch.save(optimizer.state_dict(),
                       optimizer_save_path.format(epoch + 1))
            logging.info(f"optimizer saved: {optimizer_save_path}")