예제 #1
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    task_types = []
    dropout_list = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in task_defs.n_class_map
        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]
        nclass = task_defs.n_class_map[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class)

        task_type = task_defs.task_type_map[prefix]

        dopt = generate_decoder_opt(task_defs.enable_san_map[prefix], opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)
        task_types.append(task_type)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p)
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path, True, task_type=task_type, maxlen=args.max_seq_len),
                              batch_size=batch_size,
                              dropout_w=args.dropout_w,
                              gpu=args.cuda,
                              task_id=task_id,
                              maxlen=args.max_seq_len,
                              data_type=data_type,
                              task_type=task_type,
                              encoder_type=encoder_type)
        train_data_list.append(train_data)

    opt['answer_opt'] = decoder_opts
    opt['task_types'] = task_types
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[task_defs.n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = task_defs.task_type_map[prefix]

        pw_task = False
        if task_type == TaskType.Ranking:
            pw_task = True

        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path, False, task_type=task_type, maxlen=args.max_seq_len),
                                batch_size=args.batch_size_eval,
                                gpu=args.cuda, is_train=False,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                data_type=data_type,
                                task_type=task_type,
                                encoder_type=encoder_type)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path, False, task_type=task_type, maxlen=args.max_seq_len),
                                 batch_size=args.batch_size_eval,
                                 gpu=args.cuda, is_train=False,
                                 task_id=task_id,
                                 maxlen=args.max_seq_len,
                                 data_type=data_type,
                                 task_type=task_type,
                                 encoder_type=encoder_type)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_iters = [iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]

    # div number of grad accumulation. 
    num_all_batches = args.epochs * sum(all_lens) // args.grad_accumulation_step
    logger.info('############# Gradient Accumulation Info #############')
    logger.info('number of step: {}'.format(args.epochs * sum(all_lens)))
    logger.info('number of grad grad_accumulation step: {}'.format(args.grad_accumulation_step))
    logger.info('adjusted number of step: {}'.format(num_all_batches))
    logger.info('############# Gradient Accumulation Info #############')

    if len(train_data_list) > 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio)))

    bert_model_path = args.init_checkpoint
    state_dict = None

    if encoder_type == EncoderModelType.BERT:
        if os.path.exists(bert_model_path):
            state_dict = torch.load(bert_model_path)
            config = state_dict['config']
            config['attention_probs_dropout_prob'] = args.bert_dropout_p
            config['hidden_dropout_prob'] = args.bert_dropout_p
            config['multi_gpu_on'] = opt["multi_gpu_on"]
            opt.update(config)
        else:
            logger.error('#' * 20)
            logger.error('Could not find the init model!\n The parameters will be initialized randomly!')
            logger.error('#' * 20)
            config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
            config['multi_gpu_on'] = opt["multi_gpu_on"]
            opt.update(config)
    elif encoder_type == EncoderModelType.ROBERTA:
        bert_model_path = '{}/model.pt'.format(bert_model_path)
        if os.path.exists(bert_model_path):
            new_state_dict = {}
            state_dict = torch.load(bert_model_path)
            for key, val in state_dict['model'].items():
                if key.startswith('decoder.sentence_encoder'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
                elif key.startswith('classification_heads'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
            state_dict = {'state': new_state_dict}

    model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        logger.info('loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    # tensorboard
    if args.tensorboard:
        args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir)
        tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir)

    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        for train_data in train_data_list:
            train_data.reset()
        start = datetime.now()
        all_indices = []
        if len(train_data_list) > 1 and args.ratio > 0:
            main_indices = [0] * len(train_data_list[0])
            extra_indices = []
            for i in range(1, len(train_data_list)):
                extra_indices += [i] * len(train_data_list[i])
            random_picks = int(min(len(train_data_list[0]) * args.ratio, len(extra_indices)))
            extra_indices = np.random.choice(extra_indices, random_picks, replace=False)
            if args.mix_opt > 0:
                extra_indices = extra_indices.tolist()
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices.tolist()

        else:
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data = next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.local_updates) % (args.log_per_updates * args.grad_accumulation_step) == 0 or model.local_updates == 1:
                ramaining_time = str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0]
                logger.info('Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format(task_id,
                                                                                                    model.updates,
                                                                                                    model.train_loss.avg,
                                                                                                    ramaining_time))
                if args.tensorboard:
                    tensorboard.add_scalar('train/loss', model.train_loss.avg, global_step=model.updates)


            if args.save_per_updates_on and ((model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0):
                model_file = os.path.join(output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                logger.info('Saving mt-dnn model to {}'.format(model_file))
                model.save(model_file)

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = task_defs.global_map.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                with torch.no_grad():
                    dev_metrics, dev_predictions, scores, golds, dev_ids= eval_model(model,
                                                                                    dev_data,
                                                                                    metric_meta=task_defs.metric_meta_map[prefix],
                                                                                    use_cuda=args.cuda,
                                                                                    label_mapper=task_defs.global_map[prefix])
                for key, val in dev_metrics.items():
                    if args.tensorboard:
                        tensorboard.add_scalar('dev/{}/{}'.format(dataset, key), val, global_step=epoch)
                    if isinstance(val, str):
                        logger.warning('Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format(dataset, epoch, key, val))
                    else:
                        logger.warning('Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format(dataset, epoch, key, val))
                score_file = os.path.join(output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores}
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                with torch.no_grad():
                    test_metrics, test_predictions, scores, golds, test_ids= eval_model(model, test_data,
                                                                                        metric_meta=task_defs.metric_meta_map[prefix],
                                                                                        use_cuda=args.cuda, with_label=False,
                                                                                        label_mapper=task_defs.global_map[prefix])
                score_file = os.path.join(output_dir, '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores}
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
    if args.tensorboard:
        tensorboard.close()
예제 #2
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    dropout_list = []

    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in DATA_META
        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]
        nclass = DATA_META[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class)

        task_type = TASK_TYPE[prefix]
        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        dopt = generate_decoder_opt(prefix, opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = args.dropout_p
        if tasks_config and prefix in tasks_config:
            dropout_p = tasks_config[prefix]
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len),
                                batch_size=batch_size,
                                dropout_w=args.dropout_w,
                                gpu=args.cuda,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type)
        train_data_list.append(train_data)

    opt['answer_opt'] = decoder_opts
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = TASK_TYPE[prefix]

        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len),
                                  batch_size=args.batch_size_eval,
                                  gpu=args.cuda, is_train=False,
                                  task_id=task_id,
                                  maxlen=args.max_seq_len,
                                  pairwise=pw_task,
                                  data_type=data_type,
                                  task_type=task_type)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len),
                                  batch_size=args.batch_size_eval,
                                  gpu=args.cuda, is_train=False,
                                  task_id=task_id,
                                  maxlen=args.max_seq_len,
                                  pairwise=pw_task,
                                  data_type=data_type,
                                  task_type=task_type)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_iters =[iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]
    num_all_batches = args.epochs * sum(all_lens)

    if len(train_data_list)> 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio)))

    model_path = args.init_checkpoint
    state_dict = None

    if os.path.exists(model_path):
        state_dict = torch.load(model_path)
        config = state_dict['config']
        config['attention_probs_dropout_prob'] = args.bert_dropout_p
        config['hidden_dropout_prob'] = args.bert_dropout_p
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error('Could not find the init model!\n The parameters will be initialized randomly!')
        logger.error('#' * 20)
        config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
        opt.update(config)

    model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches)
    ####model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ###print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    if args.freeze_layers > 0:
        model.network.freeze_layers(args.freeze_layers)

    if args.cuda:
        model.cuda()
    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        for train_data in train_data_list:
            train_data.reset()
        start = datetime.now()
        all_indices=[]
        if len(train_data_list)> 1 and args.ratio > 0:
            main_indices =[0] * len(train_data_list[0])
            extra_indices=[]
            for i in range(1, len(train_data_list)):
                extra_indices += [i] * len(train_data_list[i])
            random_picks=int(min(len(train_data_list[0]) * args.ratio, len(extra_indices)))
            extra_indices = np.random.choice(extra_indices, random_picks, replace=False)
            if args.mix_opt > 0:
                extra_indices = extra_indices.tolist()
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices.tolist()

        else:
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data= next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.updates) % args.log_per_updates == 0 or model.updates == 1:
                logger.info('Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format(task_id,
                    model.updates, model.train_loss.avg,
                    str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0]))

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = GLOBAL_MAP.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids= eval_model(model, dev_data, dataset=prefix,
                                                                                 use_cuda=args.cuda)
                for key, val in dev_metrics.items():
                    logger.warning("Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(dataset, epoch, key, val))
                score_file = os.path.join(output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores}
                dump(score_file, results)
                official_score_file = os.path.join(output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                test_metrics, test_predictions, scores, golds, test_ids= eval_model(model, test_data, dataset=prefix,
                                                                                 use_cuda=args.cuda, with_label=False)
                score_file = os.path.join(output_dir, '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores}
                dump(score_file, results)
                official_score_file = os.path.join(output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
예제 #3
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    dropout_list = []

    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in DATA_META
        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]
        nclass = DATA_META[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(
                tasks_class)

        task_type = TASK_TYPE[prefix]
        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        dopt = generate_decoder_opt(prefix, opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = args.dropout_p
        if tasks_config and prefix in tasks_config:
            dropout_p = tasks_config[prefix]
        dropout_list.append(dropout_p)

        train_data_ratio_string = str(
            args.train_data_ratio) + "p" if args.train_data_ratio < 100 else ""

        train_path = os.path.join(
            data_dir, '{0}_train{1}.json'.format(dataset,
                                                 train_data_ratio_string))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path,
                                            True,
                                            pairwise=pw_task,
                                            maxlen=args.max_seq_len),
                              batch_size=batch_size,
                              dropout_w=args.dropout_w,
                              gpu=args.cuda,
                              task_id=task_id,
                              maxlen=args.max_seq_len,
                              pairwise=pw_task,
                              data_type=data_type,
                              task_type=task_type)
        train_data_list.append(train_data)

    opt['answer_opt'] = decoder_opts
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[
            DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = TASK_TYPE[prefix]

        pw_task = False
        if prefix in opt['pw_tasks']:
            pw_task = True

        assert prefix in DATA_TYPE
        data_type = DATA_TYPE[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path,
                                              False,
                                              pairwise=pw_task,
                                              maxlen=args.max_seq_len),
                                batch_size=args.batch_size_eval,
                                gpu=args.cuda,
                                is_train=False,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path,
                                               False,
                                               pairwise=pw_task,
                                               maxlen=args.max_seq_len),
                                 batch_size=args.batch_size_eval,
                                 gpu=args.cuda,
                                 is_train=False,
                                 task_id=task_id,
                                 maxlen=args.max_seq_len,
                                 pairwise=pw_task,
                                 data_type=data_type,
                                 task_type=task_type)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_iters = [iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]
    num_all_batches = args.epochs * sum(all_lens)

    if len(train_data_list) > 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) *
                                             (1 + args.ratio)))

    model_path = args.init_checkpoint
    state_dict = None

    if os.path.exists(model_path):
        state_dict = torch.load(model_path, map_location='cpu')
        config = state_dict['config']
        config['attention_probs_dropout_prob'] = args.bert_dropout_p
        config['hidden_dropout_prob'] = args.bert_dropout_p
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error('Could not find the init model!\n Exit application!')
        logger.error('#' * 20)
        try:
            shutil.rmtree(output_dir)
        except Exception as e:
            print(e)
        exit(1)

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    ####model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ###print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    if args.freeze_layers > 0:
        model.network.freeze_layers(args.freeze_layers)

    if args.cuda:
        model.cuda()

    best_F1_macro = -1.0
    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        for train_data in train_data_list:
            train_data.reset()
        start = datetime.now()
        all_indices = []
        if len(train_data_list) > 1 and (args.ratio > 0 or
                                         args.reduce_first_dataset_ratio > 0):
            main_indices = [0] * (int(args.reduce_first_dataset_ratio * len(
                train_data_list[0])) if args.reduce_first_dataset_ratio > 0
                                  else len(train_data_list[0]))
            extra_indices = []
            for i in range(1, len(train_data_list)):
                extra_indices += [i] * len(train_data_list[i])
            if args.ratio > 0:
                random_picks = int(
                    min(
                        len(train_data_list[0]) * args.ratio,
                        len(extra_indices)))
                extra_indices = np.random.choice(extra_indices,
                                                 random_picks,
                                                 replace=False).tolist()
            if args.mix_opt > 0:
                extra_indices = extra_indices
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices
            logger.info(
                "Main batches loaded (first dataset in list): {}".format(
                    len(main_indices)))
            logger.info(
                "Extra batches loaded (all except first dataset in list): {}".
                format(len(extra_indices)))

        else:  # shuffle the index of the train sets whose batches will be trained on in the order: e.g. if train_set[1] is large, it will get trained on more often
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data = next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.updates
                ) % args.log_per_updates == 0 or model.updates == 1:
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(
                        task_id, model.updates, model.train_loss.avg,
                        str((datetime.now() - start) / (i + 1) *
                            (len(all_indices) - i - 1)).split('.')[0]))

        temp_dev_F1s = []
        dev_dump_list = []
        test_dump_list = []
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = GLOBAL_MAP.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids, premises, hypotheses = eval_model(
                    model, dev_data, dataset=prefix, use_cuda=args.cuda)
                for key, val in dev_metrics.items():
                    if not isinstance(val, dict):
                        logger.warning(
                            "Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(
                                dataset, epoch, key, val))
                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores,
                    'golds': golds,
                    'premises': premises,
                    'hypotheses': hypotheses
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)

                # for checkpoint
                temp_dev_F1s.append(dev_metrics['F1_macro'])
                dev_dump_list.append({
                    "output_dir": output_dir,
                    "dev_metrics": dev_metrics,
                    "dev_predictions": dev_predictions,
                    "golds": golds,
                    "opt": opt,
                    "dataset": dataset
                })

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                test_metrics, test_predictions, scores, golds, test_ids, premises, hypotheses = eval_model(
                    model,
                    test_data,
                    dataset=prefix,
                    use_cuda=args.cuda,
                    with_label=True)
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores,
                    'golds': golds,
                    'premises': premises,
                    'hypotheses': hypotheses
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

                # for checkpoint
                test_dump_list.append({
                    "output_dir": output_dir,
                    "test_metrics": test_metrics,
                    "test_predictions": test_predictions,
                    "golds": golds,
                    "opt": opt,
                    "dataset": dataset
                })

        # save checkpoint
        if np.average(temp_dev_F1s) > best_F1_macro:
            print("Save new model! Current best F1 macro over all dev sets: " +
                  "{0:.2f}".format(best_F1_macro) + ". New: " +
                  "{0:.2f}".format(np.average(temp_dev_F1s)))
            best_F1_macro = np.average(temp_dev_F1s)

            # override current dump file
            for l in dev_dump_list:
                dump_result_files(l['dataset'])(l['output_dir'], epoch,
                                                l['dev_metrics'],
                                                str(l['dev_predictions']),
                                                str(l['golds']), "dev",
                                                l['opt'], l['dataset'])

            for l in test_dump_list:
                dump_result_files(l['dataset'])(l['output_dir'], epoch,
                                                l['test_metrics'],
                                                str(l['test_predictions']),
                                                str(l['golds']), "test",
                                                l['opt'], l['dataset'])

            # save model
            model_file = os.path.join(output_dir, 'model.pt')
            model.save(model_file)
예제 #4
0
def main():
    #set up dist
    device = torch.device("cuda")
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    print_message(logger, 'Launching the MT-DNN training')
    #return
    tasks = {}
    task_def_list = []
    dropout_list = []
    printable = args.local_rank in [-1, 0]

    train_datasets = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks:
            continue
        task_id = len(tasks)
        tasks[prefix] = task_id
        task_def = task_defs.get_task_def(prefix)
        task_def_list.append(task_def)
        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        print_message(logger,
                      'Loading {} as task {}'.format(train_path, task_id))
        train_data_set = SingleTaskDataset(train_path,
                                           True,
                                           maxlen=args.max_seq_len,
                                           task_id=task_id,
                                           task_def=task_def,
                                           printable=printable)
        #print("train_data_set={}".format(train_data_set))
        train_datasets.append(train_data_set)
    train_collater = Collater(dropout_w=args.dropout_w,
                              encoder_type=encoder_type,
                              soft_label=args.mkd_opt > 0,
                              max_seq_len=args.max_seq_len,
                              do_padding=args.do_padding)
    multi_task_train_dataset = MultiTaskDataset(train_datasets)
    if args.local_rank != -1:
        multi_task_batch_sampler = DistMultiTaskBatchSampler(
            train_datasets,
            args.batch_size,
            args.mix_opt,
            args.ratio,
            rank=args.local_rank,
            world_size=args.world_size)
    else:
        multi_task_batch_sampler = MultiTaskBatchSampler(
            train_datasets,
            args.batch_size,
            args.mix_opt,
            args.ratio,
            bin_on=args.bin_on,
            bin_size=args.bin_size,
            bin_grow_ratio=args.bin_grow_ratio)
    multi_task_train_data = DataLoader(multi_task_train_dataset,
                                       batch_sampler=multi_task_batch_sampler,
                                       collate_fn=train_collater.collate_fn,
                                       pin_memory=args.cuda)

    opt['task_def_list'] = task_def_list

    dev_data_list = []
    test_data_list = []
    test_collater = Collater(is_train=False,
                             encoder_type=encoder_type,
                             max_seq_len=args.max_seq_len,
                             do_padding=args.do_padding)
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_def = task_defs.get_task_def(prefix)
        print("prefix={}".format(prefix))
        task_id = tasks[prefix]
        task_type = task_def.task_type
        data_type = task_def.data_type

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data_set = SingleTaskDataset(dev_path,
                                             False,
                                             maxlen=args.max_seq_len,
                                             task_id=task_id,
                                             task_def=task_def,
                                             printable=printable)
            if args.local_rank != -1:
                dev_data_set = DistTaskDataset(dev_data_set, task_id)
                single_task_batch_sampler = DistSingleTaskBatchSampler(
                    dev_data_set,
                    args.batch_size_eval,
                    rank=args.local_rank,
                    world_size=args.world_size)
                dev_data = DataLoader(dev_data_set,
                                      batch_sampler=single_task_batch_sampler,
                                      collate_fn=test_collater.collate_fn,
                                      pin_memory=args.cuda)
            else:
                dev_data = DataLoader(dev_data_set,
                                      batch_size=args.batch_size_eval,
                                      collate_fn=test_collater.collate_fn,
                                      pin_memory=args.cuda)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data_set = SingleTaskDataset(test_path,
                                              False,
                                              maxlen=args.max_seq_len,
                                              task_id=task_id,
                                              task_def=task_def,
                                              printable=printable)
            if args.local_rank != -1:
                test_data_set = DistTaskDataset(test_data_set, task_id)
                single_task_batch_sampler = DistSingleTaskBatchSampler(
                    test_data_set,
                    args.batch_size_eval,
                    rank=args.local_rank,
                    world_size=args.world_size)
                test_data = DataLoader(test_data_set,
                                       batch_sampler=single_task_batch_sampler,
                                       collate_fn=test_collater.collate_fn,
                                       pin_memory=args.cuda)
            else:
                test_data = DataLoader(test_data_set,
                                       batch_size=args.batch_size_eval,
                                       collate_fn=test_collater.collate_fn,
                                       pin_memory=args.cuda)
        test_data_list.append(test_data)

    print_message(logger, '#' * 20)
    print_message(logger, opt)
    print_message(logger, '#' * 20)

    # div number of grad accumulation.
    num_all_batches = args.epochs * len(
        multi_task_train_data) // args.grad_accumulation_step
    print_message(logger,
                  '############# Gradient Accumulation Info #############')
    print_message(
        logger,
        'number of step: {}'.format(args.epochs * len(multi_task_train_data)))
    print_message(
        logger, 'number of grad grad_accumulation step: {}'.format(
            args.grad_accumulation_step))
    print_message(logger,
                  'adjusted number of step: {}'.format(num_all_batches))
    print_message(logger,
                  '############# Gradient Accumulation Info #############')

    init_model = args.init_checkpoint
    state_dict = None

    if os.path.exists(init_model):
        if encoder_type == EncoderModelType.BERT or \
            encoder_type == EncoderModelType.DEBERTA or \
            encoder_type == EncoderModelType.ELECTRA:
            state_dict = torch.load(init_model, map_location=device)
            config = state_dict['config']
        elif encoder_type == EncoderModelType.ROBERTA or encoder_type == EncoderModelType.XLM:
            model_path = '{}/model.pt'.format(init_model)
            #state_dict = torch.load(model_path, map_location=device)
            state_dict = torch.load(
                model_path,
                map_location=lambda storage, loc: storage.cuda(device))
            arch = state_dict['args'].arch
            arch = arch.replace('_', '-')
            if encoder_type == EncoderModelType.XLM:
                arch = "xlm-{}".format(arch)
            # convert model arch
            from data_utils.roberta_utils import update_roberta_keys
            from data_utils.roberta_utils import patch_name_dict
            state = update_roberta_keys(
                state_dict['model'], nlayer=state_dict['args'].encoder_layers)
            state = patch_name_dict(state)
            literal_encoder_type = EncoderModelType(
                opt['encoder_type']).name.lower()
            config_class, model_class, tokenizer_class = MODEL_CLASSES[
                literal_encoder_type]
            config = config_class.from_pretrained(arch).to_dict()
            state_dict = {'state': state}
        '''elif encoder_type == EncoderModelType.NEZHA:
            config = BertConfig(os.path.join(init_config))
            model = BertForSequenceClassification(config, num_labels=2)
            model.load_state_dict(torch.load(init_model))
            state_dict = model.state_dict'''
    else:
        if opt['encoder_type'] not in EncoderModelType._value2member_map_:
            raise ValueError("encoder_type is out of pre-defined types")
        literal_encoder_type = EncoderModelType(
            opt['encoder_type']).name.lower()
        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            literal_encoder_type]
        config = config_class.from_pretrained(init_model).to_dict()

    print("[chunhui] 2 args.test_datasets={}".format(args.test_datasets))

    config['attention_probs_dropout_prob'] = args.bert_dropout_p
    config['hidden_dropout_prob'] = args.bert_dropout_p
    config['multi_gpu_on'] = opt["multi_gpu_on"]
    if args.num_hidden_layers > 0:
        config['num_hidden_layers'] = args.num_hidden_layers

    if args.debug:
        config['log_per_updates'] = 5
        config['save_per_updates'] = 10
        config['save_per_updates_on'] = True

    if "test_datasets" in config:
        del config["test_datasets"]
    if "epochs" in config:
        del config["epochs"]
    if "resume" in config:
        del config["resume"]
    if "model_ckpt" in config:
        del config["model_ckpt"]
    opt.update(config)

    model = MTDNNModel(opt,
                       device=device,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        print_message(logger, 'loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    print_message(logger, '\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    print_message(logger,
                  "Total number of params: {}".format(model.total_param))
    # tensorboard
    tensorboard = None
    if args.tensorboard:
        args.tensorboard_logdir = os.path.join(args.output_dir,
                                               args.tensorboard_logdir)
        tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir)

    if args.encode_mode:
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            test_data = test_data_list[idx]
            with torch.no_grad():
                encoding = extract_encoding(model,
                                            test_data,
                                            use_cuda=args.cuda)
            torch.save(
                encoding,
                os.path.join(output_dir, '{}_encoding.pt'.format(dataset)))
        return

    print_message(logger, 'args.epoch={}'.format(args.epochs), level=1)
    for epoch in range(0, args.epochs):
        print_message(logger, 'At epoch {}'.format(epoch), level=1)
        start = datetime.now()

        #print("multi_task_train_data={}".format(type(multi_task_train_data)))
        min_loss = 100000
        max_acc = 0.0  # 随便设置一个比较大的数
        for i, (batch_meta, batch_data) in enumerate(multi_task_train_data):
            batch_meta, batch_data = Collater.patch_data(
                device, batch_meta, batch_data)
            task_id = batch_meta['task_id']
            #print("batch_meta={}".format(batch_meta))
            #print("batch_data={}".format(batch_data))
            model.update(batch_meta, batch_data)

            if (model.updates) % (
                    args.log_per_updates) == 0 or model.updates == 1:
                ramaining_time = str(
                    (datetime.now() - start) / (i + 1) *
                    (len(multi_task_train_data) - i - 1)).split('.')[0]
                if args.adv_train and args.debug:
                    debug_info = ' adv loss[%.5f] emb val[%.8f] eff_perturb[%.8f] ' % (
                        model.adv_loss.avg, model.emb_val.avg,
                        model.eff_perturb.avg)
                else:
                    debug_info = ' '
                print_message(
                    logger,
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}]{3}remaining[{4}]'
                    .format(task_id, model.updates, model.train_loss.avg,
                            debug_info, ramaining_time))
                if args.tensorboard:
                    tensorboard.add_scalar('train/loss',
                                           model.train_loss.avg,
                                           global_step=model.updates)

            if args.save_per_updates_on and (
                (model.local_updates) %
                (args.save_per_updates * args.grad_accumulation_step)
                    == 0) and args.local_rank in [-1, 0]:
                model_file = os.path.join(
                    output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                dev_acc = evaluation(model,
                                     args.test_datasets,
                                     dev_data_list,
                                     task_defs,
                                     output_dir,
                                     epoch,
                                     n_updates=args.save_per_updates,
                                     with_label=True,
                                     tensorboard=tensorboard,
                                     glue_format_on=args.glue_format_on,
                                     test_on=False,
                                     device=device,
                                     logger=logger)
                evaluation(model,
                           args.test_datasets,
                           test_data_list,
                           task_defs,
                           output_dir,
                           epoch,
                           n_updates=args.save_per_updates,
                           with_label=False,
                           tensorboard=tensorboard,
                           glue_format_on=args.glue_format_on,
                           test_on=True,
                           device=device,
                           logger=logger)
                if (model.train_loss.avg < min_loss and dev_acc > max_acc):
                    print_message(
                        logger, 'Saving mt-dnn model to {}'.format(model_file))
                    model.save(model_file)
        '''dev_acc = evaluation(model, args.test_datasets, dev_data_list, task_defs, output_dir, epoch, with_label=True, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=False, device=device, logger=logger)
        evaluation(model, args.test_datasets, test_data_list, task_defs, output_dir, epoch, with_label=False, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=True, device=device, logger=logger)
        print_message(logger, '[new test scores at {} saved.]'.format(epoch))
        if args.local_rank in [-1, 0] and model.train_loss.avg<min_loss and dev_acc>max_acc:
            model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
            model.save(model_file)'''
    if args.tensorboard:
        tensorboard.close()
예제 #5
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size

    tasks = {}
    task_def_list = []
    dropout_list = []

    train_datasets = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks:
            continue
        task_id = len(tasks)
        tasks[prefix] = task_id
        task_def = task_defs.get_task_def(prefix)
        task_def_list.append(task_def)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data_set = SingleTaskDataset(train_path,
                                           True,
                                           maxlen=args.max_seq_len,
                                           task_id=task_id,
                                           task_def=task_def)
        train_datasets.append(train_data_set)
    train_collater = Collater(dropout_w=args.dropout_w,
                              encoder_type=encoder_type,
                              soft_label=args.mkd_opt > 0)
    multi_task_train_dataset = MultiTaskDataset(train_datasets)
    multi_task_batch_sampler = MultiTaskBatchSampler(train_datasets,
                                                     args.batch_size,
                                                     args.mix_opt, args.ratio)
    multi_task_train_data = DataLoader(multi_task_train_dataset,
                                       batch_sampler=multi_task_batch_sampler,
                                       collate_fn=train_collater.collate_fn,
                                       pin_memory=args.cuda)

    opt['task_def_list'] = task_def_list

    dev_data_list = []
    test_data_list = []
    test_collater = Collater(is_train=False, encoder_type=encoder_type)
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_def = task_defs.get_task_def(prefix)
        task_id = tasks[prefix]
        task_type = task_def.task_type
        data_type = task_def.data_type

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data_set = SingleTaskDataset(dev_path,
                                             False,
                                             maxlen=args.max_seq_len,
                                             task_id=task_id,
                                             task_def=task_def)
            dev_data = DataLoader(dev_data_set,
                                  batch_size=args.batch_size_eval,
                                  collate_fn=test_collater.collate_fn,
                                  pin_memory=args.cuda)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data_set = SingleTaskDataset(test_path,
                                              False,
                                              maxlen=args.max_seq_len,
                                              task_id=task_id,
                                              task_def=task_def)
            test_data = DataLoader(test_data_set,
                                   batch_size=args.batch_size_eval,
                                   collate_fn=test_collater.collate_fn,
                                   pin_memory=args.cuda)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    # div number of grad accumulation.
    num_all_batches = args.epochs * len(
        multi_task_train_data) // args.grad_accumulation_step
    logger.info('############# Gradient Accumulation Info #############')
    logger.info('number of step: {}'.format(args.epochs *
                                            len(multi_task_train_data)))
    logger.info('number of grad grad_accumulation step: {}'.format(
        args.grad_accumulation_step))
    logger.info('adjusted number of step: {}'.format(num_all_batches))
    logger.info('############# Gradient Accumulation Info #############')

    init_model = args.init_checkpoint
    state_dict = None

    if os.path.exists(init_model):
        state_dict = torch.load(init_model)
        config = state_dict['config']
    else:
        if opt['encoder_type'] not in EncoderModelType._value2member_map_:
            raise ValueError("encoder_type is out of pre-defined types")
        literal_encoder_type = EncoderModelType(
            opt['encoder_type']).name.lower()
        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            literal_encoder_type]
        config = config_class.from_pretrained(init_model).to_dict()

    config['attention_probs_dropout_prob'] = args.bert_dropout_p
    config['hidden_dropout_prob'] = args.bert_dropout_p
    config['multi_gpu_on'] = opt["multi_gpu_on"]
    if args.num_hidden_layers != -1:
        config['num_hidden_layers'] = args.num_hidden_layers
    opt.update(config)

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        logger.info('loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    # tensorboard
    if args.tensorboard:
        args.tensorboard_logdir = os.path.join(args.output_dir,
                                               args.tensorboard_logdir)
        tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir)

    if args.encode_mode:
        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            test_data = test_data_list[idx]
            with torch.no_grad():
                encoding = extract_encoding(model,
                                            test_data,
                                            use_cuda=args.cuda)
            torch.save(
                encoding,
                os.path.join(output_dir, '{}_encoding.pt'.format(dataset)))
        return

    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        start = datetime.now()

        for i, (batch_meta, batch_data) in enumerate(multi_task_train_data):
            batch_meta, batch_data = Collater.patch_data(
                args.cuda, batch_meta, batch_data)
            task_id = batch_meta['task_id']
            model.update(batch_meta, batch_data)
            if (model.local_updates) % (args.log_per_updates *
                                        args.grad_accumulation_step
                                        ) == 0 or model.local_updates == 1:
                ramaining_time = str(
                    (datetime.now() - start) / (i + 1) *
                    (len(multi_task_train_data) - i - 1)).split('.')[0]
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(task_id, model.updates, model.train_loss.avg,
                            ramaining_time))
                if args.tensorboard:
                    tensorboard.add_scalar('train/loss',
                                           model.train_loss.avg,
                                           global_step=model.updates)

            if args.save_per_updates_on and (
                (model.local_updates) %
                (args.save_per_updates * args.grad_accumulation_step) == 0):
                model_file = os.path.join(
                    output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                logger.info('Saving mt-dnn model to {}'.format(model_file))
                model.save(model_file)

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            task_def = task_defs.get_task_def(prefix)
            label_dict = task_def.label_vocab
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                with torch.no_grad():
                    dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
                        model,
                        dev_data,
                        metric_meta=task_def.metric_meta,
                        use_cuda=args.cuda,
                        label_mapper=label_dict,
                        task_type=task_def.task_type)
                for key, val in dev_metrics.items():
                    if args.tensorboard:
                        tensorboard.add_scalar('dev/{}/{}'.format(
                            dataset, key),
                                               val,
                                               global_step=epoch)
                    if isinstance(val, str):
                        logger.warning(
                            'Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format(
                                dataset, epoch, key, val))
                    else:
                        logger.warning(
                            'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format(
                                dataset, epoch, key, val))
                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores
                }
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(
                        output_dir,
                        '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                with torch.no_grad():
                    test_metrics, test_predictions, scores, golds, test_ids = eval_model(
                        model,
                        test_data,
                        metric_meta=task_def.metric_meta,
                        use_cuda=args.cuda,
                        with_label=False,
                        label_mapper=label_dict,
                        task_type=task_def.task_type)
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores
                }
                dump(score_file, results)
                if args.glue_format_on:
                    from experiments.glue.glue_utils import submit
                    official_score_file = os.path.join(
                        output_dir,
                        '{}_test_scores_{}.tsv'.format(dataset, epoch))
                    submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
    if args.tensorboard:
        tensorboard.close()