Пример #1
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--train_data_dir",
        default="",
        type=str,
        # required=True,
        help="The input train corpus.",
    )
    parser.add_argument(
        "--val_data_dir",
        default="",
        type=str,
        # required=True,
        help="The input val corpus.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="save",
        type=str,
        # required=True,
        help=
        "The output directory where the model checkpoints will be written.",
    )

    parser.add_argument(
        "--config_file",
        default="config/bert_base_6layer_interbert.json",
        type=str,
        # required=True,
        help="The config file which specified the model details.",
    )
    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=36,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.",
    )
    parser.add_argument("--predict_feature",
                        action="store_true",
                        help="visual target.")

    parser.add_argument(
        "--train_batch_size",
        default=512,
        type=int,
        help="Total batch size for training.",
    )
    parser.add_argument(
        "--learning_rate",
        default=1e-4,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument(
        "--num_train_epochs",
        default=10.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--start_epoch",
        default=0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--continue_training",
        action="store_true",
        help="if we need to continue a stopped pretraining procedure, add this"
    )
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.",
    )
    parser.add_argument("--img_weight",
                        default=1,
                        type=float,
                        help="weight for image loss")
    parser.add_argument("--itm_weight",
                        default=1,
                        type=float,
                        help="weight for itm loss")
    parser.add_argument("--text_weight",
                        default=1,
                        type=float,
                        help="weight for text loss")
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        type=bool,
        default=True,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="local_rank for distributed training on gpus",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass.",
    )
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=3,
        help="Number of workers in the dataloader.",
    )
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument(
        "--freeze",
        default=-1,
        type=int,
        help="till which layer of textual stream of the model need to fixed.")
    parser.add_argument("--distributed",
                        action="store_true",
                        help="whether use chunck for parallel training.")
    parser.add_argument("--without_coattention",
                        action="store_true",
                        help="whether pair loss.")
    parser.add_argument("--span_mask",
                        action="store_true",
                        help="whether to use span_masking.")
    parser.add_argument("--cond_mask",
                        action="store_true",
                        help="Whether to use conditional masking method.")
    parser.add_argument("--dynamic_masking",
                        action="store_true",
                        help="whether to use dynamic masking")
    args = parser.parse_args()

    print(args)
    if args.save_name is not '':
        timeStamp = args.save_name
    else:
        timeStamp = strftime("%d-%b-%y-%X-%a", gmtime())
        timeStamp += "_{:0>6d}".format(random.randint(0, 10e6))

    savePath = os.path.join(args.output_dir, timeStamp)

    if not os.path.exists(savePath):
        os.makedirs(savePath)

    config = BertConfig.from_json_file(args.config_file)

    if args.freeze > config.t_biattention_id[0]:
        config.fixed_t_layer = config.t_biattention_id[0]

    if args.without_coattention:
        config.with_coattention = False
    # # save all the hidden parameters.
    # with open(os.path.join(savePath, 'command.txt'), 'w') as f:
    #     print(args, file=f)  # Python 3.x
    #     print('\n', file=f)
    #     print(config, file=f)

    bert_weight_name = json.load(
        open("config/" + "bert-base-uncased_weight_name.json", "r"))
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    num_train_optimization_steps = None

    viz = TBlogger("logs", timeStamp)

    train_dataset = ConceptCapLoaderTrain(args.train_data_dir,
                                          tokenizer,
                                          seq_len=args.max_seq_length,
                                          batch_size=args.train_batch_size,
                                          predict_feature=args.predict_feature,
                                          num_workers=args.num_workers,
                                          distributed=args.distributed,
                                          span_mask=args.span_mask,
                                          cond_mask=args.cond_mask)

    validation_dataset = ConceptCapLoaderVal(
        args.val_data_dir,
        tokenizer,
        seq_len=args.max_seq_length,
        batch_size=args.train_batch_size,
        predict_feature=args.predict_feature,
        num_workers=2,
        distributed=args.distributed,
        span_mask=args.span_mask,
        cond_mask=args.cond_mask)

    if args.continue_training:
        assert args.start_epoch > 0  # must have pretrained at least one epoch
        num_train_optimization_steps = (
            int(train_dataset.num_dataset / args.train_batch_size /
                args.gradient_accumulation_steps) * args.num_train_epochs)
        if args.cond_mask:
            num_train_optimization_steps *= 2
        finished_steps = (
            int(train_dataset.num_dataset / args.train_batch_size /
                args.gradient_accumulation_steps) * args.start_epoch)
        if args.cond_mask:
            finished_steps *= 2
    else:
        num_train_optimization_steps = (
            int(train_dataset.num_dataset / args.train_batch_size /
                args.gradient_accumulation_steps) *
            (args.num_train_epochs - args.start_epoch))
        if args.cond_mask:
            num_train_optimization_steps *= 2
        finished_steps = 0

    default_gpu = False
    if dist.is_available() and args.distributed:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu:
        # save all the hidden parameters.
        with open(os.path.join(savePath, 'command.txt'), 'w') as f:
            print(args, file=f)  # Python 3.x
            print('\n', file=f)
            print(config, file=f)

    # pdb.set_trace()
    if args.predict_feature:
        config.v_target_size = 2048
        config.predict_feature = True
    else:
        config.v_target_size = 1601
        config.predict_feature = False

    if args.from_pretrained:
        if args.continue_training:
            ckpt_load_path = os.path.join(
                args.from_pretrained,
                "pytorch_model_{}.bin".format(int(args.start_epoch) - 1))
            model = InterBertForMultiModalPreTraining.from_pretrained(
                ckpt_load_path, config)
        else:
            model = InterBertForMultiModalPreTraining.from_pretrained(
                args.from_pretrained, config)
    else:
        model = InterBertForMultiModalPreTraining(config)

    model.cuda()

    if args.fp16:
        model.half()
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    if args.freeze != -1:
        bert_weight_name_filtered = []
        for name in bert_weight_name:
            if 'embeddings' in name:
                bert_weight_name_filtered.append(name)
            elif 'encoder' in name:
                layer_num = name.split('.')[2]
                if int(layer_num) <= args.freeze:
                    bert_weight_name_filtered.append(name)

        optimizer_grouped_parameters = []
        for key, value in dict(model.named_parameters()).items():
            if key[12:] in bert_weight_name_filtered:
                value.requires_grad = False

        if default_gpu:
            print("filtered weight")
            print(bert_weight_name_filtered)

    if not args.from_pretrained:
        param_optimizer = list(model.named_parameters())
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
    else:
        optimizer_grouped_parameters = []
        for key, value in dict(model.named_parameters()).items():
            if value.requires_grad:
                if key[12:] in bert_weight_name:
                    lr = args.learning_rate * 0.1
                else:
                    lr = args.learning_rate

                if any(nd in key for nd in no_decay):
                    optimizer_grouped_parameters += [{
                        "params": [value],
                        "lr": lr,
                        "weight_decay": 0.01
                    }]

                if not any(nd in key for nd in no_decay):
                    optimizer_grouped_parameters += [{
                        "params": [value],
                        "lr": lr,
                        "weight_decay": 0.0
                    }]
        if default_gpu:
            print(len(list(model.named_parameters())),
                  len(optimizer_grouped_parameters))

    # set different parameters for vision branch and lanugage branch.
    if args.fp16:
        try:
            from apex.contrib.optimizers import FP16_Optimizer
            from apex.contrib.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            bias_correction=False,
            max_grad_norm=1.0,
        )
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        if args.from_pretrained:
            optimizer = BertAdam(
                optimizer_grouped_parameters,
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps,
            )
        else:
            optimizer = BertAdam(
                optimizer_grouped_parameters,
                lr=args.learning_rate,
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps,
            )

        if args.continue_training:
            opt_state_dict_path = os.path.join(
                args.from_pretrained,
                "optimizer_state_{}.bin".format(int(args.start_epoch) - 1))
            optimizer.load_state_dict(
                torch.load(opt_state_dict_path, map_location='cpu'))

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_dataset.num_dataset)
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d",
                num_train_optimization_steps - finished_steps)

    startIterID = 0
    global_step = finished_steps
    masked_loss_v_tmp = 0
    masked_loss_t_tmp = 0
    next_sentence_loss_tmp = 0
    loss_tmp = 0
    start_t = timer()

    for epochId in range(int(args.start_epoch), int(args.num_train_epochs)):
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        # iter_dataloader = iter(train_dataloader)
        for step, batch in enumerate(train_dataset):
            iterId = startIterID + step + (epochId * len(train_dataset))
            # batch = iter_dataloader.next()
            batch = tuple(
                t.cuda(device=device, non_blocking=True) for t in batch)

            input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, multimodal_mask, image_ids = (
                batch)

            masked_loss_t, masked_loss_v, next_sentence_loss = model(
                input_ids,
                image_feat,
                image_loc,
                segment_ids,
                input_mask,
                image_mask,
                multimodal_mask,
                lm_label_ids,
                image_label,
                image_target,
                is_next,
            )

            if args.without_coattention:
                next_sentence_loss = next_sentence_loss * 0

            masked_loss_v = masked_loss_v * args.img_weight
            next_sentence_loss = next_sentence_loss * args.itm_weight
            loss = masked_loss_t * args.text_weight + masked_loss_v + next_sentence_loss

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
                masked_loss_t = masked_loss_t.mean()
                masked_loss_v = masked_loss_v.mean()
                next_sentence_loss = next_sentence_loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            if math.isnan(loss.item()):
                pdb.set_trace()

            tr_loss += loss.item()

            rank = 0

            if dist.is_available() and args.distributed:
                rank = dist.get_rank()
            else:
                rank = 0

            viz.linePlot(iterId, loss.item(), "loss_" + str(rank), "train")
            viz.linePlot(iterId, masked_loss_t.item(),
                         "masked_loss_t_" + str(rank), "train")
            viz.linePlot(iterId, masked_loss_v.item(),
                         "masked_loss_v_" + str(rank), "train")
            viz.linePlot(iterId, next_sentence_loss.item(),
                         "next_sentence_loss_" + str(rank), "train")
            # viz.linePlot(iterId, optimizer.get_lr()[0], 'learning_rate', 'train')

            loss_tmp += loss.item()
            masked_loss_v_tmp += masked_loss_v.item()
            masked_loss_t_tmp += masked_loss_t.item()
            next_sentence_loss_tmp += next_sentence_loss.item()

            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / num_train_optimization_steps,
                        args.warmup_proportion,
                    )
                    for param_group in optimizer.param_groups:
                        param_group["lr"] = lr_this_step

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if step % 20 == 0 and step != 0:
                masked_loss_t_tmp = masked_loss_t_tmp / 20.0
                masked_loss_v_tmp = masked_loss_v_tmp / 20.0
                next_sentence_loss_tmp = next_sentence_loss_tmp / 20.0
                loss_tmp = loss_tmp / 20.0

                end_t = timer()
                timeStamp = strftime("%a %d %b %y %X", gmtime())

                Ep = epochId + nb_tr_steps / float(len(train_dataset))
                printFormat = "[%s][Ep: %.2f][Iter: %d][Time: %5.2fs][Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g][LR: %.8g]"

                printInfo = [
                    timeStamp,
                    Ep,
                    nb_tr_steps,
                    end_t - start_t,
                    loss_tmp,
                    masked_loss_v_tmp,
                    masked_loss_t_tmp,
                    next_sentence_loss_tmp,
                    optimizer.get_lr()[0],
                ]

                start_t = end_t
                print(printFormat % tuple(printInfo))

                masked_loss_v_tmp = 0
                masked_loss_t_tmp = 0
                next_sentence_loss_tmp = 0
                loss_tmp = 0

        # Do the evaluation
        torch.set_grad_enabled(False)
        start_t = timer()
        numBatches = len(validation_dataset)
        eval_masked_loss_t = 0
        eval_masked_loss_v = 0
        eval_next_sentence_loss = 0
        eval_total_loss = 0

        model.eval()
        for step, batch in enumerate(validation_dataset):
            batch = tuple(
                t.cuda(device=device, non_blocking=True) for t in batch)

            input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, multimodal_mask, image_ids = (
                batch)

            masked_loss_t, masked_loss_v, next_sentence_loss = model(
                input_ids,
                image_feat,
                image_loc,
                segment_ids,
                input_mask,
                image_mask,
                multimodal_mask,
                lm_label_ids,
                image_label,
                image_target,
                is_next,
            )

            masked_loss_v = masked_loss_v * args.img_weight
            loss = masked_loss_t + masked_loss_v + next_sentence_loss

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
                masked_loss_t = masked_loss_t.mean()
                masked_loss_v = masked_loss_v.mean()
                next_sentence_loss = next_sentence_loss.mean()

            eval_masked_loss_t += masked_loss_t.item()
            eval_masked_loss_v += masked_loss_v.item()
            eval_next_sentence_loss += next_sentence_loss.item()
            eval_total_loss += loss.item()

            end_t = timer()
            delta_t = " Time: %5.2fs" % (end_t - start_t)
            start_t = end_t
            progressString = "\r Evaluating split '%s' [%d/%d]\t" + delta_t
            sys.stdout.write(progressString % ('val', step + 1, numBatches))
            sys.stdout.flush()

        eval_masked_loss_t = eval_masked_loss_t / float(numBatches)
        eval_masked_loss_v = eval_masked_loss_v / float(numBatches)
        eval_next_sentence_loss = eval_next_sentence_loss / float(numBatches)
        eval_total_loss = eval_total_loss / float(numBatches)

        printFormat = "Evaluation: [Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g]"
        printInfo = [
            eval_total_loss, eval_masked_loss_v, eval_masked_loss_t,
            eval_next_sentence_loss
        ]

        print(printFormat % tuple(printInfo))
        torch.set_grad_enabled(True)

        viz.linePlot(epochId, eval_total_loss, "loss_" + str(rank), "val")
        viz.linePlot(epochId, eval_masked_loss_t, "masked_loss_t_" + str(rank),
                     "val")
        viz.linePlot(epochId, eval_masked_loss_v, "masked_loss_v_" + str(rank),
                     "val")
        viz.linePlot(epochId, eval_next_sentence_loss,
                     "next_sentence_loss_" + str(rank), "val")

        if default_gpu:
            # Save a trained model
            logger.info("** ** * Saving fine - tuned model ** ** * ")
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Only save the model it-self
            output_model_file = os.path.join(
                savePath, "pytorch_model_" + str(epochId) + ".bin")
            torch.save(model_to_save.state_dict(), output_model_file)
            output_opt_state_dict_file = os.path.join(
                savePath, "optimizer_state_" + str(epochId) + ".bin")
            torch.save(optimizer.state_dict(), output_opt_state_dict_file)

        if args.dynamic_masking and epochId + 1 < int(args.num_train_epochs):
            del train_dataset
            gc.collect()
            train_dataset = ConceptCapLoaderTrain(
                args.train_data_dir,
                tokenizer,
                seq_len=args.max_seq_length,
                batch_size=args.train_batch_size,
                predict_feature=args.predict_feature,
                num_workers=args.num_workers,
                distributed=args.distributed,
                span_mask=args.span_mask)
Пример #2
0
def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    if args.do_train:
        logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w'))
    else:
        logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w'))
    logger.info(args)
    logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format(
        device, n_gpu, args.fp16))

    processor = DataProcessor()
    label_list = processor.get_labels(args.data_dir, args.negative_label)
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for i, label in enumerate(label_list)}
    num_labels = len(label_list)
    tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case)

    special_tokens = {}
    if args.do_eval:
        eval_examples = processor.get_dev_examples(args.data_dir, tokenizer)
        eval_features = convert_examples_to_features(
            eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode)
        logger.info("***** Dev *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        all_span_ids = torch.tensor([f.span_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_span_ids)
        eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size)
        eval_label_ids = all_label_ids

    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir,tokenizer)
        train_features = convert_examples_to_features(
                train_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode)

        if args.train_mode == 'sorted' or args.train_mode == 'random_sorted':
            train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask))
        else:
            random.shuffle(train_features)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        all_span_ids = torch.tensor([f.span_id for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_span_ids)
        train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size)
        train_batches = [batch for batch in train_dataloader]

        num_train_optimization_steps = \
            len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        logger.info("***** Training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_result = None
        eval_step = max(1, len(train_batches) // args.eval_per_epoch)
        lrs = [args.learning_rate] if args.learning_rate else \
            [1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5]
        for lr in lrs:
            model = BertForMTB.from_pretrained(args.model,model_name = args.model,num_labels=num_labels,examples = train_examples,mode = args.repre_mode)
            # BertForMTB.from_pretrained(args.model ,model_name = args.model,num_labels=num_labels,examples = train_examples,mode = args.repre_mode)
            # BertForSequenceClassification.from_pretrained(
                # args.model, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE), num_labels=num_labels,examples = train_examples)
            if args.fp16:

                model.half()
            model.to(device)
            if n_gpu > 1:
                model = torch.nn.DataParallel(model)

            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer
                            if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer
                            if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
            if args.fp16:
                try:
                    # from apex.optimizers import FP16_Optimizer
                    # from apex.optimizers import FusedAdam
                    from apex.contrib.optimizers import FP16_Optimizer
                    from apex.contrib.optimizers import FusedAdam
                except ImportError:
                    raise ImportError("Please install apex from https://www.github.com/nvidia/apex"
                                      "to use distributed and fp16 training.")

                optimizer = FusedAdam(optimizer_grouped_parameters,
                                      lr=lr,
                                      bias_correction=False,
                                      max_grad_norm=1.0)
                if args.loss_scale == 0:
                    optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
                else:
                    optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

            else:
                optimizer = BertAdam(optimizer_grouped_parameters,
                                     lr=lr,
                                     warmup=args.warmup_proportion,
                                     t_total=num_train_optimization_steps)

            start_time = time.time()
            global_step = 0
            tr_loss = 0
            nb_tr_examples = 0
            nb_tr_steps = 0
            for epoch in range(int(args.num_train_epochs)):
                model.train()
                logger.info("Start epoch #{} (lr = {})...".format(epoch, lr))
                if args.train_mode == 'random' or args.train_mode == 'random_sorted':
                    random.shuffle(train_batches)
                for step, batch in enumerate(train_batches):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, label_ids, span_ids = batch
                    model_output = model(input_ids, input_mask, segment_ids,span_ids ,label_ids,output_attentions = True,output_hidden_states=True)
                    print(model_output)
                    loss = model_output.loss
                    if n_gpu > 1:
                        loss = loss.mean()
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        optimizer.backward(loss)
                    else:
                        loss.backward()

                    tr_loss += loss.item()
                    nb_tr_examples += input_ids.size(0)
                    nb_tr_steps += 1

                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        if args.fp16:
                            lr_this_step = lr * \
                                warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
                            for param_group in optimizer.param_groups:
                                param_group['lr'] = lr_this_step
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1

                    if (step + 1) % eval_step == 0:
                        logger.info('Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}'.format(
                                     epoch, step + 1, len(train_batches),
                                     time.time() - start_time, tr_loss / nb_tr_steps))
                        save_model = False
                        if args.do_eval:
                            preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels)
                            model.train()
                            result['global_step'] = global_step
                            result['epoch'] = epoch
                            result['learning_rate'] = lr
                            result['batch_size'] = args.train_batch_size
                            logger.info("First 20 predictions:")
                            for pred, label in zip(preds[:20], eval_label_ids.numpy()[:20]):
                                sign = u'\u2713' if pred == label else u'\u2718'
                                logger.info("pred = %s, label = %s %s" % (id2label[pred], id2label[label], sign))
                            if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]):
                                best_result = result
                                save_model = True
                                logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.2f" %
                                            (args.eval_metric, str(lr), epoch, result[args.eval_metric] * 100.0))
                        else:
                            save_model = True

                        if save_model:
                            model_to_save = model.module if hasattr(model, 'module') else model
                            output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
                            torch.save(model_to_save.state_dict(), output_model_file)
                            model_to_save.config.to_json_file(output_config_file)
                            tokenizer.save_vocabulary(args.output_dir)
                            if best_result:
                                output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
                                with open(output_eval_file, "w") as writer:
                                    for key in sorted(result.keys()):
                                        writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_eval:
        if args.eval_test:
            eval_examples = processor.get_test_examples(args.data_dir, tokenizer)
            eval_features = convert_examples_to_features(
                eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode)
            logger.info("***** Test *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
            all_span_ids = torch.tensor([f.span_id for f in eval_features], dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size)
            eval_label_ids = all_label_ids
        model = BertForMTB.from_pretrained(args.model,model_name = args.model,num_labels=num_labels,examples = eval_examples,mode = args.repre_mode)
        # from_pretrained(args.output_dir, num_labels=num_labels)
        if args.fp16:
            model.half()
        model.to(device)
        preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels)
        with open(os.path.join(args.output_dir, "predictions.txt"), "w") as f:
            for ex, pred in zip(eval_examples, preds):
                f.write("%s\t%s\n" % (ex.guid, id2label[pred]))
        with open(os.path.join(args.output_dir, "test_results.txt"), "w") as f:
            for key in sorted(result.keys()):
                f.write("%s = %s\n" % (key, str(result[key])))
Пример #3
0
def main(opts):
    if opts.local_rank == -1:
        assert torch.cuda.is_available()
        device = torch.device("cuda")
        n_gpu = 1
    else:
        torch.cuda.set_device(opts.local_rank)
        device = torch.device("cuda", opts.local_rank)
        # Initializes the distributed backend which will take care of
        # sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        n_gpu = torch.distributed.get_world_size()
        logger.info("device: {} n_gpu: {}, distributed training: {}, "
                    "16-bits training: {}".format(
                        device, n_gpu, bool(opts.local_rank != -1), opts.fp16))
    opts.n_gpu = n_gpu

    if opts.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                            opts.gradient_accumulation_steps))

    is_master = opts.local_rank == -1 or torch.distributed.get_rank() == 0

    # if is_master:
    #     save_training_meta(opts)

    random.seed(opts.seed)
    np.random.seed(opts.seed)
    torch.manual_seed(opts.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(opts.seed)

    tokenizer = BertTokenizer.from_pretrained(
        opts.bert_model, do_lower_case='uncased' in opts.bert_model)



    # train_examples = None
    print("Loading Train Dataset", opts.train_file)
    vocab_dump = torch.load(opts.vocab_file)
    print ('vocab dump', vocab_dump)

    # vocab = vocab_dump['tgt'].fields[0][1].vocab.stoi
    vocab = vocab_dump['src'].fields[0][1].vocab.stoi
    # f = open("./vocab_test_tgt.txt", "w")
    # f.write(str(vocab))
    # f.close()

    # f = open("./vocab_bert.txt", "w")
    # f.write(str(tokenizer.get_vocab()))
    # f.close()

    train_dataset = BertDataset(opts.train_file, tokenizer, vocab,
                                seq_len=opts.max_seq_length,
                                max_len=opts.max_sent_length)
    print ('train dataset',train_dataset[0])
    # Prepare model
    print ('len train dataset',len(train_dataset))

    model = BertForSeq2seq.from_pretrained(opts.bert_model)

    embedding = convert_embedding(
        tokenizer, vocab, model.bert.embeddings.word_embeddings.weight)
    # changing the output embedding layer to have words from new vocab instead of old bert vocab?
    model.update_output_layer(embedding)

    if opts.fp16:
        model.half()
    model.to(device)

    if opts.local_rank != -1:
        # need to make sure models are the same in the beginning
        params = [p.data for p in model.parameters()]
        broadcast_tensors(params)
    for name, module in model.named_modules():
        # we might want to tune dropout for smaller dataset
        if isinstance(module, torch.nn.Dropout):
            module.p = opts.dropout

    # Prepare optimizer
    param_optimizer = [(n, p) for n, p in model.named_parameters()
                       if 'pooler' not in n]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]

    if opts.fp16:
        try:
            from apex.contrib.optimizers import FP16_Optimizer
            from apex.contrib.optimizers import FusedAdam

        except ImportError:
            raise ImportError("Please install apex from "
                              "https://www.github.com/nvidia/apex "
                              "to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=opts.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if opts.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=opts.loss_scale)

    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                             lr=opts.learning_rate)

    global_step = 0
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Batch size = %d", opts.train_batch_size)
    logger.info("  Accumulate steps = %d", opts.gradient_accumulation_steps)
    logger.info("  Num steps = %d", opts.num_train_steps)

    if opts.local_rank == -1:
        train_sampler = TokenBucketSampler(
            train_dataset.lens,
            bucket_size=8192,
            batch_size=opts.train_batch_size,
            droplast=True)
        train_dataloader = DataLoader(train_dataset,
                                      batch_sampler=train_sampler,
                                      num_workers=4,
                                      collate_fn=BertDataset.pad_collate)
    else:
        train_sampler = DistributedTokenBucketSampler(
            n_gpu, opts.local_rank,
            train_dataset.lens,
            bucket_size=8192,
            batch_size=opts.train_batch_size,
            droplast=True)
        train_dataloader = DataLoader(train_dataset,
                                      batch_sampler=train_sampler,
                                      num_workers=4,
                                      collate_fn=BertDataset.pad_collate)

    if is_master:
        TB_LOGGER.create(join(opts.output_dir, 'log'))
    running_loss = RunningMeter('loss')
    model.train()
    if is_master:
        pbar = tqdm(total=opts.num_train_steps)
    else:
        logger.disabled = True
        pbar = None
    n_examples = 0
    n_epoch = 0
    start = time()
    while True:
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) if t is not None else t for t in batch)
            input_ids, input_mask, segment_ids, lm_label_ids = batch
            n_examples += input_ids.size(0)
            mask = lm_label_ids != -1
            loss = model(input_ids, segment_ids, input_mask,
                         lm_label_ids, mask, True)
            if opts.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()
            running_loss(loss.item())
            if (step + 1) % opts.gradient_accumulation_steps == 0:
                global_step += 1
                if opts.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if opts.fp16 is False, AdamW is used that handles
                    # this automatically
                    lr_this_step = opts.learning_rate * warmup_linear(
                        global_step/opts.num_train_steps,
                        opts.warmup_proportion)
                    if lr_this_step < 0:
                        # save guard for possible miscalculation of train steps
                        lr_this_step = 1e-8
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    TB_LOGGER.add_scalar('lr',
                                         lr_this_step, global_step)

                # NOTE running loss not gathered across GPUs for speed
                TB_LOGGER.add_scalar('loss', running_loss.val, global_step)
                TB_LOGGER.step()

                if opts.local_rank != -1:
                    # gather gradients from every processes
                    grads = [p.grad.data for p in model.parameters()
                             if p.requires_grad and p.grad is not None]
                    all_reduce_and_rescale_tensors(grads, float(1))
                optimizer.step()
                optimizer.zero_grad()
                if pbar is not None:
                    pbar.update(1)
                if global_step % 5 == 0:
                    torch.cuda.empty_cache()
                if global_step % 100 == 0:
                    if opts.local_rank != -1:
                        total = sum(all_gather_list(n_examples))
                    else:
                        total = n_examples
                    if is_master:
                        ex_per_sec = int(total / (time()-start))
                        logger.info(f'{total} examples trained at '
                                    f'{ex_per_sec} ex/s')
                    TB_LOGGER.add_scalar('ex_per_s', ex_per_sec, global_step)

                if global_step % opts.valid_steps == 0:
                    logger.info(f"start validation at Step {global_step}")
                    with torch.no_grad():
                        val_log = validate(model,
                                           opts.valid_src, opts.valid_tgt,
                                           tokenizer, vocab, device,
                                           opts.local_rank)
                    logger.info(f"Val Acc: {val_log['val_acc']}; "
                                f"Val Loss: {val_log['val_loss']}")
                    TB_LOGGER.log_scaler_dict(val_log)
                    if is_master:
                        output_model_file = join(
                            opts.output_dir, 'ckpt',
                            f"model_step_{global_step}.pt")
                        # save cpu checkpoint
                        state_dict = {k: v.cpu() if isinstance(v, torch.Tensor)
                                      else v
                                      for k, v in model.state_dict().items()}
                        torch.save(state_dict, output_model_file)
            if global_step >= opts.num_train_steps:
                break
        if global_step >= opts.num_train_steps:
            break
        n_epoch += 1
        if is_master:
            logger.info(f"finished {n_epoch} epochs")
    if opts.num_train_steps % opts.valid_steps != 0:
        with torch.no_grad():
            val_log = validate(model, opts.valid_src, opts.valid_tgt,
                               tokenizer, vocab, device, opts.local_rank)
        TB_LOGGER.log_scaler_dict(val_log)
        if is_master:
            output_model_file = join(opts.output_dir, 'ckpt',
                                     f"model_step_{global_step}.pt")
            # save cpu checkpoint
            state_dict = {k: v.cpu() if isinstance(v, torch.Tensor)
                          else v
                          for k, v in model.state_dict().items()}
            torch.save(model.state_dict(), output_model_file)