Пример #1
0
def init_optimizer(args, model, optimization_step):
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=int(optimization_step *
                                                      args.warmup),
                                     t_total=optimization_step)

    return optimizer, scheduler
 def get_optimizers(num_training_steps: int, model):
     # no_decay = ['bias', 'gamma', 'beta']
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if not any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             args.weight_decay,
         },
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if any(nd in n for nd in no_decay)
             ],
             "weight_decay":
             0.0,
         },
     ]
     optimizer = AdamW(optimizer_grouped_parameters,
                       lr=args.learning_rate,
                       eps=args.adam_epsilon)
     warmup_steps = args.warmup_proportion * num_training_steps
     scheduler = get_linear_schedule_with_warmup(
         optimizer,
         num_warmup_steps=warmup_steps,
         num_training_steps=num_training_steps)
     return optimizer, scheduler
Пример #3
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'align_mask': batch[2],
                'labels': batch[4]
            }
            inputs['token_type_ids'] = batch[3]
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    if args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            with open(
                                    os.path.join(args.output_dir,
                                                 "{}.txt".format(key)),
                                    'a+') as w:
                                w.write("%d\t%f\n" % (global_step, value))
                    with open(os.path.join(args.output_dir, "loss.txt"),
                              'a+') as w:
                        w.write(
                            "%d\t%f\n" %
                            (global_step,
                             (tr_loss - logging_loss) / args.logging_steps))
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step
Пример #4
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )

    parser.add_argument(
        "--xlnet_model",
        default=None,
        type=str,
        required=True,
        help="Either one of the two: 'xlnet-large-cased', 'xlnet-base-cased'.")

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_steps",
        default=100,
        type=int,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    args = parser.parse_args()

    processor = dreamProcessor()
    label_list = processor.get_labels()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    os.makedirs(args.output_dir, exist_ok=True)

    ## only use cased model
    tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model,
                                               do_lower_case=False)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / n_class / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    ## prepare model
    model = XLNetForSequenceClassification.from_pretrained(
        args.xlnet_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
        'distributed_{}'.format(args.local_rank),
        num_choices=3)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.weight']
    ## note: no weight decay according to XLNet paper
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        # Adam Epsilon fixed at 1e-6 according to XLNet paper
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        warmup_steps = args.warmup_steps
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=warmup_steps,
                                         t_total=t_total)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        input_ids = []
        input_mask = []
        segment_ids = []
        label_id = []
        for f in train_features:
            input_ids.append([])
            input_mask.append([])
            segment_ids.append([])
            for i in range(n_class):
                ## put three input sequences tgt
                input_ids[-1].append(f[i].input_ids)
                input_mask[-1].append(f[i].input_mask)
                segment_ids[-1].append(f[i].segment_ids)
            label_id.append([f[0].label_id])

        all_input_ids = torch.tensor(input_ids, dtype=torch.long)
        all_input_mask = torch.tensor(input_mask, dtype=torch.long)
        all_segment_ids = torch.tensor(segment_ids, dtype=torch.long)
        all_label_ids = torch.tensor(label_id, dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for ep in range(int(args.num_train_epochs)):
            max_score = 0
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _, _ = model(input_ids=input_ids,
                                   token_type_ids=segment_ids,
                                   attention_mask=input_mask,
                                   labels=label_ids,
                                   n_class=n_class)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    scheduler.step()
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

                if step % 800 == 0:
                    logger.info("Training loss: {}, global step: {}".format(
                        tr_loss / nb_tr_steps, global_step))

            if args.do_eval:
                eval_examples = processor.get_dev_examples(args.data_dir)
                eval_features = convert_examples_to_features(
                    eval_examples, label_list, args.max_seq_length, tokenizer)

                logger.info("***** Running Dev Evaluation *****")
                logger.info("  Num examples = %d", len(eval_examples))
                logger.info("  Batch size = %d", args.eval_batch_size)

                input_ids = []
                input_mask = []
                segment_ids = []
                label_id = []

                for f in eval_features:
                    input_ids.append([])
                    input_mask.append([])
                    segment_ids.append([])
                    for i in range(n_class):
                        input_ids[-1].append(f[i].input_ids)
                        input_mask[-1].append(f[i].input_mask)
                        segment_ids[-1].append(f[i].segment_ids)
                    label_id.append([f[0].label_id])

                all_input_ids = torch.tensor(input_ids, dtype=torch.long)
                all_input_mask = torch.tensor(input_mask, dtype=torch.long)
                all_segment_ids = torch.tensor(segment_ids, dtype=torch.long)
                all_label_ids = torch.tensor(label_id, dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label_ids)
                if args.local_rank == -1:
                    eval_sampler = SequentialSampler(eval_data)
                else:
                    eval_sampler = DistributedSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=args.eval_batch_size)

                model.eval()
                eval_loss, eval_accuracy = 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                logits_all = []
                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        tmp_eval_loss, logits, _ = model(
                            input_ids=input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids,
                            n_class=n_class)

                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    for i in range(len(logits)):
                        logits_all += [logits[i]]

                    tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1))

                    eval_loss += tmp_eval_loss.mean().item()
                    eval_accuracy += tmp_eval_accuracy

                    nb_eval_examples += input_ids.size(0)
                    nb_eval_steps += 1

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = eval_accuracy / nb_eval_examples

                if args.do_train:
                    result = {
                        'eval_loss': eval_loss,
                        'eval_accuracy': eval_accuracy,
                        'global_step': global_step,
                        'loss': tr_loss / nb_tr_steps
                    }
                else:
                    result = {
                        'eval_loss': eval_loss,
                        'eval_accuracy': eval_accuracy
                    }

                output_eval_file = os.path.join(args.output_dir,
                                                "eval_results_test.txt")
                with open(output_eval_file, "a+") as writer:
                    logger.info(" Epoch: %d", (ep + 1))
                    logger.info("***** Eval results *****")
                    writer.write(" Epoch: " + str(ep + 1))
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))

                # output_eval_file = os.path.join(args.output_dir, "logits_test.txt")
                # with open(output_eval_file, "w") as f:
                #     for i in range(len(logits_all)):
                #         for j in range(len(logits_all[i])):
                #             f.write(str(logits_all[i][j]))
                #             if j == len(logits_all[i])-1:
                #                 f.write("\n")
                #             else:
                #                 f.write(" ")

                if eval_accuracy > max_score:
                    max_score = eval_accuracy
                    ## save trained model
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(
                        args.output_dir,
                        "pytorch_model_{}epoch.bin".format(ep + 1))
                    torch.save(model_to_save.state_dict(), output_model_file)
            else:
                ## save trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(
                    args.output_dir,
                    "pytorch_model_{}epoch.bin".format(ep + 1))
                torch.save(model_to_save.state_dict(), output_model_file)
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    warm_up_steps = int(args.warmup_steps * t_total)
    save_steps = int(args.save_steps * t_total)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    a = []
    b = []
    c = []
    d = []
    optimizer_grouped_parameters = []
    for n, p in model.named_parameters():
        if 'classifier' in n or 'linear_r' in n or 'linear_g' in n:
            if any(nd in n for nd in no_decay):
                a.append(p)
            else:
                b.append(p)
        else:
            if any(nd in n for nd in no_decay):
                c.append(p)
            else:
                d.append(p)
    optimizer_grouped_parameters.append({
        "params": a,
        "weight_decay": 0,
        "lr": 2e-3
    })
    optimizer_grouped_parameters.append({
        "params": b,
        "weight_decay": args.weight_decay,
        "lr": 2e-3
    })
    optimizer_grouped_parameters.append({"params": c, "weight_decay": 0})
    optimizer_grouped_parameters.append({
        "params": d,
        "weight_decay": args.weight_decay
    })
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warm_up_steps,
                                     t_total=t_total)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'align_mask': batch[2],
                'labels': batch[4]
            }
            inputs['token_type_ids'] = None
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    tb_writer.add_scalar('lr_n',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('lr_o',
                                         scheduler.get_lr()[2], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    if args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    tb_writer.close()

    return global_step, tr_loss / global_step
Пример #6
0
def main():
    parser = argparse.ArgumentParser()
    # Required parameters

    parser.add_argument(
        "--output_dir",
        default='output',
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )
    parser.add_argument("--checkpoint",
                        default='pretrain_ckpt/bert_small_ckpt.bin',
                        type=str,
                        help="checkpoint")
    parser.add_argument("--model_config",
                        default='data/bert_small.json',
                        type=str)
    # Other parameters
    parser.add_argument("--train_file",
                        default='data/KorQuAD_v1.0_train.json',
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=96,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=8.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--adam_epsilon",
                        default=1e-6,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O2',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )

    args = parser.parse_args()

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {} n_gpu: {}, 16-bits training: {}".format(
        device, n_gpu, args.fp16))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer('data/ko_vocab_32k.txt',
                              max_len=args.max_seq_length,
                              do_basic_tokenize=True)
    # Prepare model
    config = Config.from_json_file(args.model_config)
    model = QuestionAnswering(config)
    model.bert.load_state_dict(torch.load(args.checkpoint))
    num_params = count_parameters(model)
    logger.info("Total Parameter: %d" % num_params)
    model.to(device)
    model = torch.nn.DataParallel(model)

    cached_train_features_file = args.train_file + '_{0}_{1}_{2}'.format(
        str(args.max_seq_length), str(args.doc_stride),
        str(args.max_query_length))
    train_examples = read_squad_examples(input_file=args.train_file,
                                         is_training=True,
                                         version_2_with_negative=False)
    try:
        with open(cached_train_features_file, "rb") as reader:
            train_features = pickle.load(reader)
    except:
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("  Saving train features into cached file %s",
                    cached_train_features_file)
        with open(cached_train_features_file, "wb") as writer:
            pickle.dump(train_features, writer)

    num_train_optimization_steps = int(
        len(train_features) / args.train_batch_size) * args.num_train_epochs

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(
        optimizer,
        warmup_steps=num_train_optimization_steps * 0.1,
        t_total=num_train_optimization_steps)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    logger.info("***** Running training *****")
    logger.info("  Num orig examples = %d", len(train_examples))
    logger.info("  Num split examples = %d", len(train_features))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)
    num_train_step = num_train_optimization_steps

    input_ids = np.load('input_ids2.npy')
    input_mask = np.load('input_mask.npy')
    input_segments = np.load('input_segments.npy')
    start_prob = np.load('start_prob.npy')
    end_prob = np.load('end_prob.npy')
    start_label = np.load('input_start.npy')
    stop_label = np.load('input_stop.npy')
    """
    for i in range(1000):
        print(input_ids[i])
        print(max(start_prob[i]))
        print(sum(start_prob[i]))

        input()
    """
    paragraph = torch.tensor(input_ids.astype(
        np.int64)).type(dtype=torch.long).cuda()
    paragraph_mask = torch.tensor(input_mask.astype(
        np.int64)).type(dtype=torch.long).cuda()
    paragraph_segments = torch.tensor(input_segments.astype(
        np.int64)).type(dtype=torch.long).cuda()
    start_prob = torch.tensor(start_prob.astype(
        np.float32)).type(dtype=torch.float32).cuda()
    end_prob = torch.tensor(end_prob.astype(
        np.float32)).type(dtype=torch.float32).cuda()
    start_label = torch.tensor(start_label.astype(
        np.int64)).type(dtype=torch.long).cuda()
    stop_label = torch.tensor(stop_label.astype(
        np.int64)).type(dtype=torch.long).cuda()

    train_data = TensorDataset(paragraph, paragraph_mask, paragraph_segments,
                               start_label, stop_label, start_prob, end_prob)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    model.train()
    global_step = 0
    epoch = 0
    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        iter_bar = tqdm(
            train_dataloader,
            desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)")
        tr_step, total_loss, mean_loss = 0, 0., 0.
        for step, batch in enumerate(iter_bar):
            if n_gpu == 1:
                batch = tuple(
                    t.to(device)
                    for t in batch)  # multi-gpu does scattering it-self
            input_ids, input_mask, segment_ids, start_positions, end_positions, start_probs, end_probs = batch
            loss = model(input_ids, segment_ids, input_mask, start_positions,
                         end_positions, start_probs, end_probs)
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            scheduler.step()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            tr_step += 1
            total_loss += loss
            mean_loss = total_loss / tr_step
            iter_bar.set_description(
                "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" %
                (global_step, num_train_step, mean_loss, loss.item()))

        logger.info("** ** * Saving file * ** **")
        model_checkpoint = "korquad_%d.bin" % (epoch)
        logger.info(model_checkpoint)
        output_model_file = os.path.join(args.output_dir, model_checkpoint)
        if n_gpu > 1:
            torch.save(model.module.state_dict(), output_model_file)
        else:
            torch.save(model.state_dict(), output_model_file)
        epoch += 1
Пример #7
0
def main(args):
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        logger.info("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    args.data_dir = os.path.join(args.data_dir, args.task_name)
    args.output_dir = os.path.join(args.output_dir, args.task_name)
    logger.info("args = %s", args)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "sst-2": Sst2Processor,
        "sts-b": StsbProcessor,
        "qqp": QqpProcessor,
        "qnli": QnliProcessor,
        "rte": RteProcessor,
        "wnli": WnliProcessor,
    }

    output_modes = {
        "cola": "classification",
        "mnli": "classification",
        "mrpc": "classification",
        "sst-2": "classification",
        "sts-b": "regression",
        "qqp": "classification",
        "qnli": "classification",
        "rte": "classification",
        "wnli": "classification",
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        logger.info("Output directory already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        try:
            os.makedirs(args.output_dir)
        except:
            logger.info("catch a error")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # use bert to aug train_examples
    ori_train_examples = processor.get_train_examples(args.data_dir)
    eval_examples = processor.get_dev_examples(args.data_dir)
    test_examples = processor.get_test_examples(args.data_dir)

    if args.double_ori == 0:
        num_train_optimization_steps = int(
            len(ori_train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
    else:
        num_train_optimization_steps = int(
            len(ori_train_examples) * 2 / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs

    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    config_class, tokenizer_class = (RobertaConfig, RobertaTokenizer)

    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.use_saved == 1:
        bert_saved_dir = args.ckpt
        if args.co_training:
            model_class = RobertaForNSP_co
            model = model_class.from_pretrained(bert_saved_dir, args=args)

        elif args.only_bert:
            model_class = RobertaForSequenceClassification
            model = model_class.from_pretrained(bert_saved_dir)
            tokenizer = tokenizer_class.from_pretrained(bert_saved_dir)
        else:
            model_class = RobertaForNSPAug
            model = model_class.from_pretrained(bert_saved_dir, args=args)

    else:
        config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=task_name,
            cache_dir=args.cache_dir if args.cache_dir else None)
        if args.only_bert:
            model_class = RobertaForSequenceClassification
            model = model_class.from_pretrained(
                args.model_name_or_path,
                from_tf=bool('.ckpt' in args.model_name_or_path),
                config=config,
                cache_dir=args.cache_dir if args.cache_dir else None)
        else:
            model_class = RobertaForNSPAug
            model = model_class.from_pretrained(
                args.model_name_or_path,
                from_tf=bool('.ckpt' in args.model_name_or_path),
                config=config,
                cache_dir=args.cache_dir if args.cache_dir else None,
                args=args)

    model.cuda()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    cnt = np.sum(np.prod(v.size())
                 for name, v in model.named_parameters()) / 1e6
    logger.info("cnt %s", str(cnt))

    if args.do_first_eval:
        args.do_train = False
        res_file = os.path.join(args.output_dir, "first_test.tsv")
        eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \
            do_evaluate(args, processor, label_list, tokenizer, model, 0, output_mode, num_labels, task_name,
                    eval_examples, type="dev")

        eval_res.update(res_parts)
        for key in sorted(eval_res.keys()):
            logger.info("first evaluation:  %s = %s", key, str(eval_res[key]))

        idx, preds = do_test(args, label_list, task_name, processor, tokenizer,
                             output_mode, model)

        dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds})
        dataframe.to_csv(res_file, index=False, sep='\t')
        logger.info("  Num test length = %d", idx)
        logger.info("  Done ")

        # write mm test results
        if task_name == "mnli":
            res_file = os.path.join(args.output_dir, "first_test_mm.tsv")

            idx, preds = do_test(args,
                                 label_list,
                                 task_name,
                                 processor,
                                 tokenizer,
                                 output_mode,
                                 model,
                                 do_mm=True)

            dataframe = pd.DataFrame({
                'index': range(idx),
                'prediction': preds
            })
            dataframe.to_csv(res_file, index=False, sep='\t')
            logger.info("  Num test length = %d", idx)
            logger.info("  Done write mm")

    if args.do_train:
        # Prepare optimizer
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(args.warmup_rate *
                                 num_train_optimization_steps),
            num_training_steps=num_train_optimization_steps)

        global_step = 0
        best_val_acc = 0.0
        first_time = time.time()

        logger.info("***** Running training *****")
        logger.info("  Num original examples = %d", len(ori_train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        aug_ratio = 0.0
        aug_seed = np.random.randint(0, 1000)
        for epoch in range(int(args.num_train_epochs)):
            if args.only_bert:
                train_features = convert_examples_to_features(
                    ori_train_examples,
                    label_list,
                    args.max_seq_length,
                    tokenizer,
                    num_show=args.num_show,
                    output_mode=output_mode,
                    args=args,
                    pad_token=tokenizer.convert_tokens_to_ids(
                        [tokenizer.pad_token])[0],
                    do_roberta=1)
            else:
                logger.info("epoch=%d,  aug_ratio = %f,  aug_seed=%d", epoch,
                            aug_ratio, aug_seed)
                train_examples = Aug_each_ckpt(
                    ori_train_examples,
                    label_list,
                    model,
                    tokenizer,
                    args=args,
                    num_show=args.num_show,
                    output_mode=output_mode,
                    seed=aug_seed,
                    aug_ratio=aug_ratio,
                    use_bert=False,
                    do_roberta=1,
                    ssa_roberta=1,
                    pad_token=tokenizer.convert_tokens_to_ids(
                        [tokenizer.pad_token])[0])
                if aug_ratio + args.aug_ratio_each < 1.0:
                    aug_ratio += args.aug_ratio_each
                aug_seed += 1

                train_features = convert_examples_to_features(
                    train_examples,
                    label_list,
                    args.max_seq_length,
                    tokenizer,
                    num_show=args.num_show,
                    output_mode=output_mode,
                    args=args,
                    pad_token=tokenizer.convert_tokens_to_ids(
                        [tokenizer.pad_token])[0],
                    do_roberta=1)

            logger.info("Done convert features")
            all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in train_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in train_features], dtype=torch.long)
            if output_mode == "classification":
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.long)
            elif output_mode == "regression":
                all_label_ids = torch.tensor(
                    [f.label_id for f in train_features], dtype=torch.float)

            token_real_label = torch.tensor(
                [f.token_real_label for f in train_features], dtype=torch.long)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_label_ids,
                                       token_real_label)
            if args.local_rank == -1:
                train_sampler = RandomSampler(train_data)
            else:
                train_sampler = DistributedSampler(train_data)
            train_dataloader = DataLoader(train_data,
                                          sampler=train_sampler,
                                          batch_size=args.train_batch_size)

            logger.info("begin training")
            tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0
            nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0
            preds = []
            all_labels = []
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.cuda() for t in batch)
                input_ids, input_mask, segment_ids, label_ids, token_real_label = batch
                if args.only_bert:
                    outputs = model(input_ids, input_mask)
                    seq_logits = outputs[0]
                else:
                    seq_logits, aug_logits, aug_loss = model(
                        input_ids,
                        input_mask,
                        labels=None,
                        token_real_label=token_real_label)
                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    seq_loss = loss_fct(seq_logits.view(-1, num_labels),
                                        label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    seq_loss = loss_fct(seq_logits.view(-1),
                                        label_ids.view(-1))

                token_real_label = token_real_label.detach().cpu().numpy()

                w = args.aug_loss_weight
                if args.only_bert:
                    loss = seq_loss
                else:
                    loss = (1 - w) * seq_loss + w * aug_loss

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                total_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), 10000.0)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                tr_seq_loss += seq_loss.mean().item()
                seq_logits = seq_logits.detach().cpu().numpy()
                label_ids = label_ids.detach().cpu().numpy()
                if len(preds) == 0:
                    preds.append(seq_logits)
                    all_labels.append(label_ids)
                else:
                    preds[0] = np.append(preds[0], seq_logits, axis=0)
                    all_labels[0] = np.append(all_labels[0], label_ids, axis=0)

                if args.only_bert == 0:
                    aug_logits = aug_logits.detach().cpu().numpy()
                    tmp_train_aug_accuracy, tmp_tokens = accuracy(
                        aug_logits, token_real_label, type="aug")
                    train_aug_accuracy += tmp_train_aug_accuracy
                    nb_tr_tokens += tmp_tokens
                    tr_aug_loss += aug_loss.mean().item()

                if global_step % 20 == 0:
                    loss = tr_loss / nb_tr_steps
                    seq_loss = tr_seq_loss / nb_tr_steps
                    aug_loss = tr_aug_loss / nb_tr_steps
                    tmp_pred = preds[0]
                    tmp_labels = all_labels[0]
                    if output_mode == "classification":
                        tmp_pred = np.argmax(tmp_pred, axis=1)
                    elif output_mode == "regression":
                        tmp_pred = np.squeeze(tmp_pred)
                    res = accuracy(tmp_pred, tmp_labels, task_name=task_name)

                    if nb_tr_tokens != 0:
                        aug_avg = train_aug_accuracy / nb_tr_tokens
                    else:
                        aug_avg = 0.0
                    log_string = ""
                    log_string += "epoch={:<5d}".format(epoch)
                    log_string += " step={:<9d}".format(global_step)
                    log_string += " total_loss={:<9.7f}".format(loss)
                    log_string += " seq_loss={:<9.7f}".format(seq_loss)
                    log_string += " aug_loss={:<9.7f}".format(aug_loss)
                    log_string += " lr={:<9.7f}".format(scheduler.get_lr()[0])
                    log_string += " |g|={:<9.7f}".format(total_norm)
                    #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg)
                    log_string += " tr_aug_acc={:<9.7f}".format(aug_avg)
                    log_string += " mins={:<9.2f}".format(
                        float(time.time() - first_time) / 60)
                    for key in sorted(res.keys()):
                        log_string += "  " + key + "= " + str(res[key])
                    logger.info(log_string)

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    optimizer.zero_grad()
                    global_step += 1

            train_loss = tr_loss / nb_tr_steps

            if args.do_eval and (args.local_rank == -1
                                 or torch.distributed.get_rank()
                                 == 0) and epoch % 1 == 0:
                eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts = \
                    do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev")

                if "acc" in eval_res:
                    tmp_acc = eval_res["acc"]
                elif "mcc" in eval_res:
                    tmp_acc = eval_res["mcc"]
                else:
                    tmp_acc = eval_res["corr"]

                if tmp_acc >= best_val_acc:
                    best_val_acc = tmp_acc
                    dev_test = "dev"

                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_dir = os.path.join(args.output_dir,
                                                    "dev_" + str(tmp_acc))
                    if not os.path.exists(output_model_dir):
                        os.makedirs(output_model_dir)
                    model_to_save.save_pretrained(output_model_dir)
                    tokenizer.save_pretrained(output_model_dir)
                    output_model_file = os.path.join(output_model_dir,
                                                     'pytorch_model.bin')
                    torch.save(model_to_save.state_dict(), output_model_file)

                    result = {
                        'eval_total_loss': eval_loss,
                        'eval_seq_loss': eval_seq_loss,
                        'eval_aug_loss': eval_aug_loss,
                        'eval_aug_accuracy': eval_aug_accuracy,
                        'global_step': global_step,
                        'train_loss': train_loss,
                        'best_epoch': epoch,
                        'train_batch_size': args.train_batch_size,
                        'args': args
                    }

                    result.update(eval_res)
                    result.update(res_parts)

                    output_eval_file = os.path.join(
                        args.output_dir,
                        dev_test + "_results_" + str(tmp_acc) + ".txt")
                    with open(output_eval_file, "w") as writer:
                        logger.info("***** Test results *****")
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))

                    # write test results
                    if args.do_test:
                        res_file = os.path.join(
                            args.output_dir, "test_" + str(tmp_acc) + ".tsv")

                        idx, preds = do_test(args, label_list, task_name,
                                             processor, tokenizer, output_mode,
                                             model)

                        dataframe = pd.DataFrame({
                            'index': range(idx),
                            'prediction': preds
                        })
                        dataframe.to_csv(res_file, index=False, sep='\t')
                        logger.info("  Num test length = %d", idx)
                        logger.info("  Done ")

                        # write mm test results
                        if task_name == "mnli":
                            res_file = os.path.join(
                                args.output_dir, "mm_roberta_results_b_" +
                                str(tmp_acc) + ".tsv")

                            idx, preds = do_test(args,
                                                 label_list,
                                                 task_name,
                                                 processor,
                                                 tokenizer,
                                                 output_mode,
                                                 do_mm=True)

                            dataframe = pd.DataFrame({
                                'index': range(idx),
                                'prediction': preds
                            })
                            dataframe.to_csv(res_file, index=False, sep='\t')
                            logger.info("  Num test length = %d", idx)
                            logger.info("  Done write mm")

                else:
                    logger.info("  tmp_val_acc = %f", tmp_acc)
Пример #8
0
    def train(self):

        from datetime import datetime
        current_time = datetime.now().strftime('%b%d_%H-%M-%S')
        task = self.args.task
        tb_writer = SummaryWriter(log_dir='./runs/' + task + "/" +
                                  current_time + self.args.prefix,
                                  comment=self.args.prefix)

        vocabs, lexical_mapping = self._build_model()

        train_data = DataLoader(self.args,
                                vocabs,
                                lexical_mapping,
                                self.args.train_data,
                                self.args.batch_size,
                                for_train=True)
        dev_data = DataLoader(self.args,
                              vocabs,
                              lexical_mapping,
                              self.args.dev_data,
                              self.args.batch_size,
                              for_train=False)
        test_data = DataLoader(self.args,
                               vocabs,
                               lexical_mapping,
                               self.args.test_data,
                               self.args.batch_size,
                               for_train='Eval')

        train_data.set_unk_rate(self.args.unk_rate)

        # WRITE PARAMETERS
        with open('./' + 'param' + '.txt', 'w') as f:

            for name, param in self.model.named_parameters():
                f.writelines('name:' + name + "\n")
                f.writelines(str(param))
                f.writelines('size:' + str(param.size()) + '\n')

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in self.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.
        }, {
            'params': [
                p for n, p in self.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        gradient_accumulation_steps = 1
        t_total = len(
            train_data) // gradient_accumulation_steps * self.args.epochs

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.args.lr,
                          eps=self.args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.args.warmup_steps,
                                         t_total=t_total)

        self.model.zero_grad()

        set_seed(42, self.args.gpus)

        batches_acm, loss_acm = 0, 0

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Task: %s", self.args.task)
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Num Epochs = %d", self.args.epochs)
        logger.info("  Total optimization steps = %d", t_total)
        logger.info("  Running Language Model = %s", self.args.lm_model)
        logger.info("  Running Model = %s", self.args.encoder_type)

        best_acc = 0
        best_model_wts = copy.deepcopy(self.model.state_dict())
        total_steps = 0

        train_iterator = trange(int(self.args.epochs), desc="Epoch")

        # initialize the early_stopping object
        early_stopping = EarlyStopping(patience=self.args.patience,
                                       verbose=True)

        for _ in train_iterator:
            epoch_iterator = tqdm(train_data, desc="Iteration")

            running_loss = 0.0
            running_corrects = 0

            batch_count = self.args.batch_multiplier

            # Turn on the train mode
            for step, batch in enumerate(epoch_iterator):

                self.model.train()
                batch = move_to_cuda(batch, self.device)

                logits, labels, ans_ids = self.model(batch, train=True)
                logits_for_pred = logits.clone().detach()
                loss = self.criterion(logits, labels)
                loss_value = loss.item()

                pred_values, pred_indices = torch.max(logits_for_pred, 1)
                labels = labels.tolist()
                pred = pred_indices.tolist()
                corrects = [i for i, j in zip(labels, pred) if i == j]

                # Statistics
                running_loss += loss.item()
                running_corrects += len(corrects)

                if batch_count == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   1.0)

                    optimizer.step()
                    scheduler.step()

                    total_steps += 1
                    optimizer.zero_grad()
                    self.model.zero_grad()

                    batch_count = self.args.batch_multiplier

                loss_acm += loss_value

                loss.backward()
                batch_count -= 1

                if (batches_acm %
                    (self.args.batch_multiplier * self.args.batch_size)
                        == 0) & (batches_acm != 0) & (step != 0):
                    logger.info(
                        'Train Epoch %d, Batch %d, loss %.3f, Accuracy %.3f',
                        _, batches_acm, loss_acm / batches_acm,
                        running_corrects / (self.args.batch_size * step))
                    tb_writer.add_scalar('Training_loss',
                                         loss_acm / batches_acm, batches_acm)
                    tb_writer.add_scalar(
                        'Training_Accuracy',
                        running_corrects / (self.args.batch_size * step))
                    torch.cuda.empty_cache()
                batches_acm += 1

            epoch_loss = running_loss / batches_acm
            epoch_acc = running_corrects / len(train_data)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(_, epoch_loss,
                                                       epoch_acc))

            tb_writer.add_scalar('Training_Epoch_loss', epoch_loss, _)
            tb_writer.add_scalar('Training_Epoch_Accuracy', epoch_acc, _)

            # Evaluate on Development Set
            eval_epoch_acc, eval_epoch_loss = self._run_evaluate(
                dev_data, _, write_answer=False)

            print('Overall_Dev Acc: {:.4f}'.format(eval_epoch_acc))

            tb_writer.add_scalar('Dev_Epoch_Accuracy', eval_epoch_acc, _)

            ##################################

            # Evaluate on Test Set
            test_epoch_acc, test_epoch_loss = self._run_evaluate(
                test_data, _, write_answer=True)

            print('Overall_Test Acc: {:.4f}'.format(test_epoch_acc))
            tb_writer.add_scalar('Test_Epoch_Accuracy', test_epoch_acc, _)

            # Save only best accuracy model on dev set
            if eval_epoch_acc > best_acc:
                best_acc = eval_epoch_acc
                best_model_wts = copy.deepcopy(self.model.state_dict())

            # early_stopping needs the validation loss to check if it has decresed,
            # and if it has, it will make a checkpoint of the current model
            early_stopping(epoch_acc, self.model)
            if early_stopping.early_stop:
                print("Early stopping")
                break

            self.model.train()

        logger.info('Best val Acc: {:4f}'.format(best_acc))

        torch.save(
            {
                'args': self.save_args,
                'model': best_model_wts
            }, '%s/epoch%d_batch%d_model_best_%s' %
            (self.args.ckpt, self.args.epochs, batches_acm, self.args.prefix))
Пример #9
0
def train(config):
    # electra config 객체 생성
    electra_config = ElectraConfig.from_pretrained(config["train_model_path"], num_labels=config["num_labels"], cache_dir=config["cache_dir_path"])
    
    # electra tokenizer 객체 생성
    electra_tokenizer = ElectraTokenizer.from_pretrained(config["train_model_path"], do_lower_case=False, cache_dir=config["cache_dir_path"])

    # electra model 객체 생성
    electra_model = ElectraForSequenceClassification.from_pretrained(config["train_model_path"], config=electra_config, cache_dir=config["cache_dir_path"])

    # electra_model.cuda()

    # 학습 데이터 읽기
    train_datas = read_data(file_path=config["train_data_path"])

    # 학습 데이터 전처리
    train_dataset = convert_data2dataset(datas=train_datas, tokenizer=electra_tokenizer, max_length=config["max_length"])

    # 학습 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config["batch_size"])

    # 평가 데이터 읽기
    test_datas = read_data(file_path=config["test_data_path"])

    # 평가 데이터 전처리
    test_dataset = convert_data2dataset(datas=test_datas, tokenizer=electra_tokenizer, max_length=config["max_length"])

    # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100)

    # 전체 학습 횟수(batch 단위)
    t_total = len(train_dataloader) // config["gradient_accumulation_steps"] * config["epoch"]

    # 모델 학습을 위한 optimizer
    optimizer = AdamW(electra_model.parameters(), lr=config["learning_rate"])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config["warmup_steps"], num_training_steps=t_total)

    if os.path.isfile(os.path.join(config["model_dir_path"], "optimizer.pt")) and os.path.isfile(
            os.path.join(config["model_dir_path"], "scheduler.pt")):
        # 기존에 학습했던 optimizer와 scheduler의 정보 불러옴
        optimizer.load_state_dict(torch.load(os.path.join(config["model_dir_path"], "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(config["model_dir_path"], "scheduler.pt")))

    global_step = 0
    electra_model.zero_grad()
    max_test_accuracy = 0
    for epoch in range(config["epoch"]):
        electra_model.train()

        # 학습 데이터에 대한 정확도와 평균 loss
        train_accuracy, average_loss, global_step = do_train(config=config, electra_model=electra_model,
                                                             optimizer=optimizer, scheduler= scheduler,
                                                             train_dataloader=train_dataloader,
                                                             epoch=epoch+1, global_step=global_step)

        print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4)))

        electra_model.eval()

        # 평가 데이터에 대한 정확도
        test_accuracy = do_evaluate(electra_model=electra_model, test_dataloader=test_dataloader, mode=config["mode"])

        print("test_accuracy : {}\n".format(round(test_accuracy, 4)))

        # 현재의 정확도가 기존 정확도보다 높은 경우 모델 파일 저장
        if(max_test_accuracy < test_accuracy):
            max_test_accuracy = test_accuracy

            output_dir = os.path.join(config["model_dir_path"], "checkpoint-{}".format(global_step))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            print("save model in checkpoint-{}\n".format(global_step))
            
            electra_config.save_pretrained(output_dir)
            electra_tokenizer.save_pretrained(output_dir)
            electra_model.save_pretrained(output_dir)
            torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
            torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
Пример #10
0
def main():
    parser = argparse.ArgumentParser()

    # Required Parameters
    parser.add_argument(
        "--output_dir",
        default='output',
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )
    parser.add_argument("--checkpoint",
                        default='pretrain_ckpt/bert_small_ckpt.bin',
                        type=str,
                        help="checkpoint")
    parser.add_argument("--resume_checkpoint",
                        default=False,
                        type=bool,
                        help="resume")
    parser.add_argument('--log_dir', default='./runs', type=str)

    # Other Parameters
    parser.add_argument("--train_feature",
                        default='./rsc/train_features.hdf5',
                        type=str,
                        help="SQuAD corpus for post-training.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=4.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--adam_epsilon",
                        default=1e-6,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
        "of training.")
    parser.add_argument("--num_workers",
                        default=8,
                        type=int,
                        help="Proportion of workers of DataLoader")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O2',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")

    args = parser.parse_args()

    summary_writer = SummaryWriter(args.log_dir)
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {} n_gpu: {}, 16-bits training: {}".format(
        device, n_gpu, args.fp16))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Prepare model
    model = SampleCNN()

    # Multi-GPU Setting
    # if n_gpu > 1:
    #     model = nn.DataParallel(model)

    num_params = count_parameters(model)
    logger.info("Total Parameter: %d" % num_params)
    model.to(device)

    post_training_dataset = SpeechDataset('./rsc/train.hdf5')

    num_train_optimization_steps = int(
        len(post_training_dataset) /
        args.train_batch_size) * args.num_train_epochs

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(
        optimizer,
        warmup_steps=num_train_optimization_steps * 0.1,
        t_total=num_train_optimization_steps)

    loss_fn = nn.KLDivLoss(reduction='batchmean').to(device)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    logger.info("***** Running training *****")
    logger.info("  Num orig examples = %d", len(post_training_dataset))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)

    num_train_step = num_train_optimization_steps

    train_dataloader = DataLoader(post_training_dataset,
                                  batch_size=args.train_batch_size,
                                  num_workers=16,
                                  pin_memory=True)
    model.train()
    global_step = 0
    epoch = 0

    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        iter_bar = tqdm(
            train_dataloader,
            desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)")
        tr_step, total_loss, mean_loss = 0, 0., 0.

        for step, batch in enumerate(iter_bar):
            feature = batch['feature'].float().to(device)
            label = batch['label'].float().to(device)
            output = model(feature)

            # loss = -F.kl_div(output, label, reduction='batchmean')

            loss = loss_fn(output, label)
            # if n_gpu > 1:
            #     loss = loss.mean()

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1
            tr_step += 1
            total_loss += loss
            mean_loss = total_loss / tr_step
            iter_bar.set_description(
                "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" %
                (global_step, num_train_step, mean_loss, loss.item()))

            if global_step % 100 == 0:
                print('output ', output)
                summary_writer.add_scalar('Train/Total_Mean_Loss', mean_loss,
                                          global_step)
                summary_writer.add_scalar('Train/Total_Loss', loss.item(),
                                          global_step)

        logger.info("***** Saving file *****")

        if args.resume_checkpoint:
            model_checkpoint = "pt_bert_from_checkpoint_%d.bin" % (epoch)
        else:
            model_checkpoint = "pt_scnn_%d.bin" % (epoch)

        logger.info(model_checkpoint)
        output_model_file = os.path.join(args.output_dir, model_checkpoint)
        # if n_gpu > 1:
        #     torch.save(model.module.state_dict(), output_model_file)
        # else:
        torch.save(model.state_dict(), output_model_file)
        epoch += 1
Пример #11
0
            print(" >> Distribute the model")
        else:
            model_ft.cuda()

        params_to_update = model_ft.parameters()
        # Observe that all parameters are being optimized
        if optim_type == 'Adam':
            optimizer_ft = optim.Adam(params_to_update, lr=begin_lr)
        if optim_type == 'SGD':
            optimizer_ft = optim.SGD(params_to_update,
                                     lr=begin_lr,
                                     momentum=0.9)
        else:
            optimizer_ft = AdamW(params_to_update,
                                 lr=begin_lr,
                                 betas=(0.9, 0.999),
                                 eps=1e-8,
                                 weight_decay=1e-2)

        if lr_schedule == 'plateau':
            # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_ft,
            #                                                         mode='min',
            #                                                         factor=0.2,
            #                                                         patience=1,
            #                                                         verbose=False,
            #                                                         threshold=1e-4,
            #                                                         threshold_mode='rel',
            #                                                         cooldown=2,
            #                                                         min_lr=0,
            #                                                         eps=1e-8)
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_ft,
Пример #12
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    warm_up_steps = int(args.warmup_steps * t_total)
    save_steps = int(args.save_steps * t_total)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warm_up_steps,
                                     t_total=t_total)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            inputs[
                'token_type_ids'] = None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]

            label_ids = torch.nn.functional.one_hot(batch[3]).float()
            tsa_start = 0.5
            tsa_threshold = get_tsa_threshold("exp_schedule",
                                              global_step,
                                              t_total,
                                              tsa_start,
                                              end=1)
            larger_than_threshold = torch.exp(-loss) > tsa_threshold
            loss_mask = torch.ones_like(label_ids) * (
                1 - larger_than_threshold.float())
            loss = torch.sum(loss * loss_mask) / torch.max(
                torch.sum(loss_mask),
                torch.tensor(1.0).cuda())

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    if args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    tb_writer.close()

    return global_step, tr_loss / global_step