예제 #1
0
class OneCycleOptimizer(BertAdam):
    def __init__(self,
                 params,
                 start,
                 stop,
                 warmup=-1,
                 t_total=-1,
                 schedule='warmup_linear',
                 b1=0.9,
                 b2=0.999,
                 e=1e-6,
                 weight_decay_rate=0.01,
                 steps=200,
                 max_grad_norm=1.0):
        defaults = dict(lr=start,
                        schedule=schedule,
                        warmup=warmup,
                        t_total=t_total,
                        b1=b1,
                        b2=b2,
                        e=e,
                        weight_decay_rate=weight_decay_rate,
                        max_grad_norm=max_grad_norm)
        self.bertadam = BertAdam(params, **defaults)
        self.dir = 1
        self.start = start
        self.stop = stop
        self.curr = start
        self.incr = 2 * (stop - start) / steps
        super(OneCycleOptimizer, self).__init__(params, **defaults)

    def get_lr(self):
        lr_list = self.bertadam.get_lr()
        if self.dir == 1:
            self.curr += self.incr
            if self.curr > self.stop:
                self.dir = -1
        elif self.dir == -1:
            self.curr -= self.incr
            if self.curr < self.start:
                self.dir = 1
        return lr_list / self.curr
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--examples_n_features_dir",
                        default='data/examples_n_features/',
                        type=str,
                        help="Dir containing drop examples and features.")
    parser.add_argument("--mlm_dir",
                        default='../data/MLM_train/',
                        type=Path,
                        help="The data dir with MLM taks. Should contain the .jsonl files (or other data files) for the task.")
    parser.add_argument("--bert_model", default='bert-base-uncased', type=str,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--output_dir",
                        default='./out_drop_finetune',
                        type=str,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--model", default="bert-base-uncased", type=str)
    parser.add_argument("--init_weights_dir",
                        default='',
                        type=str,
                        help="The directory where init model wegihts an config are stored.")
    parser.add_argument("--max_seq_length",
                        default=-1,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_inference",
                        action='store_true',
                        help="Whether to run inference on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--mlm_batch_size",
                        default=-1,
                        type=int,
                        help="Total batch size for mlm train data.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_samples",
                        default=-1,
                        type=int,
                        help="Total number of training samples used.")
    parser.add_argument("--num_train_epochs",
                        default=6.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--freeze_encoder',
                        action='store_true',
                        help="Whether to freeze the bert encoder, embeddings.")
#     parser.add_argument('--indiv_digits',
#                         action='store_true',
#                         help="Whether to tokenize numbers as digits.")
    parser.add_argument('--rand_init',
                        action='store_true',
                        help="Whether to use random init instead of BERT.")
    parser.add_argument('--random_shift',
                        action='store_true',
                        help="Whether to randomly shift position ids of encoder input.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--mlm_scale',
                        type=float, default=1.0,
                        help="mlm loss scaling factor.")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if args.do_train:
        make_output_dir(args, scripts_to_save=[sys.argv[0], 'modeling.py', 'create_examples_n_features.py'])

    #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    
    if args.init_weights_dir:
        model = BertTransformer.from_pretrained(args.init_weights_dir)
    else:
        # prepare model
        model = BertTransformer.from_pretrained(args.model,
            cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
    
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    
    eval_data = DropDataset(args, 'eval')
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_data))
    logger.info("  Batch size = %d", args.eval_batch_size)
    
    if args.do_eval and args.do_inference:
        inference(args, model, eval_dataloader, device, tokenizer)
        exit()
        
    if args.do_train:
        # Prepare data loader
        train_data = DropDataset(args, 'train')
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
        
        # Prepare optimizer
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=args.learning_rate,
                     warmup=args.warmup_proportion,
                     t_total=num_train_optimization_steps)
        
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        
        '''
        ------------------------------------------------------------------------------
        TODO: check training resume, fp16, use --random_shift for short inputs
        ------------------------------------------------------------------------------
        '''
        # using fp16
        fp16 = False
#         try:
#             from apex import amp
#             model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
#             fp16 = True
#         except ImportError:
#             logger.info("Not using 16-bit training due to apex import error.")
        
#         if n_gpu > 1:
#             model = torch.nn.DataParallel(model)
        
        tb_writer = SummaryWriter(os.path.join(args.output_dir, 'log'))  # tensorboard
        
        # masked LM data
        do_mlm_task = False 
        if args.mlm_batch_size > 0:
            mlm_dataset = MLMDataset(training_path=args.mlm_dir, tokenizer=tokenizer)
            mlm_dataloader = DataLoader(mlm_dataset, sampler=RandomSampler(mlm_dataset), batch_size=args.mlm_batch_size)
            mlm_iter = iter(mlm_dataloader)
            do_mlm_task = True
            
        model.train()
        (global_step, all_losses, all_errors, all_dec_losses, all_dec_errors, eval_errors,
         best, best_mlm, t_prev, do_eval) = 0, [], [], [], [], [], 1000, 1000, time.time(), False
        mlm_losses, mlm_errors, all_span_losses, all_span_errors = [], [], [], []
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                # grads wrt to train data
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, head_type, q_spans, p_spans = batch
                
                losses = model(input_ids, segment_ids, input_mask, random_shift=args.random_shift, target_ids=label_ids,
                               target_mask=None, answer_as_question_spans=q_spans, answer_as_passage_spans=p_spans,
                               head_type=head_type)
                loss, errs, dec_loss, dec_errors, span_loss, span_errors, type_loss, type_errors, type_preds = losses
                
                # aggregate on multi-gpu
                take_mean = lambda x: x.mean() if x is not None and sum(x.size()) > 1 else x
                take_sum = lambda x: x.sum() if x is not None and sum(x.size()) > 1 else x
                [loss, dec_loss, span_loss, type_loss] = list(map(take_mean, [loss, dec_loss, span_loss, type_loss]))
                [errs, dec_errors, span_errors, type_errors] = list(map(take_sum, [errs, dec_errors, span_errors, type_errors]))
                
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                all_losses.append(loss.item()); all_dec_losses.append(dec_loss.item()); 
                all_errors.append(errs.item() / input_ids.size(0))
                all_dec_errors.append(dec_errors.item() / input_ids.size(0))
                all_span_losses.append(span_loss.item()); all_span_errors.append(span_errors.item() / input_ids.size(0))
         
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                
                if do_mlm_task:
                    # grads wrt to mlm data
                    while True:
                        try:
                            batch = next(mlm_iter)  # sample next mlm batch
                            break
                        except StopIteration:       # end of epoch: reset and shuffle
                            mlm_iter = iter(mlm_dataloader)
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, label_ids = batch
                    loss, errs = model(input_ids, None, input_mask, target_ids=label_ids, 
                                       ignore_idx=IGNORE_IDX, task='mlm')
                    loss, err_sum = take_mean(loss), take_sum(errs)      # for multi-gpu
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps
                    loss = args.mlm_scale * loss
                    mlm_losses.append(loss.item()); mlm_errors.append(err_sum.item() / input_ids.size(0))

                    if fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
                else:
                    mlm_losses.append(-1); mlm_errors.append(-1)
                
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()          # update step
                    optimizer.zero_grad()
                    global_step += 1
                    
                train_result = {'trn_loss': all_losses[-1], 'trn_dec_loss': all_dec_losses[-1], 
                                'trn_err': all_errors[-1], 'trn_dec_err': all_dec_errors[-1], 
                                'lr': optimizer.get_lr()[0], 'trn_span_loss': all_span_losses[-1], 
                                'trn_span_err': all_span_errors[-1], 'epoch': epoch}
                mlm_result = {'trn_mlm_loss': mlm_losses[-1], 'trn_mlm_err': mlm_errors[-1]}
                tb_writer.add_scalars('train', train_result, len(all_losses))
                tb_writer.add_scalars('mlm', mlm_result, len(all_losses)) if do_mlm_task else None
    
                if time.time() - t_prev > 60*60: # evaluate every hr
                    do_eval = True
                    t_prev = time.time()
            
                if do_eval:
                    do_eval = False
                    eval_result = evaluate(args, model, eval_dataloader, device, len(train_data))
                    eval_err = eval_result['eval_err']
                    if eval_err < best or (eval_err < best + 0.005 and np.mean(mlm_errors[-1000:]) < best_mlm):
                        # if eval err is in range of best, look at MLM err
                        train_state = {'global_step': global_step, 'optimizer_state_dict': optimizer.state_dict()}
                        train_state.update(train_result)
                        save(args, model, tokenizer, train_state)
                        best_mlm = min(best_mlm, np.mean(mlm_errors[-1000:]))
                    best = min(best, eval_err)
                    eval_errors.append((len(all_losses), eval_err))
                    model.train()
            
                    tb_writer.add_scalars('eval', eval_result, len(all_losses))
#                     for name, param in model.named_parameters():
#                         tb_writer.add_histogram(name, param.clone().cpu().data.numpy(), len(all_losses))
            # end of epoch
            do_eval = True

        # training complete
        tb_writer.export_scalars_to_json(os.path.join(args.output_dir, 'training_scalars.json'))
        tb_writer.close()
예제 #3
0
    def do_training(train_fs, train_exs):
        """Runs BERT fine-tuning."""
        # Allows to write to enclosed variables global_step
        nonlocal global_step

        # Create the batched training data out of the features.
        train_data = create_tensor_dataset(train_fs)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        # Calculate the number of optimization steps.
        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        # Prepare optimizer.
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

        # Log some information about the training.
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_exs))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        # Set the model to training mode and train for X epochs.
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            # Iterate over all batches.
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # Get the Logits and calculate the loss.
                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
                loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1))

                # Scale the loss in gradient accumulation mode.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                # Calculate the gradients.
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                # Update the weights every gradient_accumulation_steps steps.
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                    tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', loss.item(), global_step)
class BertTrainer:
    def __init__(self, hypers: Hypers, model_name, checkpoint,
                 **extra_model_args):
        """
        initialize the BertOptimizer, with common logic for setting weight_decay_rate, doing gradient accumulation and
        tracking loss
        :param hypers: the core hyperparameters for the bert model
        :param model_name: the fully qualified name of the bert model we will train
            like pytorch_pretrained_bert.modeling.BertForQuestionAnswering
        :param checkpoint: if resuming training,
        this is the checkpoint that contains the optimizer state as checkpoint['optimizer']
        """

        self.init_time = time.time()

        self.model = self.get_model(hypers, model_name, checkpoint,
                                    **extra_model_args)

        self.step = 0
        self.hypers = hypers
        self.train_stats = TrainStats(hypers)

        self.model.train()
        logger.info('configured model for training')

        # show parameter names
        # logger.info(str([n for (n, p) in self.model.named_parameters()]))

        # Prepare optimizer
        if hasattr(hypers, 'exclude_pooler') and hypers.exclude_pooler:
            # module.bert.pooler.dense.weight, module.bert.pooler.dense.bias
            # see https://github.com/NVIDIA/apex/issues/131
            self.param_optimizer = [
                (n, p) for (n, p) in self.model.named_parameters()
                if '.pooler.' not in n
            ]
        else:
            self.param_optimizer = list(self.model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in self.param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params': [
                p for n, p in self.param_optimizer
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.0
        }]
        self.t_total = hypers.num_train_steps
        self.global_step = hypers.global_step

        if hypers.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=hypers.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if hypers.loss_scale == 0:
                self.optimizer = FP16_Optimizer(
                    optimizer,
                    dynamic_loss_scale=True,
                    verbose=(hypers.global_rank == 0))
            else:
                self.optimizer = FP16_Optimizer(
                    optimizer,
                    static_loss_scale=hypers.loss_scale,
                    verbose=(hypers.global_rank == 0))
        else:
            self.optimizer = BertAdam(optimizer_grouped_parameters,
                                      lr=hypers.learning_rate,
                                      warmup=hypers.warmup_proportion,
                                      t_total=self.t_total)
        logger.info('created optimizer')

        if checkpoint and type(
                checkpoint) is dict and 'optimizer' in checkpoint:
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            if hypers.fp16:
                pass
            else:
                # if we load this state, we need to set the t_total to what we passed, not what was saved
                self.optimizer.set_t_total(self.t_total)
                # show state of optimizer
                lrs = self.optimizer.get_lr()
                logger.info('Min and max learn rate:    %s',
                            str([min(lrs), max(lrs)]))
                logger.info('Min and max step in state: %s',
                            str(self.optimizer.get_steps()))
            instances_per_step = hypers.train_batch_size * hypers.gradient_accumulation_steps * hypers.world_size
            if 'seen_instances' in checkpoint:
                self.global_step = int(checkpoint['seen_instances'] /
                                       instances_per_step)
                self.train_stats.previous_instances = checkpoint[
                    'seen_instances']
                logger.info('got global step from checkpoint = %i',
                            self.global_step)

            logger.info('Loaded optimizer state:')
            logger.info(repr(self.optimizer))

    def reset(self):
        """
        reset any gradient accumulation
        :return:
        """
        self.model.zero_grad()
        self.step = 0

    def should_continue(self):
        """
        :return: True if training should continue
        """
        if self.global_step >= self.t_total:
            logger.info(
                'stopping due to train step %i >= target train steps %i',
                self.global_step, self.t_total)
            return False
        if 0 < self.hypers.time_limit <= (time.time() - self.init_time):
            logger.info('stopping due to time out %i seconds',
                        self.hypers.time_limit)
            return False
        return True

    def save_simple(self, filename):
        if self.hypers.global_rank != 0:
            logger.info('skipping save in %i', torch.distributed.get_rank())
            return
        model_to_save = self.model.module if hasattr(
            self.model, 'module') else self.model  # Only save the model itself
        torch.save(model_to_save.state_dict(), filename)
        logger.info(f'saved model only to {filename}')

    def save(self, filename, **extra_checkpoint_info):
        """
        save a checkpoint with the model parameters, the optimizer state and any additional checkpoint info
        :param filename:
        :param extra_checkpoint_info:
        :return:
        """
        # only local_rank 0, in fact only global rank 0
        if self.hypers.global_rank != 0:
            logger.info('skipping save in %i', torch.distributed.get_rank())
            return
        start_time = time.time()
        checkpoint = extra_checkpoint_info
        model_to_save = self.model.module if hasattr(
            self.model, 'module') else self.model  # Only save the model itself
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        # also save the optimizer state, since we will likely resume from partial pre-training
        checkpoint['state_dict'] = model_to_save.state_dict()
        checkpoint['optimizer'] = self.optimizer.state_dict()
        # include world size in instances_per_step calculation
        instances_per_step = self.hypers.train_batch_size * \
                             self.hypers.gradient_accumulation_steps * \
                             self.hypers.world_size
        checkpoint['seen_instances'] = self.global_step * instances_per_step
        checkpoint['num_instances'] = self.t_total * instances_per_step
        # CONSIDER: also save hypers?
        torch.save(checkpoint, filename)
        logger.info(
            f'saved model to {filename} in {time.time()-start_time} seconds')

    def get_instance_count(self):
        instances_per_step = self.hypers.train_batch_size * \
                             self.hypers.gradient_accumulation_steps * \
                             self.hypers.world_size
        return self.global_step * instances_per_step

    def step_loss(self, loss):
        """
        accumulates the gradient, tracks the loss and applies the gradient to the model
        :param loss: the loss from evaluating the model
        """
        if self.global_step == 0:
            logger.info('first step_loss')
        if self.hypers.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu.
        self.train_stats.note_loss(loss.item())

        if self.hypers.gradient_accumulation_steps > 1:
            loss = loss / self.hypers.gradient_accumulation_steps

        if self.hypers.fp16:
            self.optimizer.backward(loss)
        else:
            loss.backward()

        if (self.step + 1) % self.hypers.gradient_accumulation_steps == 0:
            lr_this_step = self.hypers.learning_rate * warmup_linear(
                self.global_step / self.t_total, self.hypers.warmup_proportion)
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = lr_this_step
            self.optimizer.step()
            self.model.zero_grad()
            self.global_step += 1

        self.step += 1

    @classmethod
    def get_files(cls, train_file, completed_files):
        logger.info('completed files = %s, count = %i',
                    str(completed_files[:min(5, len(completed_files))]),
                    len(completed_files))
        # multiple train files
        if not os.path.isdir(train_file):
            train_files = [train_file]
        else:
            if not train_file.endswith('/'):
                train_file = train_file + '/'
            train_files = glob.glob(train_file + '**', recursive=True)
            train_files = [f for f in train_files if not os.path.isdir(f)]

        # exclude completed files
        if not set(train_files) == set(completed_files):
            train_files = [f for f in train_files if f not in completed_files]
        else:
            completed_files = []  # new epoch
        logger.info('train files = %s, count = %i',
                    str(train_files[:min(5, len(train_files))]),
                    len(train_files))

        return train_files, completed_files

    @classmethod
    def get_model(cls, hypers, model_name, checkpoint, **extra_model_args):
        override_state_dict = None
        if checkpoint:
            if type(checkpoint) is dict and 'state_dict' in checkpoint:
                logger.info('loading from multi-part checkpoint')
                override_state_dict = checkpoint['state_dict']
            else:
                logger.info('loading from saved model parameters')
                override_state_dict = checkpoint

        # create the model object by name
        # https://stackoverflow.com/questions/4821104/python-dynamic-instantiation-from-string-name-of-a-class-in-dynamically-imported
        import importlib
        clsdot = model_name.rfind('.')
        class_ = getattr(importlib.import_module(model_name[0:clsdot]),
                         model_name[clsdot + 1:])

        model_args = {
            'state_dict': override_state_dict,
            'cache_dir': PYTORCH_PRETRAINED_BERT_CACHE
        }
        model_args.update(extra_model_args)
        # logger.info(pprint.pformat(extra_model_args, indent=4))
        model = class_.from_pretrained(hypers.bert_model, **model_args)

        logger.info('built model')

        # configure model for fp16, multi-gpu and/or distributed training
        if hypers.fp16:
            model.half()
            logger.info('model halved')
        logger.info('sending model to %s', str(hypers.device))
        model.to(hypers.device)
        logger.info('sent model to %s', str(hypers.device))

        if hypers.local_rank != -1:
            if not hypers.no_apex:
                try:
                    from apex.parallel import DistributedDataParallel as DDP
                    model = DDP(model)
                except ImportError:
                    raise ImportError("Please install apex")
            else:
                model = torch.nn.parallel.DistributedDataParallel(
                    model,
                    device_ids=[hypers.local_rank],
                    output_device=hypers.local_rank)
            logger.info('using DistributedDataParallel for world size %i',
                        hypers.world_size)
        elif hypers.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        return model

    @classmethod
    def get_base_parser(cls):
        parser = argparse.ArgumentParser()

        # Required parameters
        parser.add_argument(
            "--bert_model",
            default=None,
            type=str,
            required=True,
            help=
            "Bert pre-trained model selected in the list: bert-base-uncased, "
            "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
        )

        # Other parameters
        parser.add_argument(
            "--num_instances",
            default=-1,
            type=int,
            help="Total number of training instances to train over.")
        parser.add_argument(
            "--seen_instances",
            default=-1,
            type=int,
            help=
            "When resuming training, the number of instances we have already trained over."
        )
        parser.add_argument("--train_batch_size",
                            default=32,
                            type=int,
                            help="Total batch size for training.")
        parser.add_argument("--learning_rate",
                            default=5e-5,
                            type=float,
                            help="The initial learning rate for Adam.")
        parser.add_argument(
            "--warmup_proportion",
            default=0.1,
            type=float,
            help=
            "Proportion of training to perform linear learning rate warmup for. "
            "E.g., 0.1 = 10% of training.")
        parser.add_argument("--no_cuda",
                            default=False,
                            action='store_true',
                            help="Whether not to use CUDA when available")
        parser.add_argument("--no_apex",
                            default=False,
                            action='store_true',
                            help="Whether not to use apex when available")
        parser.add_argument('--seed',
                            type=int,
                            default=42,
                            help="random seed for initialization")
        parser.add_argument(
            '--gradient_accumulation_steps',
            type=int,
            default=1,
            help=
            "Number of updates steps to accumulate before performing a backward/update pass."
        )
        parser.add_argument(
            '--optimize_on_cpu',
            default=False,
            action='store_true',
            help=
            "Whether to perform optimization and keep the optimizer averages on CPU"
        )
        parser.add_argument(
            '--fp16',
            default=False,
            action='store_true',
            help="Whether to use 16-bit float precision instead of 32-bit")
        parser.add_argument(
            '--loss_scale',
            type=float,
            default=0,
            help=
            'Loss scaling, positive power of 2 values can improve fp16 convergence. '
            'Leave at zero to use dynamic loss scaling')
        return parser
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--vocab_file",
                        default='bert-base-uncased-vocab.txt',
                        type=str,
                        required=True)
    parser.add_argument("--model_file",
                        default='bert-base-uncased.tar.gz',
                        type=str,
                        required=True)
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )
    parser.add_argument(
        "--predict_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the predictions will be written.")

    # Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument("--test_file", default=None, type=str)
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=2.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--view_id',
                        type=int,
                        default=1,
                        help="view id of multi-view co-training(two-view)")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--save_all', default=False, action='store_true')

    # Base setting
    parser.add_argument('--pretrain', type=str, default=None)
    parser.add_argument('--max_ctx', type=int, default=2)
    parser.add_argument('--task_name', type=str, default='race')
    parser.add_argument('--bert_name', type=str, default='pool-race')
    parser.add_argument('--reader_name', type=str, default='race')
    parser.add_argument('--per_eval_step', type=int, default=10000000)
    # model parameters
    parser.add_argument('--evidence_lambda', type=float, default=0.8)
    # Parameters for running labeling model
    parser.add_argument('--do_label', default=False, action='store_true')
    parser.add_argument('--sentence_id_file', nargs='*')
    parser.add_argument('--weight_threshold', type=float, default=0.0)
    parser.add_argument('--only_correct', default=False, action='store_true')
    parser.add_argument('--label_threshold', type=float, default=0.0)
    parser.add_argument('--multi_evidence', default=False, action='store_true')
    parser.add_argument('--metric', default='accuracy', type=str)
    parser.add_argument('--num_evidence', default=1, type=int)
    parser.add_argument('--power_length', default=1., type=float)
    parser.add_argument('--num_choices', default=4, type=int)
    parser.add_argument('--split_type', default=0, type=int)
    parser.add_argument('--use_gumbel', default=False, action='store_true')
    parser.add_argument('--sample_steps', type=int, default=10)
    parser.add_argument('--reward_func', type=int, default=0)
    parser.add_argument('--freeze_bert', default=False, action='store_true')

    args = parser.parse_args()

    logger = setting_logger(args.output_dir)
    logger.info('================== Program start. ========================')
    logger.info(
        f'================== Running with seed {args.seed} =========================='
    )

    model_params = prepare_model_params(args)
    read_params = prepare_read_params(args)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict and not args.do_label:
        raise ValueError(
            "At least one of `do_train` or `do_predict` or `do_label` must be True."
        )

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if args.do_train:
        if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
            raise ValueError(
                "Output directory () already exists and is not empty.")
        os.makedirs(args.output_dir, exist_ok=True)

    if args.do_predict or args.do_label:
        os.makedirs(args.predict_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.vocab_file)

    data_reader = initialize_reader(args.reader_name)

    num_train_steps = None
    if args.do_train or args.do_label:
        train_examples = data_reader.read(input_file=args.train_file,
                                          **read_params)

        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}_{5}'.format(
            args.bert_model, str(args.max_seq_length), str(args.doc_stride),
            str(args.max_query_length), str(args.max_ctx), str(args.task_name))

        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except FileNotFoundError:
            train_features = data_reader.convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        num_train_steps = int(
            len(train_features) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    if args.pretrain is not None:
        logger.info('Load pretrained model from {}'.format(args.pretrain))
        model_state_dict = torch.load(args.pretrain, map_location='cuda:0')
        model = initialize_model(args.bert_name,
                                 args.model_file,
                                 state_dict=model_state_dict,
                                 **model_params)
    else:
        model = initialize_model(args.bert_name, args.model_file,
                                 **model_params)

    # if args.fp16:
    #     model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # Remove frozen parameters
    param_optimizer = [n for n in param_optimizer if n[1].requires_grad]

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps if num_train_steps is not None else -1
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    # if args.fp16:
    #     try:
    #         from apex.optimizers import FP16_Optimizer
    #         from apex.optimizers import FusedAdam
    #     except ImportError:
    #         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
    #
    #     optimizer = FusedAdam(optimizer_grouped_parameters,
    #                           lr=args.learning_rate,
    #                           bias_correction=False,
    #                           max_grad_norm=1.0)
    #     if args.loss_scale == 0:
    #         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    #     else:
    #         optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    #     warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=t_total)
    #     logger.info(f"warm up linear: warmup = {warmup_linear.warmup}, t_total = {warmup_linear.t_total}.")
    # else:
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)
    if args.fp16:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    # Prepare data
    eval_examples = data_reader.read(input_file=args.predict_file,
                                     **read_params)
    eval_features = data_reader.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length)

    eval_tensors = data_reader.data_to_tensors(eval_features)
    eval_data = TensorDataset(*eval_tensors)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.predict_batch_size)

    if args.do_train:

        if args.do_label:
            logger.info('Training in State Wise.')
            sentence_label_file = args.sentence_id_file
            if sentence_label_file is not None:
                for file in sentence_label_file:
                    train_features = data_reader.generate_features_sentence_ids(
                        train_features, file)
            else:
                logger.info('No sentence id supervision is found.')
        else:
            logger.info('Training in traditional way.')

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Num train total optimization steps = %d", t_total)
        logger.info("  Batch size = %d", args.predict_batch_size)
        train_loss = AverageMeter()
        best_acc = 0.0
        best_loss = 1000000
        summary_writer = SummaryWriter(log_dir=args.output_dir)
        global_step = 0
        eval_loss = AverageMeter()
        eval_accuracy = CategoricalAccuracy()
        eval_epoch = 0

        train_tensors = data_reader.data_to_tensors(train_features)
        train_data = TensorDataset(*train_tensors)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        for epoch in range(int(args.num_train_epochs)):
            logger.info(f'Running at Epoch {epoch}')
            # Train
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Iteration",
                         dynamic_ncols=True)):
                model.train()
                if n_gpu == 1:
                    batch = batch_to_device(
                        batch, device)  # multi-gpu does scattering it-self
                inputs = data_reader.generate_inputs(
                    batch, train_features, model_state=ModelState.Train)
                model_output = model(**inputs)
                loss = model_output['loss']
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    # optimizer.backward(loss)
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used and handles this automatically
                    # if args.fp16:
                    #     lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step)
                    #     for param_group in optimizer.param_groups:
                    #         param_group['lr'] = lr_this_step
                    #     summary_writer.add_scalar('lr', lr_this_step, global_step)
                    # else:
                    summary_writer.add_scalar('lr',
                                              optimizer.get_lr()[0],
                                              global_step)

                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                    train_loss.update(loss.item(), 1)
                    summary_writer.add_scalar('train_loss', train_loss.avg,
                                              global_step)
                    # logger.info(f'Train loss: {train_loss.avg}')

                if (step + 1) % args.per_eval_step == 0 or step == len(
                        train_dataloader) - 1:
                    # Evaluation
                    model.eval()
                    logger.info("Start evaluating")
                    for _, eval_batch in enumerate(
                            tqdm(eval_dataloader,
                                 desc="Evaluating",
                                 dynamic_ncols=True)):
                        if n_gpu == 1:
                            eval_batch = batch_to_device(
                                eval_batch,
                                device)  # multi-gpu does scattering it-self
                        inputs = data_reader.generate_inputs(
                            eval_batch,
                            eval_features,
                            model_state=ModelState.Evaluate)
                        with torch.no_grad():
                            output_dict = model(**inputs)
                            loss, choice_logits = output_dict[
                                'loss'], output_dict['choice_logits']
                            eval_loss.update(loss.item(), 1)
                            eval_accuracy(choice_logits, inputs["labels"])

                    eval_epoch_loss = eval_loss.avg
                    summary_writer.add_scalar('eval_loss', eval_epoch_loss,
                                              eval_epoch)
                    eval_loss.reset()
                    current_acc = eval_accuracy.get_metric(reset=True)
                    summary_writer.add_scalar('eval_acc', current_acc,
                                              eval_epoch)
                    torch.cuda.empty_cache()

                    if args.save_all:
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, f"pytorch_model_{eval_epoch}.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)

                    if current_acc > best_acc:
                        best_acc = current_acc
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                    if eval_epoch_loss < best_loss:
                        best_loss = eval_epoch_loss
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_loss_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)

                    logger.info(
                        'Eval Epoch: %d, Accuracy: %.4f (Best Accuracy: %.4f)'
                        % (eval_epoch, current_acc, best_acc))
                    eval_epoch += 1
            logger.info(
                f'Epoch {epoch}: Accuracy: {best_acc}, Train Loss: {train_loss.avg}'
            )
        summary_writer.close()

    for output_model_name in ["pytorch_model.bin", "pytorch_loss_model.bin"]:
        # Loading trained model
        output_model_file = os.path.join(args.output_dir, output_model_name)
        model_state_dict = torch.load(output_model_file, map_location='cuda:0')
        model = initialize_model(args.bert_name,
                                 args.model_file,
                                 state_dict=model_state_dict,
                                 **model_params)
        model.to(device)

        # Write Yes/No predictions
        if args.do_predict and (args.local_rank == -1
                                or torch.distributed.get_rank() == 0):

            test_examples = data_reader.read(args.test_file)
            test_features = data_reader.convert_examples_to_features(
                test_examples, tokenizer, args.max_seq_length)

            test_tensors = data_reader.data_to_tensors(test_features)
            test_data = TensorDataset(*test_tensors)
            test_sampler = SequentialSampler(test_data)
            test_dataloader = DataLoader(test_data,
                                         sampler=test_sampler,
                                         batch_size=args.predict_batch_size)

            logger.info("***** Running predictions *****")
            logger.info("  Num orig examples = %d", len(test_examples))
            logger.info("  Num split examples = %d", len(test_features))
            logger.info("  Batch size = %d", args.predict_batch_size)

            model.eval()
            all_results = []
            test_acc = CategoricalAccuracy()
            logger.info("Start predicting yes/no on Dev set.")
            for batch in tqdm(test_dataloader, desc="Testing"):
                if n_gpu == 1:
                    batch = batch_to_device(
                        batch, device)  # multi-gpu does scattering it-self
                inputs = data_reader.generate_inputs(
                    batch, test_features, model_state=ModelState.Evaluate)
                with torch.no_grad():
                    batch_choice_logits = model(**inputs)['choice_logits']
                    test_acc(batch_choice_logits, inputs['labels'])
                example_indices = batch[-1]
                for i, example_index in enumerate(example_indices):
                    choice_logits = batch_choice_logits[i].detach().cpu(
                    ).tolist()

                    test_feature = test_features[example_index.item()]
                    unique_id = int(test_feature.unique_id)

                    all_results.append(
                        RawResultChoice(unique_id=unique_id,
                                        choice_logits=choice_logits))

            if "loss" in output_model_name:
                logger.info(
                    'Predicting question choice on test set using model with lowest loss on validation set.'
                )
                output_prediction_file = os.path.join(args.predict_dir,
                                                      'loss_predictions.json')
            else:
                logger.info(
                    'Predicting question choice on test set using model with best accuracy on validation set,'
                )
                output_prediction_file = os.path.join(args.predict_dir,
                                                      'predictions.json')
            data_reader.write_predictions(test_examples, test_features,
                                          all_results, output_prediction_file)
            logger.info(
                f"Accuracy on Test set: {test_acc.get_metric(reset=True)}")

    # Loading trained model.
    if args.metric == 'accuracy':
        logger.info("Load model with best accuracy on validation set.")
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    elif args.metric == 'loss':
        logger.info("Load model with lowest loss on validation set.")
        output_model_file = os.path.join(args.output_dir,
                                         "pytorch_loss_model.bin")
    else:
        raise RuntimeError(
            f"Wrong metric type for {args.metric}, which must be in ['accuracy', 'loss']."
        )
    model_state_dict = torch.load(output_model_file, map_location='cuda:0')
    model = initialize_model(args.bert_name,
                             args.model_file,
                             state_dict=model_state_dict,
                             **model_params)
    model.to(device)

    # Labeling sentence id.
    if args.do_label and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):

        f = open('debug_log.txt', 'w')

        def softmax(x):
            """Compute softmax values for each sets of scores in x."""
            e_x = np.exp(x - np.max(x))
            return e_x / e_x.sum()

        def topk(sentence_sim):
            """
            :param sentence_sim: numpy
            :return:
            """
            max_length = min(args.num_evidence, len(sentence_sim))
            sorted_scores = np.array(sorted(sentence_sim, reverse=True))
            scores = []
            for idx in range(max_length):
                scores.append(np.log(softmax(sorted_scores[idx:])[0]))
            scores = [np.mean(scores[:(j + 1)]) for j in range(max_length)]
            top_k = int(np.argmax(scores) + 1)
            sorted_scores = sorted(enumerate(sentence_sim),
                                   key=lambda x: x[1],
                                   reverse=True)
            evidence_ids = [x[0] for x in sorted_scores[:top_k]]
            sentence = {
                'sentences': evidence_ids,
                'value': float(np.exp(scores[top_k - 1]))
            }
            return sentence

        def batch_topk(sentence_sim, sentence_mask):
            batch_size = sentence_sim.size(0)
            num_choices = sentence_sim.size(1)
            sentence_sim = sentence_sim.numpy() + 1e-15
            sentence_mask = sentence_mask.numpy()
            sentence_ids = []
            for b in range(batch_size):
                choice_sentence_ids = [
                    topk(_sim[:int(sum(_mask))])
                    for _sim, _mask in zip(sentence_sim[b], sentence_mask[b])
                ]
                assert len(choice_sentence_ids) == num_choices
                sentence_ids.append(choice_sentence_ids)
            return sentence_ids

        test_examples = train_examples
        test_features = train_features

        test_tensors = data_reader.data_to_tensors(test_features)
        test_data = TensorDataset(*test_tensors)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.predict_batch_size)

        logger.info("***** Running labeling *****")
        logger.info("  Num orig examples = %d", len(test_examples))
        logger.info("  Num split examples = %d", len(test_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start labeling.")
        for batch in tqdm(test_dataloader, desc="Testing"):
            if n_gpu == 1:
                batch = batch_to_device(batch, device)
            inputs = data_reader.generate_inputs(batch,
                                                 test_features,
                                                 model_state=ModelState.Test)
            with torch.no_grad():
                output_dict = model(**inputs)
                batch_choice_logits, batch_sentence_logits = output_dict[
                    "choice_logits"], output_dict["sentence_logits"]
                batch_sentence_mask = output_dict["sentence_mask"]
            example_indices = batch[-1]
            # batch_beam_results = batch_choice_beam_search(batch_sentence_logits, batch_sentence_mask)
            batch_topk_results = batch_topk(batch_sentence_logits,
                                            batch_sentence_mask)
            for i, example_index in enumerate(example_indices):
                choice_logits = batch_choice_logits[i].detach().cpu()
                evidence_list = batch_topk_results[i]

                test_feature = test_features[example_index.item()]
                unique_id = int(test_feature.unique_id)

                all_results.append(
                    RawOutput(unique_id=unique_id,
                              model_output={
                                  "choice_logits": choice_logits,
                                  "evidence_list": evidence_list
                              }))

        output_prediction_file = os.path.join(args.predict_dir,
                                              'sentence_id_file.json')
        data_reader.predict_sentence_ids(
            test_examples,
            test_features,
            all_results,
            output_prediction_file,
            weight_threshold=args.weight_threshold,
            only_correct=args.only_correct,
            label_threshold=args.label_threshold)
print('Start training!')
writer = SummaryWriter(dir+'models/logs/%s/'%LOG_NAME)

loss_fct = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(current_epoch, EPOCH):
    loss_train = 0

    for idx, batch in enumerate(tqdm(train_loader), 1):

        loss, correct = train(batch, model, loss_fct, optimizer, idx, gradient_accumulation_steps)

        writer.add_scalar('loss_train_batch/loss_train_batch', loss, global_batch_counter_train)
        writer.add_scalar('accuracy_train_batch/accuracy_train_batch', correct/batch[0].shape[0], global_batch_counter_train)
        writer.add_scalar('epoch/lr', optimizer.get_lr()[-1], global_batch_counter_train)

        global_batch_counter_train += 1
        loss_train += loss


        if (test_per_n_batch_one_epoch and (idx % test_per_n_batch_one_epoch == 0)) or (DEBUG and idx == 4 and test_per_n_batch_one_epoch):
            print('Start testing...')
            loss_test = 0
            total = 0
            correct = 0
            y_pred = []
            y_true = []
            model.eval()
            with torch.no_grad():
                for test_idx, batch in enumerate(tqdm(test_loader), 1):
예제 #7
0
def train(**kwargs):
    config = Config()
    config.update(**kwargs)
    print('当前设置为:\n', config)
    if args.use_cuda:
        torch.cuda.set_device(config.gpu)
    print('loading corpus')
    vocab = load_vocab(args.vocab_file)
    label_dic = load_vocab(config.label_file)
    index2label={v:k for k,v in label_dic.items()}
    tagset_size = len(label_dic)
    train_data,_ = read_corpus(args.pretrain_train_path, max_length=args.max_seq_length, label_dic=label_dic, vocab=vocab)
    dev_data,dev_len = read_corpus(args.pretrain_dev_path, max_length=args.max_seq_length, label_dic=label_dic, vocab=vocab)
    num_train_optimization_steps = int(
        len(train_data) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    train_ids = torch.LongTensor([temp.input_id for temp in train_data])
    train_masks = torch.LongTensor([temp.input_mask for temp in train_data])
    train_tags = torch.LongTensor([temp.label_id for temp in train_data])

    train_dataset = TensorDataset(train_ids, train_masks, train_tags)
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.train_batch_size)

    dev_ids = torch.LongTensor([temp.input_id for temp in dev_data])
    dev_masks = torch.LongTensor([temp.input_mask for temp in dev_data])
    dev_tags = torch.LongTensor([temp.label_id for temp in dev_data])
    dev_dataset = TensorDataset(dev_ids, dev_masks, dev_tags)
    dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=args.eval_batch_size)
    model = BERT_LSTM_CRF(args, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda)

    if config.use_cuda:
        model=model.cuda()
    if config.load_model:
        if config.flag=='submit':
            assert config.load_path is not None
            test_data, test_len = read_corpus(args.submit_test_path, max_length=args.max_seq_length, label_dic=label_dic,
                                              vocab=vocab,flag='submit')
            test_ids = torch.LongTensor([temp.input_id for temp in test_data])
            test_masks = torch.LongTensor([temp.input_mask for temp in test_data])
            test_dataset = TensorDataset(test_ids, test_masks)
            test_loader = DataLoader(test_dataset, shuffle=False, batch_size=args.eval_batch_size)
            model = load_model(model, name=None)
            test(model, test_loader, config, index2label, test_len)
            # dev(model, test_loader, None, config)
        if config.flag=='test':
            assert config.load_path is not None
            test_data, test_len = read_corpus(args.pretrain_test_path, max_length=args.max_seq_length, label_dic=label_dic,
                                              vocab=vocab)
            test_ids = torch.LongTensor([temp.input_id for temp in test_data])
            test_masks = torch.LongTensor([temp.input_mask for temp in test_data])
            test_tags = torch.LongTensor([temp.label_id for temp in test_data])
            test_dataset = TensorDataset(test_ids, test_masks, test_tags)
            test_loader = DataLoader(test_dataset, shuffle=False, batch_size=args.eval_batch_size)
            model = load_model(model, name=None)
            # test(model, test_loader, config, index2label, test_len)
            dev(model, test_loader, 0, config, index2label, dev_len)

    else:
    # print(model)
        model.train()
        bert_param_optimizer = list(model.word_embeds.named_parameters())
        lstm_param_optimizer = list(model.lstm.named_parameters())
        liner_param_optimizer = list(model.liner.named_parameters())
        crf_param_optimizer = list(model.crf.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01,'lr':config.lr},
            {'params': [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr':config.lr},
            {'params': [p for n, p in lstm_param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': 0.001, 'lr': config.lr*5},
            {'params': [p for n, p in lstm_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr': config.lr*5},
            {'params': [p for n, p in liner_param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': 0.001, 'lr': config.lr * 2},
            {'params': [p for n, p in liner_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr':config.lr * 2},
            {'params': [p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001, 'lr': config.lr * 3},
            {'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr': config.lr * 3},

        ]
        # print(optimizer_grouped_parameters)
        optimizer = BertAdam(optimizer_grouped_parameters,
                         # lr=config.lr,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)
        # optimizer = optimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
        eval_f1 = 0.0
        for epoch in range(config.base_epoch):
            print(optimizer.get_lr())
            step = 0
            for i, batch in enumerate(train_loader):
                step += 1
                model.zero_grad()
                inputs, masks, tags = batch
                inputs, masks, tags = Variable(inputs), Variable(masks), Variable(tags)
                if config.use_cuda:
                    inputs, masks, tags = inputs.cuda(), masks.cuda(), tags.cuda()

                feats = model(inputs, masks)
                loss = model.loss(feats, masks,tags)
                loss.backward()
                optimizer.step()
                if step % 50 == 0:
                    print('step: {} |  epoch: {}|  loss: {}'.format(step, epoch, loss.item()))
            f_measure = dev(model, dev_loader, epoch, config,index2label,dev_len)
            if eval_f1 < f_measure:
                eval_f1=f_measure
                save_model(model,epoch,f_measure)
예제 #8
0
def train(params,
          datasets,
          student,
          val_iterator,
          cuda_device=-1,
          teachers=None):
    """ Train the model """
    tb_writer = SummaryWriter()
    t_total = params.params.get('trainer').get('optimizer').get("t_total")

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in student.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in student.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    # BertAdam already has a scheduler
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         eps=args.adam_epsilon,
                         t_total=t_total)
    # parameters = [[n, p] for n, p in student.named_parameters() if p.requires_grad]
    # optimizer = Optimizer.from_params(parameters, params.get("trainer").pop("optimizer"))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(list(datasets["train"])))
    logger.info("  Num Epochs = %d", num_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                params.get("iterator").get("batch_size"))
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    total_training_loss = 0.0
    logging_loss = 0.0
    best_f1 = 0.0
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(student.vocab)

    for epoch in range(num_epochs):
        tr_loss = 0.0
        student.zero_grad()
        batches_this_epoch = 0
        epoch_start_time = time.time()
        logger.info("Training")
        logger.info("Epoch %d/%d", epoch, num_epochs - 1)

        # Get tqdm for the training batches
        train_generator = iterator(datasets["train"],
                                   num_epochs=1,
                                   shuffle=True)
        num_training_batches = math.ceil(
            iterator.get_num_batches(datasets["train"]))
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)

        for batch_group in train_generator_tqdm:
            batches_this_epoch += 1
            set_name = batch_group["metadata"][0]["dataset"]
            if teachers != {}:
                assert set_name in teachers.keys()
            teacher = teachers.get(set_name, None)

            student.train()
            if teacher is not None:
                teacher.eval()

            batch_group = nn_util.move_to_device(batch_group, cuda_device)
            output_dict = student(**batch_group)
            start_logits_stu = output_dict["span_start_logits"]
            end_logits_stu = output_dict["span_end_logits"]
            loss = output_dict["loss"]  # pure student loss, gold loss
            # Distillation loss
            if teacher is not None:
                with torch.no_grad():
                    teacher_output_dict = teacher(**batch_group)
                start_logits_tea = teacher_output_dict["span_start_logits"]
                end_logits_tea = teacher_output_dict["span_end_logits"]
                assert start_logits_stu.size() == start_logits_tea.size()
                assert end_logits_stu.size() == end_logits_tea.size()

                bias_weights = get_bias_weight(args.bias_type, batch_group,
                                               set_name)

                # confidence  regularization method
                if args.method == "CR":
                    loss_start = probability_scaling(start_logits_stu,
                                                     bias_weights[:, 0],
                                                     start_logits_tea)
                    loss_end = probability_scaling(end_logits_stu,
                                                   bias_weights[:, 1],
                                                   end_logits_tea)
                else:
                    # the WL method
                    loss_start = loss_reweighting(start_logits_stu,
                                                  bias_weights[:, 0],
                                                  start_logits_tea)
                    loss_end = loss_reweighting(end_logits_stu,
                                                bias_weights[:, 1],
                                                end_logits_tea)
                loss = loss_start + loss_end

            loss.backward()
            tr_loss += loss.item()
            optimizer.step()
            student.zero_grad()
            global_step += 1
            metrics = training_util.get_metrics(student, tr_loss,
                                                batches_this_epoch)
            description = training_util.description_from_metrics(
                metrics) + "\n"
            train_generator_tqdm.set_description(description, refresh=False)
        training_util.get_metrics(student,
                                  tr_loss,
                                  batches_this_epoch,
                                  reset=True)

        # evaluate on the validation dataset
        with torch.no_grad():
            logging.info("validation student")
            metrics = evaluate(student,
                               datasets["validation"],
                               val_iterator,
                               cuda_device,
                               batch_weight_key="")
            current_f1 = metrics["f1"]
            for key, value in metrics.items():
                tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                tb_writer.add_scalar("lr", optimizer.get_lr()[0], global_step)
                tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                     args.logging_steps, global_step)
            logging_loss = tr_loss
            logger.info("{'Epoch %d/%d, student exact_match': %s, 'f1': %s}",
                        epoch, num_epochs - 1, metrics["EM"], metrics["f1"])

        # Save model checkpoint
        model_path = os.path.join(args.output_dir,
                                  "model_state_epoch_{}.th".format(epoch))
        best_path = os.path.join(args.output_dir, "best.th")
        torch.save(student.state_dict(), model_path)
        if current_f1 > best_f1:
            torch.save(student.state_dict(), best_path)
            best_f1 = current_f1
        logger.info("Saving model checkpoint to %s", args.output_dir)

        epoch_elapsed_time = time.time() - epoch_start_time
        logger.info("Epoch duration: %s",
                    datetime.timedelta(seconds=epoch_elapsed_time))
        total_training_loss += tr_loss
        student.get_metrics(reset=True)

    tb_writer.close()
    return global_step, total_training_loss / global_step
예제 #9
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    parser.add_argument(
        '--log_path',
        type=str,
        default="./log",
        help="The path for saving tensorboard logs. Default is ./log")
    args = parser.parse_args()
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)
    processors = {
        "qe": MyProcessor,
    }
    task_name = args.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()
    model_collections = Collections()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
        'distributed_{}'.format(args.local_rank))
    model.to(device)

    # fine-tuning fine-tuning model
    # output_config_file = os.path.join(args.bert_model, CONFIG_NAME)
    # config = BertConfig(output_config_file)
    # model = BertForSequenceClassification(config)
    #
    # output_model_file = os.path.join(args.bert_model, WEIGHTS_NAME)
    # model_state_dict = torch.load(output_model_file)
    # model.load_state_dict(model_state_dict)
    # model.to(device)
    #-----------------------------

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    # ignores_names=['classifier.weight','classifier.bias']
    #
    # base_params = [p for n, p in model.named_parameters() if not any(nd in n for nd in ignores_names)]
    # ignores_params=[p for n, p in model.named_parameters() if any(nd in n for nd in ignores_names)]
    #
    # optimizer = torch.optim.Adam([{'params': base_params},
    #                               {'params': ignores_params, 'lr': args.learning_rate * 10}],
    #                              lr=args.learning_rate)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        # Timer for computing speed
        timer_for_speed = Timer()
        timer_for_speed.tic()
        summary_writer = SummaryWriter(log_dir=args.log_path)
        is_early_stop = False
        disp_freq = 100
        loss_valid_freq = 100
        early_stop_patience = 10
        bad_count = 0

        nb_tr_examples, nb_tr_steps = 0, 0

        for eidx in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                try:

                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, label_ids = batch
                    with torch.enable_grad():
                        loss = model(input_ids, segment_ids, input_mask,
                                     label_ids)
                    if n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps
                    if args.fp16:
                        optimizer.backward(loss)
                    else:
                        loss.backward()
                    nb_tr_examples += input_ids.size(0)
                    nb_tr_steps += 1
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1
                    # model_collections.add_to_collection("train_losses", loss.item())
                    # summary_writer.add_scalar("train_losses", loss.item(), global_step=nb_tr_steps)
                    # display some information
                    if (nb_tr_steps % disp_freq == 0):
                        lrate = list(optimizer.get_lr())[0]
                        result = {'train_loss': loss.item(), "lrate": lrate}
                        logger.info("***** train results *****")
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                except RuntimeError as e:
                    if 'out of memory' in str(e):
                        print('| WARNING: ran out of memory, skipping batch')
                        # optimizer.zero_grad()
                        if hasattr(torch.cuda, 'empty_cache'):
                            torch.cuda.empty_cache()
                    else:
                        raise e

                # calculate dev loss
                if (nb_tr_steps % loss_valid_freq == 0):
                    if args.do_eval and (args.local_rank == -1
                                         or torch.distributed.get_rank() == 0):
                        eval_examples = processor.get_dev_examples(
                            args.data_dir)
                        eval_features = convert_examples_to_features(
                            eval_examples, args.max_seq_length, tokenizer)
                        logger.info("***** Running evaluation *****")
                        logger.info("  Num examples = %d", len(eval_examples))
                        logger.info("  Batch size = %d", args.eval_batch_size)
                        all_input_ids = torch.tensor(
                            [f.input_ids for f in eval_features],
                            dtype=torch.long)
                        all_input_mask = torch.tensor(
                            [f.input_mask for f in eval_features],
                            dtype=torch.long)
                        all_segment_ids = torch.tensor(
                            [f.segment_ids for f in eval_features],
                            dtype=torch.long)
                        all_label_ids = torch.tensor(
                            [f.label_id for f in eval_features],
                            dtype=torch.float)
                        eval_data = TensorDataset(all_input_ids,
                                                  all_input_mask,
                                                  all_segment_ids,
                                                  all_label_ids)
                        # Run prediction for full data
                        eval_sampler = SequentialSampler(eval_data)
                        eval_dataloader = DataLoader(
                            eval_data,
                            sampler=eval_sampler,
                            batch_size=args.eval_batch_size)
                        model.eval()
                        eval_loss = 0
                        nb_eval_steps, nb_eval_examples = 0, 0

                        for bacth_eval in eval_dataloader:
                            bacth_eval = tuple(
                                t.to(device) for t in bacth_eval)
                            input_ids, input_mask, segment_ids, label_ids = bacth_eval
                            with torch.no_grad():
                                tmp_eval_loss = model(input_ids, segment_ids,
                                                      input_mask, label_ids)

                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_examples += input_ids.size(0)
                            nb_eval_steps += 1

                        eval_loss = eval_loss / nb_eval_steps
                        model_collections.add_to_collection(
                            "history_losses", eval_loss)
                        min_history_loss = np.array(
                            model_collections.get_collection(
                                "history_losses")).min()
                        summary_writer.add_scalar("loss",
                                                  eval_loss,
                                                  global_step=nb_tr_steps)
                        summary_writer.add_scalar("best_loss",
                                                  min_history_loss,
                                                  global_step=nb_tr_steps)
                        lrate = list(optimizer.get_lr())[0]
                        summary_writer.add_scalar("lrate",
                                                  scalar_value=lrate,
                                                  global_step=nb_tr_steps)
                        best_eval_loss = min_history_loss
                        # If model get new best valid loss
                        # save model & early stop
                        if eval_loss <= best_eval_loss:
                            bad_count = 0
                            if is_early_stop is False:
                                # Save a trained model
                                # Only save the model it-self
                                model_to_save = model.module if hasattr(
                                    model, 'module') else model
                                output_model_file = os.path.join(
                                    args.output_dir, "pytorch_model.bin")
                                torch.save(model_to_save.state_dict(),
                                           output_model_file)

                                output_config_file = os.path.join(
                                    args.output_dir, CONFIG_NAME)
                                with open(output_config_file, 'w') as f:
                                    f.write(
                                        model_to_save.config.to_json_string())
                        else:
                            bad_count += 1
                            # At least one epoch should be traversed
                            if bad_count >= early_stop_patience and eidx > 0:
                                is_early_stop = True
                                logger.info("Early Stop!")
                        summary_writer.add_scalar("bad_count", bad_count,
                                                  nb_tr_steps)

                        logger.info("{0} Loss: {1:.4f}   patience: {2}".format(
                            nb_tr_steps, eval_loss, bad_count))
                if is_early_stop == True:
                    break
예제 #10
0
def main(args):


    logging = config.get_logging(args.log_name)
    logging.info("##"*20)
    logging.info("##"*20)
    logging.info("##"*20)
    logging.info(args)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()

    logging.info("| question first :: {}".format(args.question_first))
    logging.info("| gpu count : {}".format(n_gpu))
    logging.info("| train batch size in each gpu : {}".format(args.train_batch_size))
    logging.info("| biuid tokenizer and model in : {}".format(args.pre_dir))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)


    tokenizer = BertTokenizer.build_tokenizer(args)
    train_data_iter = MSmarco_iterator(args, tokenizer, batch_size=args.train_batch_size, world_size=n_gpu, accumulation_steps=args.gradient_accumulation_steps, name="msmarco_train.pk")
    dev_data_iter = MSmarco_iterator(args, tokenizer, batch_size=args.valid_batch_size, world_size=n_gpu, name="msmarco_dev.pk")
    data_size = len(train_data_iter)
    gradient_accumulation_steps = args.gradient_accumulation_steps
    num_train_steps = args.num_train_epochs*data_size//gradient_accumulation_steps
    # logging.info("| load dataset {}".format(data_size))
    logging.info("| train data size {}".format(len(train_data_iter)*n_gpu*args.train_batch_size))
    logging.info("| dev data size {}".format(len(dev_data_iter)*n_gpu*args.valid_batch_size))
    logging.info("| train batch data size {}".format(len(train_data_iter)))
    logging.info("| dev batch data size {}".format(len(dev_data_iter)))
    logging.info("| update in each train data {}".format(data_size//gradient_accumulation_steps))
    logging.info("| total update {}".format(num_train_steps))

    # num_train_steps = (96032//2//2)+(data_size-96032)//2

    model = MSmarco.build_model(args)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta', 'layer_norm']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},

        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
        ]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.lr,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)
    logging.info("| init lr is {}".format(optimizer.get_lr()))

    global_update = 0
    for epochs in range(args.num_train_epochs):
        total_loss = 0
        merge_batch = []
        # count = 0
        for step, batch in enumerate(tqdm(train_data_iter, desc="Train Iteration")):
            model.train()
            # if step < 96032:
            #     merge_batch.append(batch)
            #     if len(merge_batch) == 2:
            #         batch = merger_tensor(merge_batch)
            #         merge_batch = []
            #     else:
            #         continue
            if n_gpu==1:
                for key in batch.keys():
                    batch[key]=batch[key].to(device)
            loss = model(**batch)
            # count += 1
            # pdb.set_trace()
            if n_gpu > 1:
                loss = loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss/args.gradient_accumulation_steps
            loss.backward()
            if (step+1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()
                global_update += 1
                if global_update % args.validate_updates==0:
                    validation(args, model, dev_data_iter, n_gpu, epochs, global_update, logging)
            if (step+1) % args.loss_interval==0:
                logging.info("TRAIN ::Epoch {} updates {}, train loss {}".format(epochs, global_update, loss.item()))
        # save_checkpoint(args, model, epochs)
        validation(args, model, dev_data_iter, n_gpu, epochs, global_update, logging)