Пример #1
0
 def __init__(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
     self.tokenizer = tokenizer
     self.model = model
     self.device = next(model.parameters()).device
Пример #2
0
    def train(self,
              model: transformers.PreTrainedModel,
              training_tasks: typing.List[Task],
              validation_tasks: typing.List[Task],
              num_epochs: int,
              batch_size: int,
              steps_per_epoch: int,
              prefetch_size: int,
              eval_batch_size: typing.Optional[int] = None,
              eval_batches: typing.Optional[int] = None,
              checkpoint_file: typing.Optional[str] = None) -> None:
        logging.info('Preparing kitchen sink with %d training tasks: %s', len(training_tasks), training_tasks)

        # Train the model & return its training history
        logging.info('Beginning training...')
        training_data, data_sizes = self.load_train_data(training_tasks,
                                                         batch_size=batch_size,
                                                         prefetch_size=prefetch_size)

        if validation_tasks:
            logging.info('Preparing kitchen sink with %d validation tasks: %s', len(validation_tasks), validation_tasks)
            validation_data = self.load_valid_data(validation_tasks,
                                                   batch_size=eval_batch_size or batch_size,
                                                   prefetch_size=prefetch_size,
                                                   num_batches=eval_batches)
        else:
            validation_data = None
            logging.info('Preparing kitchen sink without validation')

        num_epochs += self.warmup_epochs
        optimizer, scheduler = get_optimizer(model,
                                             num_warmup_steps=self.warmup_epochs * steps_per_epoch,
                                             num_training_steps=num_epochs * steps_per_epoch)

        model.to(self.device)
        if self.use_amp:
            if not is_apex_available():
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

        global_step = 0
        tr_loss = 0.0
        logging_loss = 0.0
        model.zero_grad()
        train_itr = tqdm.trange(0, num_epochs * steps_per_epoch, desc="Training", unit="batch")
        tasks = [task.dataset for task in training_tasks]
        mixing_rates = self.get_mixing_rate(tasks, data_sizes)
        total_task_steps = Counter({task: np.float32(0.) for task in tasks})
        for epoch in range(1, num_epochs + 1):
            epoch_itr = tqdm.trange(0, steps_per_epoch, desc="Epoch %d" % epoch, leave=False, unit="batch")
            epoch_task_steps = Counter({task: np.float32(0.) for task in tasks})
            running_task_losses = {task: np.float32(0.) for task in tasks}
            for step in epoch_itr:
                inputs, labels, _ = next(np.random.choice(training_data, p=mixing_rates))
                step_loss = self._train_step(model, inputs, labels, optimizer)
                tr_loss += step_loss
                train_itr.update()
                task = inputs['task'][0].decode('UTF-8')
                epoch_task_steps[task] += 1
                running_task_losses[task] += step_loss

                if (step + 1) % self.gradient_accumulation_steps == 0 or (
                        # last step in epoch but step is always smaller than gradient_accumulation_steps
                        self.gradient_accumulation_steps >= steps_per_epoch == (step + 1)):
                    if self.use_amp:
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)

                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

            total_tasks = sum(epoch_task_steps.values())

            print('Epoch %d: Empirical Mixing Rates: %s' % (
                epoch,
                '; '.join('{:s}: {:0>5.2f}%'.format(task, rate * 100. / total_tasks)
                          for task, rate in epoch_task_steps.items())
            ))

            print('Epoch %d: Expected Mixing Rates: %s' % (
                epoch,
                '; '.join('{:s}: {:0>5.2f}%'.format(task, rate * 100.)
                          for task, rate in zip(tasks, mixing_rates))
            ))

            mixing_losses = [loss / epoch_task_steps[task] for task, loss in running_task_losses.items()]
            print('Epoch %d: Training Losses: %s' % (
                epoch,
                '; '.join('{:s}: {:g}'.format(task, loss) for task, loss in zip(tasks, mixing_losses))
            ))

            if epoch > self.warmup_epochs:
                total_task_steps += epoch_task_steps
                exploration_ratios = np.array([total_task_steps.get(task, np.float32(0)) / size
                                               for task, size in zip(tasks, data_sizes)])
                print('Epoch %d: Exploration Ratios: %s' % (
                    epoch,
                    '; '.join('{:s}: {:0>5.2f}%'.format(task, ratio * 100.)
                              for task, ratio in zip(tasks, exploration_ratios))
                ))

                if not self.mix_from_validation:
                    avg_loss = np.nanmean(mixing_losses)
                    mixing_losses = [er * loss + (1. - er) * avg_loss
                                     for er, loss in zip(exploration_ratios, np.nan_to_num(mixing_losses))]

            valid_steps = 0
            running_valid_loss = 0.
            if validation_data:
                epoch_task_steps = {task: np.float32(0.) for task in tasks}
                running_task_losses = {task: np.float32(0.) for task in tasks}
                with torch.no_grad():
                    for step, (inputs, labels, _) in enumerate(validation_data.as_numpy_iterator(), 1):
                        model.eval()
                        # Run the forward pass
                        valid_step_loss = model(**self.prepare_forward_inputs(model, inputs, labels))[0].item()
                        running_valid_loss += valid_step_loss
                        valid_task = inputs['task'][0].decode('UTF-8')
                        if valid_task in tasks:
                            epoch_task_steps[valid_task] += 1
                            running_task_losses[valid_task] += valid_step_loss
                        valid_steps += 1

                avg_val_loss = running_valid_loss / valid_steps
                # Save checkpoint if validation loss decreases and checkpoint dir has been provided
                if checkpoint_file:
                    if epoch == 1:
                        best_val_loss = avg_val_loss
                        logging.info("Saving best model with initial validation loss {0})".format(best_val_loss))
                        self.save_model(model, "{0}_best".format(checkpoint_file))
                    else:
                        if avg_val_loss < best_val_loss:
                            best_val_loss = avg_val_loss
                            logging.info(
                                "Saving new best model with validation loss {0} (epoch {1})".format(best_val_loss,
                                                                                                    epoch))
                            self.save_model(model, "{0}_best".format(checkpoint_file))

                print('Epoch {:d}: Validation Losses: {:s}'.format(
                    epoch,
                    '; '.join('{:s}: {:g}'.format(task, loss / epoch_task_steps[task])
                              for task, loss in running_task_losses.items())
                ))

                if self.mix_from_validation:
                    mixing_losses = [loss / epoch_task_steps[task] for task, loss in running_task_losses.items()]

            if epoch > self.warmup_epochs and self.dynamic_mixing:
                new_mixing_rates = self.get_mixing_rate(
                    tasks=tasks,
                    rates=mixing_losses,
                    normalize=False,
                    temperature=(1. / self.temperature)
                )
                print('Epoch {:d}: Updating Mixing Rate: {:s}'.format(
                    epoch,
                    '; '.join(
                        '{:s}: {:0>5.2f}%->{:0>5.2f}% (Δ={:0>5.2f})'.format(
                            task,
                            old_rate * 100.,
                            smooth_rate * 100.,
                            (smooth_rate-old_rate) * 100.)
                        for task, old_rate, smooth_rate in zip(tasks, mixing_rates, new_mixing_rates))
                ))
                mixing_rates = new_mixing_rates
                logging.debug('Mixing rates (shape=%s; |tasks|=%d): %s', mixing_rates.shape, len(tasks), mixing_rates)

            lr = scheduler.get_last_lr()[0]
            loss_scalar = (tr_loss - logging_loss) / steps_per_epoch
            logging_loss = tr_loss
            train_itr.write('Global step: %d, lr: %g, loss: %g, val_loss: %g' % (
                global_step,
                lr,
                loss_scalar,
                running_valid_loss / valid_steps if valid_steps > 0 else np.NaN))

            if not np.isfinite(loss_scalar):
                logging.info('Loss was NaN, ending training after %d epochs.', epoch)
                train_itr.close()
                return

        train_itr.close()
Пример #3
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir
    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            input_ids, attention, token_ids, child, head = batch[0], batch[
                1], batch[2], batch[3], batch[4]
            dep_labels, num_dependency, arcs, arc_labels = batch[5], batch[
                6], batch[7], batch[8]
            arc_label_lengths, sent_labels = batch[9], batch[10]

            inputs = {
                'input_ids': input_ids,
                'attention': attention,
                'token_ids': token_ids,
                'child': child,
                'head': head,
                'dep_labels': dep_labels,
                'arcs': arc_labels,
                'arc_label_lengths': arc_label_lengths,
                'device': args.device
            }

            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()

        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = dep_labels.detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids,
                                      dep_labels.detach().cpu().numpy(),
                                      axis=0)

    f_out = open(os.path.join(eval_output_dir, 'dev_out.txt'), 'w')
    k = 0
    for batch in eval_dataloader:
        for inp, arc_list in zip(batch[0], batch[8]):
            text = tokenizer.decode(inp)
            text = text.replace(tokenizer.pad_token, '').strip()
            f_out.write(text + '\n')

            for j, arc in enumerate(arc_list):
                arc_text = tokenizer.decode(arc)
                arc_text = arc_text.replace(tokenizer.pad_token, '').strip()

                if arc_text == '':  # for bert
                    break

                pred_temp = softmax([preds[k][j]])

                f_out.write(text + '\n')
                f_out.write(arc_text + '\n')
                f_out.write('gold:\t' + str(out_label_ids[k][j]) + '\n')
                f_out.write('pred:\t' + str(np.argmax(pred_temp)) + '\n')
                f_out.write(
                    str(pred_temp[0][0]) + '\t' + str(pred_temp[0][1]) + '\n')
                f_out.write('\n')

            k += 1

    f_out.close()

    preds = preds.reshape(-1, 2)
    preds = softmax(preds)
    out_label_ids = out_label_ids.reshape(-1)
    preds = np.argmax(preds, axis=1)

    result = compute_metrics_intermediate(preds, out_label_ids)
    print(result)

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("dep level %s = %s", key, str(result[key]))
            writer.write("dep level  %s = %s\n" % (key, str(result[key])))
        writer.write('\n')

    return result
Пример #4
0
def freeze_all_tokens(model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
    model.get_input_embeddings().weight.requires_grad = False
Пример #5
0
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Пример #6
0
def evaluate(args,
             eval_dataset,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             run_batch_fn,
             desc="") -> Dict:
    if args.local_rank in [-1, 0]:
        eval_output_dir = args.output_dir
        os.makedirs(eval_output_dir, exist_ok=True)

    # eval_batch_size for selection must be 1 to handle variable number of candidates
    if args.task == "selection":
        args.eval_batch_size = 1
    else:
        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=eval_dataset.collate_fn)

    # multi-gpu evaluate
    if args.n_gpu > 1 and (args.task != "selection"
                           or eval_dataset.args.eval_all_snippets):
        if not isinstance(model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model)

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    data_infos = []
    all_preds = []
    all_labels = []
    for batch in tqdm(eval_dataloader,
                      desc="Evaluating",
                      disable=args.local_rank not in [-1, 0]):
        with torch.no_grad():
            loss, lm_logits, mc_logits, mc_labels = run_batch_fn(
                args, model, batch)
            if args.task == "detection":
                mc_logits = mc_logits.sigmoid()
            if args.task in ["selection", "detection"]:
                data_infos.append(batch[-1])
            all_preds.append(mc_logits.detach().cpu().numpy())
            all_labels.append(mc_labels.detach().cpu().numpy())
            eval_loss += loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    if args.task.lower() == "generation":
        perplexity = torch.exp(torch.tensor(eval_loss))
        result = {"perplexity": perplexity, "loss": eval_loss}
    elif args.task.lower() == "selection":
        all_labels = np.array(all_labels).reshape(-1)
        all_pred_ids = np.array([np.argmax(logits) for logits in all_preds])
        accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels)
        logger.info("Avg. # of candidates: %f",
                    sum([len(arr[0]) for arr in all_preds]) / len(all_preds))
        result = {"loss": eval_loss, "accuracy": accuracy}
        if args.output_file:
            sorted_pred_ids = [
                np.argsort(logits.squeeze())[::-1] for logits in all_preds
            ]
            write_selection_preds(eval_dataset.dataset_walker,
                                  args.output_file,
                                  data_infos,
                                  sorted_pred_ids,
                                  topk=5)
    elif args.task.lower() == "detection":
        all_labels = np.concatenate(all_labels)
        all_pred_ids = (np.concatenate(all_preds) > 0.5)
        accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels)
        precision = sklearn.metrics.precision_score(all_labels, all_pred_ids)
        recall = sklearn.metrics.recall_score(all_labels, all_pred_ids)
        result = {
            "loss": eval_loss,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall
        }
        if args.output_file:
            write_detection_preds(eval_dataset.dataset_walker,
                                  args.output_file, data_infos, all_pred_ids)
    else:
        raise ValueError(
            "args.task not in ['generation', 'selection', 'detection'], got %s"
            % args.task)

    if args.local_rank in [-1, 0]:
        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results %s *****" % desc)
            writer.write("***** Eval results %s *****\n" % desc)
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Пример #7
0
def evaluate(args,
             data_generator,
             tb_writer,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             global_step,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    criterion = nn.BCEWithLogitsLoss()

    eval_dataset = data_generator.instance_a_valid_dataset()

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(batch):
        # if tokenizer._pad_token is None:
        #     return pad_sequence(examples, batch_first=True)
        # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

        tokens = [b[0] for b in batch]
        features = [b[1] for b in batch]
        targets = [b[2] for b in batch]
        inputs = [b[3] for b in batch]

        lens = [len(x) for x in inputs]

        inputs = pad_sequence(inputs,
                              batch_first=True,
                              padding_value=tokenizer.pad_token_id)
        attention_mask = (inputs != tokenizer.pad_token_id).int()

        tokens, features, targets = [
            torch.tensor(x) for x in [tokens, features, targets]
        ]

        return tokens, features, targets, inputs, attention_mask, torch.tensor(
            lens).unsqueeze(1)

    if args.use_bucket_iterator:
        bucket_boundaries = [0, 20, 40, 60, 80, 101]
        eval_sampler = BySequenceLengthSampler(eval_dataset,
                                               bucket_boundaries,
                                               batch_size=args.eval_batch_size,
                                               drop_last=False)
        eval_dataloader = DataLoader(eval_dataset,
                                     batch_size=1,
                                     batch_sampler=eval_sampler,
                                     collate_fn=collate)
    else:
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size,
                                     collate_fn=collate)

    # multi-gpu evaluate
    # if args.n_gpu > 1:
    #    model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    preds, labels = [], []

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        # training loop
        tokens, features, targets, inputs, attention_mask, lens = batch

        tokens, features, targets, inputs, attention_mask, lens = [
            x.to(args.device)
            for x in [tokens, features, targets, inputs, attention_mask, lens]
        ]

        tokens, features, targets = [
            x.float() for x in [tokens, features, targets]
        ]

        with torch.no_grad():
            logit = model(tokens, features, inputs, attention_mask, lens)
            loss = criterion(logit, targets)

            pred = torch.sigmoid(logit).detach().cpu().numpy()
            labels.append(targets.long().detach().cpu().numpy())
            preds.append(pred)

            eval_loss += loss.mean().item()

        nb_eval_steps += 1

    labels = np.vstack(labels)
    preds = np.float64(np.vstack(preds))

    aucprs = []

    for i, engage in enumerate(["reply", "retweet", "comment", "like"]):
        _prauc = compute_prauc(preds[:, i], labels[:, i])
        _rce = compute_rce(preds[:, i], labels[:, i])

        aucprs.append(_prauc)

        print(engage + ":", _prauc, _rce)

        tb_writer.add_scalar('PRAUC/{}_val'.format(engage), _prauc,
                             global_step)
        tb_writer.add_scalar('RCE/{}_val'.format(engage), _rce, global_step)

    print("Mean AUCPR : {}".format(sum(aucprs) / 4.0))
    tb_writer.add_scalar('PRAUC/mean', sum(aucprs) / 4.0, global_step)
Пример #8
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        fact_token_ids, fact_embedding_ids = zip(
            *[get_inputs(seq, mask) for seq, mask, genre in examples])
        seqs = [seq for seq, mask, genre in examples]
        pad_seqs = pad_sequence(seqs,
                                batch_first=True,
                                padding_value=tokenizer.pad_token_id)
        pad_facts = pad_sequence(fact_token_ids,
                                 batch_first=True,
                                 padding_value=tokenizer.pad_token_id)
        pad_factsembeds = pad_sequence(fact_embedding_ids,
                                       batch_first=True,
                                       padding_value=tokenizer.pad_token_id)
        return list(zip(pad_facts, pad_factsembeds, pad_seqs))

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):

        if args.mlm:
            inputs, labels = mask_tokens(batch, tokenizer, args)
            with torch.no_grad():
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()
        elif args.xlnet:
            with torch.no_grad():

                pad_facts, pad_factsembeds, pad_seqs = zip(*batch)
                tfacts = torch.stack(pad_facts).to(args.device)
                tfact_embeds = torch.stack(pad_factsembeds).to(args.device)
                facts_padding_masks = torch.where(
                    tfacts == tokenizer.pad_token_id, torch.ones_like(tfacts),
                    torch.zeros_like(tfacts)).to(args.device)
                tseqs = torch.stack(pad_seqs).to(args.device)
                tseqs_padding_masks = torch.where(
                    tseqs == tokenizer.pad_token_id, torch.ones_like(tseqs),
                    torch.zeros_like(tseqs)).to(args.device)

                perm_masks = get_perm_masks(torch.zeros_like(tseqs),
                                            order="L2R")
                target_mapping = get_target_mapping(torch.zeros_like(tseqs),
                                                    device=args.device)

                outputs = model(input_ids=tseqs,
                                facts_tokens=tfacts,
                                facts_embeds=tfact_embeds,
                                input_mask=tseqs_padding_masks,
                                facts_input_mask=facts_padding_masks,
                                perm_mask=perm_masks,
                                target_mapping=target_mapping)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()
        else:
            inputs, labels = (batch, batch)
            with torch.no_grad():
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()

        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    print(f"validation loss value at step is {eval_loss}")
    logger.info(f"validation loss value at step is {eval_loss}")
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Пример #9
0
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    def collate(examples):
        src_id,tgt_id,src_am,tgt_am=list(zip(*examples))
        # print (src_id)
        src_id,tgt_id,src_am,tgt_am=torch.stack(src_id),torch.stack(tgt_id),torch.stack(src_am),torch.stack(tgt_am)
        # padding_value = 0 if tokenizer._pad_token is None else tokenizer.pad_token_id
        # input_ids_src = pad_sequence(src_examples, batch_first=True, padding_value=padding_value)
        # input_ids_tgt = pad_sequence(tgt_examples, batch_first=True, padding_value=padding_value)
        # max_length = input_ids.shape[1]
        # attention_mask_src = torch.stack(
        #     [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) for t
        #      in src_examples])
        # attention_mask_tgt = torch.stack(
        #     [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) for t
        #      in tgt_examples])

        return src_id,tgt_id,src_am,tgt_am
    # def collate(examples: List[torch.Tensor]):
    #     padding_value = 0 if tokenizer._pad_token is None else tokenizer.pad_token_id
    #     input_ids=pad_sequence(examples, batch_first=True,padding_value=padding_value)
    #     max_length = input_ids.shape[1]
    #     attention_mask = torch.stack(
    #         [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) for t
    #          in examples])

    #     return input_ids, attention_mask
        # if tokenizer._pad_token is None:
        #     max_length = input_ids.shape[1]
        #     attention_mask = torch.stack(
        #         [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)])
        #          for t in examples])
        #     return pad_sequence(examples, batch_first=True)
        #
        # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs_src,inputs_tgt,attention_mask_src,attention_mask_tgt=batch
        inputs_src,inputs_tgt = inputs_src.to(args.device),inputs_tgt.to(args.device)
        attention_mask_src,attention_mask_tgt=attention_mask_src.to(args.device),attention_mask_tgt.to(args.device)
        with torch.no_grad():
            loss = model(input_ids_src=inputs_src, input_ids_tgt=inputs_tgt,attention_mask_src=attention_mask_src,attention_mask_tgt=attention_mask_tgt) 
            eval_loss += loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    # perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"loss": eval_loss}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
def train(args, train_dataset, corrects, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    log_dir = os.path.join(
        config.output_dir, 'runs', args.relation,
        os.path.basename(args.output_dir) + '_' + current_time)
    tb_writer = SummaryWriter(log_dir=log_dir)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=args.batch_size,
                                  collate_fn=collate)
    t_total = len(
        train_dataloader) // args.gradient_accumulation_steps * args.epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.batch_size * args.gradient_accumulation_steps,
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.epochs),
                            desc="Epoch",
                            disable=False)
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=False)
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer, args)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    results = evaluate(args, corrects, model, tokenizer)

                    for key, value in results.items():
                        tb_writer.add_scalar("{}".format(key), value,
                                             global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    # print((tr_loss - logging_loss) / args.logging_steps)

                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

    tb_writer.close()

    return global_step, tr_loss / global_step
Пример #11
0
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
    if args.add_fake_english:
        add_shifted_input(eval_dataset.examples, args.special_token_indices, model.config.shift)
    # remove parallel data is not meaningful for evaluation
    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    mycounter = 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        if mycounter == 0 or mycounter == 68:
            pass
        if args.invert_order:
            invert(batch, model.config.shift)
        if args.language_specific_positions:
            if args.block_size > 256:
                raise ValueError("Language specific posiiton embeddings can only be <256.")
            position_ids, segment_ids = get_language_specific_positions(batch, model.config.shift, args.block_size)
            position_ids = position_ids.to(args.device)
            segment_ids = segment_ids.to(args.device)
        else:
            position_ids, segment_ids = None, None
        inputs, labels = mask_tokens(batch, tokenizer, args, model) if args.mlm else (batch, batch)
        if args.shift_special_tokens:
            shift_special_tokens(inputs, model.config.shift, args.special_token_indices)
        mycounter += 1
        if mycounter < 5:
            logger.info("")
            logger.info("#" * 10 + " {} ".format(mycounter) + "#"*10)
            logger.info("-" * 30 + " INPUTS")
            logger.info(inputs)
            logger.info("-" * 30 + " POSITIONS")
            logger.info(position_ids)
            logger.info("-" * 30 + " TOKENS")
            logger.info(segment_ids)
            logger.info("-" * 30 + " LABELS")
            logger.info(labels)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    if args.eval_output_file is not None:
        output_eval_file = args.eval_output_file
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("{} {} {} {}\n".format(args.output_dir, args.seed, key, result[key]))
    else:
        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Пример #12
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    if args.wiki_dataset:
        collate_fn = functools.partial(collate_wiki, tokenizer)
    else:
        collate_fn = functools.partial(collate, tokenizer)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset,
        sampler=eval_sampler,
        batch_size=args.eval_batch_size,
        collate_fn=collate_fn,
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        if args.eval_subsampling != 1.0 and random.random(
        ) >= args.eval_subsampling:
            continue

        if args.wiki_dataset:
            if args.mlm:
                raise RuntimeError("Can't do mlm for wiki dataset")

            tokens, loss_mask = batch
            inputs, labels = (tokens, tokens)

            loss_mask = loss_mask.to(args.device)
            loss_weights = (~loss_mask) + loss_mask * args.title_scale
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            outputs = model(inputs, labels=labels, loss_weights=loss_weights)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        else:
            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)

            with torch.no_grad():
                outputs = model(
                    inputs, masked_lm_labels=labels) if args.mlm else model(
                        inputs, labels=labels)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()

        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))
    loss = torch.tensor(eval_loss)

    result = {"perplexity": perplexity, "loss": loss}

    if args.eval_creativity_blacklist:
        if not args.parsed_dictionary_dataset:
            raise RuntimeError(
                "Evaluating creativity blacklist with non-parsed dictionary dataset"
            )

        blacklist = datasets.Blacklist.load(args.eval_creativity_blacklist)

        print(
            f"Evaluating creativity over {args.num_eval_creativity} words with {args.eval_creativity_batch_size} batch size"
        )
        s = time.time()
        result.update(
            datasets.ParsedDictionaryDefinitionDataset.evaluate_creativity(
                tokenizer,
                model,
                blacklist,
                args.num_eval_creativity,
                args.eval_creativity_batch_size,
                max_length=args.block_size,
            ))
        print(f"Done evaluating creativity in {time.time() - s}s")

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Пример #13
0
def export_pytorch(
    tokenizer: PreTrainedTokenizer,
    model: PreTrainedModel,
    config: OnnxConfig,
    opset: int,
    output: Path,
) -> Tuple[List[str], List[str]]:
    """
    Export a PyTorch model to an ONNX Intermediate Representation (IR)

    Args:
        tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer used for encoding the data.
        model ([`PreTrainedModel`]):
            The model to export.
        config ([`~onnx.config.OnnxConfig`]):
            The ONNX configuration associated with the exported model.
        opset (`int`):
            The version of the ONNX operator set to use.
        output (`Path`):
            Directory to store the exported ONNX model.

    Returns:
        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
        the ONNX configuration.
    """
    if issubclass(type(model), PreTrainedModel):
        import torch
        from torch.onnx import export as onnx_export

        logger.info(f"Using framework PyTorch: {torch.__version__}")
        with torch.no_grad():
            model.config.return_dict = True
            model.eval()

            # Check if we need to override certain configuration item
            if config.values_override is not None:
                logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
                for override_config_key, override_config_value in config.values_override.items():
                    logger.info(f"\t- {override_config_key} -> {override_config_value}")
                    setattr(model.config, override_config_key, override_config_value)

            # Ensure inputs match
            # TODO: Check when exporting QA we provide "is_pair=True"
            model_inputs = config.generate_dummy_inputs(tokenizer, framework=TensorType.PYTORCH)
            inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
            onnx_outputs = list(config.outputs.keys())

            if not inputs_match:
                raise ValueError("Model and config inputs doesn't match")

            config.patch_ops()

            # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
            # so we check the torch version for backwards compatibility
            if parse(torch.__version__) < parse("1.10"):
                # export can work with named args but the dict containing named args
                # has to be the last element of the args tuple.
                try:
                    onnx_export(
                        model,
                        (model_inputs,),
                        f=output.as_posix(),
                        input_names=list(config.inputs.keys()),
                        output_names=onnx_outputs,
                        dynamic_axes={
                            name: axes for name, axes in chain(config.inputs.items(), config.outputs.items())
                        },
                        do_constant_folding=True,
                        use_external_data_format=config.use_external_data_format(model.num_parameters()),
                        enable_onnx_checker=True,
                        opset_version=opset,
                    )
                except RuntimeError as err:
                    message = str(err)
                    if (
                        message
                        == "Exporting model exceed maximum protobuf size of 2GB. Please call torch.onnx.export without setting use_external_data_format parameter."
                    ):
                        message = "Exporting model exceed maximum protobuf size of 2GB. Please call torch.onnx.export without setting use_external_data_format parameter or try with torch 1.10+."
                        raise RuntimeError(message)
                    else:
                        raise err
            else:
                onnx_export(
                    model,
                    (model_inputs,),
                    f=output.as_posix(),
                    input_names=list(config.inputs.keys()),
                    output_names=onnx_outputs,
                    dynamic_axes={name: axes for name, axes in chain(config.inputs.items(), config.outputs.items())},
                    do_constant_folding=True,
                    opset_version=opset,
                )

            config.restore_ops()

    return matched_inputs, onnx_outputs
Пример #14
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """

    tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch")
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, label_ids = batch
            labels = label_ids.repeat((inputs.shape[1], 1)).T
            masks = inputs.eq(tokenizer.mask_token_id)
            labels[~masks] = -100
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:

                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    tb_writer.close()

    return global_step, tr_loss / global_step
Пример #15
0
def save_preds(args,
               data_generator,
               tb_writer,
               model: PreTrainedModel,
               tokenizer: PreTrainedTokenizer,
               global_step,
               prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    criterion = nn.BCEWithLogitsLoss()

    eval_dataset = data_generator.instance_a_lb_dataset()

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(batch):
        # if tokenizer._pad_token is None:
        #     return pad_sequence(examples, batch_first=True)
        # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

        tokens = [b[0] for b in batch]
        features = [b[1] for b in batch]
        tweet_ids = [b[3] for b in batch]
        user_ids = [b[4] for b in batch]
        inputs = [b[2] for b in batch]

        lens = [len(x) for x in inputs]

        inputs = pad_sequence(inputs,
                              batch_first=True,
                              padding_value=tokenizer.pad_token_id)
        attention_mask = (inputs != tokenizer.pad_token_id).int()

        tokens, features = [torch.tensor(x) for x in [tokens, features]]

        return tokens, features, tweet_ids, user_ids, inputs, attention_mask, torch.tensor(
            lens).unsqueeze(1)

    if args.use_bucket_iterator:
        bucket_boundaries = [0, 20, 40, 60, 80, 101]
        eval_sampler = BySequenceLengthSampler(eval_dataset,
                                               bucket_boundaries,
                                               batch_size=args.eval_batch_size,
                                               drop_last=False)
        eval_dataloader = DataLoader(eval_dataset,
                                     batch_size=1,
                                     batch_sampler=eval_sampler,
                                     collate_fn=collate)
    else:
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size,
                                     collate_fn=collate)

    # multi-gpu evaluate
    # if args.n_gpu > 1:
    #    model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    nb_eval_steps = 0
    model.eval()

    tweets, users, preds = [], [], []

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        # training loop
        tokens, features, tweet_ids, user_ids, inputs, attention_mask, lens = batch

        tokens, features, inputs, attention_mask, lens = [
            x.to(args.device)
            for x in [tokens, features, inputs, attention_mask, lens]
        ]

        tokens, features = [x.float() for x in [tokens, features]]

        with torch.no_grad():
            logit = model(tokens, features, inputs, attention_mask, lens)
            pred = torch.sigmoid(logit).detach().cpu().numpy()
            tweets += tweet_ids
            users += user_ids
            preds.append(pred)

        nb_eval_steps += 1

        #if nb_eval_steps == 10:
        #    break

    tweets = np.array(tweets)
    users = np.array(users)
    preds = np.float64(np.vstack(preds))
    print(tweets.shape, users.shape, preds.shape)
    print(tweets[0:10])
    print(users[0:10])

    for i, engage in enumerate(["reply", "retweet", "comment", "like"]):
        preds_i = preds[:, i]
        print(preds_i.shape)
        with open(
                args.test_inference_path + "submission_{}.csv".format(engage),
                "w") as f:
            for k in range(preds_i.shape[0]):
                f.write(
                    str(tweets[k]) + "," + str(users[k]) + "," +
                    str(preds_i[k]) + "\n")
            print("Saved to csv the predictions for task {}".format(engage))
    def __init__(self,
                 model: SentenceTransformer,
                 decoder_name_or_path: str = None,
                 tie_encoder_decoder: bool = True):
        """
        :param model: SentenceTransformer model
        :param decoder_name_or_path: Model name or path for initializing a decoder (compatible with Huggingface's Transformers)
        :param tie_encoder_decoder: whether to tie the trainable parameters of encoder and decoder
        """
        super(DenoisingAutoEncoderLoss, self).__init__()
        self.encoder = model  # This will be the final model used during the inference time.
        self.tokenizer_encoder = model.tokenizer

        encoder_name_or_path = model[0].auto_model.config._name_or_path
        if decoder_name_or_path is None:
            assert tie_encoder_decoder, "Must indicate the decoder_name_or_path argument when tie_encoder_decoder=False!"
        if tie_encoder_decoder:
            if decoder_name_or_path:
                logger.warning(
                    'When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.'
                )
            decoder_name_or_path = encoder_name_or_path

        self.tokenizer_decoder = AutoTokenizer.from_pretrained(
            decoder_name_or_path)
        self.need_retokenization = not (type(self.tokenizer_encoder) == type(
            self.tokenizer_decoder))

        decoder_config = AutoConfig.from_pretrained(decoder_name_or_path)
        decoder_config.is_decoder = True
        decoder_config.add_cross_attention = True
        kwargs_decoder = {'config': decoder_config}
        try:
            self.decoder = AutoModelForCausalLM.from_pretrained(
                decoder_name_or_path, **kwargs_decoder)
        except ValueError as e:
            logger.error(
                f'Model name or path "{decoder_name_or_path}" does not support being as a decoder. Please make sure the decoder model has an "XXXLMHead" class.'
            )
            raise e
        assert model[
            0].auto_model.config.hidden_size == decoder_config.hidden_size, 'Hidden sizes do not match!'
        if self.tokenizer_decoder.pad_token is None:
            # Needed by GPT-2, etc.
            self.tokenizer_decoder.pad_token = self.tokenizer_decoder.eos_token
            self.decoder.config.pad_token_id = self.decoder.config.eos_token_id

        if len(AutoTokenizer.from_pretrained(encoder_name_or_path)) != len(
                self.tokenizer_encoder):
            logger.warning(
                'WARNING: The vocabulary of the encoder has been changed. One might need to change the decoder vocabulary, too.'
            )

        if tie_encoder_decoder:
            assert not self.need_retokenization, "The tokenizers should be the same when tie_encoder_decoder=True."
            if len(self.tokenizer_encoder) != len(
                    self.tokenizer_decoder
            ):  # The vocabulary has been changed.
                self.tokenizer_decoder = self.tokenizer_encoder
                self.decoder.resize_token_embeddings(
                    len(self.tokenizer_decoder))
                logger.warning(
                    'Since the encoder vocabulary has been changed and --tie_encoder_decoder=True, now the new vocabulary has also been used for the decoder.'
                )
            decoder_base_model_prefix = self.decoder.base_model_prefix
            PreTrainedModel._tie_encoder_decoder_weights(
                model[0].auto_model,
                self.decoder._modules[decoder_base_model_prefix],
                self.decoder.base_model_prefix)
Пример #17
0
def train(args, data, datasets, model: PreTrainedModel, original_model,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    train_datasets = datasets['train']
    dev_datasets = datasets['dev']

    train_dataloaders, train_example_num, train_distribution = create_dataloader(
        args, train_datasets, tokenizer, train=True)
    dev_dataloaders, dev_example_num, dev_distribution = create_dataloader(
        args, dev_datasets, tokenizer, train=False)

    train_iter_num = sum(
        [len(dataloader) for dataloader in train_dataloaders.values()])
    dev_iter_num = sum(
        [len(dataloader) for dataloader in dev_dataloaders.values()])

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            train_iter_num // args.gradient_accumulation_steps) + 1
    else:
        t_total = train_iter_num // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    original_model = original_model.module if hasattr(
        original_model, "module"
    ) else original_model  # Take care of distributed/parallel training
    original_model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
        original_model = torch.nn.DataParallel(original_model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
        original_model = torch.nn.parallel.DistributedDataParallel(
            original_model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_example_num)
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    best_loss = float('inf')
    best_step = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (train_iter_num //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                train_iter_num // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    model.zero_grad()
    original_model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])

    def inner_product(x, y):
        return torch.mean(torch.sum(y * x, 3))

    def mean_square(x, y, idx):
        return torch.mean(torch.mean((y - x)**2, idx))
        #return torch.mean(torch.sum((y - x) ** 2, 3))

    def save_best_model(best_loss, best_step, dev_dataloaders):
        if (
                args.local_rank == -1 and args.evaluate_during_training
        ):  # Only evaluate when single GPU otherwise metrics may not average well
            eval_loss = evaluate(model, attributes_hiddens, dev_dataloaders)
            #eval_loss = evaluate(args, model, original_model, dev_dataloaders, dev_example_num, dev_distribution, criterion_mse, criterion_ip, feminine_hiddens, masculine_hiddens, gender_hiddens)
            logger.info(" global_step = %s, evaluate loss = %s", global_step,
                        eval_loss)
            tb_writer.add_scalar("eval_loss", eval_loss, global_step)
        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)

        if eval_loss < best_loss:
            best_loss = eval_loss
            best_step = global_step
            checkpoint_prefix = "checkpoint"
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir, "checkpoint-best")
            os.makedirs(output_dir, exist_ok=True)
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            torch.save(args, os.path.join(output_dir, "training_args.bin"))
            logger.info("Saving model checkpoint to %s", output_dir)

            #_rotate_checkpoints(args, checkpoint_prefix)

            torch.save(optimizer.state_dict(),
                       os.path.join(output_dir, "optimizer.pt"))
            torch.save(scheduler.state_dict(),
                       os.path.join(output_dir, "scheduler.pt"))
            logger.info("Saving optimizer and scheduler states to %s",
                        output_dir)
        logger.info(" best_step = %s, best loss = %s", best_step, best_loss)

        return best_loss, best_step

    def get_hiddens_of_model(input):
        model.zero_grad()
        if args.model_type == 'roberta':
            _, _, hiddens = model.roberta(input)
        elif args.model_type == 'bert':
            _, _, hiddens = model.bert(input)
        elif args.model_type == 'albert':
            _, _, hiddens = model.albert(input)
        elif args.model_type == 'dbert':
            _, hiddens = model.distilbert(input)
        elif args.model_type == 'electra':
            _, hiddens = model.electra(input)
        elif args.model_type == 'gpt2':
            _, _, hiddens = model.transformer(input)
        elif args.model_type == 'gpt':
            _, hiddens = model.transformer(input)

        return hiddens

    def attribute_vector_example():
        attributes_hiddens = {f'attribute{i}': [] for i in range(2)}

        dataloaders, _, distribution = create_dataloader(args,
                                                         train_datasets,
                                                         tokenizer,
                                                         train=True)
        for key in distribution:
            if key != 'neutral':
                inputs, labels = next(dataloaders[key])
                inputs = inputs.to(args.device)
                hiddens = get_hiddens_of_model(inputs)
                hiddens = torch.stack(hiddens, 2)
                if labels.size(1) > 1:
                    onehot = torch.eye(hiddens.size(1))
                    zeros = torch.zeros(1, onehot.size(0))
                    onehot = torch.cat((zeros, onehot), 0)
                    onehot = onehot[labels]
                    onehot = torch.sum(onehot, 1)
                    onehot = onehot.view(hiddens.size(0), -1, 1, 1)
                else:
                    onehot = torch.eye(hiddens.size(1))[labels].view(
                        hiddens.size(0), -1, 1, 1)
                onehot = onehot.to(args.device)
                attributes_hiddens[key].append(
                    torch.sum(hiddens * onehot, 1) / labels.size(1))

        # neutralも含まれている
        attribute_size = len(data['train']['example'])
        for i in range(attribute_size - 1):
            attributes_hiddens[f'attribute{i}'] = torch.mean(
                torch.cat(attributes_hiddens[f'attribute{i}'], 0),
                0).detach().unsqueeze(0)

        return attributes_hiddens

    def forward(attributes_hiddens, dataloaders, key):
        inputs = next(dataloaders[key])
        if len(inputs) == 2:
            inputs, labels = inputs
            labels = labels.to(args.device)
        else:
            labels = None
        inputs = inputs.to(args.device)
        if args.model_type == 'roberta':
            final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.roberta(
                inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.roberta(
                        inputs)
                if args.token_loss:
                    token_predicts = model.lm_head(final_layer_hiddens)
                    token_original = original_model.lm_head(
                        final_layer_original_hiddens)
        elif args.model_type == 'bert':
            final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.bert(
                inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.bert(
                        inputs)
                if args.token_loss:
                    token_predicts = model.cls(final_layer_hiddens)
                    token_original = original_model.cls(
                        final_layer_original_hiddens)
        elif args.model_type == 'albert':
            final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.albert(
                inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.albert(
                        inputs)
                if args.token_loss:
                    token_predicts = model.classifier(final_layer_hiddens)
                    token_original = original_model.classifier(
                        final_layer_original_hiddens)
        elif args.model_type == 'dbert':
            final_layer_hiddens, all_layer_hiddens = model.distilbert(inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, all_layer_original_hiddens = original_model.distilbert(
                        inputs)
                if args.token_loss:
                    token_predicts = model.classifier(final_layer_hiddens)
                    token_original = original_model.classifier(
                        final_layer_original_hiddens)
        elif args.model_type == 'electra':
            final_layer_hiddens, all_layer_hiddens = model.electra(inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, all_layer_original_hiddens = original_model.electra(
                        inputs)
                if args.token_loss:
                    hiddens = model.generator_predictions(final_layer_hiddens)
                    token_predicts = model.generator_lm_head(hiddens)
                    original_hiddens = original_model.generator_predictions(
                        final_layer_original_hiddens)
                    token_original = original_model.generator_lm_head(
                        original_hiddens)
        elif args.model_type == 'gpt2':
            final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.transformer(
                inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.transformer(
                        inputs)
                if args.token_loss:
                    token_predicts = model.lm_head(final_layer_hiddens)
                    token_original = original_model.lm_head(
                        final_layer_original_hiddens)
        elif args.model_type == 'gpt':
            final_layer_hiddens, all_layer_hiddens = model.transformer(inputs)
            if 'neutral' != key:
                with torch.no_grad():
                    final_layer_original_hiddens, all_layer_original_hiddens = original_model.transformer(
                        inputs)
                if args.token_loss:
                    token_predicts = model.lm_head(final_layer_hiddens)
                    token_original = original_model.lm_head(
                        final_layer_original_hiddens)

        all_layer_hiddens = torch.stack(all_layer_hiddens, 2)
        if 'neutral' != key:
            all_original_hiddens = torch.stack(all_layer_original_hiddens, 2)
            all_original_hiddens = all_original_hiddens.detach()
            if args.token_loss:
                original_hiddens - original_hiddens.detach()
                token_original = token_original.detach()
        if args.debias_layer == 'all':
            target_layer_hiddens = all_layer_hiddens
            target_original_hiddens = all_layer_hiddens
        else:
            if args.debias_layer == 'first':
                idx = 0
            elif args.debias_layer == 'last':
                idx = -1
            target_layer_hiddens = all_layer_hiddens[:, :, idx]
            target_layer_hiddens = target_layer_hiddens.unsqueeze(2)
            if 'neutral' != key:
                target_original_hiddens = all_original_hiddens[:, :, idx]
                target_original_hiddens = target_original_hiddens.unsqueeze(2)
            else:
                attributes_hiddens = {
                    key: value[:, idx, :].unsqueeze(1)
                    for key, value in attributes_hiddens.items()
                }

        if args.loss_target == 'sentence' or labels is None:
            attributes_hiddens = {
                key: value.unsqueeze(1)
                for key, value in attributes_hiddens.items()
            }
        #elif args.loss_target == 'token' and key == 'neutral':
        elif args.loss_target == 'token':
            if labels.size(1) > 1:
                onehot = torch.eye(target_layer_hiddens.size(1))
                zeros = torch.zeros(1, onehot.size(0))
                onehot = torch.cat((zeros, onehot), 0)
                onehot = onehot[labels]
                onehot = torch.sum(onehot, 1)
                onehot = onehot.view(target_layer_hiddens.size(0), -1, 1, 1)
            else:
                onehot = torch.eye(target_layer_hiddens.size(1))[labels].view(
                    target_layer_hiddens.size(0), -1, 1, 1)
            onehot = onehot.to(args.device)
            target_layer_hiddens = torch.sum(target_layer_hiddens * onehot,
                                             1).unsqueeze(1) / labels.size(1)
            if 'neutral' != key:
                target_original_hiddens = torch.sum(
                    target_original_hiddens * onehot,
                    1).unsqueeze(1) / labels.size(1)
            else:
                attributes_hiddens = {
                    key: value.expand(target_layer_hiddens.size(0), 1,
                                      value.size(1), value.size(2))
                    for key, value in attributes_hiddens.items()
                }

        if 'neutral' == key:
            loss = 0
            for attribute_hiddens in attributes_hiddens.values():
                tmp_loss = criterion_ip(target_layer_hiddens,
                                        attribute_hiddens)
                if args.square_loss:
                    tmp_loss = tmp_loss**2
                tmp_loss *= alpha
                loss += tmp_loss
        else:
            #loss = criterion_ms(target_layer_hiddens, target_original_hiddens)
            loss = criterion_ms(all_layer_hiddens, all_original_hiddens, 3)
            if args.token_loss:
                loss += criterion_ms(token_predicts, token_original, 2)
                #loss += criterion_ms(hiddens, original_hiddens, 2)
            loss *= beta

        return loss

    #def evaluate(args, model: PreTrainedModel, original_model, dev_dataloaders, dev_example_num, dev_distribution, criterion_mse, criterion_ip, feminine_hiddens, masculine_hiddens, gender_hiddens, prefix="") -> Dict:
    def evaluate(model, attributes_hiddens, dev_dataloaders, prefix=""):
        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_output_dir = args.output_dir

        if args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir, exist_ok=True)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(
            1, args.n_gpu)
        # Note that DistributedSampler samples randomly

        # multi-gpu evaluate
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", dev_example_num)
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        model.eval()
        #criterion.eval()

        for key in tqdm(dev_distribution):
            with torch.no_grad():
                loss = forward(attributes_hiddens, dev_dataloaders, key)

                eval_loss += loss.item()

                model.zero_grad()
                original_model.zero_grad()

        output_eval_file = os.path.join(eval_output_dir, prefix,
                                        "eval_results.txt")
        '''
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            logger.info("  Loss = %s", eval_loss)
            writer.write("Loss = %s\n" % (eval_loss))
        '''

        return eval_loss

    #criterion_ms = torch.nn.MSELoss()
    criterion_ms = mean_square
    #criterion.train()
    criterion_ip = inner_product
    original_model.eval()

    alpha, beta = args.weighted_loss
    alpha = float(alpha)
    beta = float(beta)

    train_loss = 0.0

    for _ in train_iterator:

        random.shuffle(train_distribution)
        epoch_iterator = tqdm(train_distribution,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])

        model.eval()
        with torch.no_grad():
            attributes_hiddens = attribute_vector_example()

        for step, key in enumerate(epoch_iterator):
            model.train()

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            loss = forward(attributes_hiddens, train_dataloaders, key)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            train_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                original_model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logger.info(" global_step = %s, train loss = %s",
                                global_step, train_loss)
                    train_loss = 0.0
                    # Log metrics
                    best_loss, best_step = save_best_model(
                        best_loss, best_step, dev_dataloaders)
                    dev_dataloaders, dev_example_num, dev_distribution = create_dataloader(
                        args, dev_datasets, tokenizer, train=False)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
            train_dataloaders, train_example_num, train_distribution = create_dataloader(
                args, train_datasets, tokenizer, train=True)

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    dev_dataloaders, dev_example_num, dev_distribution = create_dataloader(
        args, dev_datasets, tokenizer, train=False)
    best_loss, best_step = save_best_model(best_loss, best_step,
                                           dev_dataloaders)

    if args.local_rank in [-1, 0]:
        tb_writer.close()
Пример #18
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)

    evaluation_loss = dict()

    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    # adjusting eval batch size according to the number of train epochs to
    # make it easier to plot with same length for train and eval also for
    # the eval loss plot to be adjusted and clear within the plot frame
    # args.eval_batch_size = int(len(eval_dataset) / args.num_train_epochs)

    # commenting the actual one
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) \
            if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) \
                if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    # write for each batch
    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    print('\n--------------------------')

    result = {
        "perplexity": perplexity,
        "eval_loss": eval_loss,
        "eval_steps": nb_eval_steps
    }

    output_eval_file = os.path.join(FINETUNE_DIR, "eval_results.txt")
    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    print('----------------------------')

    return result
Пример #19
0
def train(args, train_dataset, eval_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer, run_batch_fn_train,
          run_batch_fn_eval) -> Tuple[int, float]:
    if args.local_rank in [-1, 0]:
        log_dir = os.path.join("runs",
                               args.exp_name) if args.exp_name else None
        tb_writer = SummaryWriter(log_dir)
        args.output_dir = log_dir

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=train_dataset.collate_fn)

    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs

    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    global_step = 0
    model.zero_grad()
    train_iterator = trange(0,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # for reproducibility

    for _ in train_iterator:
        local_steps = 0
        tr_loss = 0.0
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            loss, _, _, _ = run_batch_fn_train(args, model, batch)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                local_steps += 1
                epoch_iterator.set_postfix(Loss=tr_loss / local_steps)

        results = evaluate(args,
                           eval_dataset,
                           model,
                           tokenizer,
                           run_batch_fn_eval,
                           desc=str(global_step))
        if args.local_rank in [-1, 0]:
            for key, value in results.items():
                tb_writer.add_scalar("eval_{}".format(key), value, global_step)
            tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
            tb_writer.add_scalar("loss", tr_loss / local_steps, global_step)

            checkpoint_prefix = "checkpoint"
            # Save model checkpoint
            output_dir = os.path.join(
                args.output_dir, "{}-{}".format(checkpoint_prefix,
                                                global_step))
            os.makedirs(output_dir, exist_ok=True)
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training

            logger.info("Saving model checkpoint to %s", output_dir)
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            torch.save(args, os.path.join(output_dir, "training_args.bin"))
            with open(os.path.join(output_dir, "params.json"),
                      "w") as jsonfile:
                json.dump(args.params,
                          jsonfile,
                          indent=2,
                          default=lambda x: str(x))
            logger.info("Saving model checkpoint to %s", output_dir)

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / local_steps
Пример #20
0
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, train_dataset_second, DP_classifier) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
    

    train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )
    
    correct_sampler = SequentialSampler(train_dataset_second) if args.local_rank == -1 else DistributedSampler(train_dataset_second)
    correct_dataloader = DataLoader(
        train_dataset_second, sampler=correct_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )

    wrong_sampler = RandomSampler(train_dataset_second) if args.local_rank == -1 else DistributedSampler(train_dataset_second)
    wrong_dataloader = DataLoader(
        train_dataset_second, sampler=wrong_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )


    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)] + [p for n, p in DP_classifier.named_parameters()],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    #scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    DP_classifier.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    
    zipped_data = zip(train_dataloader, correct_dataloader, wrong_dataloader)
    
    correct_mc_tensor = torch.ones(args.train_batch_size, dtype=torch.float)
    correct_mc_tensor = correct_mc_tensor.to(args.device)
    wrong_mc_tensor = torch.zeros(args.train_batch_size, dtype=torch.float)
    wrong_mc_tensor = wrong_mc_tensor.to(args.device)
    
    print(correct_mc_tensor)
    print(wrong_mc_tensor)
    
    accumulated_lm_loss = 0.0
    accumulated_mc_loss = 0.0
    
    for _ in train_iterator:
        train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
        train_dataloader = DataLoader(
            train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
        )
        
        correct_sampler = SequentialSampler(train_dataset_second) if args.local_rank == -1 else DistributedSampler(train_dataset_second)
        correct_dataloader = DataLoader(
            train_dataset_second, sampler=correct_sampler, batch_size=args.train_batch_size, collate_fn=collate
        )
    
        wrong_sampler = RandomSampler(train_dataset_second) if args.local_rank == -1 else DistributedSampler(train_dataset_second)
        wrong_dataloader = DataLoader(
            train_dataset_second, sampler=wrong_sampler, batch_size=args.train_batch_size, collate_fn=collate
        )
        zipped_data = zip(train_dataloader, correct_dataloader, wrong_dataloader)
        epoch_iterator = tqdm(zipped_data, desc="Iteration", disable=args.local_rank not in [-1, 0], total=len(train_dataloader))
        for step, zipped_batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            DP_classifier.train()
            

            # unpack zipped_batch
            batch, correct_batch, wrong_batch = zipped_batch
            
                        
            # First: original sentence
            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            labels = inputs.clone()
            
            cls_pos = []
            for curr in labels:
                for idx, tk in enumerate(curr):
                    if tk == tokenizer.cls_token_id:
                        curr[idx] = -100
                        cls_pos.append(idx)
                        break
                
                
            
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            
            outputs = model(inputs, lm_labels=labels)
            loss_lm_1 = outputs[0]
            hidden_1 = outputs[3]
            
            sentence_embed_1_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_1)]
            sentence_embed_1 = torch.cat(sentence_embed_1_pieces)
            
            
            
            
            # Second: correct next sentence
            correct_input = correct_batch
            correct_labels = correct_input.clone()
            
            cls_pos = []
            for curr in correct_labels:
                for idx, tk in enumerate(curr):
                    if tk == tokenizer.cls_token_id:
                        curr[idx] = -100
                        cls_pos.append(idx)
                        break
            
            
            
            correct_input = correct_input.to(args.device)
            correct_labels = correct_labels.to(args.device)            
            
            outputs = model(correct_input, lm_labels=correct_labels)

            loss_lm_2 = outputs[0]
            hidden_2 = outputs[3]
            sentence_embed_2_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_2)]
            sentence_embed_2 = torch.cat(sentence_embed_2_pieces)

            
            
            
            # Get correct loss
            if random.randint(0, 1) == 1:
                outputs = DP_classifier(sentence_embed_1, sentence_embed_2, correct_mc_tensor)
            else:
                outputs = DP_classifier(sentence_embed_2, sentence_embed_1, correct_mc_tensor)
            loss_mc = outputs[0]
            
            # MC_LOSS SCALING
            SCALING = 0.05
            loss_lm = loss_lm_1 + loss_lm_2
            
            
            #loss = loss_lm
            loss_first = loss_lm + SCALING * loss_mc
            #print("loss_mc: ", loss_mc.item())
            #print("loss_lm: ", loss_lm.item())
            
            accumulated_lm_loss += loss_lm.item() / 2.0
            accumulated_mc_loss += SCALING * loss_mc.item()
            
            # Second loss: wrong next sentence randomly sampled from training set
            wrong_input = wrong_batch
            wrong_labels = wrong_input.clone()
            
            cls_pos = []
            for curr in wrong_labels:
                for idx, tk in enumerate(curr):
                    if tk == tokenizer.cls_token_id:
                        curr[idx] = -100
                        cls_pos.append(idx)
                        break

            
            wrong_input = wrong_input.to(args.device)
            wrong_labels = wrong_labels.to(args.device)
            
            outputs = model(wrong_input, lm_labels=wrong_labels)

            loss_lm_3 = outputs[0]
            hidden_3 = outputs[3]
            sentence_embed_3_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_3)]
            sentence_embed_3 = torch.cat(sentence_embed_3_pieces)

            
            if random.randint(0, 1) == 1:
                outputs = DP_classifier(sentence_embed_1, sentence_embed_3, wrong_mc_tensor)
            else:
                outputs = DP_classifier(sentence_embed_3, sentence_embed_1, wrong_mc_tensor)
            loss_mc = outputs[0]
            
            #loss = loss_lm
            loss_second = loss_lm_3 + SCALING * loss_mc
            #print("loss_mc: ", loss_mc.item())
            #print("loss_lm: ", loss_lm.item())
            accumulated_mc_loss += SCALING * loss_mc.item()
            
            # Total loss
            loss = loss_first + loss_second


            SKIP_STEP = 50
            if (step % SKIP_STEP == 0):
                print(' iter %d, avg. lm_loss %.2f, avg. mc_loss %.2f, avg. ppl %.2f ' % (step,
                                                                    accumulated_lm_loss / SKIP_STEP,
                                                                    accumulated_mc_loss / SKIP_STEP,
                                                                    math.exp(loss_lm.item() /2),
                                                                    ), file=sys.stderr)
                tb_writer.add_scalar("training_lm_loss", accumulated_lm_loss / SKIP_STEP, global_step)
                tb_writer.add_scalar("training_mc_loss", accumulated_mc_loss / SKIP_STEP, global_step)
                accumulated_lm_loss = 0.0
                accumulated_mc_loss = 0.0
                

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                    torch.nn.utils.clip_grad_norm_(DP_classifier.parameters(), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                    torch.nn.utils.clip_grad_norm_(DP_classifier.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                DP_classifier.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer, DP_classifier)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(DP_classifier, os.path.join(output_dir, "DP_classifier.bin"))
                    
                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def export(tokenizer: PreTrainedTokenizer, model: PreTrainedModel,
           config: OnnxConfig, opset: int,
           output: Path) -> Tuple[List[str], List[str]]:
    """
    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR

    Args:
        tokenizer:
        model:
        config:
        opset:
        output:

    Returns:

    """
    if not is_torch_available():
        raise ImportError(
            "Cannot convert because PyTorch is not installed. Please install torch first."
        )

    import torch
    from torch.onnx import export

    from ..file_utils import torch_version

    if not is_torch_onnx_dict_inputs_support_available():
        raise AssertionError(
            f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}"
        )

    logger.info(f"Using framework PyTorch: {torch.__version__}")
    with torch.no_grad():
        model.config.return_dict = True
        model.eval()

        # Check if we need to override certain configuration item
        if config.values_override is not None:
            logger.info(
                f"Overriding {len(config.values_override)} configuration item(s)"
            )
            for override_config_key, override_config_value in config.values_override.items(
            ):
                logger.info(
                    f"\t- {override_config_key} -> {override_config_value}")
                setattr(model.config, override_config_key,
                        override_config_value)

        # Ensure inputs match
        # TODO: Check when exporting QA we provide "is_pair=True"
        model_inputs = config.generate_dummy_inputs(
            tokenizer, framework=TensorType.PYTORCH)
        inputs_match, matched_inputs = ensure_model_and_config_inputs_match(
            model, model_inputs.keys())
        onnx_outputs = list(config.outputs.keys())

        if not inputs_match:
            raise ValueError("Model and config inputs doesn't match")

        config.patch_ops()

        # export can works with named args but the dict containing named args as to be last element of the args tuple
        export(
            model,
            (model_inputs, ),
            f=output.as_posix(),
            input_names=list(config.inputs.keys()),
            output_names=onnx_outputs,
            dynamic_axes={
                name: axes
                for name, axes in chain(config.inputs.items(),
                                        config.outputs.items())
            },
            do_constant_folding=True,
            use_external_data_format=config.use_external_data_format(
                model.num_parameters()),
            enable_onnx_checker=True,
            opset_version=opset,
        )

        config.restore_ops()

    return matched_inputs, onnx_outputs
Пример #22
0
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, DP_classifier, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True, doubling=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        
        labels = inputs.clone()
        for curr in labels:
                for idx, tk in enumerate(curr):
                    if tk == tokenizer.cls_token_id:
                        curr[idx] = -100
            
        
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, lm_labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}
    
    ###### Evaluate NSP accuracy
    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
    eval_dataset_second = load_and_cache_examples(args, tokenizer, evaluate=True, second=True)
    
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )
    
    eval_correct_sampler = SequentialSampler(eval_dataset_second)
    eval_correct_dataloader = DataLoader(
        eval_dataset_second, sampler=eval_correct_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )
    
    eval_wrong_sampler = RandomSampler(eval_dataset_second)
    eval_wrong_dataloader = DataLoader(
        eval_dataset_second, sampler=eval_wrong_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )
    
    nb_eval_steps = 0
    num_correctly_predicted = 0
    num_wrongly_predicted = 0
    for zipped_batch in tqdm(zip(eval_dataloader, eval_correct_dataloader, eval_wrong_dataloader), desc="Evaluating", total=len(eval_dataloader)):
        batch, correct_batch, wrong_batch = zipped_batch        
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        
        second_input = None
        if_correct = False
        if random.randint(0, 1) == 1:
            second_input = correct_batch
            if_correct = True
        else:
            second_input = wrong_batch
            if_correct = False
        
        cls_pos = []
        for curr in inputs:
            for idx, tk in enumerate(curr):
                if tk == tokenizer.cls_token_id:
                    cls_pos.append(idx)
                    break
        inputs = inputs.to(args.device)            
        with torch.no_grad():
            outputs = model(inputs)
        hidden_1 = outputs[2]
        sentence_embed_1_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_1)]
        sentence_embed_1 = torch.cat(sentence_embed_1_pieces)
        

        cls_pos = []
        for curr in second_input:
            for idx, tk in enumerate(curr):
                if tk == tokenizer.cls_token_id:
                    cls_pos.append(idx)
                    break
        
        second_input = second_input.to(args.device)
        with torch.no_grad():
            outputs = model(second_input)
        hidden_2 = outputs[2]
        
        sentence_embed_2_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_2)]
        sentence_embed_2 = torch.cat(sentence_embed_2_pieces)

        with torch.no_grad():        
            if random.randint(0, 1) == 1:
                outputs = DP_classifier(sentence_embed_1, sentence_embed_2)
            else:
                outputs = DP_classifier(sentence_embed_2, sentence_embed_1)
        
        mc_logits = outputs[0].cpu()
        
        for jj in range(mc_logits.shape[0]):
            if (mc_logits[jj, 0] > 0) == if_correct:
                num_correctly_predicted += 1
            else:
                num_wrongly_predicted += 1
            
        nb_eval_steps += 1
        
    total_predicted = num_correctly_predicted + num_wrongly_predicted
    accuracy = num_correctly_predicted / total_predicted
    result["accuracy"] = accuracy

    
    
    

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Пример #23
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer,
                                     args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result
Пример #24
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, tr_loss_sent, logging_loss, logging_loss_sent = 0.0, 0.0, 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch")
    set_seed(args)  # Added here for reproducibility

    results = {}
    acc_prev = 0.
    preds = None
    labels = None

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            batch = tuple(t.to(args.device) for t in batch)
            input_ids, attention, token_ids, child, head = batch[0], batch[
                1], batch[2], batch[3], batch[4]
            dep_labels, num_dependency, arcs, arc_labels = batch[5], batch[
                6], batch[7], batch[8]
            arc_label_lengths, sent_labels = batch[9], batch[10]

            inputs = {
                'input_ids': input_ids,
                'attention': attention,
                'token_ids': token_ids,
                'child': child,
                'head': head,
                'dep_labels': dep_labels,
                'arcs': arc_labels,
                'arc_label_lengths': arc_label_lengths,
                'device': args.device
            }

            model.train()
            outputs = model(**inputs)

            loss = outputs[0]
            logits = outputs[1]

            tr_loss += loss.item()

            loss.backward()

            if preds is None:
                preds = logits.detach().cpu().numpy()
                labels = dep_labels.view(-1).cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                labels = np.append(labels,
                                   dep_labels.view(-1).cpu().numpy(),
                                   axis=0)

            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.save_steps > 0 and global_step % args.save_steps == 0:

                    logs = {}
                    loss_scalar_dep = (tr_loss -
                                       logging_loss) / args.save_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss_dep"] = loss_scalar_dep
                    logging_loss = tr_loss

                    print(json.dumps({**logs, **{"step": global_step}}))
                    logger.info(json.dumps({**logs, **{"step": global_step}}))

                    preds = preds.reshape(-1, 2)
                    preds = softmax(preds)
                    preds = np.argmax(preds, axis=1)
                    res_train = compute_metrics_intermediate(preds, labels)
                    preds = None
                    labels = None

                    print(res_train)
                    # Evaluation
                    result = evaluate(args, model, tokenizer)
                    results.update(result)

                    save_checkpoints(args, args.output_dir, model, tokenizer)

                    if result['acc'] > acc_prev:
                        acc_prev = result['acc']
                        # Save model checkpoint best
                        output_dir = os.path.join(args.output_dir,
                                                  "model-best")
                        save_checkpoints(args, output_dir, model, tokenizer)

            if 0 < args.max_steps < global_step:
                epoch_iterator.close()
                break
        if 0 < args.max_steps < global_step:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step
Пример #25
0
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    set_seed(args)  # Added here for reproducibility

    """ Train the model """
    if args.gpu == 0:
        current_time = datetime.now().strftime('%b%d_%H-%M-%S')
        tb_writer = SummaryWriter(args.output_dir + '/runs/' + current_time)

    args.train_batch_size = args.per_gpu_train_batch_size

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    if args.shuffle:
        logger.info(f"Shuffle the dataset in training,"
                       f"GPU: {args.gpu},"
                       f"Rank: {args.rank},"
                       f"Total: {args.world_size}")
    train_sampler = DistributedSampler(
        train_dataset,
        num_replicas=args.world_size,
        rank=args.rank,
        shuffle=args.shuffle,
    )
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, shuffle=False, num_workers=0,
        batch_size=args.train_batch_size, collate_fn=collate, pin_memory=True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      # betas=(0.9, 0.98),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    if args.warmup_ratio > 0.:
        assert args.warmup_steps == 0
        args.warmup_steps = int(t_total * args.warmup_ratio)
    if args.gpu == 0:
        print("Optimized with lr %f, steps %d, warmup steps %d, and use beta, epsilon %0.8f." % (
            args.learning_rate, t_total, args.warmup_steps, optimizer.defaults['eps']
        ), optimizer.defaults['betas'])
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level,
                                          verbosity=0)
        from apex.parallel import DistributedDataParallel as DDP
        model = DDP(model)
    else:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * args.world_size
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    # Check if continuing training from a checkpoint
    # if args.model_name_or_path and os.path.exists(args.model_name_or_path):
    #     try:
    #         # set global_step to gobal_step of last saved checkpoint from model path
    #         checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
    #         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
    #         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
    #         logger.info("  Continuing training from checkpoint, will skip to saved global_step")
    #         logger.info("  Continuing training from epoch %d", epochs_trained)
    #     except ValueError:
    #         logger.info("  Do not load model from %s, restart training" % args.model_name_or_path)

    # model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    # model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.gpu != 0
    )
    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.gpu != 0)
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()       # Support of accumulating gradients
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            # If some of the input is padded, then the attention mask is needed
            attention_mask = (inputs != tokenizer.pad_token_id)         # word_tokens --> 1, pad_token --> 0
            if attention_mask.all():
                attention_mask = None

            if epoch == 0 and step < 3 and args.gpu == 0:
                print(inputs.shape)
                print(inputs[0])
                print(tokenizer.convert_ids_to_tokens(inputs[0].cpu().numpy()))
                print(labels[0])
                print(attention_mask)

            model.train()
            outputs = model(inputs,
                            attention_mask=attention_mask,
                            masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.max_grad_norm > 0.:
                    if args.fp16:
                        total_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        total_norm =torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.gpu == 0 and args.logging_steps > 0 and (step + 1) % args.logging_steps == 0:
                    # Log metrics
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    if args.fp16:
                        try:
                            from apex.amp import _amp_state
                            tb_writer.add_scalar("loss_scale", _amp_state.loss_scalers[0]._loss_scale, global_step)
                            tb_writer.add_scalar("scaled_loss", scaled_loss.item(), global_step)
                        except ImportError:
                            logger.warning("Cannot import apex.amp._amp_state, "
                                           "would not state the loss_scale in the log")
                    if args.max_grad_norm > 0.:  # Only clip the grad when it is valid
                        tb_writer.add_scalar("grad_norm", total_norm, global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

            if args.max_steps > 0 and global_step >= args.max_steps:
                break

        # Save it each epoch
        if args.gpu == 0:
            # Save checkpoints
            checkpoint_name = "checkpoint-epoch%04d" % epoch
            save_model(args, checkpoint_name, model, tokenizer, optimizer, scheduler)
            last_path = os.path.join(args.output_dir, 'checkpoint-last')
            # if os.path.exists(last_path):
            #     print(last_path)
            #     os.remove(last_path)
            # os.symlink(os.path.join(args.output_dir, checkpoint_name), last_path)

            # Evaluate the model
            logger.info(" Training loss of Epoch %d: %0.4f" % (epoch, tr_loss / step))
            logger.info(" Evaluation Results of Epoch %d: " % epoch)
            results = evaluate(args, model, tokenizer)
            for key, value in results.items():
                tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                logger.info("\t %s: %0.4f" % (key, value))
            output_eval_file = os.path.join(args.output_dir, checkpoint_name, "eval_results.json")
            json.dump(results, open(output_eval_file, 'w'), sort_keys=True, indent=4)

        if args.max_steps > 0 and global_step >= args.max_steps:
            epoch_iterator.close()
            train_iterator.close()
            break

    if args.gpu == 0:
        tb_writer.close()
Пример #26
0
def evaluate(args,
             model: PreTrainedModel,
             tokenizer: PreTrainedTokenizer,
             prefix="") -> Dict:
    def cal_simple_nll(pred, gold):

        loss_fct = nn.NLLLoss(reduce=False)

        batch_size = gold.size(0)
        gold = gold.contiguous()
        norm = nn.Softmax(dim=1)

        pred = pred.contiguous().view(-1, pred.size(2))
        pred = norm(pred)
        pred_prob_t = pred.contiguous().view(batch_size, -1,
                                             pred.size(1)) + 1e-16

        pred_prob_t_log = torch.log(pred_prob_t)
        pred_prob_t_log = pred_prob_t_log.view(-1, pred_prob_t_log.size(2))
        loss = loss_fct(pred_prob_t_log, gold.view(-1))

        return loss

    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate)

    # multi-gpu evaluate
    # if args.n_gpu > 1:
    #     model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    reward_fct = nn.NLLLoss(reduce=False)

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        raw_text = batch.clone().detach().to(args.device)
        inputs = raw_text[:, :-1].contiguous()
        gold = raw_text[:, 1:].contiguous()
        masks = inputs.ne(0).type(torch.float)

        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks)
            lm_loss = cal_simple_nll(outputs[0], gold)
            lm_loss = lm_loss.contiguous().view(inputs.size(0),
                                                -1).mean(dim=-1)

            eval_loss += lm_loss.sum().item()
        nb_eval_steps += lm_loss.size(0)

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix,
                                    "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result