def adding_lr_decay_handler(optimizer, trainer, model_parameters):
    lr_decay_epoch_frequency = model_parameters['lr_decay_epoch_frequency']
    lr_decay = model_parameters['lr_decay']
    step_scheduler = StepLR(optimizer,
                            step_size=lr_decay_epoch_frequency,
                            gamma=lr_decay)
    scheduler = LRScheduler(step_scheduler)
    trainer.add_event_handler(Events.EPOCH_COMPLETED, scheduler)
示例#2
0
def find_lr_add_one_cycle(engine: Engine, lr_finder: LRFinder,
                          optimizer: Optimizer):
    train_dl = engine.state.dataloader
    lr_finder.range_test(train_dl, num_iter=1000)
    max_lr = lr_finder.lr_suggestion
    lr_finder.reset()
    one_cycle_scheduler = LRScheduler(
        OneCycleLR(optimizer,
                   max_lr,
                   train_dl=train_dl,
                   num_epochs=engine.state.max_epochs))
    engine.add_event_handler(Events.ITERATION_STARTED, one_cycle_scheduler)
示例#3
0
 def schedule_lr(self, optimizer, name, params, warmup_start=None,
                 warmup_end=None, warmup_duration=None):
     if name is None:
         return None
     lr_scheduler = self._get_lr_scheduler(name)(optimizer, **params)
     if warmup_start and warmup_end and warmup_duration:
         scheduler = \
             create_lr_scheduler_with_warmup(lr_scheduler,
                                             warmup_start_value=warmup_start,
                                             warmup_end_value=warmup_end,
                                             warmup_duration=warmup_duration)
     else:
         scheduler = LRScheduler(lr_scheduler)
     self.trainer.add_event_handler(Events.EPOCH_COMPLETED, scheduler)
示例#4
0
    def _run(self, engine):
        engine.state.state_cache = _StateCacher(self._memory_cache,
                                                cache_dir=self._cache_dir)
        engine.state.state_cache.store("model", self._model.state_dict())
        engine.state.state_cache.store("optimizer",
                                       self._optimizer.state_dict())

        self._history = {"lr": [], "loss": []}
        self._best_loss = None
        self._diverge_flag = False

        # attach loss and lr logging
        if not engine.has_event_handler(self._log_lr_and_loss):
            engine.add_event_handler(Events.ITERATION_COMPLETED,
                                     self._log_lr_and_loss)

        # attach LRScheduler to engine. can be done only after engine.run was called because of num_iter
        required_epochs = self.num_iter / len(engine.state.dataloader)
        if engine.state.max_epochs < required_epochs:
            engine.state.max_epochs = int(np.ceil(required_epochs))

        self._logger.debug("Running LR finder for {} iterations".format(
            self.num_iter))
        # Initialize the proper learning rate policy
        if self._step_mode.lower() == "exp":
            self._lr_schedule = LRScheduler(
                _ExponentialLR(self._optimizer, self._end_lr, self.num_iter))
        else:
            self._lr_schedule = LRScheduler(
                _LinearLR(self._optimizer, self._end_lr, self.num_iter))
        if not engine.has_event_handler(self._lr_schedule):
            engine.add_event_handler(Events.ITERATION_COMPLETED,
                                     self._lr_schedule, self.num_iter)

        if not engine.has_event_handler(self._reached_num_iterations):
            engine.add_event_handler(Events.ITERATION_COMPLETED,
                                     self._reached_num_iterations)
示例#5
0
    def _run(self, trainer, optimizer, output_transform, num_iter, end_lr,
             step_mode, smooth_f, diverge_th):

        self._history = {"lr": [], "loss": []}
        self._best_loss = None
        self._diverge_flag = False

        # attach LRScheduler to trainer.
        if num_iter is None:
            num_iter = trainer.state.epoch_length * trainer.state.max_epochs
        else:
            max_iter = trainer.state.epoch_length * trainer.state.max_epochs
            if num_iter > max_iter:
                warnings.warn(
                    "Desired num_iter {} is unreachable with the current run setup of {} iteration "
                    "({} epochs)".format(num_iter, max_iter,
                                         trainer.state.max_epochs),
                    UserWarning,
                )

        if not trainer.has_event_handler(self._reached_num_iterations):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._reached_num_iterations, num_iter)

        # attach loss and lr logging
        if not trainer.has_event_handler(self._log_lr_and_loss):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._log_lr_and_loss, output_transform,
                                      smooth_f, diverge_th)

        self.logger.debug(
            "Running LR finder for {} iterations".format(num_iter))
        # Initialize the proper learning rate policy
        if step_mode.lower() == "exp":
            self._lr_schedule = LRScheduler(
                _ExponentialLR(optimizer, end_lr, num_iter))
        else:
            start_lr = optimizer.param_groups[0]["lr"]
            self._lr_schedule = PiecewiseLinear(optimizer,
                                                param_name="lr",
                                                milestones_values=[
                                                    (0, start_lr),
                                                    (num_iter, end_lr)
                                                ])
        if not trainer.has_event_handler(self._lr_schedule):
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      self._lr_schedule, num_iter)
示例#6
0
def _test_setup_common_training_handlers(
    dirname, device, rank=0, local_rank=0, distributed=False, lr_scheduler=None, save_handler=None
):

    lr = 0.01
    step_size = 100
    gamma = 0.5
    num_iters = 100
    num_epochs = 10

    model = DummyModel().to(device)
    if distributed and "cuda" in device:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank,], output_device=local_rank)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    if lr_scheduler is None:
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
    elif isinstance(lr_scheduler, str) and lr_scheduler == "ignite|LRScheduler":
        from ignite.contrib.handlers import LRScheduler

        lr_scheduler = LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma))
    elif isinstance(lr_scheduler, str) and lr_scheduler == "ignite":
        from ignite.contrib.handlers import PiecewiseLinear

        milestones_values = [(0, 0.0), (step_size, lr), (num_iters * (num_epochs - 1), 0.0)]
        lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)
    else:
        raise ValueError(f"Unknown lr_scheduler: {lr_scheduler}")

    def update_fn(engine, batch):
        optimizer.zero_grad()
        x = torch.tensor([batch], requires_grad=True, device=device)
        y_pred = model(x)
        loss = y_pred.mean()
        loss.backward()
        optimizer.step()
        return loss

    train_sampler = None
    if distributed and idist.get_world_size() > 1:
        train_sampler = MagicMock(spec=DistributedSampler)
        train_sampler.set_epoch = MagicMock()

    trainer = Engine(update_fn)
    setup_common_training_handlers(
        trainer,
        train_sampler=train_sampler,
        to_save={"model": model, "optimizer": optimizer},
        save_every_iters=75,
        output_path=dirname,
        save_handler=save_handler,
        lr_scheduler=lr_scheduler,
        with_gpu_stats=False,
        output_names=["batch_loss",],
        with_pbars=True,
        with_pbar_on_iters=True,
        log_every_iters=50,
    )

    data = [i * 0.1 for i in range(num_iters)]
    trainer.run(data, max_epochs=num_epochs)

    # check handlers
    handlers = trainer._event_handlers[Events.ITERATION_COMPLETED]
    for cls in [
        TerminateOnNan,
    ]:
        assert any([isinstance(h[0], cls) for h in handlers]), f"{handlers}"
    assert "batch_loss" in trainer.state.metrics

    # Check saved checkpoint
    if rank == 0:
        if save_handler is not None:
            dirname = save_handler.dirname
        checkpoints = list(os.listdir(dirname))
        assert len(checkpoints) == 1
        for v in [
            "training_checkpoint",
        ]:
            assert any([v in c for c in checkpoints])

    # Check LR scheduling
    assert optimizer.param_groups[0]["lr"] <= lr * gamma ** (
        num_iters * num_epochs / step_size
    ), f"{optimizer.param_groups[0]['lr']} vs {lr * gamma ** (num_iters * num_epochs / step_size)}"
示例#7
0
def train():
    os.environ['CUDA_VISIBLE_DEVICES'] = '7'

    parser = ArgumentParser()
    parser.add_argument('--gpt2', action='store_true', help="use gpt2")
    parser.add_argument("--model_checkpoint", type=str, default="uer/gpt2-chinese-cluecorpussmall", help="Path or URL of the model")
    parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step")
    parser.add_argument('--pretrained', action='store_true', help="If False train from scratch")
    parser.add_argument("--data_path", type=str, default="data/autocloze.json",
                        help="Path or url of the dataset. ")
    parser.add_argument("--train_path", type=str, default="data/toy_train.txt",
                        help="Path of the train dataset for dist dataset. ")
    parser.add_argument("--valid_path", type=str, default="data/toy_valid.txt",
                        help="Path of the valid dataset for dist dataset. ")
    #--------------------------------------------------------------
    parser.add_argument("--dataset_cache", type=str, default="dataset_zh",
                        help="Path or url of the dataset cache")
    parser.add_argument('--log_file', '-log_file', type=str, default="", help="Output logs to a file under this path")
    parser.add_argument("--num_workers", type=int, default=8, help="Number of subprocesses for data loading")
    parser.add_argument("--n_epochs", type=int, default=40, help="Number of training epochs")
    parser.add_argument("--train_batch_size", type=int, default=1, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=1, help="Batch size for validation")
    parser.add_argument("--max_history", type=int, default=15, help="Number of previous exchanges to keep in history")
    parser.add_argument("--scheduler", type=str, default="noam", choices=['noam', 'linear'], help="method of optim")
    parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)")
    parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate")
    parser.add_argument("--eval_before_start", action='store_true',
                        help="If true start with a first evaluation before training")
    parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps")
    parser.add_argument("--valid_steps", type=int, default=5000, help="Perfom validation every X steps")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=64,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="",
                        help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()
    print('cuda ',torch.cuda.is_available())
    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process.
    # logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    '''if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    '''
    args.device = torch.device("cuda")
    print('device ',args.device)
    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    #model_class = OpenAIGPTLMHeadModel if not args.gpt2 else GPT2LMHeadModel
    #config_class = OpenAIGPTConfig if not args.gpt2 else GPT2Config
    model_class = GPT2LMHeadModel
    config_class = GPT2Config
    tokenizer_class = BertTokenizer
    print('pretrained:',args.pretrained)
    if args.pretrained:
        print("----------------pretrained")
        tokenizer = BertTokenizer.from_pretrained(args.model_checkpoint, do_lower_case=True)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
        model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall",from_tf=True)
        #print('generate')
        #print(text_generator("这是很久之前的事情了", max_length=100, do_sample=True))

    #args.device=torch.device("cuda", 2)
    
    model.to(args.device)
    
    optimizer = AdamW([{'params': model.parameters(), 'initial_lr': args.lr}], lr=args.lr, correct_bias=True)

    logger.info("Prepare datasets")
    loader_class = build_dist_loaders if not args.data_path else build_dataloaders
    train_loader, val_loader, train_sampler, valid_sampler = loader_class(args, tokenizer, logger)

    logger.info("Prepare datasets ends")
    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)
        model=model.module
    #if isinstance(model,torch.nn.DataParallel):
    
    #print('params:',params_count(model))

    #tokens_embed = model.transformer.get_input_embeddings()
    # Training function and trainer
    def update(engine, batch):
        input_ids, token_type_ids, lm_labels = tuple(input_tensor.to(args.device) for input_tensor in batch)
        
        #for i in range(input_ids.size()[0]):
        #    for j in range(input_ids.size()[1]):
        #        if input_ids[i,j]==-1:
        #            input_ids[i,j]=-100
        #        if lm_labels[i,j]==-1:
        #            lm_labels[i,j]=-100
        #one=torch.tensor(-100)
        #input_ids=torch.where(input_ids==-1,one,input_ids)
        #lm_labels=torch.where(lm_labels==-1,one,lm_labels)
        #print('traindata',input_ids,lm_labels)

        #lm_labels=input_ids
        r'''input_shape = input_ids.siz`e`()
        input_ids = input_ids.view(-1, input_shape[-1])
        inputs_embeds = tokens_embed(input_ids) * math.sqrt(tokens_embed.embedding_dim)'''

        model.train()
        #(lm_loss), *_ = model(inputs_embeds=inputs_embeds, labels=lm_labels,return_dict=0)
        (lm_loss), *_ = model(input_ids=input_ids, labels=lm_labels,return_dict=False)
        #print('lm_loss',lm_loss)
        loss = lm_loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item(), optimizer.param_groups[0]['lr']

    trainer = Engine(update)
    

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    cntepoch=0
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            input_ids, token_type_ids, lm_labels = tuple(input_tensor.to(args.device) for input_tensor in batch)
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            #one = torch.tensor(-100)
            #input_ids=torch.where(input_ids==-1,one,input_ids)
            #print('validdata',input_ids,lm_labels)
            #lm_labels=input_ids
            r'''input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
            inputs_embeds = tokens_embed(input_ids) * math.sqrt(tokens_embed.embedding_dim)'''
            

            #lm_logits, *_ = model(inputs_embeds=inputs_embeds,return_dict=0)
            lm_logits, *_ = model(input_ids=input_ids,return_dict=False)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return lm_logits_flat_shifted, lm_labels_flat_shifted
        cntepoch+=1
        torch.save(args, tb_logger.writer.logdir + '_%s/model_training_args.bin'%(str(cntepoch)))

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Evaluation during training
    @trainer.on(Events.ITERATION_STARTED)
    def log_iterations(engine):
        # if engine.state.iteration % max(int(0.1 * len(train_loader)), 1) == 0:
        if engine.state.iteration % args.valid_steps == 0:
            evaluator.run(val_loader)

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # noam decrease the learning rate
    # model_size = model.config.n_embd
    model_size = args.n_emd
    noam_lambda = lambda step: (
            model_size ** (-0.5) * min((step + 1) ** (-0.5), (step + 1) * args.warmup_steps ** (-1.5)))
    noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step)
    scheduler = LRScheduler(noam_scheduler)
    if args.scheduler == "linear":
        scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss")
    RunningAverage(output_transform=lambda x: x[1]).attach(trainer, "lr")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0], x[1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints
    # And save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True, mininterval=2)
        pbar.attach(trainer, metric_names=["loss", "lr"])
        evaluator.add_event_handler(Events.COMPLETED,
                                    lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=None)
        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()),
                                                              another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=6)
        # save model after evaluation
        evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {
            'mymodel': getattr(model, 'module', model)})
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {
            'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation

        torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.logdir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.logdir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)
    
    # On the main process: close tensorboard logger and rename the last checkpoint
    # (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(checkpoint_handler._saved[-1][1][-1],
                  os.path.join(tb_logger.writer.logdir,
                               WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
示例#8
0
    def fit(self,
            train_dataset: TileFeaturesDataset,
            val_dataset: TileFeaturesDataset = None,
            epochs: int = 10,
            batch_size: int = 10,
            num_workers: int = 10,
            evaluate_every: int = 300,
            save_every: int = 1000):
        """
        Args:
            train_dataset: The dataset object for training data
            val_dataset: The dataset object for validation data, optional
            epochs: number of epochs to train the network
            batch_size: batch size for the network
            num_workers: number of workers for the network
            evaluate_every: every how many steps to run evaluation
            save_every: every how many steps to save the model

        Returns:
            a trained pytorch model
        """

        # create data loader
        train_data_loader = DataLoader(train_dataset,
                                       batch_size=batch_size,
                                       shuffle=True,
                                       num_workers=num_workers)
        if val_dataset is not None:
            val_data_loader = DataLoader(val_dataset,
                                         batch_size=batch_size,
                                         shuffle=True,
                                         num_workers=num_workers)
        else:
            val_data_loader = None

        # create the model
        criterion = MultiheadLoss(self.losses,
                                  use_log=self.log_loss,
                                  weights=self.losses_weights).to(self.device)

        # create tensorboard
        writer = create_summary_writer(self.model,
                                       train_data_loader,
                                       log_dir=TENSORBOARD_DIR)

        def multihead_loss_func(y_pred, y):
            return criterion(y_pred[1], torch.split(y, 1, dim=1))[0]

        def multihead_output_transform(x, y, y_pred, *args):
            embedding, output = y_pred
            y_pred_tensor = torch.stack(output).squeeze(2).transpose(0, 1)
            y_tensor = y
            data = x
            with torch.no_grad():
                loss, multi_losses = criterion(output, torch.split(y, 1,
                                                                   dim=1))
            return data, embedding, loss, multi_losses, y_pred_tensor, y_tensor

        eval_metrics = {
            'rmse': RootMeanSquaredError(),  # 'corr': DistanceCorrelation(),
            # 'embedding_data': EmbeddingData()
        }
        train_metrics = {
            'rmse': RootMeanSquaredError()  # , 'corr': DistanceCorrelation()
        }
        trainer = create_supervised_trainer(
            self.model,
            self.optimizer,
            multihead_loss_func,
            device=self.device,
            output_transform=multihead_output_transform)
        for name, metric in train_metrics.items(
        ):  # Calculate metrics also on trainer
            metric.attach(trainer, name)

        evaluator = create_supervised_evaluator(
            self.model,
            metrics=eval_metrics,
            device=self.device,
            output_transform=multihead_output_transform)

        if self.model_save_path is not None:
            # do we want to use it ? from Ignite
            checkpoint_handler = ModelCheckpoint(self.model_save_path,
                                                 'checkpoint',
                                                 save_interval=save_every,
                                                 n_saved=10,
                                                 require_empty=False,
                                                 create_dir=True)

        pbar = ProgressBar()
        # RunningAverage(output_transform=lambda x: x[2])
        pbar.attach(trainer)

        scheduler = LRScheduler(self.step_scheduler)
        trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                  {'mymodel': self.model})

        @trainer.on(Events.EPOCH_STARTED)
        def init_state_params(engine):
            engine.state.plusplus_ex, engine.state.plusminus_ex = [
                None
            ] * self.n_features, [None] * self.n_features
            engine.state.minusminus_ex, engine.state.minusplus_ex = [
                None
            ] * self.n_features, [None] * self.n_features

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_training_loss(engine):
            writer.add_scalar('General/LR',
                              scheduler.get_param(),
                              global_step=engine.state.iteration)
            _, embedding, loss, multi_losses, y_pred_tensor, y_tensor = engine.state.output
            images_batch, features_batch = engine.state.batch
            plusplus_ex, plusminus_ex = engine.state.plusplus_ex, engine.state.plusminus_ex
            minusminus_ex, minusplus_ex = engine.state.minusminus_ex, engine.state.minusplus_ex

            writer.add_scalar('General/Train Loss',
                              loss,
                              global_step=engine.state.iteration)

            feat_diff = (y_pred_tensor - y_tensor)  # / y_tensor + 1
            feat_sum = y_pred_tensor + y_tensor
            for j in range(self.n_features):
                writer.add_scalar(f'Multiple Losses/{self.feature_names[j]}',
                                  multi_losses[j],
                                  global_step=engine.state.iteration)
                for i in range(len(images_batch)):
                    itm_diff, itm_sum = feat_diff[i][j].item(
                    ), feat_sum[i][j].item()
                    itm_pred, itm_actual = y_pred_tensor[i][j].item(
                    ), y_tensor[i][j].item()
                    ex = TrainExample(images_batch[i],
                                      predicted=itm_pred,
                                      actual=itm_actual,
                                      sum=itm_sum,
                                      diff=itm_diff)
                    if minusminus_ex[j] is None or minusminus_ex[
                            j].sum > itm_sum:
                        engine.state.minusminus_ex[j] = ex
                    elif plusminus_ex[j] is None or plusminus_ex[
                            j].diff < itm_diff:
                        engine.state.plusminus_ex[j] = ex
                    elif minusplus_ex[j] is None or minusplus_ex[
                            j].diff > itm_diff:
                        engine.state.minusplus_ex[j] = ex
                    elif plusplus_ex[j] is None or plusplus_ex[j].sum < itm_sum:
                        engine.state.plusplus_ex[j] = ex

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_training_results(engine):
            global_step = engine.state.iteration
            metrics = engine.state.metrics  # already attached to the trainer engine to save
            # can add more metrics here
            add_metrics_to_tensorboard(metrics,
                                       writer,
                                       self.feature_names,
                                       global_step,
                                       log_str="train")

            # plot min-max examples
            plusplus_ex, plusminus_ex = engine.state.plusplus_ex, engine.state.plusminus_ex
            minusminus_ex, minusplus_ex = engine.state.minusminus_ex, engine.state.minusplus_ex

            for j in range(self.n_features):
                if plusplus_ex[j] is None:
                    continue
                writer.add_figure(tag=f"{self.feature_names[j]}/plusplus",
                                  figure=build_example_image_figure(
                                      plusplus_ex[j]),
                                  global_step=global_step)

                writer.add_figure(tag=f"{self.feature_names[j]}/plusminus",
                                  figure=build_example_image_figure(
                                      plusminus_ex[j]),
                                  global_step=global_step)

                writer.add_figure(tag=f"{self.feature_names[j]}/minusminus",
                                  figure=build_example_image_figure(
                                      minusminus_ex[j]),
                                  global_step=global_step)

                writer.add_figure(tag=f"{self.feature_names[j]}/minusplus",
                                  figure=build_example_image_figure(
                                      minusplus_ex[j]),
                                  global_step=global_step)

        @trainer.on(Events.ITERATION_COMPLETED)
        def log_validation_results(engine):
            global_step = engine.state.iteration
            if global_step % evaluate_every == 0:
                evaluator.run(val_data_loader)
                metrics = evaluator.state.metrics
                # can add more metrics here
                add_metrics_to_tensorboard(metrics,
                                           writer,
                                           self.feature_names,
                                           global_step,
                                           log_str="validation")
                # add_embedding_visualization(writer, metrics, global_step)
            if global_step % save_every == 0:
                self.save_trained_model(
                    f"{self.model_save_path}/{global_step}_model.pth")

        trainer.run(train_data_loader, max_epochs=epochs)

        return self.model
示例#9
0
#                        9.5608062744141,
#                        7.8698215484619,
#                        9.5168733596802,
#                        10.373730659485,
#                        6.6616044044495,
#                        10.260489463806,
#                        10.287888526917,
#                        10.289801597595,
#                        10.405355453491,
#                        10.138095855713])
# loss_fn = torch.nn.CrossEntropyLoss(ignore_index=255, weight=weight)
loss_fn = OHEMLoss(ignore_index=255, numel_frac=1 / 16)
loss_fn = loss_fn.cuda()

scheduler = LRScheduler(
    PolyLR(optimizer,
           args.learning_rate,
           total_steps=args.epochs * len(train_loader) - 1000))
scheduler = create_lr_scheduler_with_warmup(scheduler, 0, args.learning_rate,
                                            1000)

original_optimizer = optimizer

model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
if args.checkpoint:
    amp.load_state_dict(checkpoint['amp'])

if args.distributed:
    model = convert_syncbn_model(model)
    model = DistributedDataParallel(model)

trainer = create_segmentation_trainer(
def create_lr_finder(
    model,
    criterion,
    optim_fn=optim.SGD,
    create_engine=None,
    lr_init=1e-11,
    lr_final=10,
    optim_fn_kwargs=None,
    device=None,
    non_blocking=False,
):
    """
    create_lr_finder(
        model,
        criterion,
        optim_fn=optim.SGD,
        create_engine=None,
        lr_init=1e-11,
        lr_final=10,
        optim_fn_kwargs=None,
        device=None,
    )

    Parameters
    ----------
    model : nn.Module
    criterion : nn.Loss
    optim_fn : torch.optim instance
        Default: optim.SGD
    lr_init : float
    lr_final : float
    optim_fn_kwargs : dict (optional)
    device : torch.device

    Returns
    -------
    find_lr : callable

    Example
    -------
    >>> model = nn.Sequential(nn.Linear(5, 2), nn.ReLU(), nn.Linear(2, 2))
    >>> model_parameter = next(model.parameters())
    >>> # initial value for model_parameter:
    >>> print(model_parameter)
    >>> ## <some tensor>
    >>> criterion = nn.CrossEntropyLoss()
    >>> find_lr = create_lr_finder(model, criterion)
    >>> # plotting does not require GUI
    >>> output = find_lr(loader, plot_fpath="./lr_finder_plot.pdf")
    >>> # the original model's parameters are not modified!
    >>> print(model_parameter)
    >>> ## <the same tensor>
    >>> print(output.keys())
    >>> ## ('lr', 'batch_loss')
    >>> print(len(output["lr"]))
    >>> ## <number of iterations>

    Notes
    -----
    See this article
      https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html
    for what might be a better implementation: exponential smoothing and runs
    over a single epoch only. Maybe look at this one too:
      https://forums.fast.ai/t/automated-learning-rate-suggester/44199/15
    which talks about going the final step and choosing an lr automagically.
    """
    from copy import deepcopy

    # Old code:
    # new_model = deepcopy(model)
    if hasattr(model, "_args"):
        new_model = type(model)(*model._args)
    else:
        new_model = deepcopy(model)
    if create_engine is None:
        create_engine = create_lr_finder_engine
    if optim_fn_kwargs is None:
        optim_fn_kwargs = {}
    elif isinstance(optim_fn_kwargs, dict):
        optim_fn_kwargs = {
            key: value
            for key, value in optim_fn_kwargs.items() if key != "lr"
        }
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    new_model = new_model.to(device)
    optimizer = optim_fn(new_model.parameters(), lr=lr_init, **optim_fn_kwargs)

    lr_finder = create_engine(
        new_model,
        optimizer,
        criterion,
        device=device,
        non_blocking=non_blocking,
    )
    exp_scheduler = ExponentialLR(optimizer, gamma=1.1)
    scheduler = LRScheduler(exp_scheduler, save_history=True)
    lr_finder.add_event_handler(Events.ITERATION_COMPLETED, scheduler)
    logger = Logger()

    @lr_finder.on(Events.ITERATION_STARTED)
    def log_lr(engine):
        logger("lr", scheduler.get_param())

    @lr_finder.on(Events.ITERATION_COMPLETED)
    def log_batch_loss(engine):
        logger("batch_loss", engine.state.output)

    @lr_finder.on(Events.ITERATION_COMPLETED)
    def terminate_maybe(engine):
        loss_upper_bound = logger["batch_loss"][0] * 100
        if engine.state.output >= loss_upper_bound:
            engine.terminate()
            return
        if scheduler.get_param() > lr_final:
            engine.terminate()
            return

    @lr_finder.on(Events.COMPLETED)
    def attach_logger(engine):
        if not hasattr(engine.state, "logger"):
            setattr(engine.state, "logger", logger)

    def _get_smoothed_data(output, lr_min, lr_max):
        df = pd.DataFrame(output)
        df["log_lr"] = np.log(df.lr.values)
        df["log_loss"] = np.log(df.batch_loss.values)
        smoothed_log_loss = (df.set_index("log_lr")["log_loss"].rolling(
            10, center=True).mean().reset_index())
        df["lr_smooth"] = np.exp(smoothed_log_loss.log_lr)
        df["batch_loss_smooth"] = np.exp(smoothed_log_loss.log_loss)
        df = df.dropna()
        df = df.loc[(df.lr >= lr_min) & (df.lr <= lr_max)]
        return df

    def _plot_helper(plot_fpath, df, lr_min, lr_max, figsize=None):
        import matplotlib.pyplot as plt

        if figsize is None:
            figsize = (8, 5)
        fig, ax = plt.subplots(1, 1, figsize=figsize)
        ax.plot(df.lr, df.batch_loss, label="unsmoothed")
        ax.plot(df.lr_smooth, df.batch_loss_smooth, label="smooth")
        ax.set_xscale("log")
        ax.set_yscale("log")
        ax.set_xlim(lr_min, lr_max)
        ax.set_xlabel("batch loss")
        ax.set_ylabel("lr value")
        ax.legend()
        fig.savefig(plot_fpath)
        del fig, ax
        return

    def _auto_lr_finder(output):
        from scipy.ndimage import gaussian_filter1d

        lr_vec = np.array(output["lr"])
        loss_vec = np.array(output["batch_loss"])
        idx = np.argmin(loss_vec)
        lr_centre = lr_vec[idx]
        lr_min = np.maximum(lr_centre / 100, np.min(lr_vec))
        lr_max = np.minimum(lr_centre * 1000, np.max(lr_vec))

        lr_values = lr_vec[(lr_vec >= lr_min) & (lr_vec <= lr_max)]
        batch_loss = loss_vec[(lr_vec >= lr_min) & (lr_vec <= lr_max)]

        batch_loss_sm = gaussian_filter1d(batch_loss, 1)
        d_batch_loss_sm = gaussian_filter1d(batch_loss, 1, order=1)

        idx_min = np.argmin(batch_loss_sm)
        idx_dec = np.argmin(d_batch_loss_sm[:idx_min])
        lr_star = lr_values[idx_dec]

        if lr_star > 1:
            print("warning: found lr_star > 1. returning 1e-2")
            lr_star = 1e-2
        return lr_star

    def find_lr(dataloader, max_epochs=100, plot_fpath=None, figsize=None):
        """
        find_lr(dataloader, max_epochs=100, plot_fpath=False)

        Parameters
        ----------
        dataloader: torch.utils.data.DataLoader
            dataloader
        max_epochs: int
            upper bound on number of epochs for which to run
        plot_fpath: string
            location of saved plot

        Returns
        -------
        output : dict
            Has keys 'lr' and 'batch_loss'.

        """
        final_state = lr_finder.run(dataloader, max_epochs)

        output = final_state.logger.log
        if isinstance(plot_fpath, str):
            lr_vec = output["lr"]
            loss_vec = output["batch_loss"]
            idx = np.argmin(loss_vec)
            lr_centre = lr_vec[idx]
            lr_min = np.maximum(lr_centre / 100, np.min(lr_vec))
            lr_max = np.minimum(lr_centre * 1000, np.max(lr_vec))
            df = _get_smoothed_data(output, lr_min, lr_max)
            _plot_helper(plot_fpath, df, lr_min, lr_max, figsize=figsize)

        lr_star = _auto_lr_finder(output)
        return lr_star

    return find_lr
示例#11
0
    def run_once(self):
        # self.log_path = 'log/%s/' % self.dataset
        # self.model_name = 'efficientnet-b0_MSI_{0}fold_random_tile_patch'.format(self.fold_idx)
        # self.log_dir = self.log_path + self.model_name

        log_dir = self.log_dir
        check_manual_seed(self.seed)
        train_pairs, valid_pairs = dataset.prepare_PAIP2020_PANDA(
            self.fold_idx)
        print(len(train_pairs))
        print(len(valid_pairs))

        train_augmentors = self.train_augmentors()
        train_dataset = dataset.DatasetSerial(train_pairs[:],
                                              self.tile_size,
                                              self.num_tile,
                                              train_mode=True)

        infer_augmentors = self.infer_augmentors()  # HACK at has_aux
        infer_dataset = dataset.DatasetSerial(valid_pairs[:],
                                              self.tile_size,
                                              self.num_tile,
                                              train_mode=False)

        train_loader = data.DataLoader(train_dataset,
                                       num_workers=self.nr_procs_train,
                                       batch_size=self.train_batch_size,
                                       shuffle=True,
                                       drop_last=True)

        valid_loader = data.DataLoader(infer_dataset,
                                       num_workers=self.nr_procs_valid,
                                       batch_size=self.infer_batch_size,
                                       shuffle=True,
                                       drop_last=False)

        # --------------------------- Training Sequence

        if self.logging:
            check_log_dir(log_dir)
        #
        device = 'cuda'

        # networksv
        input_chs = 3  # TODO: dynamic config
        # ### VGGNet

        net = EfficientNet.from_pretrained('efficientnet-b0', num_classes=2)

        #net =DenseNet(3,2)
        # load pre-trained models
        net = torch.nn.DataParallel(net).to(device)

        if self.load_network:
            saved_state = torch.load(self.save_net_path)
            net.load_state_dict(saved_state)

        # optimizers
        optimizer = optim.Adam(net.parameters(), lr=self.init_lr)
        scheduler = StepLR(optimizer, self.lr_steps, gamma=0.1)
        scheduler = LRScheduler(scheduler)
        #
        trainer = Engine(lambda engine, batch: self.train_step(
            net, batch, optimizer, device))
        valider = Engine(
            lambda engine, batch: self.infer_step(net, batch, device))

        infer_output = ['prob', 'true']
        ##

        if self.logging:
            checkpoint_handler = ModelCheckpoint(log_dir,
                                                 self.chkpts_prefix,
                                                 save_interval=1,
                                                 n_saved=100,
                                                 require_empty=False)
            # adding handlers using `trainer.add_event_handler` method API
            trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED,
                                      handler=checkpoint_handler,
                                      to_save={'net': net})

        timer = Timer(average=True)
        timer.attach(trainer,
                     start=Events.EPOCH_STARTED,
                     resume=Events.ITERATION_STARTED,
                     pause=Events.ITERATION_COMPLETED,
                     step=Events.ITERATION_COMPLETED)
        timer.attach(valider,
                     start=Events.EPOCH_STARTED,
                     resume=Events.ITERATION_STARTED,
                     pause=Events.ITERATION_COMPLETED,
                     step=Events.ITERATION_COMPLETED)

        # attach running average metrics computation
        # decay of EMA to 0.95 to match tensorpack default
        # TODO: refactor this
        RunningAverage(alpha=0.95, output_transform=lambda x: x['acc']).attach(
            trainer, 'acc')
        RunningAverage(alpha=0.95,
                       output_transform=lambda x: x['loss']).attach(
                           trainer, 'loss')

        # attach progress bar
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=['loss'])
        pbar.attach(valider)

        # #early Stopping
        # def score_function(engine):
        #     val_acc=engine.state.metrics["valid-acc"]
        #     return val_acc
        # early_stopping_handler=EarlyStopping(patience=10,score_function=score_function,trainer=trainer)

        # adding handlers using `trainer.on` decorator API
        @trainer.on(Events.EXCEPTION_RAISED)
        def handle_exception(engine, e):
            if isinstance(e,
                          KeyboardInterrupt) and (engine.state.iteration > 1):
                engine.terminate()
                warnings.warn('KeyboardInterrupt caught. Exiting gracefully.')
                checkpoint_handler(engine, {'net_exception': net})
            else:
                raise e

        # writer for tensorboard logging
        tfwriter = None  # HACK temporary
        if self.logging:
            tfwriter = SummaryWriter(log_dir)
            json_log_file = log_dir + '/stats.json'
            with open(json_log_file, 'w') as json_file:
                json.dump({}, json_file)  # create empty file

        ### TODO refactor again
        log_info_dict = {
            'logging': self.logging,
            'optimizer': optimizer,
            'tfwriter': tfwriter,
            'json_file': json_log_file,
            'nr_classes': self.nr_classes,
            'metric_names': infer_output,
            'infer_batch_size': self.infer_batch_size  # too cumbersome
        }

        trainer.add_event_handler(Events.EPOCH_COMPLETED,
                                  log_train_ema_results, log_info_dict)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, inference, valider,
                                  valid_loader, log_info_dict)
        valider.add_event_handler(Events.ITERATION_COMPLETED,
                                  accumulate_outputs)

        # Setup is done. Now let's run the training
        trainer.run(train_loader, self.nr_epochs)
        return
示例#12
0
def attach_handlers(run, model, optimizer, learning_rule, trainer, evaluator, train_loader, val_loader, params):
    # Metrics
    UnitConvergence(model[0], learning_rule.norm).attach(trainer.engine, 'unit_conv')

    # Tqdm logger
    pbar = ProgressBar(persist=True, bar_format=config.IGNITE_BAR_FORMAT)
    pbar.attach(trainer.engine, metric_names='all')
    tqdm_logger = TqdmLogger(pbar=pbar)
    # noinspection PyTypeChecker
    tqdm_logger.attach_output_handler(
        evaluator.engine,
        event_name=Events.COMPLETED,
        tag="validation",
        global_step_transform=global_step_from_engine(trainer.engine),
    )

    # Evaluator
    evaluator.attach(trainer.engine, Events.EPOCH_COMPLETED(every=100), train_loader, val_loader)

    # Learning rate scheduling
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                                     lr_lambda=lambda epoch: 1 - epoch / params['epochs'])
    lr_scheduler = LRScheduler(lr_scheduler)
    trainer.engine.add_event_handler(Events.EPOCH_COMPLETED, lr_scheduler)

    # Early stopping
    mc_handler = ModelCheckpoint(config.MODELS_DIR, run.replace('/', '-'), n_saved=1, create_dir=True,
                                 require_empty=False,
                                 global_step_transform=global_step_from_engine(trainer.engine))
    trainer.engine.add_event_handler(Events.EPOCH_COMPLETED, mc_handler, {'m': model})

    # Create a TensorBoard logger
    tb_logger = TensorboardLogger(log_dir=os.path.join(config.TENSORBOARD_DIR, run))
    images, labels = next(iter(train_loader))
    tb_logger.writer.add_graph(copy.deepcopy(model).cpu(), images)
    tb_logger.writer.add_hparams(params, {})

    # noinspection PyTypeChecker
    tb_logger.attach_output_handler(
        evaluator.engine,
        event_name=Events.COMPLETED,
        tag="validation",
        metric_names="all",
        global_step_transform=global_step_from_engine(trainer.engine),
    )
    # noinspection PyTypeChecker
    tb_logger.attach_output_handler(
        trainer.engine,
        event_name=Events.EPOCH_COMPLETED,
        tag="train",
        metric_names=["unit_conv"]
    )
    input_shape = tuple(next(iter(train_loader))[0].shape[1:])
    tb_logger.attach(trainer.engine,
                     log_handler=WeightsImageHandler(model, input_shape),
                     event_name=Events.EPOCH_COMPLETED)
    tb_logger.attach(trainer.engine, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.EPOCH_STARTED)
    # tb_logger.attach(trainer.engine,
    #                  log_handler=WeightsScalarHandler(model, layer_names=['linear1', 'linear2']),
    #                  event_name=Events.EPOCH_COMPLETED)
    # tb_logger.attach(trainer.engine,
    #                  log_handler=WeightsHistHandler(model, layer_names=['linear1', 'linear2']),
    #                  event_name=Events.EPOCH_COMPLETED)
    # tb_logger.attach(trainer.engine,
    #                  log_handler=ActivationsHistHandler(model, layer_names=['batch_norm', 'repu']),
    #                  event_name=Events.ITERATION_COMPLETED)
    # tb_logger.attach(trainer.engine,
    #                  log_handler=NumActivationsScalarHandler(model, layer_names=['repu']),
    #                  event_name=Events.ITERATION_COMPLETED)
    # tb_logger.attach(trainer.engine,
    #                  log_handler=ActivationsScalarHandler(model, reduction=torch.mean,
    #                                                       layer_names=['batch_norm', 'repu']),
    #                  event_name=Events.ITERATION_COMPLETED)
    # tb_logger.attach(trainer.engine,
    #                  log_handler=ActivationsScalarHandler(model, reduction=torch.std,
    #                                                       layer_names=['batch_norm', 'repu']),
    #                  event_name=Events.ITERATION_COMPLETED)

    return tb_logger
示例#13
0
model = model.to(device)

optimizer = torch.optim.SGD(
    model.parameters(),
    lr=args.learning_rate,
    weight_decay=args.weight_decay,
    momentum=0.9,
)

loss_fn = nn.CrossEntropyLoss()
loss_fn = loss_fn.to(device)

scheduler = LRScheduler(
    torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                        args.learning_rate,
                                        steps_per_epoch=len(train_loader),
                                        epochs=args.epochs))

model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
if args.distributed:
    model = convert_syncbn_model(model)
    model = DistributedDataParallel(model)

trainer = create_classification_trainer(
    model,
    optimizer,
    loss_fn,
    device=device,
    use_f16=True,
)
示例#14
0
    def range_test(self,
                   train_loader,
                   val_loader=None,
                   end_lr=10,
                   num_iter=100,
                   step_mode="exp",
                   smooth_f=0.05,
                   diverge_th=5,
                   suggestion=True):
        """Performs the learning rate range test.

        Arguments:
            train_loader (torch.utils.data.DataLoader): the training set data laoder.
            val_loader (torch.utils.data.DataLoader, optional): if `None` the range test
                will only use the training loss. When given a data loader, the model is
                evaluated after each iteration on that dataset and the evaluation loss
                is used. Note that in this mode the test takes significantly longer but
                generally produces more precise results. Default: None.
            end_lr (float, optional): the maximum learning rate to test. Default: 10.
            num_iter (int, optional): the number of iterations over which the test
                occurs. Default: 100.
            step_mode (str, optional): one of the available learning rate policies,
                linear or exponential ("linear", "exp"). Default: "exp".
            smooth_f (float, optional): the loss smoothing factor within the [0, 1[
                interval. Disabled if set to 0, otherwise the loss is smoothed using
                exponential smoothing. Default: 0.05.
            diverge_th (int, optional): the test is stopped when the loss surpasses the
                threshold:  diverge_th * best_loss. Default: 5.
            suggestion (bool, optional): whether to compute suggested learning rate (minimal grad) and store value into
                {lr_finder_name}.lr_suggestion. Default: True

        """

        self.logger.info("Learning rate search started")
        # Reset test results
        self.history = {"lr": [], "loss": []}
        self.best_loss = None

        # Initialize the proper learning rate policy
        if step_mode.lower() == "exp":
            lr_schedule = LRScheduler(
                ExponentialLR(self.optimizer, end_lr, num_iter))
        elif step_mode.lower() == "linear":
            lr_schedule = LRScheduler(
                LinearLR(self.optimizer, end_lr, num_iter))
        else:
            raise ValueError(f"expected one of (exp, linear), got {step_mode}")

        if smooth_f < 0 or smooth_f >= 1:
            raise ValueError("smooth_f is outside the range [0, 1]")

        trainer = create_supervised_trainer(self.model,
                                            self.optimizer,
                                            self.criterion,
                                            self.device,
                                            non_blocking=True)

        # if val_loader provided, calculates average loss across entire validation set, accurate but very very slow
        if val_loader:
            evaluator = create_supervised_evaluator(
                self.model,
                metrics={"Loss": Loss(self.criterion)},
                device=self.device,
                non_blocking=True)
            trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                      lambda engine: evaluator.run(val_loader))

        # log the loss at the end of every train iteration

        def log_lr_and_loss(finder):
            loss = evaluator.state.metrics[
                "Loss"] if val_loader else trainer.state.output
            lr = lr_schedule.lr_scheduler.get_lr()[0]
            finder.history["lr"].append(lr)
            if trainer.state.iteration == 1:
                finder.best_loss = loss
            else:
                if smooth_f > 0:
                    loss = smooth_f * loss + (
                        1 - smooth_f) * finder.history["loss"][-1]
                if loss < finder.best_loss:
                    finder.best_loss = loss
            finder.history["loss"].append(loss)

        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  lambda engine: log_lr_and_loss(self))

        # increase lr with every iteration
        trainer.add_event_handler(Events.ITERATION_COMPLETED, lr_schedule)

        # Check if the loss has diverged; if it has, stop the trainer
        def loss_diverged(engine: Engine, finder):
            if finder.history["loss"][-1] > diverge_th * finder.best_loss:
                engine.terminate()
                finder.logger.info("Stopping early, the loss has diverged")

        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  lambda engine: loss_diverged(engine, self))

        # run lr finder
        trainer.run(train_loader, 999)

        if suggestion:
            self.lr_suggestion = self._suggestion()

        self.logger.info(
            "Learning rate search finished. See the graph with {finder_name}.plot()"
        )
示例#15
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default=DATA_FOLDER, help="Path of the dataset.")
    parser.add_argument("--image_path", type=str, default=IMG_FOLDER, help="Path of the images.")
    parser.add_argument("--images_feature_path", type=str, default=IMG_FEATURE_FOLDER, help="Path of the images.")
    parser.add_argument("--dataset_cache", type=str, default=DATA_CACHE, help="Path of the dataset cache_no_pretrained")
    parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model")
    parser.add_argument('--dhead_gpt2', action='store_true', default=False, help="use double head gpt2")
    parser.add_argument("--from_step", type=int, default=-1, help="Init learning rate from this step")
    parser.add_argument('--pretrained', action='store_true', default=True, help="If False train from scratch")
    parser.add_argument("--num_candidates", type=int, default=1, help="Number of candidates for training")
    parser.add_argument("--max_history", type=int, default=3, help="Number of previous turns to keep in history")
    parser.add_argument("--max_length", type=int, default=256, help="Max length of input sentence")
    parser.add_argument("--train_batch_size", type=int, default=58, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=32, help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=9, help="Accumulate gradients on several steps")
    parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
    parser.add_argument("--scheduler", type=str, default="linear", choices=['noam', 'linear'], help="method of optim")
    parser.add_argument("--n_emd", type=int, default=768, help="Number of n_emd in config file (for noam)")
    parser.add_argument("--warmup_steps", type=int, default=5000, help="Warm up steps")
    parser.add_argument("--lm_coef", type=float, default=2.0, help="LM loss coefficient")
    parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--n_epochs", type=int, default=50, help="Number of training epochs")
    parser.add_argument("--num_workers", type=int, default=0, help="Number of subprocesses for data loading")
    parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
    parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="O1", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    logger.info("Prepare tokenizer, pretrained model and optimizer.")
    tokenizer_class = BertTokenizer
    config_class = GPT2Config  # GPT2Config if "gpt2" in args.model_checkpoint else OpenAIGPTConfig
    model_class = GPT2LMHeadModel  # GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
    if args.pretrained:
        tokenizer = tokenizer_class.from_pretrained(MODEL_CHECKPOINT, do_lower_case=False)
        # tokenizer = tokenizer_class(vocab_file=VOCAB_PATH, do_lower_case=True)
        model = model_class.from_pretrained(MODEL_CHECKPOINT)
    else:
        tokenizer = tokenizer_class(vocab_file=VOCAB_PATH, do_lower_case=False)
        tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
        config = config_class.from_json_file(CONFIG_PATH)
        model = model_class(config)
    model.to(args.device)
    # Add special tokens if they are not already added
    # add_special_tokens_(model, tokenizer)
    # optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)
    optimizer = AdamW([{'params': model.parameters(), 'initial_lr': args.lr}], lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = build_dataloader(args, tokenizer, logger)

    def update(engine, batch):
        model.train()
        batch = tuple(torch.tensor(input_data).to(args.device) if idx not in [2, 3] else input_data for idx, input_data in enumerate(batch))
        input_ids, token_type_ids, input_images, image_ids, lm_labels, mc_token_ids, mc_labels = batch
        if args.dhead_gpt2:
            (lm_loss), (mc_loss), *_ = model(input_ids,
                                             token_type_ids=token_type_ids,
                                             mc_token_ids=mc_token_ids,
                                             mc_labels=mc_labels,
                                             lm_labels=lm_labels)
            loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
        else:
            (lm_loss), *_ = model(input_ids,
                                  labels=lm_labels,
                                  token_type_ids=token_type_ids,
                                  input_images=input_images,
                                  image_ids=image_ids)
            loss = lm_loss / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item() #, optimizer.param_groups[0]['lr']
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            if args.dhead_gpt2:
                lm_logits, mc_logits, *_ = model(
                    input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
                )
                lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
                lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
                return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
            else:
                lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids)
                lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
                lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
                return lm_logits_flat_shifted, lm_labels_flat_shifted
    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    # trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    model_size = args.n_emd
    noam_lambda = lambda step: (
            model_size ** (-0.5) * min((step + 1) ** (-0.5), (step + 1) * args.warmup_steps ** (-1.5)))
    noam_scheduler = LambdaLR(optimizer, lr_lambda=noam_lambda, last_epoch=args.from_step)
    scheduler = LRScheduler(noam_scheduler)
    if args.scheduler == "linear":
        scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0][0], x[1][0])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        # tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', n_saved=None)
        trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1), checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))
        tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
示例#16
0
        else:
            logger.info(
                'Current model CANNOT BEAT the previous best model, previous best accuracy is %.5f',
                best_res['acc'])

    def score_function(engine):
        return engine.state.metrics['accuracy']

    if not args.evaluation_mode:
        '''If current run is training'''
        train_data_loader, _, _, _ = get_pytorch_dataloader(
            args, train_file_name_prefix, shuffle=True)
        optimizer = Adam(model.parameters(), lr=args.lr)
        '''Learning rate decays every 5 epochs'''
        optimizer_scheduler = StepLR(optimizer, step_size=5, gamma=0.5)
        scheduler = LRScheduler(optimizer_scheduler)
        trainer = Engine(train)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, scheduler)
        trainer.add_event_handler(Events.ITERATION_COMPLETED,
                                  lambda _: evaluator.run(dev_data_loader))

        pbar = ProgressBar(persist=True, desc='Training')
        pbar.attach(trainer, metric_names=["loss"])
        RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")

        trainer.add_event_handler(Events.ITERATION_COMPLETED(every=args.loss_log_interval), lambda engine: \
                logger.info('Loss at iteration %d is %.5f', engine.state.iteration, engine.state.metrics['loss']))
        early_stop_handler = EarlyStopping(patience=args.patience,
                                           score_function=score_function,
                                           trainer=trainer)
        evaluator.add_event_handler(Events.COMPLETED,