def _init_scheduler(self):
     if self.hparams.scheduler_name == "none":
         self.scheduler = None
     elif self.hparams.scheduler_name == "warmup_with_cosine":
         from ignite.contrib.handlers import LinearCyclicalScheduler, CosineAnnealingScheduler, ConcatScheduler
         lr = self.hparams.lr
         if self.hparams.run_params["epoch_length"]:
             epoch_length = self.hparams.run_params["epoch_length"]
         else:
             epoch_length = len(self.train_loader)
         num_epochs = self.hparams.run_params["max_epochs"]
         scheduler_1 = LinearCyclicalScheduler(self.optimizer, "lr", start_value=lr*0.01, end_value=lr, cycle_size=epoch_length*2)
         scheduler_2 = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.001, cycle_size=num_epochs*epoch_length)
         durations = [epoch_length, ]
         self.scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations)
     elif self.hparams.scheduler_name == "warmup_with_cosine_100":
         from ignite.contrib.handlers import LinearCyclicalScheduler, CosineAnnealingScheduler, ConcatScheduler
         lr = self.hparams.lr
         if self.hparams.run_params["epoch_length"]:
             epoch_length = self.hparams.run_params["epoch_length"]
         else:
             epoch_length = len(self.train_loader)
         num_epochs = self.hparams.run_params["max_epochs"]
         scheduler_1 = LinearCyclicalScheduler(self.optimizer, "lr", start_value=lr*0.01, end_value=lr, cycle_size=epoch_length*2)
         scheduler_2 = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.01, cycle_size=num_epochs*epoch_length)
         durations = [epoch_length, ]
         self.scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations)
     elif self.hparams.scheduler_name == "warmup_with_cosine_10":
         from ignite.contrib.handlers import LinearCyclicalScheduler, CosineAnnealingScheduler, ConcatScheduler
         lr = self.hparams.lr
         if self.hparams.run_params["epoch_length"]:
             epoch_length = self.hparams.run_params["epoch_length"]
         else:
             epoch_length = len(self.train_loader)
         num_epochs = self.hparams.run_params["max_epochs"]
         scheduler_1 = LinearCyclicalScheduler(self.optimizer, "lr", start_value=lr*0.1, end_value=lr, cycle_size=epoch_length*2)
         scheduler_2 = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.1, cycle_size=num_epochs*epoch_length)
         durations = [epoch_length, ]
         self.scheduler = ConcatScheduler(schedulers=[scheduler_1, scheduler_2], durations=durations)
     elif self.hparams.scheduler_name == "one_cycle_cosine_10":
         from ignite.contrib.handlers import CosineAnnealingScheduler
         lr = self.hparams.lr
         if self.hparams.run_params["epoch_length"]:
             epoch_length = self.hparams.run_params["epoch_length"]
         else:
             epoch_length = len(self.train_loader)
         num_epochs = self.hparams.run_params["max_epochs"]
         self.scheduler  = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.1, cycle_size=num_epochs*epoch_length)
     elif self.hparams.scheduler_name == "one_cycle_cosine_100":
         from ignite.contrib.handlers import CosineAnnealingScheduler
         lr = self.hparams.lr
         if self.hparams.run_params["epoch_length"]:
             epoch_length = self.hparams.run_params["epoch_length"]
         else:
             epoch_length = len(self.train_loader)
         num_epochs = self.hparams.run_params["max_epochs"]
         self.scheduler  = CosineAnnealingScheduler(self.optimizer, "lr", start_value=lr, end_value=lr*0.01, cycle_size=num_epochs*epoch_length)
예제 #2
0
def get_scheduler(optimizer, epochs, learning_rate, train_loader_size):
    scheduler = CosineAnnealingScheduler(optimizer, 'lr', learning_rate,
                                         learning_rate / 1000,
                                         epochs * train_loader_size)
    scheduler = create_lr_scheduler_with_warmup(scheduler, 0, 1000,
                                                learning_rate)
    return scheduler
예제 #3
0
def get_lr_scheduler(
    config: ConfigSchema, optimizer: Optimizer, trainer: Engine, evaluator: Engine
):
    if config.num_warmup_epochs:
        length = config.num_epochs - config.num_warmup_epochs
    else:
        length = config.num_epochs

    if config.lr_scheduler == "cosine":
        lr_scheduler = CosineAnnealingScheduler(
            optimizer,
            "lr",
            config.learning_rate,
            0.001 * config.learning_rate,
            cycle_size=length + 1,
        )
        if config.num_warmup_epochs:
            lr_scheduler = create_lr_scheduler_with_warmup(
                lr_scheduler, 0.0, config.num_warmup_epochs
            )
    elif config.lr_scheduler == "reduce_at_plateau":
        lr_scheduler = LRReductionEarlyStopping(
            optimizer,
            trainer=trainer,
            reduction_rate=0.1,
            num_reduction=2,
            patience=config.patience,
            score_function=lambda _: evaluator.state.metrics["accuracy"],
            num_warmup_epochs=config.num_warmup_epochs,
            warmup_start_value=0.001 * config.learning_rate,
        )
    else:
        raise ValueError(f"unknown lr scheduler {config.lr_scheduler}")

    return lr_scheduler
예제 #4
0
val_evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device)

# #### EarlyStopping

# In[10]:

# handler = EarlyStopping(patience=30, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer)
# val_evaluator.add_event_handler(Events.COMPLETED, handler)

# #### LR Scheduler

# In[11]:

scheduler = CosineAnnealingScheduler(optimizer, 'lr', init_lr, end_lr,
                                     len(loader))
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)


@trainer.on(Events.ITERATION_COMPLETED)
def print_lr(engine):
    epoch = engine.state.epoch
    iteration = engine.state.iteration

    if epoch < 2 and iteration % 100 == 0:
        print(f'Iteration {iteration} | LR {optimizer.param_groups[0]["lr"]}')


# #### Compute and display metrics

# In[12]:
예제 #5
0
def main():
  parser = argparse.ArgumentParser()

  # Required parameters
  parser.add_argument("--model", type=str, default='ffn', help="model's name")
  parser.add_argument("--mode", type=int, choices=[0, 1, 2], default=None)
  parser.add_argument("--SNRdb", type=float, default=None)
  parser.add_argument("--pilot_version", type=int, choices=[1, 2], default=1)
  parser.add_argument("--loss_type", type=str, default="BCELoss")
  parser.add_argument("--train_batch_size", type=int, default=128)
  parser.add_argument("--valid_batch_size", type=int, default=128)
  parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
  parser.add_argument("--max_norm", type=float, default=-1)
  parser.add_argument("--lr", type=float, default=1e-3)
  parser.add_argument("--noise_lambda", type=float, default=1.0)
  parser.add_argument("--lr_scheduler", type=str, choices=["linear", "cycle", "cosine"], default="linear")
  parser.add_argument("--reset_lr_scheduler", type=str, choices=["linear", "cycle", "cosine"], default=None)
  parser.add_argument("--reset_trainer", action='store_true')
  parser.add_argument("--modify_model", action='store_true')
  parser.add_argument("--wd", type=float, default=1e-4, help="weight decay")
  parser.add_argument("--eval_iter", type=int, default=10)
  parser.add_argument("--save_iter", type=int, default=10)
  parser.add_argument("--n_epochs", type=int, default=10)
  parser.add_argument("--flush_dataset", type=int, default=0)
  parser.add_argument("--no_cache", action='store_true')
  parser.add_argument("--with_pure_y", action='store_true') 
  parser.add_argument("--with_h", action='store_true') 
  parser.add_argument("--only_l1", action='store_true', help="Only loss 1")
  parser.add_argument("--interpolation", action='store_true', help="if interpolate between pure and reconstruction.") 
  parser.add_argument("--data_dir", type=str, default="data")
  parser.add_argument("--cache_dir", type=str, default="train_cache")
  parser.add_argument("--output_path", type=str, default="runs", help="model save")
  parser.add_argument("--resume_from", type=str, default=None, help="resume training.")
  parser.add_argument("--first_cache_index", type=int, default=0)
  parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                      help="Device (cuda or cpu)")
  parser.add_argument("--local_rank", type=int, default=-1,
                      help="Local rank for distributed training (-1: not distributed)")
  parser.add_argument("--seed", type=int, default=43)
  parser.add_argument("--debug", action='store_true')
  args = parser.parse_args()

  args.output_path = os.path.join(args.output_path, f'pilot_{args.pilot_version}')
  args.cache_dir = os.path.join(args.data_dir, args.cache_dir)
  # Setup CUDA, GPU & distributed training
  args.distributed = (args.local_rank != -1)
  if not args.distributed:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl", init_method='env://')
  args.n_gpu = torch.cuda.device_count() if not args.distributed else 1
  args.device = device

  # Set seed
  set_seed(args)
  logger = setup_logger("trainer", distributed_rank=args.local_rank)

  # Model construction
  model = getattr(models, args.model)(args)
  model = model.to(device)
  optimizer = AdamW(model.parameters(), lr = args.lr, weight_decay=args.wd)

  if args.loss_type == "MSELoss":
    criterion = nn.MSELoss(reduction='sum').to(device)
  else:
    criterion = getattr(nn, args.loss_type, getattr(auxiliary, args.loss_type, None))().to(device)
  criterion2 = nn.MSELoss(reduction='sum').to(device)

  if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
    )

  train_dataset = SIGDataset(args, data_type="train")
  valid_dataset = SIGDataset(args, data_type="valid")
  train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
  valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
  train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, pin_memory=True, shuffle=(not args.distributed))
  valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, pin_memory=True, shuffle=False)
  
  lr_scheduler = None
  if args.lr_scheduler == "linear":
    lr_scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
  elif args.lr_scheduler == "cycle":
    lr_scheduler = LinearCyclicalScheduler(optimizer, 'lr', 0.0, args.lr, args.eval_iter * len(train_loader))
  elif args.lr_scheduler == "cosine":
    lr_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, args.eval_iter * len(train_loader))

  # Training function and trainer
  def update(engine, batch):
      model.train()
      y, x_label, y_pure, H = train_dataset.prepare_batch(batch, device=args.device)

      if args.with_pure_y and args.with_h:
        x_pred, y_pure_pred, H_pred = model(y, pure=y_pure, H=H, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        if args.loss_type == "MSELoss":
          loss_1 = loss_1 / x_pred.size(0)
        loss_noise = criterion2(y_pure_pred, y_pure) / y.size(0) / args.gradient_accumulation_steps
        loss_noise_h = criterion2(H_pred, H) / H.size(0) / args.gradient_accumulation_steps
        if args.only_l1:
          loss = loss_1
        else:
          loss = loss_1 + loss_noise * args.noise_lambda + loss_noise_h
        output = (loss.item(), loss_1.item(), loss_noise.item(), loss_noise_h.item())
      elif args.with_pure_y:
        x_pred, y_pure_pred = model(y, pure=y_pure if args.interpolation else None, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss_noise = criterion2(y_pure_pred, y_pure) / y.size(0) / args.gradient_accumulation_steps
        loss = loss_1 + loss_noise * args.noise_lambda
        output = (loss.item(), loss_1.item(), loss_noise.item())
      elif args.with_h:
        x_pred, H_pred = model(y, opp=True)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss_noise = criterion2(H_pred, H) / H.size(0) / args.gradient_accumulation_steps
        loss = loss_1 + loss_noise * args.noise_lambda
        output = (loss.item(), loss_1.item(), loss_noise.item())
      else:
        x_pred = model(y)
        loss_1 = criterion(x_pred, x_label) / args.gradient_accumulation_steps
        loss = loss_1
        output = (loss.item(), loss_1.item(), torch.zeros_like(loss_1).item())

      loss.backward()
      if args.max_norm > 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
      if engine.state.iteration % args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
      return output
  trainer = Engine(update)

  to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler}
  metric_names = ["loss", "l1", "ln"]
  if args.with_pure_y and args.with_h:
    metric_names.append("lnH")

  common.setup_common_training_handlers(
    trainer=trainer,
    train_sampler=train_loader.sampler,
    to_save=to_save,
    save_every_iters=len(train_loader) * args.save_iter,
    lr_scheduler=lr_scheduler,
    output_names=metric_names,
    with_pbars=False,
    clear_cuda_cache=False,
    output_path=args.output_path,
    n_saved=2,
  )

  resume_from = args.resume_from
  if resume_from is not None:
    checkpoint_fp = Path(resume_from)
    assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix())
    logger.info("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
    checkpoint = torch.load(checkpoint_fp.as_posix(), map_location="cpu")
    if args.reset_trainer:
      to_save.pop("trainer")
    checkpoint_to_load = to_save if 'validation' not in resume_from else {"model": model}
    Checkpoint.load_objects(to_load=checkpoint_to_load, checkpoint=checkpoint)
    if args.reset_lr_scheduler is not None:
      if args.reset_lr_scheduler == "linear":
        lr_scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
      elif args.reset_lr_scheduler == "cycle":
        lr_scheduler = LinearCyclicalScheduler(optimizer, 'lr', 0.0, args.lr, args.eval_iter * len(train_loader))
      elif args.reset_lr_scheduler == "cosine":
        lr_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, args.eval_iter * len(train_loader))

  metrics = {
    "accuracy": Accuracy(lambda output: (torch.round(output[0][0]), output[1][0])), 
    "loss_1": Loss(criterion, output_transform=lambda output: (output[0][0], output[1][0])),
    "loss_noise": Loss(criterion2, output_transform=lambda output: (output[0][1], output[1][1]))
  }
  if args.with_pure_y and args.with_h:
    metrics["loss_noise_h"] = Loss(criterion2, output_transform=lambda output: (output[0][2], output[1][2]))

  def _inference(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]:
    model.eval()
    with torch.no_grad():
      x, y, x_pure, H = valid_dataset.prepare_batch(batch, device=args.device, non_blocking=True)
      if args.with_pure_y and args.with_h:
        y_pred, x_pure_pred, h_pred = model(x, opp=True)
        outputs = (y_pred, x_pure_pred, h_pred), (y, x_pure, H)
      elif args.with_pure_y:
        y_pred, x_pure_pred = model(x, opp=True)
        outputs = (y_pred, x_pure_pred), (y, x_pure)
      elif args.with_h:
        y_pred, h_pred = model(x, opp=True)
        outputs = (y_pred, h_pred), (y, H)
      else:
        y_pred = model(x)
        x_pure_pred = x_pure
        outputs = (y_pred, x_pure_pred), (y, x_pure)       
      return outputs
  evaluator = Engine(_inference)
  for name, metric in metrics.items():
      metric.attach(evaluator, name)

  trainer.add_event_handler(Events.EPOCH_COMPLETED(every=args.eval_iter), lambda _: evaluator.run(valid_loader))

  if args.flush_dataset > 0:
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=args.n_epochs//args.flush_dataset), 
                  lambda _: train_loader.dataset.reset() if args.no_cache else train_loader.dataset.reload())

  # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
  if args.local_rank in [-1, 0]:
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names=metric_names, output_transform=lambda _: {"lr": f"{optimizer.param_groups[0]['lr']:.2e}"})
    evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

    tb_logger = common.setup_tb_logging(args.output_path, trainer, optimizer, evaluators={'validation': evaluator}, log_every_iters=1)

  # Store 3 best models by validation accuracy:
  common.gen_save_best_models_by_val_score(
    save_handler=DiskSaver(args.output_path, require_empty=False),
    evaluator=evaluator,
    models={"model": model},
    metric_name="accuracy",
    n_saved=3,
    trainer=trainer,
    tag="validation"
  )

  # Run the training
  trainer.run(train_loader, max_epochs=args.n_epochs)

  if args.local_rank in [-1, 0]:
    tb_logger.close()
예제 #6
0
        'params': parameters_of(model, (nn.BatchNorm2d, nn.PReLU)),
    }],
    lr=args.learning_rate,
    momentum=0.9)

class_freq = torch.from_numpy(Cityscapes.CLASS_FREQ).float()
weight = 1 / torch.log(1.02 + class_freq)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=255, weight=weight)
loss_fn = loss_fn.cuda()

warmup_iterations = 1000

scheduler = CosineAnnealingScheduler(
    optimizer,
    'lr',
    args.learning_rate,
    args.learning_rate * 1e-4,
    cycle_size=args.epochs * len(train_loader) - warmup_iterations,
)
scheduler = create_lr_scheduler_with_warmup(scheduler, 0, args.learning_rate,
                                            warmup_iterations)

model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
if args.distributed:
    model = convert_syncbn_model(model)
    model = DistributedDataParallel(model)

trainer = create_segmentation_trainer(
    model,
    optimizer,
    loss_fn,
예제 #7
0
            'weight_decay': args.weight_decay,
        },
        {
            'params': (p for p in model.parameters() if len(p.shape) == 1),
        },
    ],
    lr=args.learning_rate,
)

loss_fn = OHEMLoss(ignore_index=255)
loss_fn = loss_fn.cuda()

scheduler = CosineAnnealingScheduler(
    optimizer,
    'lr',
    args.learning_rate,
    args.learning_rate / 1000,
    args.epochs * len(train_loader) - 1000,
)
scheduler = create_lr_scheduler_with_warmup(scheduler, 0, args.learning_rate,
                                            1000)

model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
if args.distributed:
    model = convert_syncbn_model(model)
    model = DistributedDataParallel(model)

trainer = create_segmentation_trainer(
    model,
    optimizer,
    loss_fn,
def train(model_name: str, dir_dataset: Path, dir_model: Path, img_type: str,
          architecture: str, epoch: int, batch_size: int,
          multi_sample_aug: str, seed: int, debug: bool):

    logger = getLogger('root')

    tic = time.time()

    random.seed(seed)
    np.random.seed(seed)

    train_csv = pd.read_csv(dir_dataset / 'train.csv')

    dir_images = Path('../../input/train_raw')

    train_images_all = sorted(os.listdir(str(dir_images)))
    random.shuffle(train_images_all)

    num_train = int(len(train_images_all) * 0.9)
    train_images = train_images_all[:num_train]
    valid_images = train_images_all[num_train:]

    if debug:
        train_images = train_images[:4000]
        valid_images = valid_images[:2000]

    logger.info('train images: {}'.format(train_images[:5]))
    logger.info('valid images: {}'.format(valid_images[:5]))

    logger.info('==> prepare dataset')
    logger.info(f'==> use {multi_sample_aug}')

    if multi_sample_aug == 'mixup':
        train_mixup_dataset = BengaliMixUpDataset(
            1.0,
            dir_images,
            train_csv=train_csv,
            list_images=train_images,
            is_aug=True,
            get_augmenter_func=get_augmenter)
    elif multi_sample_aug == 'cutmix':
        train_mixup_dataset = BengaliCutMixDataset(
            1.0,
            dir_images,
            train_csv=train_csv,
            list_images=train_images,
            is_aug=True,
            get_augmenter_func=get_augmenter)
    elif multi_sample_aug == 'cutmixup':
        train_mixup_dataset = BengaliCutMixUpDataset(
            0.5,
            0.5,
            dir_images,
            train_csv=train_csv,
            list_images=train_images,
            is_aug=True,
            get_augmenter_func=get_augmenter)
    else:
        raise ValueError('Unknown Augmentation')

    train_dataset = BengaliDataset(dir_images,
                                   train_csv=train_csv,
                                   list_images=train_images,
                                   is_aug=False)
    valid_dataset = BengaliDataset(dir_images,
                                   train_csv=train_csv,
                                   list_images=valid_images,
                                   is_aug=False)

    logger.info('==> create data loader')
    train_loader = DataLoader(train_mixup_dataset,
                              num_workers=7,
                              batch_sampler=BalancedSampler(
                                  train_csv, train_images,
                                  len(train_images) // batch_size, batch_size))
    valid_loader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              num_workers=7,
                              shuffle=False)
    eval_train_loader = DataLoader(train_dataset,
                                   batch_size=batch_size,
                                   num_workers=7,
                                   shuffle=False,
                                   sampler=SequentialSampler(
                                       train_dataset,
                                       num_samples=len(valid_images)))

    logger.info('==> build model')
    model = build_model(architecture, img_type, pretrained=True)
    model.cuda()

    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=1e-3)
    loss_f = BengaliLoss()
    device = "cuda" if torch.cuda.is_available() else "cpu"

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        loss_f,
                                        device=device)

    ProgressBar(persist=True, desc='Train').attach(trainer)

    def extract_grapheme_root(output):
        y_pred, (y, _) = output
        return y_pred[:, :NUM_GRAPHEME_ROOT], y[:, 0]

    def extract_vowel_diacritic(output):
        y_pred, (y, _) = output
        return y_pred[:, NUM_GRAPHEME_ROOT:NUM_GRAPHEME_ROOT + NUM_VOWEL], y[:,
                                                                             1]

    def extract_consonant_diacritic(output):
        y_pred, (y, _) = output
        return y_pred[:, NUM_GRAPHEME_ROOT + NUM_VOWEL:], y[:, 2]

    metrics = {
        'accuracy_gr':
        Accuracy(extract_grapheme_root),
        'accuracy_vd':
        Accuracy(extract_vowel_diacritic),
        'accuracy_cd':
        Accuracy(extract_consonant_diacritic),
        'recall_gr':
        Recall(extract_grapheme_root, average=True),
        'recall_vd':
        Recall(extract_vowel_diacritic, average=True),
        'recall_cd':
        Recall(extract_consonant_diacritic, average=True),
        'ave_recall':
        0.5 * Recall(extract_grapheme_root, average=True) +
        0.25 * Recall(extract_vowel_diacritic, average=True) +
        0.25 * Recall(extract_consonant_diacritic, average=True)
    }

    evaluator_train = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device)

    ProgressBar(persist=True, desc='Train Evaluation').attach(evaluator_train)
    ProgressBar(persist=True, desc='Valid Evaluation').attach(evaluator)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(trainer):
        evaluator_train.run(eval_train_loader)
        eval_print('Train', trainer.state.epoch, evaluator_train.state.metrics)

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):
        evaluator.run(valid_loader)
        eval_print('Valid', trainer.state.epoch, evaluator.state.metrics)

    handler = ModelCheckpoint(dirname=dir_model,
                              filename_prefix=f'temp',
                              create_dir=True,
                              require_empty=False,
                              n_saved=1,
                              score_function=score_function)
    evaluator.add_event_handler(Events.COMPLETED, handler, {'model': model})

    lr_handler = CosineAnnealingScheduler(optimizer,
                                          'lr',
                                          1e-3,
                                          1e-7,
                                          len(train_loader) * epoch,
                                          save_history=True)

    trainer.add_event_handler(Events.ITERATION_COMPLETED, lr_handler)

    trainer.run(train_loader, max_epochs=epoch)

    saved_model = handler._saved[0][1][0]
    model_new_name = str(dir_model / f'{model_name}.pth')
    logger.info(f'rename model {saved_model} to {model_new_name}')
    os.rename(saved_model, model_new_name)

    plt.plot(trainer.state.param_history['lr'])
    plt.xlabel('batch')
    plt.ylabel('learning rate')
    plt.savefig('_log/learning_rate.png')
    plt.close()

    model_conf = {
        'model_name': model_name,
        'architecture': architecture,
        'seed': seed,
        'img_type': img_type,
        'multi_sample_aug': multi_sample_aug
    }

    with open(str(params.dir_model / f'{model_name}.json'), 'w') as f:
        json.dump(model_conf, f, indent=2)

    elapsed_time = time.time() - tic
    logger.info(f'elapsed time: {elapsed_time / 60.0:.1f} [min]')
예제 #9
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default='wikitext-2',
        help="One of ('wikitext-103', 'wikitext-2') or a dict of splits paths."
    )
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")

    parser.add_argument("--embed_dim",
                        type=int,
                        default=410,
                        help="Embeddings dim")
    parser.add_argument("--hidden_dim",
                        type=int,
                        default=2100,
                        help="Hidden dimension")
    parser.add_argument("--num_max_positions",
                        type=int,
                        default=256,
                        help="Max input length")
    parser.add_argument("--num_heads",
                        type=int,
                        default=10,
                        help="Number of heads")
    parser.add_argument("--num_layers",
                        type=int,
                        default=16,
                        help="NUmber of layers")
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout")
    parser.add_argument("--initializer_range",
                        type=float,
                        default=0.02,
                        help="Dropout")

    parser.add_argument("--train_batch_size",
                        type=int,
                        default=8,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=8,
                        help="Batch size for validation")
    parser.add_argument("--lr",
                        type=float,
                        default=2.5e-4,
                        help="Learning rate")
    parser.add_argument("--max_norm",
                        type=float,
                        default=0.25,
                        help="Clipping gradient norm")
    parser.add_argument("--weight_decay",
                        type=float,
                        default=0.0,
                        help="Weight decay")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=200,
                        help="Number of training epochs")
    parser.add_argument("--n_warmup",
                        type=int,
                        default=1000,
                        help="Number of warmup iterations")
    parser.add_argument("--eval_every",
                        type=int,
                        default=-1,
                        help="Evaluate every X steps (-1 => end of epoch)")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=1,
                        help="Accumulate gradient")

    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log on main process only, logger.warning => log on all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(
        args))  # This is a logger.info: only printed on the first process

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("Prepare tokenizer, model and optimizer")
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-cased',
        do_lower_case=False)  # Let's use a pre-defined tokenizer
    args.num_embeddings = len(
        tokenizer.vocab
    )  # We need this to create the model at next line (number of embeddings to use)
    model = TransformerWithLMHead(args)
    model.to(args.device)
    optimizer = Adam(model.parameters(),
                     lr=args.lr,
                     weight_decay=args.weight_decay)
    logger.info("Model has %s parameters",
                sum(p.numel() for p in model.parameters() if p.requires_grad))

    # Prepare model for distributed training if needed
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders(
        args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = batch.transpose(0, 1).contiguous().to(
            args.device)  # to shape [seq length, batch]
        logits, loss = model(batch, labels=batch)
        loss = loss / args.gradient_accumulation_steps
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = batch.transpose(0, 1).contiguous().to(
                args.device)  # to shape [seq length, batch]
            logits = model(batch)
            shift_logits = logits[:-1]
            shift_labels = batch[1:]
            return shift_logits.view(-1,
                                     logits.size(-1)), shift_labels.view(-1)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.eval_every > 0:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            lambda engine: evaluator.run(val_loader)
            if engine.state.iteration % args.eval_every == 0 else None)
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule
    cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0,
                                             len(train_loader) * args.n_epochs)
    scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr,
                                                args.n_warmup)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we average distributed metrics using average_distributed_scalar
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))}
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    metrics["average_word_ppl"] = MetricsLambda(
        lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words),
        metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train
    if args.local_rank in [-1, 0]:
        checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving(
            trainer, evaluator, metrics, model, optimizer, args)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint for easy re-loading
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
예제 #10
0
def run(*options, cfg=None, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file
    
    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.        
        debug (bool): Places scripts in debug/test mode and only executes a few iterations
    """
    # Configuration:
    update_config(config, options=options, config_file=cfg)

    # The model will be saved under: outputs/<config_file_name>/<model_dir>
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]
    try:
        output_dir = generate_path(
            config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),
        )
    except:
        output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),)

    # Logging:
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)

    # Set CUDNN benchmark mode:
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    # We will write the model under outputs / config_file_name / model_dir
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]

    # Fix random seeds:
    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # Augmentation:
    basic_aug = Compose(
        [
            Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=1),
            PadIfNeeded(
                min_height=config.TRAIN.PATCH_SIZE,
                min_width=config.TRAIN.PATCH_SIZE,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=255,
                value=0,
            ),
            Resize(
                config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True,
            ),
            PadIfNeeded(
                min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
                min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=255,
            ),
        ]
    )
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    # Training and Validation Loaders:
    TrainPatchLoader = get_patch_loader(config)
    logging.info(f"Using {TrainPatchLoader}")
    train_set = TrainPatchLoader(
        config.DATASET.ROOT,
        config.DATASET.NUM_CLASSES,
        split="train",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=train_aug,
        debug=debug,
    )
    logger.info(train_set)
    n_classes = train_set.n_classes
    val_set = TrainPatchLoader(
        config.DATASET.ROOT,
        config.DATASET.NUM_CLASSES,
        split="val",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=val_aug,
        debug=debug,
    )
    logger.info(val_set)

    if debug:
        logger.info("Running in debug mode..")
        train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES))
        val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU))

    train_loader = data.DataLoader(
        train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True
    )
    val_loader = data.DataLoader(
        val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=1
    )  # config.WORKERS)

    # Model:
    model = getattr(models, config.MODEL.NAME).get_seg_model(config)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Optimizer and LR Scheduler:
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader)
    scheduler = CosineAnnealingScheduler(
        optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration
    )

    # Tensorboard writer:
    summary_writer = create_summary_writer(log_dir=path.join(output_dir, "logs"))

    # class weights are inversely proportional to the frequency of the classes in the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False)

    # Loss:
    criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean")

    # Ignite trainer and evaluator:
    trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device)
    transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(), output_dict["mask"].squeeze())
    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll": Loss(criterion, output_transform=transform_fn),
            "pixacc": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device),
            "cacc": class_accuracy(n_classes, output_transform=transform_fn),
            "mca": mean_class_accuracy(n_classes, output_transform=transform_fn),
            "ciou": class_iou(n_classes, output_transform=transform_fn),
            "mIoU": mean_iou(n_classes, output_transform=transform_fn),
        },
        device=device,
    )
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Logging:
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ),
    )
    trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer))

    # Tensorboard and Logging:
    trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer))
    trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer))

    # add specific logger which also triggers printed metrics on training set
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training")
        logging_handlers.log_metrics(engine, evaluator, stage="Training")

    # add specific logger which also triggers printed metrics on validation set
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation")
        logging_handlers.log_metrics(engine, evaluator, stage="Validation")
        # dump validation set metrics at the very end for debugging purposes
        if engine.state.epoch == config.TRAIN.END_EPOCH and debug:
            fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
            metrics = evaluator.state.metrics
            out_dict = {x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"]}
            with open(fname, "w") as fid:
                json.dump(out_dict, fid)
            log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys())
            logging.info(log_msg)

    # Checkpointing: snapshotting trained models to disk
    checkpoint_handler = SnapshotHandler(
        output_dir,
        config.MODEL.NAME,
        extract_metric_from("mIoU"),
        lambda: (trainer.state.iteration % snapshot_duration) == 0,
    )
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model})

    logger.info("Starting training")
    trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)

    summary_writer.close()
예제 #11
0
# #### EarlyStopping

# In[8]:


handler = EarlyStopping(patience=10, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer)
val_evaluator.add_event_handler(Events.COMPLETED, handler)


# #### LR Scheduler

# In[9]:


scheduler = CosineAnnealingScheduler(optimizer, 'lr', initial_lr, 1e-7, len(loader))
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

@trainer.on(Events.ITERATION_STARTED)
def warmup_lr(engine):
    epoch = engine.state.epoch
    if epoch < 6:
        for param_group in optimizer.param_groups:
            # at the 6th epoch the lr will be the initial lr
            param_group['lr'] = 1e-7 + (epoch - 1)*(1./5.)*initial_lr

printed_warmup_epochs = []
@trainer.on(Events.ITERATION_STARTED)
def print_warmup_lr(engine):
    global printed_warmup_epochs
    epoch = engine.state.epoch
예제 #12
0
                         mode='bilinear',
                         align_corners=True)
    return loss_fn(y_pred, y)


def supervised_loss_fn(y_pred, y):
    y_pred, aux_y_pred = y_pred
    return \
        loss_fn(y_pred, y) \
        + 0.4 * sum((aux_loss(y_pred, y) for y_pred in aux_y_pred))


scheduler1 = CosineAnnealingScheduler(
    optimizer,
    param_name='lr',
    start_value=args.learning_rate / 10,
    end_value=args.learning_rate / 10 * 1e-4,
    cycle_size=args.epochs * len(train_loader) - 1000,
    param_group_index=0,
)
scheduler1 = create_lr_scheduler_with_warmup(scheduler1, 0,
                                             args.learning_rate / 10, 1000)
scheduler2 = CosineAnnealingScheduler(
    optimizer,
    param_name='lr',
    start_value=args.learning_rate / 10,
    end_value=args.learning_rate / 10 * 1e-4,
    cycle_size=args.epochs * len(train_loader) - 1000,
    param_group_index=1,
)
scheduler2 = create_lr_scheduler_with_warmup(scheduler2, 0,
                                             args.learning_rate / 10, 1000)
예제 #13
0
def add_events(engines, dataloaders, model, optimizer, device, save_dir, args):
    trainer, valid_evaluator, test_evaluator = engines
    train_dl, valid_dl, test_dl = dataloaders

    if args.valid_on == 'Loss':
        score_fn = lambda engine: -engine.state.metrics[args.valid_on]
    elif args.valid_on == 'Product':
        score_fn = lambda engine: engine.state.metrics[
            'MRR'] * engine.state.metrics['HR@10']
    elif args.valid_on == 'RMS':
        score_fn = lambda engine: engine.state.metrics[
            'MRR']**2 + engine.state.metrics['HR@10']**2
    else:
        score_fn = lambda engine: engine.state.metrics[args.valid_on]

    # LR Scheduler
    if args.lr_scheduler == 'restart':
        scheduler = CosineAnnealingScheduler(optimizer,
                                             'lr',
                                             start_value=args.lr,
                                             end_value=args.lr * 0.01,
                                             cycle_size=len(train_dl),
                                             cycle_mult=args.cycle_mult)
        trainer.add_event_handler(Events.ITERATION_STARTED, scheduler,
                                  'lr_scheduler')
    elif args.lr_scheduler == 'triangle':
        scheduler = make_slanted_triangular_lr_scheduler(
            optimizer, n_events=args.n_epochs * len(train_dl), lr_max=args.lr)
        trainer.add_event_handler(Events.ITERATION_STARTED, scheduler,
                                  'lr_scheduler')
    elif args.lr_scheduler == 'none':
        pass
    else:
        raise NotImplementedError

    # EarlyStopping
    trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())
    valid_evaluator.add_event_handler(
        Events.COMPLETED,
        EarlyStopping(args.patience, score_function=score_fn, trainer=trainer))
    # Training Loss
    RunningAverage(output_transform=lambda x: x,
                   alpha=args.avg_alpha).attach(trainer, 'loss')
    # Checkpoint
    ckpt_handler = ModelCheckpoint(save_dir,
                                   'best',
                                   score_function=score_fn,
                                   score_name=args.valid_on,
                                   n_saved=1)
    valid_evaluator.add_event_handler(Events.COMPLETED, ckpt_handler,
                                      {'model': model})
    # Timer
    timer = Timer(average=True)
    timer.attach(trainer,
                 resume=Events.EPOCH_STARTED,
                 step=Events.EPOCH_COMPLETED)
    # Progress Bar
    if args.pbar:
        pbar = ProgressBar()
        pbar.attach(trainer, ['loss'])
        log_msg = pbar.log_message
    else:
        log_msg = print

    cpe_valid = CustomPeriodicEvent(n_epochs=args.valid_every)
    cpe_valid.attach(trainer)
    valid_metrics_history = []

    @trainer.on(
        getattr(cpe_valid.Events, f'EPOCHS_{args.valid_every}_COMPLETED'))
    def evaluate_on_valid(engine):
        state = valid_evaluator.run(valid_dl)
        metrics = state.metrics
        valid_metrics_history.append(metrics)
        msg = f'Epoch: {engine.state.epoch:3d} AvgTime: {timer.value():3.1f}s TrainLoss: {engine.state.metrics["loss"]:.4f} '
        msg += ' '.join([
            f'{k}: {v:.4f}' for k, v in metrics.items()
            if k in ['Loss', 'MRR', 'HR@10']
        ])
        log_msg(msg)

    @trainer.on(Events.COMPLETED)
    def evaluate_on_test(engine):
        pth_file = [
            f for f in pathlib.Path(save_dir).iterdir()
            if f.name.endswith('pth')
        ][0]
        log_msg(f'Load Best Model: {str(pth_file)}')
        model.load_state_dict(torch.load(pth_file, map_location=device))
        # Rerun on Valid for log.
        valid_state = valid_evaluator.run(valid_dl)
        engine.state.valid_metrics = valid_state.metrics
        # Test
        test_state = test_evaluator.run(test_dl)
        engine.state.test_metrics = test_state.metrics
        engine.state.valid_metrics_history = valid_metrics_history
        msg = f'[Test] '
        msg += ' '.join([
            f'{k}: {v:.4f}' for k, v in test_state.metrics.items()
            if k in ['Loss', 'MRR', 'HR@10']
        ])
        log_msg(msg)

    # Tensorboard
    if args.tensorboard:
        tb_logger = TensorboardLogger(log_dir=str(save_dir / 'tb_log'))
        # Loss
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(
                             tag='training', output_transform=lambda x: x),
                         event_name=Events.ITERATION_COMPLETED)
        # Metrics
        tb_logger.attach(valid_evaluator,
                         log_handler=OutputHandler(
                             tag='validation',
                             metric_names=['Loss', 'MRR', 'HR@10'],
                             another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)
        # Optimizer
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        # Parameters
        # tb_logger.attach(trainer,
        #                  log_handler=WeightsScalarHandler(model),
        #                  event_name=Events.ITERATION_COMPLETED)
        # tb_logger.attach(trainer,
        #                  log_handler=GradsScalarHandler(model),
        #                  event_name=Events.ITERATION_COMPLETED)

        @trainer.on(Events.COMPLETED)
        def close_tb(engine):
            tb_logger.close()
예제 #14
0
        {'params': parameters_of(model, (nn.BatchNorm2d, nn.PReLU)), },
    ],
    lr=args.learning_rate,
)


class_freq = torch.from_numpy(Cityscapes.CLASS_FREQ).float()
weight = 1 / torch.log(1.02 + class_freq)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=255, weight=weight)
# loss_fn = OHEMLoss(ignore_index=255)
loss_fn = loss_fn.cuda()


scheduler1 = CosineAnnealingScheduler(
    optimizer, 'lr',
    args.learning_rate, args.learning_rate * 1e-4,
    cycle_size=args.epochs * len(train_loader),
)
scheduler1 = create_lr_scheduler_with_warmup(
    scheduler1, 0, args.learning_rate, 1000)


model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
if args.distributed:
    model = convert_syncbn_model(model)
    model = DistributedDataParallel(model)


trainer = create_segmentation_trainer(
    model, optimizer, loss_fn,
    device=device,
예제 #15
0
def train(args):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                              do_lower_case=False)
    args.num_embeddings = len(
        tokenizer.vocab
    )  # We need this to create the model at next line (number of embeddings to use)
    model = TransformerWithLMHead(args)
    model.to(args.device)
    optimizer = Adam(model.parameters(),
                     lr=args.lr,
                     weight_decay=args.weight_decay)

    logger.info("Model has %s parameters",
                sum(p.numel() for p in model.parameters() if p.requires_grad))

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler, train_num_words, valid_num_words = get_data_loaders(
        args, tokenizer)

    # Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original
    def mask_tokens(inputs):
        labels = inputs.clone()
        masked_indices = torch.bernoulli(
            torch.full(labels.shape, args.mlm_probability)).byte()
        labels[~masked_indices] = -1  # We only compute loss on masked tokens
        indices_replaced = torch.bernoulli(torch.full(
            labels.shape, 0.8)).byte() & masked_indices
        inputs[indices_replaced] = tokenizer.vocab[
            "[MASK]"]  # 80% of the time, replace masked input tokens with [MASK]
        indices_random = torch.bernoulli(torch.full(
            labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced
        random_words = torch.randint(args.num_embeddings,
                                     labels.shape,
                                     dtype=torch.long,
                                     device=args.device)
        inputs[indices_random] = random_words[
            indices_random]  # 10% of the time, replace masked input tokens with random word
        return inputs, labels

    def update(engine, batch):
        model.train()
        inputs = batch.transpose(0, 1).contiguous().to(args.device)
        inputs, labels = mask_tokens(inputs) if args.mlm else (inputs, inputs)
        logits, loss = model(inputs, labels=labels)
        loss = loss / args.gradient_accumulation_steps
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            inputs = batch.transpose(0, 1).contiguous().to(args.device)
            inputs, labels = mask_tokens(inputs) if args.mlm else (
                inputs,
                inputs)  # Prepare masked input/labels if we use masked LM
            logits = model(inputs)
            shift_logits = logits[:-1] if not args.mlm else logits
            shift_labels = labels[1:] if not args.mlm else labels
            return shift_logits.view(-1,
                                     logits.size(-1)), shift_labels.view(-1)

    evaluator = Engine(inference)

    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.eval_every > 0:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            lambda engine: evaluator.run(val_loader)
            if engine.state.iteration % args.eval_every == 0 else None)
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))

    # Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine schedule
    cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0,
                                             len(train_loader) * args.n_epochs)
    scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr,
                                                args.n_warmup)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we average distributed metrics using average_distributed_scalar
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))}
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    # Let's convert sub-word perplexities in word perplexities. If you need details: http://sjmielke.com/comparing-perplexities.htm
    metrics["average_word_ppl"] = MetricsLambda(
        lambda x: math.exp(x * val_loader.dataset.numel() / valid_num_words),
        metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train
    if args.local_rank in [-1, 0]:
        checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving(
            trainer, evaluator, metrics, model, optimizer, args)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)
예제 #16
0
    print(model)
    wandb.watch(model)

    loss = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=cfg.lr,
                                 weight_decay=cfg.weight_decay)
    trainer = create_supervised_trainer(model, optimizer, loss, device)
    RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED,
        create_lr_scheduler_with_warmup(CosineAnnealingScheduler(
            optimizer,
            param_name='lr',
            start_value=cfg.lr,
            end_value=0,
            cycle_size=len(train_loader) * cfg.n_epochs,
            start_value_mult=0,
            end_value_mult=0),
                                        warmup_start_value=0.0,
                                        warmup_end_value=cfg.lr,
                                        warmup_duration=len(train_loader)))

    evaluator = create_supervised_evaluator(
        model,
        metrics={
            'loss': Loss(loss),
            'acc_smpl': Accuracy(threshold_output, is_multilabel=True),
            'p': Precision(threshold_output, average=True),
            'r': Recall(threshold_output, average=True),
            'f1': Fbeta(1.0, output_transform=threshold_output),
예제 #17
0
def run(*options, cfg=None, local_rank=0, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file

    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
    """
    update_config(config, options=options, config_file=cfg)

    # we will write the model under outputs / config_file_name / model_dir
    config_file_name = "default_config" if not cfg else cfg.split(
        "/")[-1].split(".")[0]

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    silence_other_ranks = True
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    distributed = world_size > 1

    if distributed:
        # FOR DISTRIBUTED: Set the device according to local_rank.
        torch.cuda.set_device(local_rank)

        # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will
        # provide environment variables, and requires that you use init_method=`env://`.
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")

    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)
    # Setup Augmentations
    basic_aug = Compose([
        Normalize(mean=(config.TRAIN.MEAN, ),
                  std=(config.TRAIN.STD, ),
                  max_pixel_value=1),
        PadIfNeeded(
            min_height=config.TRAIN.PATCH_SIZE,
            min_width=config.TRAIN.PATCH_SIZE,
            border_mode=config.OPENCV_BORDER_CONSTANT,
            always_apply=True,
            mask_value=255,
        ),
        Resize(
            config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT,
            config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH,
            always_apply=True,
        ),
        PadIfNeeded(
            min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
            min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
            border_mode=config.OPENCV_BORDER_CONSTANT,
            always_apply=True,
            mask_value=255,
        ),
    ])
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    TrainPatchLoader = get_patch_loader(config)

    train_set = TrainPatchLoader(
        config.DATASET.ROOT,
        split="train",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=train_aug,
    )

    val_set = TrainPatchLoader(
        config.DATASET.ROOT,
        split="val",
        is_transform=True,
        stride=config.TRAIN.STRIDE,
        patch_size=config.TRAIN.PATCH_SIZE,
        augmentations=val_aug,
    )

    logger.info(f"Validation examples {len(val_set)}")
    n_classes = train_set.n_classes

    if debug:
        val_set = data.Subset(val_set,
                              range(config.VALIDATION.BATCH_SIZE_PER_GPU))
        train_set = data.Subset(train_set,
                                range(config.TRAIN.BATCH_SIZE_PER_GPU * 2))

    logger.info(f"Training examples {len(train_set)}")
    logger.info(f"Validation examples {len(val_set)}")

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set, num_replicas=world_size, rank=local_rank)

    train_loader = data.DataLoader(
        train_set,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
        num_workers=config.WORKERS,
        sampler=train_sampler,
    )

    val_sampler = torch.utils.data.distributed.DistributedSampler(
        val_set, num_replicas=world_size, rank=local_rank)

    val_loader = data.DataLoader(
        val_set,
        batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU,
        num_workers=config.WORKERS,
        sampler=val_sampler,
    )

    model = getattr(models, config.MODEL.NAME).get_seg_model(config)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    model = model.to(device)  # Send to GPU

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    # weights are inversely proportional to the frequency of the classes in
    # the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS,
                                 device=device,
                                 requires_grad=False)

    criterion = torch.nn.CrossEntropyLoss(weight=class_weights,
                                          ignore_index=255,
                                          reduction="mean")

    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[device], find_unused_parameters=True)

    snapshot_duration = epochs_per_cycle * len(
        train_loader) if not debug else 2 * len(train_loader)

    warmup_duration = 5 * len(train_loader)

    warmup_scheduler = LinearCyclicalScheduler(
        optimizer,
        "lr",
        start_value=config.TRAIN.MAX_LR,
        end_value=config.TRAIN.MAX_LR * world_size,
        cycle_size=10 * len(train_loader),
    )
    cosine_scheduler = CosineAnnealingScheduler(
        optimizer,
        "lr",
        config.TRAIN.MAX_LR * world_size,
        config.TRAIN.MIN_LR * world_size,
        cycle_size=snapshot_duration,
    )

    scheduler = ConcatScheduler(
        schedulers=[warmup_scheduler, cosine_scheduler],
        durations=[warmup_duration])

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch,
                                        device=device)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    # Set to update the epoch parameter of our distributed data sampler so that we get
    # different shuffles
    trainer.add_event_handler(Events.EPOCH_STARTED,
                              update_sampler_epoch(train_loader))

    if silence_other_ranks & local_rank != 0:
        logging.getLogger("ignite.engine.engine.Engine").setLevel(
            logging.WARNING)

    def _select_pred_and_mask(model_out_dict):
        return (model_out_dict["y_pred"].squeeze(),
                model_out_dict["mask"].squeeze())

    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll":
            Loss(criterion,
                 output_transform=_select_pred_and_mask,
                 device=device),
            "pixa":
            pixelwise_accuracy(n_classes,
                               output_transform=_select_pred_and_mask,
                               device=device),
            "cacc":
            class_accuracy(n_classes,
                           output_transform=_select_pred_and_mask,
                           device=device),
            "mca":
            mean_class_accuracy(n_classes,
                                output_transform=_select_pred_and_mask,
                                device=device),
            "ciou":
            class_iou(n_classes,
                      output_transform=_select_pred_and_mask,
                      device=device),
            "mIoU":
            mean_iou(n_classes,
                     output_transform=_select_pred_and_mask,
                     device=device),
        },
        device=device,
    )

    # Set the validation run to start on the epoch completion of the training run

    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              Evaluator(evaluator, val_loader))

    if local_rank == 0:  # Run only on master process

        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            logging_handlers.log_training_output(
                log_interval=config.TRAIN.BATCH_SIZE_PER_GPU),
        )
        trainer.add_event_handler(Events.EPOCH_STARTED,
                                  logging_handlers.log_lr(optimizer))

        try:
            output_dir = generate_path(
                config.OUTPUT_DIR,
                git_branch(),
                git_hash(),
                config_file_name,
                config.TRAIN.MODEL_DIR,
                current_datetime(),
            )
        except TypeError:
            output_dir = generate_path(
                config.OUTPUT_DIR,
                config_file_name,
                config.TRAIN.MODEL_DIR,
                current_datetime(),
            )

        summary_writer = create_summary_writer(
            log_dir=path.join(output_dir, config.LOG_DIR))
        logger.info(
            f"Logging Tensorboard to {path.join(output_dir, config.LOG_DIR)}")
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"),
        )
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            tensorboard_handlers.log_training_output(summary_writer),
        )
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            logging_handlers.log_metrics(
                "Validation results",
                metrics_dict={
                    "nll": "Avg loss :",
                    "mIoU": " Avg IoU :",
                    "pixa": "Pixelwise Accuracy :",
                    "mca": "Mean Class Accuracy :",
                },
            ),
        )
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            tensorboard_handlers.log_metrics(
                summary_writer,
                trainer,
                "epoch",
                metrics_dict={
                    "mIoU": "Validation/IoU",
                    "nll": "Validation/Loss",
                    "mca": "Validation/MCA",
                },
            ),
        )

        def _select_max(pred_tensor):
            return pred_tensor.max(1)[1]

        def _tensor_to_numpy(pred_tensor):
            return pred_tensor.squeeze().cpu().numpy()

        transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes),
                                 _tensor_to_numpy)
        transform_pred = compose(transform_func, _select_max)
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            create_image_writer(summary_writer, "Validation/Image", "image"),
        )
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            create_image_writer(summary_writer,
                                "Validation/Mask",
                                "mask",
                                transform_func=transform_func),
        )
        evaluator.add_event_handler(
            Events.EPOCH_COMPLETED,
            create_image_writer(
                summary_writer,
                "Validation/Pred",
                "y_pred",
                transform_func=transform_pred,
            ),
        )

        def snapshot_function():
            return (trainer.state.iteration % snapshot_duration) == 0

        checkpoint_handler = SnapshotHandler(
            output_dir,
            config.MODEL.NAME,
            extract_metric_from("mIoU"),
            snapshot_function,
        )
        evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                    {"model": model})
        logger.info("Starting training")

        if debug:
            trainer.run(
                train_loader,
                max_epochs=config.TRAIN.END_EPOCH,
                epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU * 2,
                seed=config.SEED,
            )
        else:
            trainer.run(train_loader,
                        max_epochs=config.TRAIN.END_EPOCH,
                        epoch_length=len(train_loader),
                        seed=config.SEED)
예제 #18
0
val_evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device)

# #### EarlyStopping

# In[8]:

# handler = EarlyStopping(patience=30, score_function=lambda engine: engine.state.metrics['accuracy'], trainer=trainer)
# val_evaluator.add_event_handler(Events.COMPLETED, handler)

# #### LR Scheduler

# In[9]:

scheduler = CosineAnnealingScheduler(optimizer, 'lr', 10e-4, 1e-7, len(loader))
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)


@trainer.on(Events.ITERATION_COMPLETED)
def print_lr(engine):
    epoch = engine.state.epoch
    iteration = engine.state.iteration

    if epoch < 2 and iteration % 100 == 0:
        print(f'Iteration {iteration} | LR {optimizer.param_groups[0]["lr"]}')


# #### Compute and display metrics

# In[10]:
예제 #19
0
def run(*options, cfg=None, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options loaded from default.py will be overridden by those loaded from cfg file
        Options passed in via options argument will override those loaded from cfg file
    
    Args:
        *options (str, int, optional): Options used to overide what is loaded from the
                                    config. To see what options are available consult
                                    default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
        debug (bool): Places scripts in debug/test mode and only executes a few iterations
    """

    update_config(config, options=options, config_file=cfg)

    # we will write the model under outputs / config_file_name / model_dir
    config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0]

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"

    # Setup Augmentations
    basic_aug = Compose(
        [
            Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,),
            PadIfNeeded(
                min_height=config.TRAIN.PATCH_SIZE,
                min_width=config.TRAIN.PATCH_SIZE,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=mask_value,
                value=0,
            ),
            Resize(
                config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True,
            ),
            PadIfNeeded(
                min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
                min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
                border_mode=config.OPENCV_BORDER_CONSTANT,
                always_apply=True,
                mask_value=mask_value,
                value=0,
            ),
        ]
    )
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    PenobscotDataset = get_patch_dataset(config)

    train_set = PenobscotDataset(
        config.DATASET.ROOT,
        config.TRAIN.PATCH_SIZE,
        config.TRAIN.STRIDE,
        split="train",
        transforms=train_aug,
        n_channels=config.MODEL.IN_CHANNELS,
        complete_patches_only=config.TRAIN.COMPLETE_PATCHES_ONLY,
    )

    val_set = PenobscotDataset(
        config.DATASET.ROOT,
        config.TRAIN.PATCH_SIZE,
        config.TRAIN.STRIDE,
        split="val",
        transforms=val_aug,
        n_channels=config.MODEL.IN_CHANNELS,
        complete_patches_only=config.VALIDATION.COMPLETE_PATCHES_ONLY,
    )
    logger.info(train_set)
    logger.info(val_set)
    n_classes = train_set.n_classes

    train_loader = data.DataLoader(
        train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True,
    )

    if debug:
        val_set = data.Subset(val_set, range(3))

    val_loader = data.DataLoader(val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS)

    model = getattr(models, config.MODEL.NAME).get_seg_model(config)

    model = model.to(device)  # Send to GPU

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    try:
        output_dir = generate_path(
            config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),
        )
    except TypeError:
        output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),)

    summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR))
    snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader)
    scheduler = CosineAnnealingScheduler(
        optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration
    )

    # weights are inversely proportional to the frequency of the classes in
    # the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False)

    criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean")

    trainer = create_supervised_trainer(model, optimizer, criterion, _prepare_batch, device=device)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU),
    )
    trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer))
    trainer.add_event_handler(
        Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"),
    )
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer),
    )

    def _select_pred_and_mask(model_out_dict):
        return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze())

    evaluator = create_supervised_evaluator(
        model,
        _prepare_batch,
        metrics={
            "pixacc": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "nll": Loss(criterion, output_transform=_select_pred_and_mask),
            "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask),
            "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask),
            "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask),
        },
        device=device,
    )

    # Set the validation run to start on the epoch completion of the training run
    trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader))

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_metrics(
            "Validation results",
            metrics_dict={
                "nll": "Avg loss :",
                "pixacc": "Pixelwise Accuracy :",
                "mca": "Avg Class Accuracy :",
                "mIoU": "Avg Class IoU :",
            },
        ),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        tensorboard_handlers.log_metrics(
            summary_writer,
            trainer,
            "epoch",
            metrics_dict={
                "mIoU": "Validation/mIoU",
                "nll": "Validation/Loss",
                "mca": "Validation/MCA",
                "pixacc": "Validation/Pixel_Acc",
            },
        ),
    )

    def _select_max(pred_tensor):
        return pred_tensor.max(1)[1]

    def _tensor_to_numpy(pred_tensor):
        return pred_tensor.squeeze().cpu().numpy()

    transform_func = compose(np_to_tb, decode_segmap, _tensor_to_numpy,)

    transform_pred = compose(transform_func, _select_max)

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func),
    )
    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred),
    )

    def snapshot_function():
        return (trainer.state.iteration % snapshot_duration) == 0

    checkpoint_handler = SnapshotHandler(output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function,)
    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model})

    logger.info("Starting training")
    if debug:
        trainer.run(
            train_loader,
            max_epochs=config.TRAIN.END_EPOCH,
            epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU,
            seed=config.SEED,
        )
    else:
        trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)
예제 #20
0
def run(*options,
        cfg=None,
        local_rank=0,
        debug=False,
        input=None,
        distributed=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file
    
    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.        
        debug (bool): Places scripts in debug/test mode and only executes a few iterations
        input (str, optional): Location of data if Azure ML run, 
            for local runs input is config.DATASET.ROOT
        distributed (bool): This flag tells the training script to run in distributed mode
            if more than one GPU exists.
    """

    # if AML training pipeline supplies us with input
    if input is not None:
        data_dir = input
        output_dir = data_dir + config.OUTPUT_DIR

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)

    # Configuration:
    update_config(config, options=options, config_file=cfg)
    silence_other_ranks = True

    world_size = int(os.environ.get("WORLD_SIZE", 1))
    distributed = world_size > 1

    if distributed:
        # FOR DISTRIBUTED: Set the device according to local_rank.
        torch.cuda.set_device(local_rank)

        # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will
        # provide environment variables, and requires that you use init_method=`env://`.
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        logging.info(f"Started train.py using distributed mode.")
    else:
        logging.info(f"Started train.py using local mode.")

    # Set CUDNN benchmark mode:
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    # Fix random seeds:
    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # Augmentation:
    basic_aug = Compose([
        Normalize(mean=(config.TRAIN.MEAN, ),
                  std=(config.TRAIN.STD, ),
                  max_pixel_value=1),
        PadIfNeeded(
            min_height=config.TRAIN.PATCH_SIZE,
            min_width=config.TRAIN.PATCH_SIZE,
            border_mode=config.OPENCV_BORDER_CONSTANT,
            always_apply=True,
            mask_value=255,
            value=0,
        ),
        Resize(
            config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT,
            config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH,
            always_apply=True,
        ),
        PadIfNeeded(
            min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT,
            min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH,
            border_mode=config.OPENCV_BORDER_CONSTANT,
            always_apply=True,
            mask_value=255,
        ),
    ])
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    # Training and Validation Loaders:
    TrainPatchLoader = get_patch_loader(config)
    logging.info(f"Using {TrainPatchLoader}")

    train_set = TrainPatchLoader(
        config,
        split="train",
        is_transform=True,
        augmentations=train_aug,
        debug=debug,
    )
    logger.info(train_set)

    n_classes = train_set.n_classes
    val_set = TrainPatchLoader(
        config,
        split="val",
        is_transform=True,
        augmentations=val_aug,
        debug=debug,
    )

    logger.info(val_set)

    if debug:
        data_flow_dict = dict()

        data_flow_dict["train_patch_loader_length"] = len(train_set)
        data_flow_dict["validation_patch_loader_length"] = len(val_set)
        data_flow_dict["train_input_shape"] = train_set.seismic.shape
        data_flow_dict["train_label_shape"] = train_set.labels.shape
        data_flow_dict["n_classes"] = n_classes

        logger.info("Running in debug mode..")
        train_range = min(
            config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES,
            len(train_set))
        logging.info(f"train range in debug mode {train_range}")
        train_set = data.Subset(train_set, range(train_range))
        valid_range = min(config.VALIDATION.BATCH_SIZE_PER_GPU, len(val_set))
        val_set = data.Subset(val_set, range(valid_range))

        data_flow_dict["train_length_subset"] = len(train_set)
        data_flow_dict["validation_length_subset"] = len(val_set)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set, num_replicas=world_size, rank=local_rank)
    val_sampler = torch.utils.data.distributed.DistributedSampler(
        val_set, num_replicas=world_size, rank=local_rank)

    train_loader = data.DataLoader(
        train_set,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
        num_workers=config.WORKERS,
        sampler=train_sampler,
    )
    val_loader = data.DataLoader(
        val_set,
        batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU,
        num_workers=config.WORKERS,
        sampler=val_sampler)

    if debug:
        data_flow_dict["train_loader_length"] = len(train_loader)
        data_flow_dict["validation_loader_length"] = len(val_loader)
        config_file_name = "default_config" if not cfg else cfg.split(
            "/")[-1].split(".")[0]
        fname = f"data_flow_train_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
        with open(fname, "w") as f:
            json.dump(data_flow_dict, f, indent=2)

    # Model:
    model = getattr(models, config.MODEL.NAME).get_seg_model(config)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Optimizer and LR Scheduler:
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    snapshot_duration = epochs_per_cycle * len(
        train_loader) if not debug else 2 * len(train_loader)
    cosine_scheduler = CosineAnnealingScheduler(
        optimizer,
        "lr",
        config.TRAIN.MAX_LR * world_size,
        config.TRAIN.MIN_LR * world_size,
        cycle_size=snapshot_duration,
    )

    if distributed:
        warmup_duration = 5 * len(train_loader)
        warmup_scheduler = LinearCyclicalScheduler(
            optimizer,
            "lr",
            start_value=config.TRAIN.MAX_LR,
            end_value=config.TRAIN.MAX_LR * world_size,
            cycle_size=10 * len(train_loader),
        )
        scheduler = ConcatScheduler(
            schedulers=[warmup_scheduler, cosine_scheduler],
            durations=[warmup_duration])
    else:
        scheduler = cosine_scheduler

    # class weights are inversely proportional to the frequency of the classes in the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS,
                                 device=device,
                                 requires_grad=False)

    # Loss:
    criterion = torch.nn.CrossEntropyLoss(weight=class_weights,
                                          ignore_index=255,
                                          reduction="mean")

    # Model:
    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[device], find_unused_parameters=True)
        if silence_other_ranks & local_rank != 0:
            logging.getLogger("ignite.engine.engine.Engine").setLevel(
                logging.WARNING)

    # Ignite trainer and evaluator:
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch,
                                        device=device)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
    # Set to update the epoch parameter of our distributed data sampler so that we get
    # different shuffles
    trainer.add_event_handler(Events.EPOCH_STARTED,
                              update_sampler_epoch(train_loader))

    transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(),
                                        output_dict["mask"].squeeze())
    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll":
            Loss(criterion, output_transform=transform_fn, device=device),
            "pixacc":
            pixelwise_accuracy(n_classes,
                               output_transform=transform_fn,
                               device=device),
            "cacc":
            class_accuracy(n_classes,
                           output_transform=transform_fn,
                           device=device),
            "mca":
            mean_class_accuracy(n_classes,
                                output_transform=transform_fn,
                                device=device),
            "ciou":
            class_iou(n_classes, output_transform=transform_fn, device=device),
            "mIoU":
            mean_iou(n_classes, output_transform=transform_fn, device=device),
        },
        device=device,
    )

    # The model will be saved under: outputs/<config_file_name>/<model_dir>
    config_file_name = "default_config" if not cfg else cfg.split(
        "/")[-1].split(".")[0]
    try:
        output_dir = generate_path(
            config.OUTPUT_DIR,
            git_branch(),
            git_hash(),
            config_file_name,
            config.TRAIN.MODEL_DIR,
            current_datetime(),
        )
    except:
        output_dir = generate_path(
            config.OUTPUT_DIR,
            config_file_name,
            config.TRAIN.MODEL_DIR,
            current_datetime(),
        )

    if local_rank == 0:  # Run only on master process
        # Logging:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            logging_handlers.log_training_output(
                log_interval=config.PRINT_FREQ),
        )
        trainer.add_event_handler(Events.EPOCH_STARTED,
                                  logging_handlers.log_lr(optimizer))

        # Checkpointing: snapshotting trained models to disk
        checkpoint_handler = SnapshotHandler(
            output_dir,
            config.MODEL.NAME,
            extract_metric_from("mIoU"),
            lambda: (trainer.state.iteration % snapshot_duration) == 0,
        )

        evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                    {"model": model})

        # Tensorboard and Logging:
        summary_writer = create_summary_writer(
            log_dir=path.join(output_dir, "logs"))
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"))
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            tensorboard_handlers.log_training_output(summary_writer))
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            tensorboard_handlers.log_validation_output(summary_writer))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        evaluator.run(train_loader)
        if local_rank == 0:  # Run only on master process
            tensorboard_handlers.log_results(engine,
                                             evaluator,
                                             summary_writer,
                                             n_classes,
                                             stage="Training")
            logging_handlers.log_metrics(engine, evaluator, stage="Training")
            logger.info("Logging training results..")

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        if local_rank == 0:  # Run only on master process
            tensorboard_handlers.log_results(engine,
                                             evaluator,
                                             summary_writer,
                                             n_classes,
                                             stage="Validation")
            logging_handlers.log_metrics(engine, evaluator, stage="Validation")
            logger.info("Logging validation results..")
            # dump validation set metrics at the very end for debugging purposes
            if engine.state.epoch == config.TRAIN.END_EPOCH and debug:
                fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json"
                metrics = evaluator.state.metrics
                out_dict = {
                    x: metrics[x]
                    for x in ["nll", "pixacc", "mca", "mIoU"]
                }
                with open(fname, "w") as fid:
                    json.dump(out_dict, fid)
                log_msg = " ".join(f"{k}: {out_dict[k]}"
                                   for k in out_dict.keys())
                logging.info(log_msg)

    logger.info("Starting training")
    trainer.run(train_loader,
                max_epochs=config.TRAIN.END_EPOCH,
                epoch_length=len(train_loader),
                seed=config.SEED)
    if local_rank == 0:
        summary_writer.close()
예제 #21
0
# Define training function
def update(engine, batch):
    model.train()
    batch = batch.transpose(0, 1).contiguous().to(args.device)  # to shape [seq length, batch]
    logits, loss = model(batch, labels=batch)
    loss = loss / args.gradient_accumulation_steps
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
    if engine.state.iteration % args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()
trainer = Engine(update)

# Add progressbar with loss
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
ProgressBar(persist=True).attach(trainer, metric_names=['loss'])

# Learning rate schedule: linearly warm-up to lr and then decrease the learning rate to zero with cosine
cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(dataloader) * args.n_epochs)
scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.n_warmup)
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

# Save checkpoints and training config
checkpoint_handler = ModelCheckpoint(args.log_dir, 'checkpoint', save_interval=1, n_saved=5)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': model})
torch.save(args, os.path.join(args.log_dir, 'training_args.bin'))

trainer.run(dataloader, max_epochs=args.n_epochs)
예제 #22
0
     'lr': args.learning_rate,
     'weight_decay': args.weight_decay, },
    {'params': parameters_of(model.decoder, nn.BatchNorm2d),
     'lr': args.learning_rate, }, ],
    momentum=0.9,
)

loss_fn = OHEMLoss(ignore_index=255).cuda()

lr = args.learning_rate
lrs = [lr / 10, lr/10, lr, lr]

schedulers = [
    CosineAnnealingScheduler(
        optimizer, 'lr',
        lr, lr * 1e-4,
        args.epochs * len(train_loader),
        param_group_index=0)
    for index, lr in enumerate(lrs)]
schedulers = [
    create_lr_scheduler_with_warmup(scheduler, 0, lr, 1000)
    for scheduler, lr in zip(schedulers, lrs)
]


model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
if args.distributed:
    model = convert_syncbn_model(model)
    model = DistributedDataParallel(model)

예제 #23
0
def run(*options, cfg=None, debug=False):
    """Run training and validation of model

    Notes:
        Options can be passed in via the options argument and loaded from the cfg file
        Options from default.py will be overridden by options loaded from cfg file
        Options passed in via options argument will override option loaded from cfg file

    Args:
        *options (str,int ,optional): Options used to overide what is loaded from the
                                      config. To see what options are available consult
                                      default.py
        cfg (str, optional): Location of config file to load. Defaults to None.
    """

    update_config(config, options=options, config_file=cfg)

    # Start logging
    load_log_configuration(config.LOG_CONFIG)
    logger = logging.getLogger(__name__)
    logger.debug(config.WORKERS)
    scheduler_step = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS
    torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK

    torch.manual_seed(config.SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.SEED)
    np.random.seed(seed=config.SEED)

    # Setup Augmentations
    basic_aug = Compose([
        Normalize(mean=(config.TRAIN.MEAN, ),
                  std=(config.TRAIN.STD, ),
                  max_pixel_value=1)
    ])
    if config.TRAIN.AUGMENTATION:
        train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)])
        val_aug = basic_aug
    else:
        train_aug = val_aug = basic_aug

    TrainLoader = get_section_loader(config)

    train_set = TrainLoader(
        data_dir=config.DATASET.ROOT,
        split="train",
        is_transform=True,
        augmentations=train_aug,
    )

    val_set = TrainLoader(
        data_dir=config.DATASET.ROOT,
        split="val",
        is_transform=True,
        augmentations=val_aug,
    )

    class CustomSampler(torch.utils.data.Sampler):
        def __init__(self, data_source):
            self.data_source = data_source

        def __iter__(self):
            char = ["i" if np.random.randint(2) == 1 else "x"]
            self.indices = [
                idx for (idx, name) in enumerate(self.data_source)
                if char[0] in name
            ]
            return (self.indices[i] for i in torch.randperm(len(self.indices)))

        def __len__(self):
            return len(self.data_source)

    n_classes = train_set.n_classes

    val_list = val_set.sections
    train_list = val_set.sections

    train_loader = data.DataLoader(
        train_set,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
        sampler=CustomSampler(train_list),
        num_workers=config.WORKERS,
        shuffle=False,
    )

    val_loader = data.DataLoader(
        val_set,
        batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU,
        sampler=CustomSampler(val_list),
        num_workers=config.WORKERS,
    )

    model = getattr(models, config.MODEL.NAME).get_seg_model(config)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    model = model.to(device)  # Send to GPU

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.TRAIN.MAX_LR,
        momentum=config.TRAIN.MOMENTUM,
        weight_decay=config.TRAIN.WEIGHT_DECAY,
    )

    try:
        output_dir = generate_path(
            config.OUTPUT_DIR,
            git_branch(),
            git_hash(),
            config.MODEL.NAME,
            current_datetime(),
        )
    except TypeError:
        output_dir = generate_path(
            config.OUTPUT_DIR,
            config.MODEL.NAME,
            current_datetime(),
        )

    summary_writer = create_summary_writer(
        log_dir=path.join(output_dir, config.LOG_DIR))

    snapshot_duration = scheduler_step * len(train_loader)
    scheduler = CosineAnnealingScheduler(optimizer, "lr", config.TRAIN.MAX_LR,
                                         config.TRAIN.MIN_LR,
                                         snapshot_duration)

    # weights are inversely proportional to the frequency of the classes in
    # the training set
    class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS,
                                 device=device,
                                 requires_grad=False)

    criterion = torch.nn.CrossEntropyLoss(weight=class_weights,
                                          ignore_index=255,
                                          reduction="mean")

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        prepare_batch,
                                        device=device)

    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED,
        logging_handlers.log_training_output(log_interval=config.PRINT_FREQ),
    )

    trainer.add_event_handler(Events.EPOCH_STARTED,
                              logging_handlers.log_lr(optimizer))

    trainer.add_event_handler(
        Events.EPOCH_STARTED,
        tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"),
    )

    trainer.add_event_handler(
        Events.ITERATION_COMPLETED,
        tensorboard_handlers.log_training_output(summary_writer),
    )

    def _select_pred_and_mask(model_out_dict):
        return (model_out_dict["y_pred"].squeeze(),
                model_out_dict["mask"].squeeze())

    evaluator = create_supervised_evaluator(
        model,
        prepare_batch,
        metrics={
            "nll":
            Loss(criterion,
                 output_transform=_select_pred_and_mask,
                 device=device),
            "pixacc":
            pixelwise_accuracy(n_classes,
                               output_transform=_select_pred_and_mask,
                               device=device),
            "cacc":
            class_accuracy(n_classes,
                           output_transform=_select_pred_and_mask,
                           device=device),
            "mca":
            mean_class_accuracy(n_classes,
                                output_transform=_select_pred_and_mask,
                                device=device),
            "ciou":
            class_iou(n_classes,
                      output_transform=_select_pred_and_mask,
                      device=device),
            "mIoU":
            mean_iou(n_classes,
                     output_transform=_select_pred_and_mask,
                     device=device),
        },
        device=device,
    )

    if debug:
        logger.info("Running Validation in Debug/Test mode")
        val_loader = take(3, val_loader)
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              Evaluator(evaluator, val_loader))

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_metrics(
            "Validation results",
            metrics_dict={
                "nll": "Avg loss :",
                "pixacc": "Pixelwise Accuracy :",
                "mca": "Avg Class Accuracy :",
                "mIoU": "Avg Class IoU :",
            },
        ),
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        logging_handlers.log_class_metrics(
            "Per class validation results",
            metrics_dict={
                "ciou": "Class IoU :",
                "cacc": "Class Accuracy :"
            },
        ),
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        tensorboard_handlers.log_metrics(
            summary_writer,
            trainer,
            "epoch",
            metrics_dict={
                "mIoU": "Validation/mIoU",
                "nll": "Validation/Loss",
                "mca": "Validation/MCA",
                "pixacc": "Validation/Pixel_Acc",
            },
        ),
    )

    def _select_max(pred_tensor):
        return pred_tensor.max(1)[1]

    def _tensor_to_numpy(pred_tensor):
        return pred_tensor.squeeze().cpu().numpy()

    transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes),
                             _tensor_to_numpy)

    transform_pred = compose(transform_func, _select_max)

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer, "Validation/Image", "image"),
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer,
                            "Validation/Mask",
                            "mask",
                            transform_func=transform_func),
    )

    evaluator.add_event_handler(
        Events.EPOCH_COMPLETED,
        create_image_writer(summary_writer,
                            "Validation/Pred",
                            "y_pred",
                            transform_func=transform_pred),
    )

    def snapshot_function():
        return (trainer.state.iteration % snapshot_duration) == 0

    checkpoint_handler = SnapshotHandler(
        path.join(output_dir, config.TRAIN.MODEL_DIR),
        config.MODEL.NAME,
        extract_metric_from("mIoU"),
        snapshot_function,
    )

    evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                {"model": model})

    logger.info("Starting training")
    if debug:
        logger.info("Running Validation in Debug/Test mode")
        train_loader = take(3, train_loader)
    trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH)
예제 #24
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--basedir", type=str)
    parser.add_argument("--dataset_key",
                        type=str,
                        default='wikitext-2',
                        help="key from DATASETS global")
    parser.add_argument("--train_file",
                        type=str,
                        help='Optional file path to use for train file')
    parser.add_argument("--valid_file",
                        type=str,
                        help='Optional file path to use for valid file')
    parser.add_argument("--dataset_cache",
                        type=str,
                        default=os.path.expanduser('~/.bl-data'),
                        help="Path or url of the dataset cache")
    parser.add_argument("--cache_features", type=str2bool, default=True)
    parser.add_argument("--d_model",
                        type=int,
                        default=410,
                        help="Model dimension (and embedding dsz)")
    parser.add_argument("--d_ff", type=int, default=2100, help="FFN dimension")
    parser.add_argument("--num_heads",
                        type=int,
                        default=10,
                        help="Number of heads")
    parser.add_argument("--num_layers",
                        type=int,
                        default=8,
                        help="Number of layers")
    parser.add_argument("--nctx",
                        type=int,
                        default=256,
                        help="Max input length")
    parser.add_argument("--batch_size", type=int, default=8, help="Batch Size")
    parser.add_argument("--tokens",
                        choices=["words", "chars", "subwords"],
                        default="subwords",
                        help="What tokens to use")
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout")
    parser.add_argument("--lr",
                        type=float,
                        default=4.0e-4,
                        help="Learning rate")
    parser.add_argument("--clip",
                        type=float,
                        default=0.25,
                        help="Clipping gradient norm")
    parser.add_argument("--weight_decay",
                        type=float,
                        default=0.0,
                        help="Weight decay")
    parser.add_argument("--epochs",
                        type=int,
                        default=20,
                        help="Num training epochs")
    parser.add_argument("--warmup_steps",
                        type=int,
                        default=1000,
                        help="Num warmup steps")
    parser.add_argument("--eval_every",
                        type=int,
                        default=-1,
                        help="Evaluate every X steps (-1 => end of epoch)")

    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--distributed",
                        type=str2bool,
                        default=False,
                        help="Are we doing distributed training?")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help=
        "Local rank for distributed training (-1 means use the environment variables to find)"
    )
    parser.add_argument("--chars_per_word",
                        type=int,
                        default=40,
                        help="How many max characters per word")
    parser.add_argument(
        "--accum_grad_steps",
        type=int,
        default=1,
        help="Create effective batch size by accumulating grads without updates"
    )
    args = parser.parse_args()

    if args.train_file and not args.valid_file:
        logger.error(
            "If you provide a train_file, you must provide a valid_file")
        return

    if not args.train_file and args.valid_file:
        logger.error(
            "If you provide a valid_file, you must also provide a train_file")
        return

    if args.basedir is None:
        args.basedir = 'transformer-{}-{}-{}'.format(args.dataset_key,
                                                     args.tokens, os.getpid())
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info("Cache directory [%s]", args.dataset_cache)

    args.distributed = args.distributed or int(os.environ.get("WORLD_SIZE",
                                                              1)) > 1

    if args.distributed:
        if args.local_rank == -1:
            # https://github.com/kubeflow/pytorch-operator/issues/128
            # https://github.com/pytorch/examples/blob/master/imagenet/main.py
            logger.info("Setting local rank to RANK env variable")
            args.local_rank = int(os.environ['RANK'])
        logger.warning("Local rank (%d)", args.local_rank)
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    if args.train_file:
        dataset = {
            'train_file': args.train_file,
            'valid_file': args.valid_file
        }
    else:
        dataset = DataDownloader(DATASETS[args.dataset_key],
                                 args.dataset_cache).download()
    reader = create_reader(args.tokens, args.nctx, args.chars_per_word)

    preproc_data = load_embed_and_vocab(args.tokens, reader, dataset,
                                        args.dataset_key, args.d_model,
                                        args.cache_features)

    vocabs = preproc_data['vocabs']
    os.makedirs(args.basedir, exist_ok=True)
    # We want to make sure to save our input vocab into the basedir for reuse later
    write_json(vocabs['x'], os.path.join(args.basedir, 'vocabs.json'))
    embeddings = preproc_data['embeddings']
    valid_num_words = preproc_data['valid_num_words']
    tgt_key = preproc_data['tgt_key']
    logger.info("Loaded embeddings")

    train_set = load_data(args.tokens, reader, dataset, 'train_file', vocabs,
                          args.cache_features)
    valid_set = load_data(args.tokens, reader, dataset, 'valid_file', vocabs,
                          args.cache_features)
    logger.info("valid. tokens [%s], valid. words [%s]",
                valid_set.tensors[-1].numel(), valid_num_words)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_set) if args.distributed else None
    train_loader = DataLoader(train_set,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              shuffle=(not args.distributed))

    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_set) if args.distributed else None
    valid_loader = DataLoader(valid_set,
                              sampler=valid_sampler,
                              batch_size=args.batch_size,
                              shuffle=False)

    logger.info("Loaded datasets")

    model = TransformerLanguageModel.create(
        embeddings,
        hsz=args.d_model,
        d_ff=args.d_ff,
        tie_weights=(args.tokens != 'chars'),
        dropout=args.dropout,
        gpu=False,
        num_heads=args.num_heads,
        layers=args.num_layers,
        src_keys=['x'],
        tgt_key=tgt_key)
    model.to(args.device)
    train_loss = model.create_loss()
    train_loss.to(args.device)

    logger.info("Loaded model and loss")

    optimizer = Adam(model.parameters(),
                     lr=args.lr,
                     weight_decay=args.weight_decay)
    logger.info("Model has %s parameters",
                sum(p.numel() for p in model.parameters() if p.requires_grad))

    # Prepare model for distributed training if needed
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)
        logger.info("Model located on %d", args.local_rank)

    def update(engine, batch):
        model.train()
        x, y = batch
        inputs = {'x': x.to(args.device)}
        labels = y.to(args.device).transpose(0, 1).contiguous()
        logits = model(inputs, None)[0].transpose(0, 1).contiguous()
        shift_logits = logits[:-1]
        shift_labels = labels[1:]
        loss = train_loss(shift_logits, shift_labels)
        loss = loss / args.accum_grad_steps
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        if engine.state.iteration % args.accum_grad_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    def inference(_, batch):
        model.eval()
        with torch.no_grad():
            x, y = batch
            inputs = {'x': x.to(args.device)}
            labels = y.to(args.device).transpose(0, 1).contiguous()
            logits = model(inputs, None)[0].transpose(0, 1).contiguous()
            shift_logits = logits[:-1]
            shift_labels = labels[1:]
            return shift_logits.view(-1,
                                     logits.size(-1)), shift_labels.view(-1)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(valid_loader))
    if args.eval_every > 0:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            lambda engine: evaluator.run(valid_loader)
            if engine.state.iteration % args.eval_every == 0 else None)
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0,
                                             len(train_loader) * args.epochs)
    scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr,
                                                args.warmup_steps)
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))}
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args)
    })

    if args.tokens == 'subwords':
        # If we compute subwords, need to renormalize for num words
        metrics["average_subword_ppl"] = MetricsLambda(math.exp,
                                                       metrics["average_nll"])
        metrics["average_word_ppl"] = MetricsLambda(
            lambda x: math.exp(x * valid_set.tensors[-1].numel() /
                               valid_num_words), metrics["average_nll"])
    else:
        metrics["average_word_ppl"] = MetricsLambda(math.exp,
                                                    metrics["average_nll"])

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    if args.local_rank < 1:
        RunningAverage(output_transform=lambda x: x).attach(
            trainer, "valid_loss")
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, lambda _: print(
                "Epoch[{}] Training Loss: {:.2f}, Perplexity {:.2f}".format(
                    trainer.state.epoch, trainer.state.output,
                    np.exp(trainer.state.output))))
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: print("Validation: %s" % pformat(
                evaluator.state.metrics)))
        checkpoint_handler = ModelCheckpoint(args.basedir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3,
                                             create_dir=False)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler,
                                  {'mymodel': getattr(model, 'module', model)})
    trainer.run(train_loader, max_epochs=args.epochs)