def experiment(logdir: Path, device: torch.device) -> None: """Experiment function Args: logdir (Path): directory where should be placed logs device (str): device name to use """ tb_dir = logdir / "tensorboard" main_metric = "gap" minimize_metric = False seed_all() model = EncoderWithHead( EfficientNetEncoder("efficientnet-b0", EMBEDDING_SIZE, bias=False), CosFace(EMBEDDING_SIZE, NUM_CLASSESS, None), ) load_checkpoint("./logs/full_set5/stage_0/best.pth", model) # model.head.s = 64.0 # np.sqrt(2) * np.log(NUM_CLASSESS - 1) # model.head.m = 0.35 # # freeze backbone of encoder # for parameter in model.encoder.base._blocks.parameters(): # parameter.requires_grad = False model = nn.DataParallel(model) model = model.to(device) optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9, weight_decay=1e-5) criterion = nn.CrossEntropyLoss(weight=get_class_weights()).to(device) scheduler = None with TensorboardLogger(tb_dir) as tb: stage = "stage_0" n_epochs = 5 print(f"Stage - '{stage}'") checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=5, ) train_loader, valid_loader = get_loaders(stage, train_bs=32, valid_bs=64) for epoch in range(1, n_epochs + 1): epoch_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(f"[{epoch_start_time}]\n[Epoch {epoch}/{n_epochs}]") train_metrics = train_fn(model, train_loader, device, criterion, optimizer) log_metrics(stage, train_metrics, tb, "train", epoch) valid_metrics = valid_fn(model, valid_loader, device, criterion) log_metrics(stage, valid_metrics, tb, "valid", epoch) checkpointer.process( metric_value=valid_metrics[main_metric], epoch=epoch, checkpoint=make_checkpoint( stage, epoch, model, optimizer, scheduler, metrics={ "train": train_metrics, "valid": valid_metrics }, ), )
def experiment(logdir, device) -> None: """Experiment function Args: logdir (Path): directory where should be placed logs device (str): device name to use """ tb_dir = logdir / "tensorboard" main_metric = "loss" minimize_metric = True seed_all() history_n_frames = cfg["model_params"]["history_num_frames"] future_n_frames = cfg["model_params"]["future_num_frames"] n_trajectories = 3 model = ModelWithConfidence( backbone=resnet18( pretrained=True, in_channels=3 + 2 * (history_n_frames + 1), num_classes=2 * future_n_frames * n_trajectories + n_trajectories, ), future_num_frames=future_n_frames, num_trajectories=n_trajectories, ) # model = nn.DataParallel(model) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = neg_multi_log_likelihood_batch scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100) with TensorboardLogger(tb_dir) as tb: stage = "stage_0" n_epochs = 1 print(f"Stage - {stage}") checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=5, ) train_loader, valid_loader = get_loaders(train_batch_size=32, valid_batch_size=32) for epoch in range(1, n_epochs + 1): epoch_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(f"[{epoch_start_time}]\n[Epoch {epoch}/{n_epochs}]") train_metrics = train_fn(model, train_loader, device, criterion, optimizer) log_metrics(stage, train_metrics, tb, "train", epoch) valid_metrics = valid_fn(model, valid_loader, device, criterion) log_metrics(stage, valid_metrics, tb, "valid", epoch) checkpointer.process( metric_value=valid_metrics[main_metric], epoch=epoch, checkpoint=make_checkpoint( stage, epoch, model, optimizer, scheduler, metrics={ "train": train_metrics, "valid": valid_metrics }, ), ) scheduler.step()
def train_fn( model, loader, device, loss_fn, optimizer, scheduler=None, accumulation_steps=1, verbose=True, tensorboard_logger=None, logdir=None, ): """Train step. Args: model (nn.Module): model to train loader (DataLoader): loader with data device (str or torch.device): device to use for placing batches loss_fn (nn.Module): loss function, should be callable optimizer (torch.optim.Optimizer): model parameters optimizer scheduler ([type], optional): batch scheduler to use. Default is `None`. accumulation_steps (int, optional): number of steps to accumulate gradients. Default is `1`. verbose (bool, optional): verbosity mode. Default is True. Returns: dict with metics computed during the training on loader """ model.train() metrics = {"loss": 0.0} n_batches = len(loader) indices_to_save = [ int(n_batches * pcnt) for pcnt in np.arange(0.1, 1, 0.1) ] with tqdm(total=len(loader), desc="train", disable=not verbose) as progress: for idx, batch in enumerate(loader): images, targets, target_availabilities = t2d( ( batch["image"], batch["target_positions"], batch["target_availabilities"], ), device, ) zero_grad(optimizer) predictions, confidences = model(images) loss = loss_fn(targets, predictions, confidences, target_availabilities) _loss = loss.detach().item() metrics["loss"] += _loss if tensorboard_logger is not None: tensorboard_logger.metric("loss", _loss, idx) loss.backward() progress.set_postfix_str(f"loss - {_loss:.5f}") progress.update(1) if (idx + 1) in indices_to_save and logdir is not None: checkpoint = make_checkpoint("train", idx + 1, model) save_checkpoint(checkpoint, logdir, f"train_{idx}.pth") if (idx + 1) % accumulation_steps == 0: optimizer.step() if scheduler is not None: scheduler.step() if idx == DEBUG: break metrics["loss"] /= idx + 1 return metrics
def experiment(logdir, device) -> None: """Experiment function Args: logdir (Path): directory where should be placed logs device (str): device name to use """ tb_dir = logdir / "tensorboard" main_metric = "loss" minimize_metric = True seed_all() history_n_frames = cfg["model_params"]["history_num_frames"] future_n_frames = cfg["model_params"]["future_num_frames"] n_trajectories = 3 model = ModelWithConfidence( backbone=resnet34_accel( pretrained=True, in_channels=3 + 3, num_classes=2 * future_n_frames * n_trajectories + n_trajectories, in_accel_features=(history_n_frames - 1) * 2, num_accel_features=32, ), future_num_frames=future_n_frames, num_trajectories=n_trajectories, ) load_checkpoint( "./logs/resnet34_frast_fulldata_confidence_25hist_accel/epoch_1/train_689999.pth", model, ) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = neg_multi_log_likelihood_batch scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100) with TensorboardLogger(tb_dir) as tb: stage = "stage_0" n_epochs = 1 print(f"Stage - {stage}") checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=5, ) train_loader, (valid_loader, valid_gt_path) = get_loaders( train_batch_size=32, valid_batch_size=32 ) valid_func = partial( valid_fn, loader=valid_loader, ground_truth_file=valid_gt_path, logdir=logdir, verbose=True, ) for epoch in range(1, n_epochs + 1): epoch_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(f"[{epoch_start_time}]\n[Epoch {epoch}/{n_epochs}]") # try: train_metrics = train_fn( model, train_loader, device, criterion, optimizer, tensorboard_logger=tb, logdir=logdir / f"epoch_{epoch}", validation_fn=valid_func, ) log_metrics(stage, train_metrics, tb, "train", epoch) # except BaseException: # train_metrics = {"message": "An exception occured!"} valid_metrics = valid_fn(model, valid_loader, device, valid_gt_path, logdir) log_metrics(stage, valid_metrics, tb, "valid", epoch) checkpointer.process( metric_value=valid_metrics["score"], epoch=epoch, checkpoint=make_checkpoint( stage, epoch, model, optimizer, scheduler, metrics={"train": train_metrics, "valid": valid_metrics}, ), )
def experiment(logdir: Path, device: torch.device) -> None: """Experiment function Args: logdir (Path): directory where should be placed logs device (str): device name to use """ tb_dir = logdir / "tensorboard" main_metric = "gap" minimize_metric = False seed_all() model = EncoderWithHead( EfficientNetEncoder("efficientnet-b5", EMBEDDING_SIZE, bias=True), CosFace( EMBEDDING_SIZE, NUM_CLASSESS, device_id=None, s=np.sqrt(2) * np.log(NUM_CLASSESS - 1), m=0.1, ), ) model = nn.DataParallel(model) model = model.to(device) optimizer = optim.SGD( model.parameters(), lr=1e-3, momentum=0.9, weight_decay=1e-5, ) criterion = nn.CrossEntropyLoss() scheduler = None with TensorboardLogger(tb_dir) as tb: stage = "stage_0" n_epochs = 10 print(f"Stage - '{stage}'") checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=5, ) train_loader, valid_loader = get_loaders(stage, train_bs=48, valid_bs=128) for epoch in range(1, n_epochs + 1): epoch_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(f"[{epoch_start_time}]\n[Epoch {epoch}/{n_epochs}]") train_metrics = train_fn(model, train_loader, device, criterion, optimizer) log_metrics(stage, train_metrics, tb, "train", epoch) valid_metrics = valid_fn(model, valid_loader, device, criterion) log_metrics(stage, valid_metrics, tb, "valid", epoch) checkpointer.process( metric_value=valid_metrics[main_metric], epoch=epoch, checkpoint=make_checkpoint( stage, epoch, model, optimizer, scheduler, metrics={ "train": train_metrics, "valid": valid_metrics }, ), )
def train_fn( model, loader, device, loss_fn, optimizer, scheduler=None, accumulation_steps=1, verbose=True, tensorboard_logger=None, logdir=None, validation_fn=None, ): """Train step. Args: model (nn.Module): model to train loader (DataLoader): loader with data device (str or torch.device): device to use for placing batches loss_fn (nn.Module): loss function, should be callable optimizer (torch.optim.Optimizer): model parameters optimizer scheduler ([type], optional): batch scheduler to use. Default is `None`. accumulation_steps (int, optional): number of steps to accumulate gradients. Default is `1`. verbose (bool, optional): verbosity mode. Default is True. Returns: dict with metics computed during the training on loader """ model.train() metrics = {"regression_loss": 0.0, "mask_loss": 0.0, "loss": 0.0} n_batches = len(loader) indices_to_save = [ int(n_batches * pcnt) for pcnt in np.arange(0.1, 1, 0.1) ] last_score = 0.0 with tqdm(total=len(loader), desc="train", disable=not verbose) as progress: for idx, batch in enumerate(loader): (images, targets, target_availabilities, masks) = t2d( ( batch["image"], batch["target_positions"], batch["target_availabilities"], batch["mask"], ), device, ) zero_grad(optimizer) predictions, confidences, masks_logits = model(images) rloss = loss_fn(targets, predictions, confidences, target_availabilities) mloss = 1e4 * F.binary_cross_entropy_with_logits( masks_logits, masks) loss = rloss + mloss _rloss = rloss.detach().item() _mloss = mloss.detach().item() _loss = loss.detach().item() metrics["regression_loss"] += _rloss metrics["mask_loss"] += _mloss metrics["mask_loss"] += _loss if (idx + 1) % 30_000 == 0 and validation_fn is not None: score = validation_fn(model=model, device=device) model.train() last_score = score if logdir is not None: checkpoint = make_checkpoint("train", idx + 1, model) save_checkpoint(checkpoint, logdir, f"train_{idx}.pth") else: score = None if tensorboard_logger is not None: tensorboard_logger.metric("regression_loss", _rloss, idx) tensorboard_logger.metric("mask_loss", _mloss, idx) tensorboard_logger.metric("loss", _loss, idx) if score is not None: tensorboard_logger.metric("score", score, idx) if (idx + 1) % 1_000 == 0: # masks_gt - (bs)x(1)x(h)x(w) # masks - (bs)x(1)x(h)x(w) tensorboard_logger.writer.add_images( "gt_vs_mask", torch.cat([masks, torch.sigmoid(masks_logits)], dim=-1), idx, ) loss.backward() progress.set_postfix_str(f"rloss - {_rloss:.5f}, " f"mloss - {_mloss:.5f}, " f"loss - {_loss:.5f}, " f"score - {last_score:.5f}") progress.update(1) if (idx + 1) % accumulation_steps == 0: optimizer.step() if scheduler is not None: scheduler.step() if idx == DEBUG: break
def experiment(logdir, device) -> None: """Experiment function Args: logdir (Path): directory where should be placed logs device (str): device name to use """ tb_dir = logdir / "tensorboard" main_metric = "score" minimize_metric = True seed_all() history_n_frames = cfg["model_params"]["history_num_frames"] future_n_frames = cfg["model_params"]["future_num_frames"] n_trajectories = 3 model = ModelWithConfidence( backbone=resnet18( pretrained=True, in_channels=3 + 2 * (history_n_frames + 1), num_classes=2 * future_n_frames * n_trajectories + n_trajectories, ), future_num_frames=future_n_frames, num_trajectories=n_trajectories, ) # model = nn.DataParallel(model) model = model.to(device) # optimizer = optim.Adam(model.parameters(), lr=1e-3) optimizer = optim.SGD(model.parameters(), lr=1e-4) scheduler = optim.lr_scheduler.CyclicLR( optimizer, base_lr=1e-4, max_lr=1e-3, step_size_up=120_000, cycle_momentum=True, mode="triangular2", ) load_checkpoint( "./logs/resnet18_bigerimages_continue4_chopped/epoch_1/train_25868.pth", model, # optimizer, ) criterion = neg_multi_log_likelihood_batch with TensorboardLogger(tb_dir) as tb: stage = "stage_0" n_epochs = 1 print(f"Stage - {stage}") checkpointer = CheckpointManager( logdir=logdir / stage, metric=main_metric, metric_minimization=minimize_metric, save_n_best=5, ) train_loader, (valid_loader, valid_gt_path) = get_loaders( train_batch_size=32, valid_batch_size=32 ) valid_func = partial( valid_fn, loader=valid_loader, ground_truth_file=valid_gt_path, logdir=logdir, verbose=True, ) for epoch in range(1, n_epochs + 1): epoch_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print(f"[{epoch_start_time}]\n[Epoch {epoch}/{n_epochs}]") try: train_metrics = train_fn( model, train_loader, device, criterion, optimizer, scheduler=scheduler, tensorboard_logger=tb, logdir=logdir / f"epoch_{epoch}", validation_fn=valid_func, ) log_metrics(stage, train_metrics, tb, "train", epoch) except BaseException: train_metrics = {"message": "An exception occured!"} # valid_metrics = train_metrics valid_metrics = valid_fn(model, valid_loader, device, valid_gt_path, logdir) log_metrics(stage, valid_metrics, tb, "valid", epoch) checkpointer.process( metric_value=valid_metrics[main_metric], epoch=epoch, checkpoint=make_checkpoint( stage, epoch, model, optimizer, scheduler, metrics={"train": train_metrics, "valid": valid_metrics}, ), )