def make_progressive_loader(stage, model, conf): adapt = progressive_adaptive_regularization( stage, conf.training.epoch // conf.training.progressive.step, conf.training.progressive.train_sizes, conf.training.progressive.valid_sizes, conf.training.progressive.randaug_layers, conf.training.progressive.randaug_magnitudes, conf.training.progressive.mixups, conf.training.progressive.cutmixes, conf.training.progressive.dropouts, conf.training.progressive.drop_paths, conf.training.progressive.verbose, ) train_set, valid_set = make_dataset( conf.dataset_path, adapt.train_size, adapt.valid_size, { "n_augment": adapt.randaug_layer, "magnitude": adapt.randaug_magnitude, "increasing": conf.training.randaug_increasing, "magnitude_std": conf.training.randaug_magnitude_std, }, { "mixup": adapt.mixup, "cutmix": adapt.cutmix, "mix_before_aug": conf.training.mix_before_aug, }, ) try: model.set_dropout(adapt.dropout, adapt.drop_path) except: pass if conf.training.progressive.grad_accumulation is not None: grad_accum = conf.training.progressive.grad_accumulation[stage] else: grad_accum = conf.training.grad_accumulation batch_size = conf.training.dataloader.batch_size // grad_accum get_logger( mode=conf.logger).info(f"Using gradient accumulation {grad_accum}") train_loader, valid_loader, train_sampler = make_dataloader( train_set, valid_set, batch_size, conf.distributed, conf.training.dataloader.num_workers, ) return train_loader, valid_loader, train_sampler, grad_accum
def valid(conf, loader, model, criterion): device = "cuda" batch_time = Meter() losses = Meter() top1 = Meter() top5 = Meter() model.eval() logger = get_logger(mode=conf.logger) start = perf_counter() for i, (input, label) in enumerate(loader): input = input.to(device) label = label.to(device) out = model(input) loss = criterion(out, label) prec1, prec5 = accuracy(out, label, topk=(1, 5)) batch = input.shape[0] loss_dict = { "prec1": prec1 * batch, "prec5": prec5 * batch, "loss": loss * batch, "batch": torch.tensor(batch, dtype=torch.float32).to(device), } loss_reduced = dist.reduce_dict(loss_dict, average=False) batch = loss_reduced["batch"].to(torch.int64).item() losses.update(loss_reduced["loss"].item() / batch, batch) top1.update(loss_reduced["prec1"].item() / batch, batch) top5.update(loss_reduced["prec5"].item() / batch, batch) batch_time.update(perf_counter() - start) start = perf_counter() if dist.is_primary() and i % conf.log_freq == 0: logger.info( f"valid: {i}/{len(loader)}; time: {batch_time.val:.3f} ({batch_time.avg:.3f}); " f"loss: {losses.val:.4f} ({losses.avg:.4f}); " f"prec@1: {top1.val:.3f} ({top1.avg:.3f}); " f"prec@5: {top5.val:.3f} ({top5.avg:.3f})") if dist.is_primary(): logger.info( f"validation finished: prec@1 {top1.avg:.3f}, prec@5 {top5.avg:.3f}" ) return top1.avg, top5.avg, losses
def progressive_adaptive_regularization( stage, max_stage, train_sizes, valid_sizes, randaug_layers, randaug_magnitudes, mixups, cutmixes, dropouts, drop_paths, verbose=True, ): train_size = int(lerp(*train_sizes, stage, max_stage)) valid_size = int(lerp(*valid_sizes, stage, max_stage)) randaug_layer = int(lerp(*randaug_layers, stage, max_stage)) randaug_magnitude = lerp(*randaug_magnitudes, stage, max_stage) mixup = lerp(*mixups, stage, max_stage) cutmix = lerp(*cutmixes, stage, max_stage) dropout = lerp(*dropouts, stage, max_stage) drop_path = lerp(*drop_paths, stage, max_stage) if verbose: logger = get_logger() log = f"""Progressive Training with Adaptive Regularization Stage: {stage + 1} / {max_stage} Image Size: train={train_size}, valid={valid_size} RandAugment: n_augment={randaug_layer}, magnitude={randaug_magnitude} Mixup: {mixup}, Cutmix: {cutmix}, Dropout={dropout}, DropPath={drop_path}""" logger.info(log) return SimpleNamespace( train_size=train_size, valid_size=valid_size, randaug_layer=randaug_layer, randaug_magnitude=randaug_magnitude, mixup=mixup, cutmix=cutmix, dropout=dropout, drop_path=drop_path, )
def main(conf): device = "cuda" conf.distributed = conf.n_gpu > 1 torch.backends.cudnn.benchmark = True logger = get_logger(mode=conf.logger) logger.info(conf.dict()) model = conf.arch.make().to(device) model_ema = conf.arch.make().to(device) logger.info(model) if conf.distributed: model = nn.parallel.DistributedDataParallel( model, device_ids=[dist.get_local_rank()], output_device=dist.get_local_rank(), ) model_module = model.module accumulate(model_ema, model_module, 0) else: model_module = model accumulate(model_ema, model, 0) grad_accum = conf.training.grad_accumulation if conf.training.progressive.step > 0: progressive_stage = 0 train_loader, valid_loader, train_sampler, grad_accum = make_progressive_loader( progressive_stage, model_module, conf) else: train_set, valid_set = make_dataset( conf.dataset_path, conf.training.train_size, conf.training.valid_size, { "n_augment": conf.training.randaug_layer, "magnitude": conf.training.randaug_magnitude, "increasing": conf.training.randaug_increasing, "magnitude_std": conf.training.randaug_magnitude_std, "cutout": conf.training.randaug_cutout, }, { "mixup": conf.training.mixup, "cutmix": conf.training.cutmix, "mix_before_aug": conf.training.mix_before_aug, }, conf.training.erasing, ) batch_size = conf.training.dataloader.batch_size // grad_accum train_loader, valid_loader, train_sampler = make_dataloader( train_set, valid_set, batch_size, conf.distributed, conf.training.dataloader.num_workers, ) criterion_train = MixLoss(eps=0.1) criterion_valid = nn.CrossEntropyLoss() parameters, names = add_weight_decay( model.named_parameters(), conf.training.weight_decay, wd_skip_fn(conf.training.wd_skip), ) optimizer = make_optimizer(conf.training, parameters) epoch_len = math.ceil(len(train_loader) / grad_accum) scheduler = make_scheduler(conf.training, optimizer, epoch_len) step = 0 scaler = amp.GradScaler(enabled=conf.fp16) checker = conf.checker.make() for epoch in range(conf.training.epoch): if conf.distributed: train_sampler.set_epoch(epoch) train( conf, step, epoch, train_loader, model, model_ema, criterion_train, optimizer, scheduler, scaler, grad_accum, ) step += epoch_len if conf.training.ema == 0: prec1, prec5, losses = valid(conf, valid_loader, model_module, criterion_valid) else: prec1, prec5, losses = valid(conf, valid_loader, model_ema, criterion_valid) checker.log( step=epoch + 1, prec1=prec1, prec5=prec5, loss=losses.avg, lr=optimizer.param_groups[0]["lr"], ) try: checker.checkpoint( { "model": model_module.state_dict(), "ema": model_ema.state_dict(), "scheduler": scheduler.state_dict(), "optimizer": optimizer.state_dict(), "conf": conf.dict(), }, f"epoch-{str(epoch + 1).zfill(3)}.pt", ) except Exception as e: print(e) if (conf.training.progressive.step > 0 and (epoch + 1) % conf.training.progressive.step == 0): progressive_stage += 1 if (progressive_stage < conf.training.epoch // conf.training.progressive.step): train_loader, valid_loader, train_sampler, grad_accum = make_progressive_loader( progressive_stage, model_module, conf)
def train( conf, step, epoch, loader, model, model_ema, criterion, optimizer, scheduler, scaler, grad_accum, ): device = "cuda" batch_time = Meter() data_time = Meter() losses = Meter() top1 = Meter() top5 = Meter() model.train() agc_params = [ p[1] for p in model.named_parameters() if "linear" not in p[0] ] params = list(model.parameters()) logger = get_logger(mode=conf.logger) start = perf_counter() for i, (input, label1, label2, ratio) in enumerate(loader): # measure data loading time input = input.to(device) label1 = label1.to(device) label2 = label2.to(device) ratio = ratio.to(device=device, dtype=torch.float32) data_time.update(perf_counter() - start) with amp.autocast(enabled=conf.fp16): out = model(input) loss = criterion(out, label1, label2, ratio) / grad_accum prec1, prec5 = accuracy(out, label1, topk=(1, 5)) batch = input.shape[0] losses.update(loss.item() * grad_accum, batch) top1.update(prec1.item(), batch) top5.update(prec5.item(), batch) scaler.scale(loss).backward() if ((i + 1) % grad_accum == 0) or (i + 1) == len(loader): if conf.training.agc > 0 or conf.training.clip_grad_norm > 0: if conf.fp16: scaler.unscale_(optimizer) if conf.training.agc > 0: adaptive_grad_clip(agc_params, conf.training.agc) if conf.training.clip_grad_norm > 0: nn.utils.clip_grad_norm_(params, conf.training.clip_grad_norm) scheduler.step() scaler.step(optimizer) scaler.update() optimizer.zero_grad(set_to_none=True) # optimizer.step() t = step + i if conf.training.ema > 0: if conf.distributed: model_module = model.module else: model_module = model accumulate( model_ema, model_module, min(conf.training.ema, (1 + t) / (10 + t)), ema_bn=conf.training.ema_bn, ) batch_time.update(perf_counter() - start) start = perf_counter() if dist.is_primary() and i % conf.log_freq == 0: lr = optimizer.param_groups[0]["lr"] logger.info( f"epoch: {epoch} ({i}/{len(loader)}); time: {batch_time.val:.3f} ({batch_time.avg:.2f}); " f"data: {data_time.val:.3f} ({data_time.avg:.2f}); " f"loss: {losses.val:.3f} ({losses.avg:.3f}); " f"prec@1: {top1.val:.2f} ({top1.avg:.2f}); " f"prec@5: {top5.val:.2f} ({top5.avg:.2f})") return losses
def __init__(self, formatter=None): if formatter is None: formatter = default_formatter self.logger = get_logger() self.formatter = formatter
def make_dataset( path, train_size, valid_size, randaug_params, mix_params, erasing, verbose=True ): train_dir = os.path.join(nsml.DATASET_PATH, path, "train.lmdb") valid_dir = os.path.join(nsml.DATASET_PATH, path, "valid.lmdb") normalize = transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) transform_list = [ transforms.RandomResizedCrop(train_size, interpolation=Image.BICUBIC), transforms.RandomHorizontalFlip(), RandAugment(**randaug_params), transforms.ToTensor(), normalize, ] if erasing > 0: transform_list += [ RandomErasing( erasing, mode="pixel", max_count=1, num_splits=0, device="cpu" ) ] if mix_params["mix_before_aug"]: preprocess = transform_list[:2] postprocess = transform_list[2:] else: preprocess = transform_list postprocess = [] if verbose: logger = get_logger() log = f"""Transforms Transform before Mixes: {preprocess} Mixes: mixup={mix_params["mixup"]}, cutmix={mix_params["cutmix"]}""" if mix_params["mix_before_aug"]: log += f""" Transform after Mixes: {postprocess}""" logger.info(log) train_preprocess = transforms.Compose(preprocess) train_postprocess = transforms.Compose(postprocess) train_set = LMDBDataset(train_dir, train_preprocess) train_set = MixDataset( train_set, train_postprocess, mix_params["mixup"], mix_params["cutmix"] ) valid_preprocess = transforms.Compose( [ transforms.Resize(valid_size + 32, interpolation=Image.BICUBIC), transforms.CenterCrop(valid_size), transforms.ToTensor(), normalize, ] ) valid_set = LMDBDataset(valid_dir, valid_preprocess) return train_set, valid_set
def main(conf): device = "cuda" conf.distributed = conf.n_gpu > 1 torch.backends.cudnn.benchmark = True logger = get_logger(mode=conf.logger) logger.info(conf.dict()) student = conf.arch.make().to(device) student.set_drop_path(conf.task.student_drop_path) teacher = conf.arch.make().to(device) logger.info(student) if conf.distributed: teacher = nn.parallel.DistributedDataParallel( teacher, device_ids=[dist.get_local_rank()], output_device=dist.get_local_rank(), ) student = nn.parallel.DistributedDataParallel( student, device_ids=[dist.get_local_rank()], output_device=dist.get_local_rank(), ) teacher_module = teacher.module student_module = student.module teacher_module.load_state_dict(student_module.state_dict()) else: teacher_module = teacher student_module = student teacher_module.load_state_dict(student.state_dict()) for p in teacher.parameters(): p.requires_grad = False grad_accum = conf.training.grad_accumulation train_set, valid_set = make_augment_dataset( conf.dataset_path, DINOAugment( conf.task.global_crop_size, conf.task.local_crop_size, conf.task.global_crop_scale, conf.task.local_crop_scale, conf.task.n_local_crop, ), None, ) batch_size = conf.training.dataloader.batch_size // grad_accum train_loader, valid_loader, train_sampler = make_dataloader( train_set, valid_set, batch_size, conf.distributed, conf.training.dataloader.num_workers, ) criterion_train = DINOLoss( conf.arch.dim_head_out, conf.task.n_local_crop + 2, conf.task.warmup_teacher_temperature, conf.task.teacher_temperature, conf.task.warmup_teacher_temperature_epoch, conf.training.epoch, ).to(device) parameters, names = add_weight_decay( student.named_parameters(), conf.training.weight_decay, wd_skip_fn(conf.training.wd_skip), ) def make_scheduler(train_conf, optimizer, epoch_len): warmup = train_conf.scheduler.warmup * epoch_len n_iter = epoch_len * train_conf.epoch lr = train_conf.base_lr * train_conf.dataloader.batch_size / 256 if train_conf.scheduler.type == "exp_epoch": return train_conf.scheduler.make(optimizer, epoch_len, lr=lr, max_iter=train_conf.epoch, warmup=warmup) else: return train_conf.scheduler.make(optimizer, lr=lr, n_iter=n_iter, warmup=warmup) optimizer = make_optimizer(conf.training, parameters) epoch_len = math.ceil(len(train_loader) / grad_accum) scheduler = make_scheduler(conf.training, optimizer, epoch_len) wd_schedule = cosine_schedule( conf.training.weight_decay, conf.task.weight_decay_end, epoch_len * conf.training.epoch, ) momentum_schedule = cosine_schedule(conf.task.teacher_momentum, 1, epoch_len * conf.training.epoch) scaler = amp.GradScaler(enabled=conf.fp16) checker = conf.checker.make() step = 0 for epoch in range(conf.training.epoch): if conf.distributed: train_sampler.set_epoch(epoch) train( conf, step, epoch, train_loader, teacher, student, criterion_train, optimizer, scheduler, wd_schedule, momentum_schedule, scaler, grad_accum, checker, ) step += epoch_len try: checker.checkpoint( { "student": student_module.state_dict(), "teacher": teacher_module.state_dict(), "scheduler": scheduler.state_dict(), "optimizer": optimizer.state_dict(), "conf": conf.dict(), }, f"epoch-{str(epoch + 1).zfill(3)}.pt", ) except Exception as e: print(e)
def train( conf, step, epoch, loader, teacher, student, criterion, optimizer, scheduler, wd_schedule, momentum_schedule, scaler, grad_accum, checker, ): device = "cuda" batch_time = Meter() data_time = Meter() losses = Meter() student.train() agc_params = [ p[1] for p in student.named_parameters() if "linear" not in p[0] ] params = list(student.parameters()) logger = get_logger(mode=conf.logger) start = perf_counter() for i, (inputs, _) in enumerate(loader): # measure data loading time inputs = [i.to(device) for i in inputs] data_time.update(perf_counter() - start) with amp.autocast(enabled=conf.fp16): with torch.no_grad(): teacher_out = teacher(inputs[:2]) student_out = student(inputs) loss = criterion(student_out, teacher_out, epoch) / grad_accum losses.update(loss.item() * grad_accum, inputs[0].shape[0]) scaler.scale(loss).backward() for param_group in optimizer.param_groups: if "no_decay" not in param_group: param_group["weight_decay"] = wd_schedule[step] if ((i + 1) % grad_accum == 0) or (i + 1) == len(loader): if conf.training.agc > 0 or conf.training.clip_grad_norm > 0: if conf.fp16: scaler.unscale_(optimizer) if conf.training.agc > 0: adaptive_grad_clip(agc_params, conf.training.agc) if conf.training.clip_grad_norm > 0: nn.utils.clip_grad_norm_(params, conf.training.clip_grad_norm) cancel_last_layer_grad(epoch, student, conf.task.freeze_last_layer) scheduler.step() scaler.step(optimizer) scaler.update() optimizer.zero_grad(set_to_none=True) with torch.no_grad(): m = momentum_schedule[step] for param_q, param_k in zip(student.parameters(), teacher.parameters()): param_k.detach().mul_(m).add_(param_q.detach(), alpha=1 - m) batch_time.update(perf_counter() - start) start = perf_counter() if dist.is_primary() and i % conf.log_freq == 0: lr = optimizer.param_groups[0]["lr"] """logger.info( f"epoch: {epoch} ({i}/{len(loader)}); time: {batch_time.val:.3f} ({batch_time.avg:.2f}); " f"data: {data_time.val:.3f} ({data_time.avg:.2f}); " f"loss: {losses.val:.3f} ({losses.avg:.3f}); " f"lr: {lr:.5f}; " f"wd: {wd_schedule[step]:4f}; " f"moment: {momentum_schedule[step]:.4f}" )""" checker.log( step=step, weight_decay=wd_schedule[step], momentum=momentum_schedule[step], loss=losses.avg, lr=optimizer.param_groups[0]["lr"], ) step += 1 return losses