def cosine_schedule_with_warmup(k, num_epochs, batch_size, dataset_size): batch_size *= dist.size() if dist.size() == 1: warmup_iters = 0 else: warmup_iters = 1000 // dist.size() if k < warmup_iters: return (k + 1) / warmup_iters else: iter_per_epoch = (dataset_size + batch_size - 1) // batch_size return 0.5 * (1 + np.cos(np.pi * (k - warmup_iters) / (num_epochs * iter_per_epoch)))
def set_run_dir(dirpath: str) -> None: global _run_dir _run_dir = fs.normpath(dirpath) fs.makedir(_run_dir) prefix = '{time}' if dist.size() > 1: prefix += '_{:04d}'.format(dist.rank()) logger.add(os.path.join(_run_dir, 'logging', prefix + '.log'), format=('{time:YYYY-MM-DD HH:mm:ss.SSS} | ' '{name}:{function}:{line} | ' '{level} | {message}'))
def main(): warnings.filterwarnings("ignore") # parse args args, opt = parser.parse_known_args() opt = parse_unknown_args(opt) # setup gpu and distributed training if args.gpu is not None: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu if not torch.distributed.is_initialized(): dist.init() torch.backends.cudnn.benchmark = True torch.cuda.set_device(dist.local_rank()) # setup path os.makedirs(args.path, exist_ok=True) # setup random seed if args.resume: args.manual_seed = int(time.time()) torch.manual_seed(args.manual_seed) torch.cuda.manual_seed_all(args.manual_seed) # load config exp_config = yaml.safe_load(open(args.config, "r")) partial_update_config(exp_config, opt) # save config to run directory yaml.dump(exp_config, open(os.path.join(args.path, "config.yaml"), "w"), sort_keys=False) # build data_loader image_size = exp_config["data_provider"]["image_size"] data_provider, n_classes = build_data_loader( exp_config["data_provider"]["dataset"], image_size, exp_config["data_provider"]["base_batch_size"], exp_config["data_provider"]["n_worker"], exp_config["data_provider"]["data_path"], dist.size(), dist.rank(), ) # build model model = build_model( exp_config["model"]["name"], n_classes, exp_config["model"]["dropout_rate"], ) print(model) # netaug if exp_config.get("netaug", None) is not None: use_netaug = True model = augemnt_model(model, exp_config["netaug"], n_classes, exp_config["model"]["dropout_rate"]) model.set_active(mode="min") else: use_netaug = False # load init if args.init_from is not None: init = load_state_dict_from_file(args.init_from) load_state_dict(model, init, strict=False) print("Loaded init from %s" % args.init_from) else: init_modules(model, init_type=exp_config["run_config"]["init_type"]) print("Random Init") # profile profile_model = copy.deepcopy(model) # during inference, bn will be fused into conv remove_bn(profile_model) print(f"Params: {trainable_param_num(profile_model)}M") print( f"MACs: {inference_macs(profile_model, data_shape=(1, 3, image_size, image_size))}M" ) # train exp_config["generator"] = torch.Generator() exp_config["generator"].manual_seed(args.manual_seed) model = nn.parallel.DistributedDataParallel(model.cuda(), device_ids=[dist.local_rank()]) train(model, data_provider, exp_config, args.path, args.resume, use_netaug)
def train( model: nn.Module, data_provider: Dict, exp_config: Dict, path: str, resume=False, use_netaug=False, ): # build optimizer params_without_wd = [] params_with_wd = [] for name, param in model.named_parameters(): if param.requires_grad: if np.any([key in name for key in ["bias", "norm"]]): params_without_wd.append(param) else: params_with_wd.append(param) net_params = [ { "params": params_without_wd, "weight_decay": 0 }, { "params": params_with_wd, "weight_decay": exp_config["run_config"]["weight_decay"], }, ] optimizer = torch.optim.SGD( net_params, lr=exp_config["run_config"]["base_lr"] * dist.size(), momentum=0.9, nesterov=True, ) # build lr scheduler lr_scheduler = CosineLRwithWarmup( optimizer, exp_config["run_config"]["warmup_epochs"] * len(data_provider["train"]), exp_config["run_config"]["base_lr"], exp_config["run_config"]["n_epochs"] * len(data_provider["train"]), ) # train criterion train_criterion = CrossEntropyWithLabelSmooth( smooth_ratio=exp_config["run_config"]["label_smoothing"]) # init best_val = 0.0 start_epoch = 0 checkpoint_path = os.path.join(path, "checkpoint") log_path = os.path.join(path, "logs") os.makedirs(checkpoint_path, exist_ok=True) os.makedirs(log_path, exist_ok=True) logs_writer = open(os.path.join(log_path, "exp.log"), "a") if resume and os.path.isfile(os.path.join(checkpoint_path, "checkpoint.pt")): checkpoint = torch.load(os.path.join(checkpoint_path, "checkpoint.pt"), map_location="cpu") model.module.load_state_dict(checkpoint["state_dict"]) if "best_val" in checkpoint: best_val = checkpoint["best_val"] if "epoch" in checkpoint: start_epoch = checkpoint["epoch"] + 1 if "optimizer" in checkpoint: optimizer.load_state_dict(checkpoint["optimizer"]) if "lr_scheduler" in checkpoint: lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) # start training for epoch in range( start_epoch, exp_config["run_config"]["n_epochs"] + exp_config["run_config"]["warmup_epochs"], ): remaining_epochs = (exp_config["run_config"]["n_epochs"] + exp_config["run_config"]["warmup_epochs"] - epoch) netaug_mode = None if use_netaug: netaug_mode = "default" if remaining_epochs <= exp_config["netaug"]["stop_aug_w_epoch"]: netaug_mode = "min_w" elif remaining_epochs <= exp_config["netaug"]["stop_aug_e_epoch"]: netaug_mode = "min_e" if remaining_epochs <= exp_config["netaug"]["stop_netaug_epoch"]: netaug_mode = None # sort channel if exp_config["netaug"][ "sort_channel"] and netaug_mode == "default": model.module.sort_channels() print("sort channels") if netaug_mode is None: model.module.set_active(mode="min") train_info_dict = train_one_epoch( model, data_provider, epoch, optimizer, train_criterion, lr_scheduler, exp_config, netaug_mode, ) if use_netaug: model.module.set_active(mode="min") val_info_dict = eval(model, data_provider, use_netaug) is_best = val_info_dict["val_top1"] > best_val best_val = max(best_val, val_info_dict["val_top1"]) # log epoch_log = f"[{epoch + 1 - exp_config['run_config']['warmup_epochs']}/{exp_config['run_config']['n_epochs']}]" epoch_log += f"\tval_top1={val_info_dict['val_top1']:.2f} ({best_val:.2f})" epoch_log += f"\ttrain_top1={train_info_dict['train_top1']:.2f}\tlr={optimizer.param_groups[0]['lr']:.2E}" if dist.is_master(): logs_writer.write(epoch_log + "\n") logs_writer.flush() # save checkpoint checkpoint = { "state_dict": model.module.state_dict(), "epoch": epoch, "best_val": best_val, "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), } if dist.is_master(): torch.save( checkpoint, os.path.join(checkpoint_path, "checkpoint.pt"), _use_new_zipfile_serialization=False, ) if is_best: torch.save( checkpoint, os.path.join(checkpoint_path, "best.pt"), _use_new_zipfile_serialization=False, ) # export if use_netaug if use_netaug: checkpoint = load_state_dict_from_file( os.path.join(checkpoint_path, "best.pt")) model.module.load_state_dict(checkpoint) model.eval() model.module.set_active(mode="min") with torch.no_grad(): model.module( torch.zeros( 1, 3, exp_config["data_provider"]["image_size"], exp_config["data_provider"]["image_size"], ).cuda()) export_model = model.module.export() if dist.is_master(): torch.save( {"state_dict": export_model.state_dict()}, os.path.join(checkpoint_path, "target.pt"), _use_new_zipfile_serialization=False, )
def main() -> None: dist.init() torch.backends.cudnn.benchmark = True torch.cuda.set_device(dist.local_rank()) parser = argparse.ArgumentParser() parser.add_argument('config', metavar='FILE', help='config file') parser.add_argument('--run-dir', metavar='DIR', help='run directory') args, opts = parser.parse_known_args() configs.load(args.config, recursive=True) configs.update(opts) if args.run_dir is None: args.run_dir = auto_set_run_dir() else: set_run_dir(args.run_dir) logger.info(' '.join([sys.executable] + sys.argv)) logger.info(f'Experiment started: "{args.run_dir}".' + '\n' + f'{configs}') dataset = builder.make_dataset() dataflow = {} for split in dataset: sampler = torch.utils.data.DistributedSampler( dataset[split], num_replicas=dist.size(), rank=dist.rank(), shuffle=(split == 'train'), ) dataflow[split] = torch.utils.data.DataLoader( dataset[split], batch_size=configs.batch_size // dist.size(), sampler=sampler, num_workers=configs.workers_per_gpu, pin_memory=True, ) model = builder.make_model() model = torch.nn.parallel.DistributedDataParallel( model.cuda(), device_ids=[dist.local_rank()], ) criterion = builder.make_criterion() optimizer = builder.make_optimizer(model) scheduler = builder.make_scheduler(optimizer) trainer = ClassificationTrainer( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, amp_enabled=configs.amp.enabled, ) trainer.train_with_defaults( dataflow['train'], num_epochs=configs.num_epochs, callbacks=[ SaverRestore(), InferenceRunner( dataflow['test'], callbacks=[ TopKCategoricalAccuracy(k=1, name='acc/top1'), TopKCategoricalAccuracy(k=5, name='acc/top5'), ], ), MaxSaver('acc/top1'), Saver(), ], )
def main() -> None: dist.init() torch.backends.cudnn.benchmark = True torch.cuda.set_device(dist.local_rank()) parser = argparse.ArgumentParser() parser.add_argument('config', metavar='FILE', help='config file') parser.add_argument('--run-dir', metavar='DIR', help='run directory') parser.add_argument('--name', type=str, help='model name') args, opts = parser.parse_known_args() configs.load(args.config, recursive=True) configs.update(opts) if args.run_dir is None: args.run_dir = auto_set_run_dir() else: set_run_dir(args.run_dir) logger.info(' '.join([sys.executable] + sys.argv)) logger.info(f'Experiment started: "{args.run_dir}".' + '\n' + f'{configs}') dataset = builder.make_dataset() dataflow = dict() for split in dataset: sampler = torch.utils.data.distributed.DistributedSampler( dataset[split], num_replicas=dist.size(), rank=dist.rank(), shuffle=(split == 'train')) dataflow[split] = torch.utils.data.DataLoader( dataset[split], batch_size=configs.batch_size if split == 'train' else 1, sampler=sampler, num_workers=configs.workers_per_gpu, pin_memory=True, collate_fn=dataset[split].collate_fn) if 'spvnas' in args.name.lower(): model = spvnas_specialized(args.name) elif 'spvcnn' in args.name.lower(): model = spvcnn(args.name) elif 'mink' in args.name.lower(): model = minkunet(args.name) else: raise NotImplementedError #model = builder.make_model() model = torch.nn.parallel.DistributedDataParallel( model.cuda(), device_ids=[dist.local_rank()], find_unused_parameters=True) model.eval() criterion = builder.make_criterion() optimizer = builder.make_optimizer(model) scheduler = builder.make_scheduler(optimizer) meter = MeanIoU(configs.data.num_classes, configs.data.ignore_label) trainer = SemanticKITTITrainer(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, num_workers=configs.workers_per_gpu, seed=configs.train.seed ) callbacks=Callbacks([ SaverRestore(), MeanIoU( configs.data.num_classes, configs.data.ignore_label ) ]) callbacks._set_trainer(trainer) trainer.callbacks = callbacks trainer.dataflow = dataflow['test'] trainer.before_train() trainer.before_epoch() # important model.eval() for feed_dict in tqdm(dataflow['test'], desc='eval'): _inputs = dict() for key, value in feed_dict.items(): if not 'name' in key: _inputs[key] = value.cuda() inputs = _inputs['lidar'] targets = feed_dict['targets'].F.long().cuda(non_blocking=True) outputs = model(inputs) invs = feed_dict['inverse_map'] all_labels = feed_dict['targets_mapped'] _outputs = [] _targets = [] for idx in range(invs.C[:, -1].max()+1): cur_scene_pts = (inputs.C[:, -1] == idx).cpu().numpy() cur_inv = invs.F[invs.C[:, -1] == idx].cpu().numpy() cur_label = (all_labels.C[:, -1] == idx).cpu().numpy() outputs_mapped = outputs[cur_scene_pts][ cur_inv].argmax(1) targets_mapped = all_labels.F[cur_label] _outputs.append(outputs_mapped) _targets.append(targets_mapped) outputs = torch.cat(_outputs, 0) targets = torch.cat(_targets, 0) output_dict = { 'outputs': outputs, 'targets': targets } trainer.after_step(output_dict) trainer.after_epoch()
def main() -> None: dist.init() torch.backends.cudnn.benchmark = True torch.cuda.set_device(dist.local_rank()) parser = argparse.ArgumentParser() parser.add_argument('config', metavar='FILE', help='config file') parser.add_argument('--run-dir', metavar='DIR', help='run directory') args, opts = parser.parse_known_args() configs.load(args.config, recursive=True) configs.update(opts) if args.run_dir is None: args.run_dir = auto_set_run_dir() else: set_run_dir(args.run_dir) logger.info(' '.join([sys.executable] + sys.argv)) logger.info(f'Experiment started: "{args.run_dir}".' + '\n' + f'{configs}') # seed if ('seed' not in configs.train) or (configs.train.seed is None): configs.train.seed = torch.initial_seed() % (2**32 - 1) seed = configs.train.seed + dist.rank( ) * configs.workers_per_gpu * configs.num_epochs random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = builder.make_dataset() dataflow = dict() for split in dataset: sampler = torch.utils.data.distributed.DistributedSampler( dataset[split], num_replicas=dist.size(), rank=dist.rank(), shuffle=(split == 'train')) dataflow[split] = torch.utils.data.DataLoader( dataset[split], batch_size=configs.batch_size, sampler=sampler, num_workers=configs.workers_per_gpu, pin_memory=True, collate_fn=dataset[split].collate_fn) model = builder.make_model() model = torch.nn.parallel.DistributedDataParallel( model.cuda(), device_ids=[dist.local_rank()], find_unused_parameters=True) criterion = builder.make_criterion() optimizer = builder.make_optimizer(model) scheduler = builder.make_scheduler(optimizer) trainer = SemanticKITTITrainer(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, num_workers=configs.workers_per_gpu, seed=seed) trainer.train_with_defaults( dataflow['train'], num_epochs=configs.num_epochs, callbacks=[ InferenceRunner(dataflow[split], callbacks=[ MeanIoU(name=f'iou/{split}', num_classes=configs.data.num_classes, ignore_label=configs.data.ignore_label) ]) for split in ['test'] ] + [ MaxSaver('iou/test'), Saver(), ])
args = parser.parse_args() # setup gpu and distributed training if args.gpu is not None: os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu dist.init() torch.backends.cudnn.benchmark = True torch.cuda.set_device(dist.local_rank()) # build data loader data_loader_dict, n_classes = build_data_loader( args.dataset, args.image_size, args.batch_size, args.n_worker, args.data_path, dist.size(), dist.rank(), ) # build model model = build_model(args.model, n_classes, 0).cuda() # load checkpoint checkpoint = load_state_dict_from_file(args.init_from) model.load_state_dict(checkpoint) model = nn.parallel.DistributedDataParallel(model, device_ids=[dist.local_rank()]) val_results = eval(model, data_loader_dict, args.reset_bn) for key, val in val_results.items():