def main(args): load_config(cfg, args.config) local_rank = -1 torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True cfg.defrost() timestr = datetime.datetime.now().__format__("%Y%m%d%H%M%S") cfg.save_dir = os.path.join(cfg.save_dir, timestr) mkdir(local_rank, cfg.save_dir) logger = NanoDetLightningLogger(cfg.save_dir) assert args.task in ["val", "test"] cfg.update({"test_mode": args.task}) logger.info("Setting up data...") val_dataset = build_dataset(cfg.data.val, args.task) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=naive_collate, drop_last=False, ) evaluator = build_evaluator(cfg.evaluator, val_dataset) logger.info("Creating model...") task = TrainingTask(cfg, evaluator) ckpt = torch.load(args.model) if "pytorch-lightning_version" not in ckpt: warnings.warn( "Warning! Old .pth checkpoint is deprecated. " "Convert the checkpoint with tools/convert_old_checkpoint.py ") ckpt = convert_old_model(ckpt) task.load_state_dict(ckpt["state_dict"]) if cfg.device.gpu_ids == -1: logger.info("Using CPU training") accelerator, devices = "cpu", None else: accelerator, devices = "gpu", cfg.device.gpu_ids trainer = pl.Trainer( default_root_dir=cfg.save_dir, accelerator=accelerator, devices=devices, log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0, logger=logger, ) logger.info("Starting testing...") trainer.test(task, val_dataloader)
def main(args): load_config(cfg, args.config) if cfg.model.arch.head.num_classes != len(cfg.class_names): raise ValueError('cfg.model.arch.head.num_classes must equal len(cfg.class_names),but got {} and {}'.format(cfg.model.arch.head.num_classes,len(cfg.class_names))) local_rank = int(args.local_rank) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) if args.seed is not None: logger.log('Set random seed to {}'.format(args.seed)) pl.seed_everything(args.seed) logger.log('Setting up data...') train_dataset = build_dataset(cfg.data.train, 'train') val_dataset = build_dataset(cfg.data.val, 'test') evaluator = build_evaluator(cfg, val_dataset) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) # TODO: batch eval val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) logger.log('Creating model...') task = TrainingTask(cfg, evaluator) if 'load_model' in cfg.schedule: ckpt = torch.load(cfg.schedule.load_model) if 'pytorch-lightning_version' not in ckpt: warnings.warn('Warning! Old .pth checkpoint is deprecated. ' 'Convert the checkpoint with tools/convert_old_checkpoint.py ') ckpt = convert_old_model(ckpt) task.load_state_dict(ckpt['state_dict'], strict=False) model_resume_path = os.path.join(cfg.save_dir, 'model_last.ckpt') if 'resume' in cfg.schedule else None trainer = pl.Trainer(default_root_dir=cfg.save_dir, max_epochs=cfg.schedule.total_epochs, gpus=cfg.device.gpu_ids, check_val_every_n_epoch=cfg.schedule.val_intervals, accelerator='ddp', log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0, resume_from_checkpoint=model_resume_path, callbacks=[ProgressBar(refresh_rate=0)] # disable tqdm bar ) trainer.fit(task, train_dataloader, val_dataloader)
def main(args): load_config(cfg, args.config) local_rank = -1 torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True cfg.defrost() timestr = datetime.datetime.now().__format__('%Y%m%d%H%M%S') cfg.save_dir = os.path.join(cfg.save_dir, timestr) mkdir(local_rank, cfg.save_dir) logger = Logger(local_rank, cfg.save_dir) assert args.task in ['val', 'test'] cfg.update({'test_mode': args.task}) logger.log('Setting up data...') val_dataset = build_dataset(cfg.data.val, args.task) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=collate_function, drop_last=True) evaluator = build_evaluator(cfg, val_dataset) logger.log('Creating model...') task = TrainingTask(cfg, evaluator) ckpt = torch.load(args.model) if 'pytorch-lightning_version' not in ckpt: warnings.warn( 'Warning! Old .pth checkpoint is deprecated. ' 'Convert the checkpoint with tools/convert_old_checkpoint.py ') ckpt = convert_old_model(ckpt) task.load_state_dict(ckpt['state_dict']) trainer = pl.Trainer( default_root_dir=cfg.save_dir, gpus=cfg.device.gpu_ids, accelerator='ddp', log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0, ) logger.log('Starting testing...') trainer.test(task, val_dataloader)
def main(args): load_config(cfg, args.config) if cfg.model.arch.head.num_classes != len(cfg.class_names): raise ValueError( "cfg.model.arch.head.num_classes must equal len(cfg.class_names), " "but got {} and {}".format(cfg.model.arch.head.num_classes, len(cfg.class_names))) local_rank = int(args.local_rank) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True mkdir(local_rank, cfg.save_dir) logger = NanoDetLightningLogger(cfg.save_dir) logger.dump_cfg(cfg) if args.seed is not None: logger.info("Set random seed to {}".format(args.seed)) pl.seed_everything(args.seed) logger.info("Setting up data...") train_dataset = build_dataset(cfg.data.train, "train") val_dataset = build_dataset(cfg.data.val, "test") evaluator = build_evaluator(cfg.evaluator, val_dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=True, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=naive_collate, drop_last=True, ) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg.device.batchsize_per_gpu, shuffle=False, num_workers=cfg.device.workers_per_gpu, pin_memory=True, collate_fn=naive_collate, drop_last=False, ) logger.info("Creating model...") task = TrainingTask(cfg, evaluator) if "load_model" in cfg.schedule: ckpt = torch.load(cfg.schedule.load_model) if "pytorch-lightning_version" not in ckpt: warnings.warn( "Warning! Old .pth checkpoint is deprecated. " "Convert the checkpoint with tools/convert_old_checkpoint.py ") ckpt = convert_old_model(ckpt) load_model_weight(task.model, ckpt, logger) logger.info("Loaded model weight from {}".format( cfg.schedule.load_model)) model_resume_path = (os.path.join(cfg.save_dir, "model_last.ckpt") if "resume" in cfg.schedule else None) accelerator = None if len(cfg.device.gpu_ids) <= 1 else "ddp" trainer = pl.Trainer( default_root_dir=cfg.save_dir, max_epochs=cfg.schedule.total_epochs, gpus=cfg.device.gpu_ids, check_val_every_n_epoch=cfg.schedule.val_intervals, accelerator=accelerator, log_every_n_steps=cfg.log.interval, num_sanity_val_steps=0, resume_from_checkpoint=model_resume_path, callbacks=[ProgressBar(refresh_rate=0)], # disable tqdm bar logger=logger, benchmark=True, gradient_clip_val=cfg.get("grad_clip", 0.0), ) trainer.fit(task, train_dataloader, val_dataloader)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import torch from nanodet.util import convert_old_model def parse_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Convert .pth model to onnx.", ) parser.add_argument("--file_path", type=str, help="Path to .pth checkpoint.") parser.add_argument("--out_path", type=str, help="Path to .ckpt checkpoint.") return parser.parse_args() if __name__ == "__main__": args = parse_args() file_path = args.file_path out_path = args.out_path old_check_point = torch.load(file_path) new_check_point = convert_old_model(old_check_point) torch.save(new_check_point, out_path) print("Checkpoint saved to:", out_path)