def train(cfg, args): logger = logging.getLogger('SSD.trainer') if args.finetune: model = pruned_load() else: model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) if not args.finetune: extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def main(): parser = argparse.ArgumentParser( description='SSD Evaluation on VOC and COCO dataset.') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--ckpt", help= "The path to the checkpoint for test, default is the latest checkpoint.", default=None, type=str, ) parser.add_argument("--output_dir", default="eval_results", type=str, help="The directory to store evaluation results.") parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # evaluation(cfg, ckpt=args.ckpt, distributed=distributed) logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) evaluation(cfg, ckpt=args.ckpt, distributed=distributed)
def main(): parser = argparse.ArgumentParser( description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--log_step', default=10, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=2500, type=int, help='Save checkpoint every save_step') parser.add_argument( '--eval_step', default=2500, type=int, help='Evaluate dataset every eval_step, disabled when eval_step < 0') parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument('--sr', dest='sr', action='store_true', help='train with channel sparsity regularization') parser.add_argument('--finetune', dest='finetune', action='store_true', help='train with channel sparsity regularization') parser.add_argument('--s', type=float, default=0.0001, help='scale sparse rate (default: 0.0001)') parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() if cfg.OUTPUT_DIR: mkdir(cfg.OUTPUT_DIR) logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model, distributed=args.distributed)
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None BnPrune = UpBatchNorm(model, args) max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() if args.sr: BnPrune.upBN() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if device == "cuda": logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) else: logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: if args.sr: checkpointer.save("sr_model_{:06d}".format(iteration), **arguments) else: torch.save( model, os.path.join( cfg.OUTPUT_DIR, "fine_pruned_model_{:06d}.pth".format(iteration))) # checkpointer.save("pruned_model_{:06d}".format(iteration), **arguments) if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def main(): parser = argparse.ArgumentParser( description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--log_step', default=10, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=1, type=int, help='Save checkpoint every save_step') parser.add_argument( '--eval_step', default=1, type=int, help='Evaluate dataset every eval_step, disabled when eval_step < 0') parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument( '--pruner', default='SlimmingPruner', type=str, choices=['AutoSlimPruner', 'SlimmingPruner', 'l1normPruner'], help='architecture to use') parser.add_argument('--pruneratio', default=0.4, type=float, help='architecture to use') parser.add_argument('--sr', dest='sr', action='store_true', help='train with channel sparsity regularization') parser.add_argument('--s', type=float, default=0.0001, help='scale sparse rate (default: 0.0001)') parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) ###### ## prune ########### model = build_detection_model(cfg) newmodel = build_detection_model(cfg) checkpointer = CheckPointer(model, save_dir=cfg.OUTPUT_DIR) _ = checkpointer.load() model.eval() newmodel.eval() if args.pruner == 'l1normPruner': kwargs = {'pruneratio': args.pruneratio} elif args.pruner == 'SlimmingPruner': kwargs = {'pruneratio': args.pruneratio} elif args.pruner == 'AutoSlimPruner': kwargs = {'prunestep': 16, 'constrain': 200e6} pruner = prune.__dict__[args.pruner](model=model, newmodel=newmodel, args=args, **kwargs) pruner.prune() ##---------count op input = torch.randn(1, 3, 320, 320) flops, params = profile(model, inputs=(input, ), verbose=False) flops, params = clever_format([flops, params], "%.3f") flopsnew, paramsnew = profile(newmodel, inputs=(input, ), verbose=False) flopsnew, paramsnew = clever_format([flopsnew, paramsnew], "%.3f") logger.info("flops:{}->{}, params: {}->{}".format(flops, flopsnew, params, paramsnew)) save_path = os.path.join(cfg.OUTPUT_DIR, "pruned_model.pth") torch.save(newmodel, save_path)