def prune_and_eval(model, sorted_bn, prune_idx,percent,cfg): #不是最终准确的结果 print(f'mAP of the original model is:') with torch.no_grad(): eval = do_evaluation(cfg, model, distributed=False) print(eval[0]['metrics']) model_copy = deepcopy(model) thre_index = int(len(sorted_bn) * percent) # 获得α参数的阈值,小于该值的α参数对应的通道,全部裁剪掉 thre = sorted_bn[thre_index] thre = thre.cuda() print(f'Channels with Gamma value less than {thre:.4f} are pruned!') remain_num = 0 for idx in prune_idx: bn_module = model_copy.backbone.module_list[idx][1] mask = bn_module.weight.data.abs().ge(thre).float() remain_num += int(mask.sum()) bn_module.weight.data.mul_(mask) print(f'Number of channels has been reduced from {len(sorted_bn)} to {remain_num}') print(f'Prune ratio: {1 - remain_num / len(sorted_bn):.3f}') print('快速看剪枝效果----》') print(f'mAP of the pruned model is:') with torch.no_grad(): eval=do_evaluation(cfg, model_copy, distributed=False) print(eval[0]['metrics']) return thre
def evaluation(cfg, ckpt): logger = logging.getLogger("SSD.inference") model = SSDDetector(cfg) checkpointer = CheckPointer(model, save_dir=cfg.OUTPUT_DIR, logger=logger) model = torch_utils.to_cuda(model) checkpointer.load(ckpt, use_latest=ckpt is None) do_evaluation(cfg, model)
def evaluation(cfg, ckpt, distributed): logger = logging.getLogger("SSD.inference") model = build_detection_model(cfg) checkpointer = CheckPointer(model, save_dir=cfg.OUTPUT_DIR, logger=logger) device = torch.device(cfg.MODEL.DEVICE) model.to(device) checkpointer.load(ckpt, use_latest=ckpt is None) do_evaluation(cfg, model, distributed)
def main(): parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--vgg', help='Pre-trained vgg model path, download from https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth') parser.add_argument('--resume', default=None, type=str, help='Checkpoint state_dict file to resume training from') parser.add_argument('--log_step', default=50, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=5000, type=int, help='Save checkpoint every save_step') parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") logger = setup_logger("SSD", distributed_util.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model, cfg.OUTPUT_DIR, distributed=args.distributed)
def evaluation(cfg, ckpt, distributed): logger = logging.getLogger("SSD.inference") model = build_detection_model(cfg) checkpointer = CheckPointer(model, save_dir=cfg.OUTPUT_DIR, logger=logger) device = torch.device(cfg.MODEL.DEVICE) #model.load_state_dict(torch.load('outputs/vgg_ssd300_voc0712.pth'), strict=False) model.to(device) checkpointer.load(ckpt, use_latest=ckpt is None) do_evaluation(cfg, model, distributed)
def evaluation(cfg, weights_file, output_dir, distributed): if not os.path.exists(output_dir): os.makedirs(output_dir) device = torch.device(cfg.MODEL.DEVICE) model = build_ssd_model(cfg) model.load(weights_file) logger = logging.getLogger("SSD.inference") logger.info('Loaded weights from {}.'.format(weights_file)) model.to(device) do_evaluation(cfg, model, output_dir, distributed)
def prune_and_eval(model, CBL_idx, CBLidx2mask, cfg): print(f'mAP of the original model is:') with torch.no_grad(): eval = do_evaluation(cfg, model, distributed=False) print(eval[0]['metrics']) model_copy = deepcopy(model) for idx in CBL_idx: bn_module = model_copy.backbone.module_list[idx][1] mask = CBLidx2mask[idx].cuda() bn_module.weight.data.mul_(mask) print('快速看剪枝效果----》') print(f'mAP of the pruned model is:') with torch.no_grad(): eval = do_evaluation(cfg, model_copy, distributed=False) print(eval[0]['metrics'])
def score(self): torch.cuda.empty_cache() eval_results = do_evaluation(self.cfg, self.model, distributed=self.args.distributed) mAP = eval_results[0]['metrics']['mAP'] return mAP
def evaluation(cfg, ckpt, distributed, model_path=None): logger = logging.getLogger("SSD.inference") model = build_detection_model(cfg) logger.info("Model :\n{}".format(model)) #如果用print,多gpu会打印两便 checkpointer = CheckPointer(model, save_dir=cfg.OUTPUT_DIR, logger=logger) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if model_path is None: checkpointer.load(ckpt, use_latest=ckpt is None) else: model.load_state_dict(torch.load(model_path)) if cfg.TEST.BN_FUSE is True: print('BN_FUSE.') model.backbone.bn_fuse() model.to(device) do_evaluation(cfg, model, distributed)
def evaluation(cfg, ckpt, distributed): logger: logging.RootLogger = logging.getLogger("SSD.inference") model = build_detection_model(cfg) checkpointer = CheckPointer(model, save_dir=cfg.OUTPUT_DIR, logger=logger) device = torch.device(cfg.MODEL.DEVICE) model.to(device) checkpointer.load(ckpt, use_latest=ckpt is None) for scale in np.linspace(0.5, 1.0, 5): logger.info(f"Running eval with rescale factor: {scale}") eval_result = do_evaluation(cfg, model, distributed, rescale=scale)
def evaluation(cfg, args, weights_file, output_dir, distributed): if not os.path.exists(output_dir): os.makedirs(output_dir) device = torch.device(cfg.MODEL.DEVICE) model = build_ssd_model(cfg) model.load(open(weights_file, 'rb')) logger = logging.getLogger("SSD.inference") logger.info('Loaded weights from {}.'.format(weights_file)) model.to(device) if args.eval_mode == "test": do_evaluation(cfg, model, output_dir, distributed) else: dataset_metrics = do_evaluation(cfg, model, cfg.OUTPUT_DIR, distributed, datasets_dict=_create_val_datasets(args, cfg, logger)) count = len(dataset_metrics) map_sum = 0 for k,v in dataset_metrics.items(): #logger.info("mAP on {}: {:.3f}".format(k, v.info["mAP"])) map_sum += v.info["mAP"] avg_map = map_sum/count print("'Model': '{}', 'Avg_mAP': {}".format(weights_file, avg_map))
def main(): args = get_parser().parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = pathlib.Path(cfg.OUTPUT_DIR) output_dir.mkdir(exist_ok=True, parents=True) logger = setup_logger("SSD", output_dir) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = start_train(cfg) logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model)
def main(): parser = argparse.ArgumentParser( description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "config_file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = pathlib.Path(cfg.OUTPUT_DIR) output_dir.mkdir(exist_ok=True, parents=True) logger = setup_logger("SSD", output_dir) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = start_train(cfg) logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model)
def do_train(cfg, model, data_loader, optimizer, scheduler, device, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training") model.train() save_to_disk = distributed_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter(log_dir=cfg.OUTPUT_DIR) else: summary_writer = None max_iter = len(data_loader) start_training_time = time.time() trained_time = 0 tic = time.time() end = time.time() for iteration, (images, boxes, labels) in enumerate(data_loader): iteration = iteration + 1 scheduler.step() images = images.to(device) boxes = boxes.to(device) labels = labels.to(device) #print(images.shape) #print(labels.shape) #print(boxes.shape) optimizer.zero_grad() loss_dict = model(images, targets=(boxes, labels)) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) loss.backward() optimizer.step() trained_time += time.time() - end end = time.time() if iteration % args.log_step == 0: eta_seconds = int((trained_time / iteration) * (max_iter - iteration)) log_str = [ "Iter: {:06d}, Lr: {:.5f}, Cost: {:.2f}s, Eta: {}".format(iteration, optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item()) ] for loss_name, loss_item in loss_dict_reduced.items(): log_str.append("{}: {:.3f}".format(loss_name, loss_item.item())) log_str = ', '.join(log_str) logger.info(log_str) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) tic = time.time() if save_to_disk and iteration % args.save_step == 0: model_path = os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_iteration_{:06d}.pth".format(cfg.INPUT.IMAGE_SIZE, iteration)) _save_model(logger, model, model_path) # Do eval when training, to trace the mAP changes and see performance improved whether or nor if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: do_evaluation(cfg, model, cfg.OUTPUT_DIR, distributed=args.distributed) model.train() if save_to_disk: model_path = os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_final.pth".format(cfg.INPUT.IMAGE_SIZE)) _save_model(logger, model, model_path) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter)) return model
def main(): # 解析命令行 读取配置文件 ''' 规定了模型的基本参数,训练的类,一共是20类加上背景所以是21 模型的输入大小,为了不对原图造成影响,一般是填充为300*300的图像 训练的文件夹路径2007和2012,测试的文件夹路径2007 最大迭代次数为120000.学习率还有gamma的值,总之就是一系列的超参数 输出的文件目录 MODEL: NUM_CLASSES: 21 INPUT: IMAGE_SIZE: 300 DATASETS: TRAIN: ("voc_2007_trainval", "voc_2012_trainval") TEST: ("voc_2007_test", ) SOLVER: MAX_ITER: 120000 LR_STEPS: [80000, 100000] GAMMA: 0.1 BATCH_SIZE: 32 LR: 1e-3 OUTPUT_DIR: 'outputs/vgg_ssd300_voc0712' Returns: ''' parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="configs/vgg_ssd300_voc0712.yaml", # default="configs/vgg_ssd300_visdrone0413.yaml", metavar="FILE", help="path to config file", type=str, ) # 每2500步保存一次文件,并且验证一次文件,记录是每10次记录一次,然后如果不想看tensor的记录的话,可以关闭,使用的是tensorboardX parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--log_step', default=10, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=2500, type=int, help='Save checkpoint every save_step') parser.add_argument('--eval_step', default=2500, type=int, help='Evaluate dataset every eval_step, disabled when eval_step < 0') parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) # 参数解析,可以使用多GPU进行训练 args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus # 做一些启动前必要的检查 if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() # 创建模型输出文件夹 if cfg.OUTPUT_DIR: mkdir(cfg.OUTPUT_DIR) # 使用logger来进行记录 logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) # 加载配置文件 logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) # 模型训练 # model = train(cfg, args) model = train(cfg, args) # 开始进行验证 if not args.skip_test: logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model, distributed=args.distributed)
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, arguments): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() summary_writer = torch.utils.tensorboard.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration images = torch_utils.to_cuda(images) targets = torch_utils.to_cuda(targets) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) meters.update(total_loss=loss, **loss_dict) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % cfg.LOG_STEP == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format(iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0))) global_step = iteration summary_writer.add_scalar('losses/total_loss', loss, global_step=global_step) for loss_name, loss_item in loss_dict.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % cfg.MODEL_SAVE_STEP == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if cfg.EVAL_STEP > 0 and iteration % cfg.EVAL_STEP == 0: eval_results = do_evaluation(cfg, model, iteration=iteration) for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def main(): parser = argparse.ArgumentParser( description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( '--vgg', help= 'Pre-trained vgg model path, download from https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth' ) parser.add_argument( '--resume', default=None, type=str, help='Checkpoint state_dict file to resume training from') parser.add_argument('--log_step', default=50, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=5000, type=int, help='Save checkpoint every save_step') parser.add_argument( '--eval_step', default=0, type=int, help= 'Evaluate dataset every eval_step, disabled when eval_step <= 0. Default: disabled' ) parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument("--num_workers", default=4, type=int, help="Number of workers to use for data loaders") parser.add_argument( "--eval_mode", default="test", type=str, help= 'Use defined test datasets for periodic evaluation or use a validation split. Default: "test", alternative "val"' ) parser.add_argument( "--return_best", default=False, type=str2bool, help= "If false (default) tests on the target the last model. If true tests on the target the model with the best performance on the validation set" ) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") logger = setup_logger("SSD", distributed_util.get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) if not os.path.exists(cfg.OUTPUT_DIR): if not args.distributed or (args.distributed and distributed_util.is_main_process()): os.makedirs(cfg.OUTPUT_DIR) model = train(cfg, args) if not args.skip_test: logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model, cfg.OUTPUT_DIR, distributed=args.distributed)
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() max_epoch = 10 for epoch in range(max_epoch): logger.info('epoch: {}'.format(epoch)) for iteration, (images, targets, _) in enumerate(data_loader, start_iter): # print("imgs shape: ",images.shape,iteration) # continue # iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) # log step if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar( 'losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) # save step if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) # eval step if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: # if True: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def shortcut_prune(cfg, model, pruned_cfg, file, max, percent, quick, weight_path): obtain_num_parameters = lambda model: sum( [param.nelement() for param in model.parameters()]) origin_nparameters = obtain_num_parameters(model) origin_size = model_size(model) #这里采用的shortcut方法,是https://github.com/SpursLipu/YOLOv3-ModelCompression-MultidatasetTraining-Multibackbone/blob/4516d76ba89b561983babd679543135484e7e9ac/slim_prune.py的方法 CBL_idx, Conv_idx, prune_idx, _, _ = parse_module_defs( model.backbone.module_defs) # 将所有要剪枝的BN层的α参数,拷贝到bn_weights列表 bn_weights = gather_bn_weights(model.backbone.module_list, prune_idx) # torch.sort返回二维列表,第一维是排序后的值列表,第二维是排序后的值列表对应的索引 sorted_bn = torch.sort(bn_weights)[0] thresh_index = int(len(bn_weights) * percent) thresh = sorted_bn[thresh_index].cuda() print(f'Global Threshold should be less than {thresh:.9f}.') predictor_channels = list(cfg.MODEL.BACKBONE.OUT_CHANNELS) # 获得保留的卷积核的个数和每层对应的mask,以及对应的head通道数 num_filters, filters_mask, predictor_channels = obtain_filters_mask( model.backbone, thresh, CBL_idx, prune_idx, predictor_channels, max) # CBLidx2mask存储CBL_idx中,每一层BN层对应的mask CBLidx2mask = {idx: mask for idx, mask in zip(CBL_idx, filters_mask)} CBLidx2filters = { idx: filters for idx, filters in zip(CBL_idx, num_filters) } for i in model.backbone.module_defs: if i['type'] == 'shortcut': i['is_access'] = False print('merge the mask of layers connected to shortcut!') merge_mask(model.backbone, CBLidx2mask, CBLidx2filters) prune_and_eval(model, CBL_idx, CBLidx2mask, cfg) for i in CBLidx2mask: CBLidx2mask[i] = CBLidx2mask[i].clone().cpu().numpy() if quick == 0: print('实际剪枝---》') pruned_model = prune_model_keep_size(cfg, model, prune_idx, CBL_idx, CBLidx2mask) if max == 0: with torch.no_grad(): eval = do_evaluation(cfg, pruned_model, distributed=False) print('after prune_model_keep_size mAP is {}'.format( eval[0]['metrics'])) #对于最大剪枝,这里是不准的 相当于还没有截掉后面层 # 获得原始模型的module_defs,并修改该defs中的卷积核数量 compact_module_defs = deepcopy(model.backbone.module_defs) prune_after = -1 # cbl_idx索引号,后面的层都不要了(实际上看,是第一个BN偏置全零层最近的预测层之后的都被剪掉) 针对max if max == 1: new_predictor_channels = [] for idx in CBL_idx: if model.backbone.module_defs[idx][ 'feature'] == 'linear' or model.backbone.module_defs[ idx]['feature'] == 'l2_norm': i = int(model.backbone.module_defs[idx]['feature_idx']) if predictor_channels[i] != -1: new_predictor_channels.append(predictor_channels[i]) if i + 1 < len(predictor_channels): if predictor_channels[i + 1] == -1: prune_after = idx break if i + 1 == len(predictor_channels): break elif model.backbone.module_defs[idx + 1][ 'type'] == 'shortcut' and model.backbone.module_defs[ idx + 1]['feature'] == 'linear': i = int(model.backbone.module_defs[idx + 1]['feature_idx']) new_predictor_channels.append( predictor_channels[i]) # 第一个short_cut连接head不会被裁掉 predictor_channels = new_predictor_channels for idx, num in zip(CBL_idx, num_filters): assert compact_module_defs[idx]['type'] == 'convolutional' if idx == prune_after + 1 and prune_after != -1: compact_module_defs[idx]['filters'] = '-1' #这一层连同之后都不要 break else: compact_module_defs[idx]['filters'] = str(num) write_cfg(pruned_cfg, compact_module_defs) print(f'Config file has been saved: {pruned_cfg}') cfg.MODEL.BACKBONE.OUT_CHANNELS = tuple(predictor_channels) print( f'PRUNED_MODEL.BACKBONE.OUT_CHANNELS:{cfg.MODEL.BACKBONE.OUT_CHANNELS}' ) cfg.MODEL.BACKBONE.CFG = pruned_cfg cfg.MODEL.BACKBONE.PRETRAINED = False #定义模型时会加载预训练权重,这里不需要,因为之前的权重不匹配现在的通道数 compact_model = build_detection_model(cfg) # print(compact_model) device = torch.device(cfg.MODEL.DEVICE) compact_model.to(device) init_weights_from_loose_model(compact_model, pruned_model, CBL_idx, Conv_idx, CBLidx2mask, prune_after) compact_nparameters = obtain_num_parameters(compact_model) compact_size = model_size(compact_model) random_input = torch.rand( (16, 3, cfg.INPUT.IMAGE_SIZE, cfg.INPUT.IMAGE_SIZE)).to(device) pruned_forward_time = obtain_avg_forward_time(random_input, pruned_model) compact_forward_time = obtain_avg_forward_time(random_input, compact_model) # print(compact_model) # print(compact_model) with torch.no_grad(): eval = do_evaluation(cfg, compact_model, distributed=False) print('Final pruned model mAP is {}'.format(eval[0]['metrics'])) metric_table = [ ["Metric", "Before", "After"], [ "Parameters(M)", f"{origin_nparameters/(1024*1024)}", f"{compact_nparameters/(1024*1024)}" ], ["模型体积(MB)", f"{origin_size}", f"{compact_size}"], [ "Inference(ms)", f'{pruned_forward_time*1000/16:.4f}', f'{compact_forward_time*1000/16:.4f}' ] #bs=16 ] print(AsciiTable(metric_table).table) print( f'压缩率:{(origin_nparameters-compact_nparameters)/origin_nparameters}' ) file.write( f'PRUNED_MODEL.BACKBONE.OUT_CHANNELS:{cfg.MODEL.BACKBONE.OUT_CHANNELS}' + '\n') file.write(AsciiTable(metric_table).table + '\n') file.write( f'压缩率:{(origin_nparameters-compact_nparameters)/origin_nparameters}' + '\n') file.close() torch.save(compact_model.state_dict(), weight_path) print(f'Compact model has been saved.')
def do_train( cfg: CfgNode, model: SSDDetector, data_loader: DataLoader, optimizer: SGD, scheduler: MultiStepLR, checkpointer, device: device, arguments, args: Namespace, output_dir: Path, model_manager: Dict[str, Any], ) -> SSDDetector: logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter(logdir=output_dir / "logs") else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() logger.info("MAX_ITER: {}".format(max_iter)) # GB: 2019-09-08: # For rescaling tests, do an eval before fine-tuning-training, so we know what # the eval results are before any weights are updated: # do_evaluation( # cfg, # model, # distributed=args.distributed, # iteration=0, # ) # model.train() # *IMPORTANT*: change to train mode after eval. for iteration, (images, targets, _) in enumerate(data_loader, start_iter): # TODO: Print learning rate: iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", "{meters}", "eta: {eta}", "mem: {mem}M", ]).format( iter=iteration, lr=optimizer.param_groups[0]["lr"], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar("losses/total_loss", losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar( "losses/{}".format(loss_name), loss_item, global_step=global_step, ) summary_writer.add_scalar("lr", optimizer.param_groups[0]["lr"], global_step=global_step) # This project doesn't use epochs, it does something with batch samplers # instead, so there is only a concept of "iteration". For now hardcode epoch as # zero to put into file name: epoch = 0 save_name = f"ssd{cfg.INPUT.IMAGE_SIZE}-vgg_{cfg.DATASETS.TRAIN[0]}_0_{epoch}_{iteration:06d}" model_path = Path(output_dir) / f"{save_name}.pth" # Above if block would be replaced by this: if iteration % args.save_step == 0: checkpointer.save(save_name, **arguments) # Do eval when training, to trace the mAP changes and see performance improved # whether or nor if (args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter): eval_results = do_evaluation( cfg, model, distributed=args.distributed, iteration=iteration, ) do_best_model_checkpointing(cfg, model_path, eval_results, model_manager, logger) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric( eval_result["metrics"], "metrics/" + dataset, summary_writer, iteration, ) model.train() # *IMPORTANT*: change to train mode after eval. if iteration % args.save_step == 0: remove_extra_checkpoints(output_dir, [model_path], logger) checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, model, data_loader, optimizer, scheduler, device, args, val_sets_dict=None): logger = logging.getLogger("SSD.trainer") logger.info("Start training") model.train() save_to_disk = distributed_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter(log_dir=cfg.OUTPUT_DIR) tf_writer = tf.compat.v1.summary.FileWriter(cfg.OUTPUT_DIR) else: summary_writer = None if cfg.DATASETS.DG: dataloaders = data_loader max_iter = len(data_loader[0]) dataiters = [iter(dataloader) for dataloader in dataloaders] else: max_iter = len(data_loader) data_loader = iter(data_loader) start_training_time = time.time() trained_time = 0 tic = time.time() end = time.time() if args.return_best: best_map = 0 for iteration in range(scheduler.last_epoch, max_iter): if cfg.DATASETS.DG: # domain generalization settings # we need to read images from different sources images = torch.ones(cfg.SOLVER.BATCH_SIZE * len(dataloaders), 3, cfg.INPUT.IMAGE_SIZE, cfg.INPUT.IMAGE_SIZE) for j in range(len(dataloaders)): if cfg.MODEL.SELF_SUPERVISED: d_images, d_boxes, d_labels, d_j_images, d_j_index, d_orig_boxes, d_orig_labels = next(dataiters[j]) else: d_images, d_boxes, d_labels, d_orig_boxes, d_orig_labels = next(dataiters[j]) start_bs = cfg.SOLVER.BATCH_SIZE * j end_bs = start_bs + cfg.SOLVER.BATCH_SIZE images[start_bs:end_bs, :, :, :] = d_images if j == 0: boxes = d_boxes labels = d_labels orig_boxes = d_orig_boxes orig_labels = d_orig_labels if cfg.MODEL.SELF_SUPERVISED: j_images = d_j_images j_index = d_j_index else: boxes = torch.cat((boxes, d_boxes)) labels = torch.cat((labels, d_labels)) orig_boxes = torch.cat((orig_boxes, d_orig_boxes)) orig_labels = torch.cat((orig_labels, d_orig_labels)) if cfg.MODEL.SELF_SUPERVISED: j_images = torch.cat((j_images, d_j_images)) j_index = torch.cat((j_index, d_j_index)) else: if cfg.MODEL.SELF_SUPERVISED: images, boxes, labels, j_images, j_index, orig_boxes, orig_labels = next(data_loader) else: images, boxes, labels, orig_boxes, orig_labels = next(data_loader) # it is not a problem if we increment iteration because it will be reset in the loop iteration = iteration + 1 images = images.to(device) boxes = boxes.to(device) labels = labels.to(device) optimizer.zero_grad() loss_dict = model(images, targets=(boxes, labels)) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) # loss.backward() becomes: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if cfg.MODEL.SELF_SUPERVISED: j_images = j_images.to(device) j_index = j_index.to(device) loss_dict_j = model(j_images, targets=j_index, auxiliary_task=True) loss_dict_reduced_j = reduce_loss_dict(loss_dict_j) losses_reduced_j = sum(loss for loss in loss_dict_reduced_j.values()) loss_j = sum(loss for loss in loss_dict_j.values()) # apply reduction factor for auxiliary loss loss_j = loss_j * cfg.MODEL.SELF_SUPERVISOR.WEIGHT # loss.backward() becomes: with amp.scale_loss(loss_j, optimizer) as scaled_loss: scaled_loss.backward() # append this loss to the dictionary of losses loss_dict.update(loss_dict_j) losses_reduced += losses_reduced_j optimizer.step() scheduler.step() trained_time += time.time() - end end = time.time() if iteration % args.log_step == 0: eta_seconds = int((trained_time / iteration) * (max_iter - iteration)) log_str = [ "Iter: {:06d}, Lr: {:.5f}, Cost: {:.2f}s, Eta: {}".format(iteration, optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item()) ] for loss_name, loss_item in loss_dict_reduced.items(): log_str.append("{}: {:.3f}".format(loss_name, loss_item.item())) log_str = ', '.join(log_str) logger.info(log_str) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if cfg.MODEL.SELF_SUPERVISED: _log_images_tensorboard(cfg, global_step, images, orig_boxes, orig_labels, summary_writer, j_images=j_images) else: _log_images_tensorboard(cfg, global_step, images, orig_boxes, orig_labels, summary_writer) #for tag, value in model.named_parameters(): # tag = tag.replace('.', '/') # if 'ss_classifier' in tag: # print(tag, value) #_log_network_params(tf_writer, model, global_step) tic = time.time() if save_to_disk and iteration % args.save_step == 0: model_path = os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_iteration_{:06d}.pth".format(cfg.INPUT.IMAGE_SIZE, iteration)) save_training_checkpoint(logger, model, scheduler, optimizer, model_path) # Do eval when training, to trace the mAP changes and see whether or not performance improved # if args.return_best = True the model returned should be the one that gave best performances on the val set if args.eval_step > 0 and iteration % args.eval_step == 0 and (not iteration == max_iter or args.return_best): dataset_metrics = do_evaluation(cfg, model, cfg.OUTPUT_DIR, distributed=args.distributed, datasets_dict=val_sets_dict) model.train() if args.distributed and not distributed_util.is_main_process(): continue avg_map = _compute_avg_map(dataset_metrics) if args.return_best: if avg_map > best_map: best_map = avg_map logger.info("With iteration {} passed the best! New best avg map: {:4f}".format(iteration, best_map)) model_path = os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_best.pth".format(cfg.INPUT.IMAGE_SIZE)) _save_model(logger, model, model_path) else: logger.info("With iteration {} the best has not been reached. Best avg map: {:4f}, Current avg mAP: {:4f}".format(iteration, best_map, avg_map)) # logging if summary_writer: global_step = iteration summary_writer.add_scalar("val_avg_map", avg_map, global_step=global_step) for dataset_name, metrics in dataset_metrics.items(): for metric_name, metric_value in metrics.get_printable_metrics().items(): summary_writer.add_scalar('/'.join(['val', dataset_name, metric_name]), metric_value, global_step=global_step) if save_to_disk: model_path = os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_final.pth".format(cfg.INPUT.IMAGE_SIZE)) _save_model(logger, model, model_path) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter)) if args.return_best: model.load(os.path.join(cfg.OUTPUT_DIR, "ssd{}_vgg_best.pth".format(cfg.INPUT.IMAGE_SIZE))) return model
def main(): parser = ArgumentParser( description="Single Shot MultiBox Detector Training With PyTorch") parser.add_argument( "--config-file", default="", metavar="FILE", help="config file name or path (relative to the configs/ folder) ", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--log_step", default=50, type=int, help="Print logs every log_step") parser.add_argument("--save_step", default=5000, type=int, help="Save checkpoint every save_step") parser.add_argument( "--eval_step", default=5000, type=int, help="Evaluate dataset every eval_step, disabled when eval_step < 0", ) parser.add_argument("--use_tensorboard", default=True, type=str2bool) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=REMAINDER, ) parser.add_argument( "--resume_experiment", default="None", dest="resume", type=str, help="Checkpoint state_dict file to resume training from", ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True else: cfg.MODEL.DEVICE = "cpu" if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() eman = ExperimentManager("ssd") output_dir = eman.get_output_dir() args.config_file = str( Path(__file__).parent / "configs" / args.config_file) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.OUTPUT_DIR = str(output_dir) cfg.freeze() eman.start({"cfg": cfg, "args": vars(args)}) # We use our own output dir, set by ExperimentManager: # if cfg.OUTPUT_DIR: # mkdir(cfg.OUTPUT_DIR) logger = setup_logger("SSD", dist_util.get_rank(), output_dir / "logs") logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) logger.info(f"Output dir: {output_dir}") model_manager = {"best": None, "new": None} model = train(cfg, args, output_dir, model_manager) if not args.skip_test: logger.info("Start evaluating...") torch.cuda.empty_cache() # speed up evaluating after training finished eval_results = do_evaluation( cfg, model, distributed=args.distributed, ) do_best_model_checkpointing( cfg, output_dir / "model_final.pth", eval_results, model_manager, logger, is_final=True, ) eman.mark_dir_if_complete()
def main(): parser = argparse.ArgumentParser( description='Single Shot MultiBox Detector Training With PyTorch') parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument('--log_step', default=10, type=int, help='Print logs every log_step') parser.add_argument('--save_step', default=2500, type=int, help='Save checkpoint every save_step') parser.add_argument( '--eval_step', default=2500, type=int, help='Evaluate dataset every eval_step, disabled when eval_step < 0') parser.add_argument('--use_tensorboard', default=True, type=str2bool) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 args.num_gpus = num_gpus if torch.cuda.is_available(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # Train distance regression network train_distance_regr() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() if cfg.OUTPUT_DIR: mkdir(cfg.OUTPUT_DIR) logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args) if not args.skip_test: logger.info('Start evaluating...') torch.cuda.empty_cache() # speed up evaluating after training finished do_evaluation(cfg, model, distributed=args.distributed)
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() # #获得要剪枝的层 if cfg.PRUNE.TYPE != 'no': if hasattr(model, 'module'): backbone = model.module.backbone else: backbone = model.backbone if cfg.PRUNE.TYPE == 'normal': logger.info('normal sparse training') _, _, prune_idx = normal_prune.parse_module_defs( backbone.module_defs) elif cfg.PRUNE.TYPE == 'shortcut': logger.info('shortcut sparse training') _, _, prune_idx, _, _ = shortcut_prune.parse_module_defs( backbone.module_defs) model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() # 对要剪枝层的γ参数稀疏化 if cfg.PRUNE.TYPE != 'no': if hasattr(model, 'module'): bn_sparse.updateBN(model.module.backbone.module_list, cfg.PRUNE.SR, prune_idx) else: # print(model.backbone.module_list) bn_sparse.updateBN(model.backbone.module_list, cfg.PRUNE.SR, prune_idx) optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=False, iteration=iteration) #单gpu测试 if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, model, data_loader, optimizer, checkpointer, arguments, scheduler): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() summary_writer = torch.utils.tensorboard.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() scaler = torch.cuda.amp.GradScaler() print(model) for iteration, (images, targets, _) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration images = torch_utils.to_cuda(images) targets = torch_utils.to_cuda(targets) # Casts operations to mixed precision with torch.cuda.amp.autocast(): loss_dict = model(images.half(), targets=targets) loss = sum(loss for loss in loss_dict.values()) meters.update(total_loss=loss, **loss_dict) optimizer.zero_grad() # Scales the loss, and calls backward() # to create scaled gradients scaler.scale(loss).backward() # loss.backward() # Unscales gradients and calls # or skips optimizer.step() scaler.step(optimizer) # optimizer.step(iteration) # Updates the scale for next iteration scaler.update() if iteration > 5000: scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % cfg.LOG_STEP == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) lr = optimizer.param_groups[0]['lr'] to_log = [ f"iter: {iteration:06d}", f"lr: {lr:.5f}", f'{meters}', f"eta: {eta_string}", ] if torch.cuda.is_available(): mem = round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0) to_log.append(f'mem: {mem}M') logger.info(meters.delimiter.join(to_log)) global_step = iteration summary_writer.add_scalar('losses/total_loss', loss, global_step=global_step) for loss_name, loss_item in loss_dict.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % cfg.MODEL_SAVE_STEP == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if cfg.EVAL_STEP > 0 and iteration % cfg.EVAL_STEP == 0: eval_results = do_evaluation(cfg, model, iteration=iteration) for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. if iteration >= cfg.SOLVER.MAX_ITER: break checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _, boxes_norm, labels_norm) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration scheduler.step() images = images.to(device) targets = targets.to(device) #+++++++++++++++++++++++++++++++++++++++++++++++ Mask GT ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mask_t = np.zeros((images.shape[0], 81, 64, 64)) mask_t[:, 0, :, :] = np.ones((1, 1, 64, 64)) for i in range(images.shape[0]): for L, B_norm in zip(labels_norm[i], boxes_norm[i]): xmin = int(B_norm[0] * 64) ymin = int(B_norm[1] * 64) xmax = int(B_norm[2] * 64) ymax = int(B_norm[3] * 64) lab = int(L) mask_t[i, 0, ymin:ymax, xmin:xmax] = 0.0 mask_t[i, lab, ymin:ymax, xmin:xmax] = 1.0 mask_t = Variable(torch.from_numpy((mask_t).astype(np.float32))).cuda() #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ loss_dict = model(images, targets=(targets, mask_t)) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train(cfg, model, data_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() # 模型设置为train()模式,表示参数是可以进行更新的 model.train() save_to_disk = dist_util.get_rank() == 0 # 这个是关于模型训练过程中的过程记录 if args.use_tensorboard and save_to_disk: import tensorboardX summary_writer = tensorboardX.SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None # dataloader的大小,根据配置文件中的iteration进行训练 # arguments = {"iteration": 0},按照目前的理解是按照断点进行训练,这个表示的是当前的迭代次数这样 max_iter = len(data_loader) start_iter = arguments["iteration"] # 开始计时 start_training_time = time.time() end = time.time() # 一次训练中,数据长度应该是dataloader的大小,也就是按照batchsize进行分割之后的大小 # 数据集会返回图像和图像对应的标签,也就是(类别数目) (c+4)k,k个先验框、c个类别,然后加一个框的坐标位置 for iteration, (images, targets, _) in enumerate(data_loader, start_iter): # print(iteration) # print(targets) iteration = iteration + 1 arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) # 把输入和目标输出传入模型,模型就会返回loss loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes # 这里是多GPU的操作,暂时先不用去理会 loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) # 这里是标准的反向传播的过程,传播就完事了 optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() # 记录时间、写日志、写模型然后保存训练中的过程记录之类的,这里也基本是死的,主要找到模型就完事了 batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) # 目前问题主要存在这个部分,就是利用模型进行验证的过程中会报错,验证的文件有错误 if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def do_train_with_style(cfg, model, data_loader, style_loader, optimizer, scheduler, checkpointer, device, arguments, args): logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() model.train() save_to_disk = dist_util.get_rank() == 0 if args.use_tensorboard and save_to_disk: try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter summary_writer = SummaryWriter( log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) else: summary_writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() # prepare AdaIN models default_path = '/content/drive/MyDrive/DA_detection/models/' vgg_path = default_path + 'vgg_normalized.pth' if 'VGG_PATH' in os.environ: vgg_path = os.environ['VGG_PATH'] decoder_path = default_path + 'decoder.pth' if 'DECODER_PATH' in os.environ: decoder_path = os.environ['DECODER_PATH'] # DEBUG: print('AdaIN > models loaded') for iteration, (images, targets, ids) in enumerate(data_loader, start_iter): iteration = iteration + 1 arguments["iteration"] = iteration # AdaIN routine random.seed() styles = next(iter(style_loader)) # DEBUG: print('AdaIN > begin new batch') if random.random() > args.p: apply_style_transfer(vgg_path, decoder_path, images, styles[0], args.p) # DEBUG: print('AdaIN > end batch') images = images.to(device) targets = targets.to(device) loss_dict = model(images, targets=targets) loss = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % args.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if device == "cuda": logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", 'mem: {mem}M', ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), )) else: logger.info( meters.delimiter.join([ "iter: {iter:06d}", "lr: {lr:.5f}", '{meters}', "eta: {eta}", ]).format( iter=iteration, lr=optimizer.param_groups[0]['lr'], meters=str(meters), eta=eta_string, )) if summary_writer: global_step = iteration summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) for loss_name, loss_item in loss_dict_reduced.items(): summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) if iteration % args.save_step == 0: checkpointer.save("model_{:06d}".format(iteration), **arguments) if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) if dist_util.get_rank() == 0 and summary_writer: for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) model.train() # *IMPORTANT*: change to train mode after eval. checkpointer.save("model_final", **arguments) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) return model
def normal_prune(cfg,model,pruned_cfg,file,regular,max,percent,quick,weight_path): obtain_num_parameters = lambda model: sum([param.nelement() for param in model.parameters()]) origin_nparameters = obtain_num_parameters(model) origin_size=model_size(model) CBL_idx, Conv_idx, prune_idx=parse_module_defs(model.backbone.module_defs) # 将所有要剪枝的BN层的α参数,拷贝到bn_weights列表 bn_weights = gather_bn_weights(model.backbone.module_list, prune_idx) # torch.sort返回二维列表,第一维是排序后的值列表,第二维是排序后的值列表对应的索引 sorted_bn = torch.sort(bn_weights)[0] # 避免剪掉所有channel的最高阈值(每个BN层的gamma的最大值的最小值即为阈值上限) # highest_thre = [] # for idx in prune_idx: # # .item()可以得到张量里的元素值 # highest_thre.append(model.backbone.module_list[idx][1].weight.data.abs().max().item()) # highest_thre = min(highest_thre) # 找到highest_thre对应的下标对应的百分比 # percent_limit = (sorted_bn==highest_thre).nonzero().item()/len(bn_weights) # print(f'Threshold should be less than {highest_thre:.4f}.') # print(f'The corresponding prune ratio should less than {percent_limit:.3f}.') #这一行的限制只是为了防止normal剪枝某一层减空,如果保留这一层的一个,则没有这个限制了 thre=prune_and_eval(model,sorted_bn,prune_idx,percent,cfg) if quick ==0: print('实际剪枝---》') predictor_channels=list(cfg.MODEL.BACKBONE.OUT_CHANNELS) # 获得保留的卷积核的个数和每层对应的mask,以及对应的head通道数 num_filters, filters_mask,predictor_channels=obtain_filters_mask(model.backbone, thre, CBL_idx, prune_idx,predictor_channels,regular,max) # CBLidx2mask存储CBL_idx中,每一层BN层对应的mask CBLidx2mask = {idx: mask for idx, mask in zip(CBL_idx, filters_mask)} pruned_model = prune_model_keep_size(cfg,model, prune_idx, CBL_idx, CBLidx2mask,Conv_idx) if max==0: with torch.no_grad(): eval = do_evaluation(cfg,pruned_model, distributed=False) print('after prune_model_keep_size mAP is {}'.format(eval[0]['metrics'])) #对于最大剪枝,这里是不准的 相当于还没有截掉后面层 # 获得原始模型的module_defs,并修改该defs中的卷积核数量 compact_module_defs = deepcopy(model.backbone.module_defs) prune_after = -1 # cbl_idx索引号,后面的层都不要了(实际上看,是第一个BN偏置全零层最近的预测层之后的都被剪掉) 针对max if max==1: new_predictor_channels=[] for idx in CBL_idx: if model.backbone.module_defs[idx]['feature'] == 'linear' or model.backbone.module_defs[idx]['feature'] =='l2_norm': i = int(model.backbone.module_defs[idx]['feature_idx']) if predictor_channels[i] != -1: new_predictor_channels.append(predictor_channels[i]) if i + 1 < len(predictor_channels): if predictor_channels[i + 1] == -1: prune_after = idx break if i+1==len(predictor_channels): break elif model.backbone.module_defs[idx+1]['type']=='shortcut' and model.backbone.module_defs[idx+1]['feature']=='linear': i = int(model.backbone.module_defs[idx+1]['feature_idx']) new_predictor_channels.append(predictor_channels[i])#第一个short_cut连接head不会被裁掉 predictor_channels=new_predictor_channels for idx, num in zip(CBL_idx, num_filters): assert compact_module_defs[idx]['type'] == 'convolutional' if idx==prune_after+1 and prune_after!=-1: compact_module_defs[idx]['filters']='-1'#这一层连同之后都不要 break else: compact_module_defs[idx]['filters'] = str(num) write_cfg(pruned_cfg, compact_module_defs) print(f'Config file has been saved: {pruned_cfg}') cfg.MODEL.BACKBONE.OUT_CHANNELS=tuple(predictor_channels) print(f'PRUNED_MODEL.BACKBONE.OUT_CHANNELS:{cfg.MODEL.BACKBONE.OUT_CHANNELS}') cfg.MODEL.BACKBONE.CFG=pruned_cfg cfg.MODEL.BACKBONE.PRETRAINED=False #定义模型时会加载预训练权重,这里不需要,因为之前的权重不匹配现在的通道数 compact_model = build_detection_model(cfg) # print(compact_model) device = torch.device(cfg.MODEL.DEVICE) compact_model.to(device) init_weights_from_loose_model(compact_model, pruned_model, CBL_idx, Conv_idx, CBLidx2mask,prune_after) compact_nparameters = obtain_num_parameters(compact_model) compact_size = model_size(compact_model) random_input = torch.rand((16, 3, cfg.INPUT.IMAGE_SIZE, cfg.INPUT.IMAGE_SIZE)).to(device) pruned_forward_time = obtain_avg_forward_time(random_input, pruned_model) compact_forward_time = obtain_avg_forward_time(random_input, compact_model) # print(compact_model) # print(compact_model) with torch.no_grad(): eval = do_evaluation(cfg, compact_model, distributed=False) print('Final pruned model mAP is {}'.format(eval[0]['metrics'])) metric_table = [ ["Metric", "Before", "After"], ["Parameters(M)", f"{origin_nparameters/(1024*1024)}", f"{compact_nparameters/(1024*1024)}"], ["模型体积(MB)", f"{origin_size}", f"{compact_size}"], ["Inference(ms)", f'{pruned_forward_time*1000/16:.4f}', f'{compact_forward_time*1000/16:.4f}'] #bs=16 ] print(AsciiTable(metric_table).table) print(f'压缩率:{(origin_nparameters-compact_nparameters)/origin_nparameters}') file.write(f'PRUNED_MODEL.BACKBONE.OUT_CHANNELS:{cfg.MODEL.BACKBONE.OUT_CHANNELS}' + '\n') file.write(AsciiTable(metric_table).table + '\n') file.write(f'压缩率:{(origin_nparameters-compact_nparameters)/origin_nparameters}' + '\n') file.close() torch.save(compact_model.state_dict(),weight_path) print(f'Compact model has been saved.')