def main(): parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("fcos_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed)
def validation(self, epoch): # import pdb; pdb.set_trace() print('Validation of epoch {}:'.format(epoch)) # if self.distributed: # model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) # output_folders = [None] * len(cfg.DATASETS.TEST) # dataset_names = cfg.DATASETS.TEST # if cfg.OUTPUT_DIR: # for idx, dataset_name in enumerate(dataset_names): # output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) # mkdir(output_folder) # output_folders[idx] = output_folder dataset_name = cfg.DATASETS.TEST[0] output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) self.val_loader = make_val_loader(cfg) inference( self.model, self.val_loader, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def run_test(cfg, model, distributed): if distributed: model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def run_test(cfg, model, distributed): model_test = {} if distributed: model_test["backbone"] = model["backbone"].module model_test["fcos"] = model["fcos"].module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) dataset_name = cfg.DATASETS.TEST[0] if cfg.OUTPUT_DIR: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) results = inference( model_test, data_loaders_val[0], dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize() results = all_gather(results) # import pdb; pdb.set_trace() return results
def run_test(cfg, model, distributed, test_epoch=None): if distributed: model = model.module torch.cuda.empty_cache() # TODO check if it helps iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference_result = inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize() # import pdb; pdb.set_trace() summaryStrs = get_neat_inference_result(inference_result[2][0]) # print('\n'.join(summaryStrs)) summaryStrFinal = '\n'.join(summaryStrs) summaryStrFinal = '\n\nEpoch: ' + str(test_epoch) + '\n' + summaryStrFinal # with open(output_folder+'/summaryStrs.txt', 'w') as f_summaryStrs: with open(output_folder+'/summaryStrs.txt', 'a') as f_summaryStrs: f_summaryStrs.write(summaryStrFinal)
def main(): parser = argparse.ArgumentParser(description="Test onnx models of FCOS") parser.add_argument( "--config-file", default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument( "--onnx-model", default="fcos_imprv_R_50_FPN_1x.onnx", metavar="FILE", help="path to the onnx model", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) # The onnx model can only be used with DATALOADER.NUM_WORKERS = 0 cfg.DATALOADER.NUM_WORKERS = 0 cfg.freeze() save_dir = "" logger = setup_logger("fcos_core", save_dir, get_rank()) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = ONNX_FCOS(args.onnx_model, cfg) model.to(cfg.MODEL.DEVICE) iou_types = ("bbox",) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm",) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints",) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=False) for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Inference") parser.add_argument( "--config-file", default= "/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", metavar="FILE", help="path to config file", ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 distributed = num_gpus > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() save_dir = "" logger = setup_logger("fcos_core", save_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(cfg) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) model = build_detection_model(cfg) model.to(cfg.MODEL.DEVICE) output_dir = cfg.OUTPUT_DIR checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) _ = checkpointer.load(cfg.MODEL.WEIGHT) iou_types = ("bbox", ) + ("segm", ) if cfg.MODEL.MASK_ON: iou_types = iou_types + ("segm", ) if cfg.MODEL.KEYPOINT_ON: iou_types = iou_types + ("keypoints", ) output_folders = [None] * len(cfg.DATASETS.TEST) dataset_names = cfg.DATASETS.TEST if cfg.OUTPUT_DIR: for idx, dataset_name in enumerate(dataset_names): output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) mkdir(output_folder) output_folders[idx] = output_folder data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) for output_folder, dataset_name, data_loader_val in zip( output_folders, dataset_names, data_loaders_val): inference( model, data_loader_val, dataset_name=dataset_name, iou_types=iou_types, box_only=False if cfg.MODEL.FCOS_ON or cfg.MODEL.SIPMASK_ON or cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=output_folder, ) synchronize()
def main(): # 这个就是解析命令行参数,如上面的--config-file configs/fcos/fcos_imprv_R_50_FPN_1x.yaml parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) # 这个参数是torch.distributed.launch传递过来的,我们设置位置参数来接受 # local_rank代表当前程序进程使用的GPU标号 parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() # 判断机器上GPU的数量,大于1时自动使用分布式训练 # WORLD_SIZE 由torch.distributed.launch.py产生 # 具体数值为 nproc_per_node*node(node就是主机数) num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() # 参数默认是在fcos_core/config/defaults.py中,其余由config_file,opts覆盖 cfg.merge_from_file(args.config_file) # 从yaml文件中读取参数 cfg.merge_from_list(args.opts) # 也可以从命令行参数重写 cfg.freeze() # 冻住参数,为了防止之后被不小心更改,cfg被传入train() # 可以在这里打印cfg看看,我以fcos_R_50_FPN_1x.yaml为例 output_dir = cfg.OUTPUT_DIR # 创建输出文件夹,存放一些日志信息 if output_dir: mkdir(output_dir) # 写入日志文件,包括GPU数量,系统环境,配置文件参数等 logger = setup_logger("fcos_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) # 这句话是下一个入口,关注train()方法,里面第一步就是构建模型 model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed)
def main(): # 解析命令行参数,例如--config-file parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") parser.add_argument( "--config-file", #配置文件 default="", metavar="FILE", help="path to config file", type=str, ) #此参数是通过torch.distributed.launch传递过来的,我们设置位置参数来接受 # local_rank代表当前程序进程使用的GPU标号 parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, #所有剩余的命令行参数都被收集到一个列表中 ) args = parser.parse_args() #判断机器上gpu的数量,大于1时自动使用分布式训练 #world_size是由torch.distributed.launch.py产生 # 具体数值为 nproc_per_node*node(node就是主机数) num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 #判断当前系统环境变量中是否有"WORLD_SIZE" 如果没有num_gpus=1 args.distributed = num_gpus > 1 #False if args.distributed: #False torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group\ ( backend="nccl", init_method="env://" ) synchronize() #yacs的具体用法 可以参考印象笔记 #参数默认在fcos_core/config_defaults.py中 其余参数由config_file opts覆盖 cfg.merge_from_file(args.config_file) #从yaml文件中读取参数 即configs/fcos/fcos_R_50_FPN_1x.yaml cfg.merge_from_list(args.opts) #也可以从命令行进行参数重写 cfg.freeze() #冻结参数 防止不小心被更改 cfg被传入train() output_dir = cfg.OUTPUT_DIR #输出模型路径 存放一些日志信息 if output_dir: mkdir(output_dir) #创建对应的输出路径 #写入日志文件 包括gpu数量,系统环境,配置文件参数等 logger = setup_logger("fcos_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) #local_rank=0 distributed=False if not args.skip_test: run_test(cfg, model, args.distributed)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--device_ids", type=list, default=[0]) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "--use-tensorboard", dest="use_tensorboard", help="Use tensorboardX logger (Requires tensorboardX installed)", action="store_true", default=False) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() # set devices_ids according to num gpus num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) args.device_ids = list(map(str, range(num_gpus))) # do not use torch.distributed args.distributed = False if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("fad_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed, args.device_ids, use_tensorboard=args.use_tensorboard) if not args.skip_test: run_test(cfg, model, args.distributed)
def main(): parser = argparse.ArgumentParser( description="PyTorch Object Detection Training") parser.add_argument( "--config-file", default="", metavar="FILE", help="path to config file", type=str, ) parser.add_argument("--local_rank", type=int, default=0) parser.add_argument( "--skip-test", dest="skip_test", help="Do not test the final model", action="store_true", ) parser.add_argument( "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER, ) args = parser.parse_args() num_gpus = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 args.distributed = num_gpus > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) # add distance loss warmup iters cfg.SOLVER.MAX_ITER += cfg.MODEL.LABELENC.DISTANCE_LOSS_WARMUP_ITERS cfg.SOLVER.STEPS = tuple([ i + cfg.MODEL.LABELENC.DISTANCE_LOSS_WARMUP_ITERS for i in cfg.SOLVER.STEPS ]) cfg.freeze() output_dir = cfg.OUTPUT_DIR if output_dir: mkdir(output_dir) logger = setup_logger("fcos_core", output_dir, get_rank()) logger.info("Using {} GPUs".format(num_gpus)) logger.info(args) logger.info("Collecting env info (might take some time)") logger.info("\n" + collect_env_info()) logger.info("Loaded configuration file {}".format(args.config_file)) with open(args.config_file, "r") as cf: config_str = "\n" + cf.read() logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) model = train(cfg, args.local_rank, args.distributed) if not args.skip_test: run_test(cfg, model, args.distributed) if args.distributed: model = model.module if not args.distributed or dist.get_rank() == 0: label_encoding_function = model.label_encoding_function.state_dict() rpn = model.rpn.state_dict() saved_weights = { 'label_encoding_function': label_encoding_function, 'rpn': rpn } if model.roi_heads: roi_heads = model.roi_heads.state_dict() saved_weights.update({'roi_heads': roi_heads}) torch.save(saved_weights, os.path.join(cfg.OUTPUT_DIR, "label_encoding_function.pth")) logger.info("Successfully save label encoding function weights to " + \ os.path.join(cfg.OUTPUT_DIR, "label_encoding_function.pth")) synchronize()