예제 #1
0
    def evaluate_single_aug(self, cand, local_rank):
        file_dir = ''
        for i in cand:
            file_dir += str(i)
        cfg.OUTPUT_DIR = os.path.join(self.log_dir, file_dir)

        mkdir(cfg.OUTPUT_DIR)

        output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
        self.logger.info("Saving config into: {}".format(output_config_path))
        # save overloaded model config in the output directory
        save_config(cfg, output_config_path)

        model, loss_scale_hist = train(cfg,
                                       local_rank,
                                       self.distributed,
                                       search=self.logger)

        results = run_test(cfg, model, self.distributed)
        results_scales = []
        if not results is None:
            results_bbox = results[0].results['bbox']
            results_scales = [
                results_bbox['APs'], results_bbox['APm'], results_bbox['APl']
            ]

        if self.distributed:
            loss_scale_hist = reduce_loss_scale(loss_scale_hist)

        return loss_scale_hist, results_scales
예제 #2
0
def setup_env_and_logger(args, cfg):
    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    model_name = get_model_name(cfg, args.model_suffix)
    train_dir = os.path.join(args.train_dir, model_name)
    if train_dir:
        mkdir(train_dir)

    logger = setup_logger("siammot", train_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(train_dir, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    save_config(cfg, output_config_path)

    return train_dir, logger
예제 #3
0
def main():
    parser = argparse.ArgumentParser(description="PyTorch Relation Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", init_method="env://"
        )
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("image_retrieval_using_sg", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    model, test_result = train(cfg, args.local_rank, args.distributed, logger)
    evaluator(logger, test_result)
예제 #4
0
def main(args):
    seed_torch()
    info = ulti.load_json()

    num_gpus = get_num_gpus()
    args.config_file = os.path.join(
        info['training_dir'], 'e2e_faster_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml')

    cfg.merge_from_file(args.config_file)
    cfg.defrost()
    cfg.OUTPUT_DIR = os.path.join(info['training_dir'], args.sub_dataset)
    cfg.MODEL.WEIGHT = os.path.join(info['dataset_dir'], info['experiment'],
                                    'Detector',
                                    'Iter{}.pth'.format(info['iter']))
    cfg.SOLVER.IMS_PER_BATCH = num_gpus * 4
    cfg.TEST.IMS_PER_BATCH = num_gpus * 16
    cfg.SOLVER.BASE_LR = 0.002
    cfg.freeze()

    mkdir(cfg.OUTPUT_DIR)

    if args.sub_dataset is None:
        args.sub_dataset = ""

    if args.vis_title is None:
        args.vis_title = os.path.basename(cfg.OUTPUT_DIR)

    logger = setup_logger("maskrcnn_benchmark", cfg.OUTPUT_DIR, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    # logger.info("\n" + collect_env_info())

    DatasetCatalog = None
    train_dataset = cfg.DATASETS.TRAIN[0]
    test_dataset = cfg.DATASETS.TEST[0]
    paths_catalog = import_file("maskrcnn_benchmark.config.paths_catalog",
                                cfg.PATHS_CATALOG, True)

    if args.sub_dataset != "":
        DatasetCatalog = paths_catalog.DatasetCatalog

        DatasetCatalog.DATASETS[train_dataset]['img_dir'] = os.path.join(
            info['dataset_dir'], 'Images')
        DatasetCatalog.DATASETS[train_dataset]['ann_file'] = os.path.join(
            info['dataset_dir'], 'RCNN_data', 'train.json')

        DatasetCatalog.DATASETS[test_dataset]['img_dir'] = os.path.join(
            info['dataset_dir'], 'Images')
        DatasetCatalog.DATASETS[test_dataset]['ann_file'] = os.path.join(
            info['dataset_dir'], 'RCNN_data', 'test.json')

        data = json.load(
            open(DatasetCatalog.DATASETS[train_dataset]['ann_file']))
    else:
        data = json.load(
            open(paths_catalog.DatasetCatalog.DATASETS[train_dataset]
                 ['ann_file']))

    iters_per_epoch = len(data['images'])
    iters_per_epoch = math.ceil(iters_per_epoch / cfg.SOLVER.IMS_PER_BATCH)
    args.iters_per_epoch = iters_per_epoch

    cfg.defrost()
    cfg.SOLVER.MAX_ITER = round(args.epochs * args.scale * iters_per_epoch)
    cfg.SOLVER.STEPS = (round(8 * args.scale * iters_per_epoch),
                        round(11 * args.scale * iters_per_epoch),
                        round(16 * args.scale * iters_per_epoch))
    cfg.freeze()

    # logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        # logger.info(config_str)
    # logger.info("Running with config:\n{}".format(cfg))

    # logger.info(DatasetCatalog)

    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    if args.train:
        args.skip_train = False
        logger.info(args)
        model = network.train(cfg, args, DatasetCatalog)

    if args.test:
        network.test(cfg, args, model=None, DatasetCatalog=DatasetCatalog)
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--use_tensorboard",
        default=True,
        type=bool,
        help="Enable/disable tensorboard logging (enabled by default)")
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument("--log_step",
                        default=50,
                        type=int,
                        help='Number of iteration for each log')
    parser.add_argument(
        "--eval_mode",
        default="test",
        type=str,
        help=
        'Use defined test datasets for periodic evaluation or use a validation split. Default: "test", alternative "val"'
    )
    parser.add_argument("--eval_step",
                        type=int,
                        default=15000,
                        help="Number of iterations for periodic evaluation")
    parser.add_argument(
        "--return_best",
        type=bool,
        default=False,
        help=
        "If false (default) tests on the target the last model. If true tests on the target the model with the best performance on the validation set"
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    model = train(cfg, args)

    if not args.skip_test:
        run_test(cfg, model, args.distributed)
예제 #6
0
def main():
    torch.cuda.set_device(7)

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default=
        "/home/SelfDriving/maskrcnn/maskrcnn-benchmark/configs/e2e_faster_rcnn_R_50_C4_1x.yaml",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    model = train(cfg, args.local_rank, args.distributed)

    if not args.skip_test:
        run_test(cfg, model, args.distributed)
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    parser.add_argument(
        "--build-model",
        default="",
        metavar="FILE",
        help="path to NAS model build file",
        type=str,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    if cfg.NAS.TRAIN_SINGLE_MODEL:
        assert len(
            args.build_model) != 0, 'args.build_model should be provided'
        model_config = json.load(open(args.build_model, 'r'))
        if isinstance(model_config, list):
            assert len(model_config) == 1
            model_config = model_config[0]
        print('Training single model:', model_config)
        model = train(cfg, args.local_rank, args.distributed, model_config)
    else:
        model = train(cfg, args.local_rank, args.distributed)

    if not args.skip_test:
        run_test(cfg, model, args.distributed)
예제 #8
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    # 配置文件参数
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",  # 用于help 信息输出
        help="path to config file",
        type=str,
    )
    # 当前进程使用的GPU标号
    parser.add_argument("--local_rank", type=int, default=0)
    # 还不清楚这是干嘛的
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action=
        "store_true",  # 指定action='store_true'或者‘store_False’之后,在运行程序添加参数时直接输入变量名,可以省略对应的默认值True或者False
    )
    # 更改config文件的信息
    # 例:opts=['SOLVER.IMS_PER_BATCH', '2', 'SOLVER.BASE_LR', '0.0025']
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    # 多GPU训练
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    # 把config文件融入到cfg文件
    cfg.merge_from_file(args.config_file)
    # 把opts列表融入到cfg文件
    cfg.merge_from_list(args.opts)
    # 使cfg及其所有子节点不可变
    cfg.freeze()

    # 训练结果存放的目录
    # _C.OUTPUT_DIR = "."
    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    # 输出一些信息
    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    # 输出配置文件目录
    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    model = train(cfg, args.local_rank, args.distributed)

    if not args.skip_test:
        run_test(cfg, model, args.distributed)
예제 #9
0
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    trainer = JointTrainer(args)
    trainer.train_joint()
예제 #10
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    # cfg.freeze()

    iter_count = 60000

    for learn_rate in np.arange(0.0001, 0.0015, 0.0001):
        print(f"lr: {float(learn_rate)}  iter_count: {iter_count}")

        outdir = os.path.join("hypertune", f"lr{float(learn_rate)}")

        print(outdir)
        cfg.OUTPUT_DIR = outdir  # set the output folder specific to learning rate
        cfg['SOLVER']['BASE_LR'] = float(learn_rate)  # set the learning rate
        cfg['SOLVER']['MAX_ITER'] = iter_count

        output_dir = cfg.OUTPUT_DIR
        if output_dir:
            mkdir(output_dir)

        logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
        logger.info("Using {} GPUs".format(num_gpus))
        logger.info(args)

        logger.info("Collecting env info (might take some time)")
        logger.info("\n" + collect_env_info())

        logger.info("Loaded configuration file {}".format(args.config_file))
        with open(args.config_file, "r") as cf:
            config_str = "\n" + cf.read()
            logger.info(config_str)
        logger.info("Running with config:\n{}".format(cfg))

        output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
        logger.info("Saving config into: {}".format(output_config_path))
        # save overloaded model config in the output directory
        save_config(cfg, output_config_path)

        model = train(cfg, args.local_rank, args.distributed)

        if not args.skip_test:
            run_test(cfg, model, args.distributed)
예제 #11
0
파일: train_net.py 프로젝트: IntelAI/models
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )
    parser.add_argument('--bf16',
                        action='store_true',
                        default=False,
                        help='enable BF16 by IPEX autocast')
    parser.add_argument('-i',
                        '--iterations',
                        default=-1,
                        type=int,
                        metavar='N',
                        help='number of total iterations to run')
    parser.add_argument('--iter-warmup',
                        default=-1,
                        type=int,
                        metavar='N',
                        help='number of warm-up iterations to run')
    parser.add_argument("--world-size", default=1, type=int, help='world size')
    parser.add_argument("--master-addr",
                        default='127.0.0.1',
                        type=str,
                        help='Master Addr')
    parser.add_argument("--port", default='29500', type=str, help='Port')
    parser.add_argument("--rank", default=0, type=int, help='rank')
    parser.add_argument('--backend',
                        default='gloo',
                        type=str,
                        help='DDP backend, default to gloo')

    args = parser.parse_args()

    args.distributed = False
    if torch.distributed.is_available() and int(os.environ.get(
            'PMI_SIZE', '0')) > 1:
        print('Distributed training with DDP')
        os.environ['RANK'] = os.environ.get('PMI_RANK', '0')
        os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1')
        if not 'MASTER_ADDR' in os.environ:
            os.environ['MASTER_ADDR'] = args.master_addr
        if not 'MASTER_PORT' in os.environ:
            os.environ['MASTER_PORT'] = args.port

        # Initialize the process group with ccl backend
        if args.backend == 'ccl':
            import torch_ccl
        torch.distributed.init_process_group(backend=args.backend)
        args.distributed = True
        if torch.distributed.is_initialized():
            print("Torch distributed is initialized.")
            args.rank = torch.distributed.get_rank()
            args.world_size = torch.distributed.get_world_size()
        else:
            print("Torch distributed is not initialized.")
            args.rank = 0
            args.world_size = 1

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    model = train(cfg,
                  args.local_rank,
                  args.distributed,
                  bf16=args.bf16,
                  iterations=args.iterations,
                  iter_warmup=args.iter_warmup)

    if not args.skip_test:
        run_test(cfg, model, args.distributed)
예제 #12
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument(
        '--reduce-batch',
        type=int,
        help=('Divide IMS_PER_BATCH by this amount. This appropriately '
              'updates the learning rate, number of iterations, and so on.'))
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument(
        "--use-tensorboard",
        dest="use_tensorboard",
        help="Use tensorboardX logger (Requires tensorboardX installed)",
        action="store_true",
        default=False)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    # Generate a unique experiment ID for this run.
    # Note: uuid generation relies on os.urandom, so it is not affected by,
    # e.g., random.seed.
    experiment_id = uuid.uuid4()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)

    # We want to get the OUTPUT_DIR from the args immediately, if it exists, so
    # we can setup logging. We will merge the rest of the config in a few
    # lines.
    merge_keys(cfg, args.opts, ['OUTPUT_DIR'])

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    if get_rank() == 0:
        file_logger = common_setup(__file__, output_dir, args)
    else:
        file_logger = common_setup(__file__ + '-worker%s' % get_rank(),
                                   output_dir,
                                   args,
                                   log_console_level=logging.CRITICAL,
                                   save_git_state=False)

    # Automatically handle config changes as required by
    # https://github.com/facebookresearch/maskrcnn-benchmark/tree/327bc29bcc4924e35bd61c59877d5a1d25bb75af#single-gpu-training
    if args.reduce_batch:
        # Update using --opts first, then override.
        merge_keys(cfg, args.opts, [
            'SOLVER.IMS_PER_BATCH', 'SOLVER.BASE_LR', 'SOLVER.MAX_ITER',
            'SOLVER.STEPS', 'SOLVER.CHECKPOINT_PERIOD'
        ])
        assert num_gpus in (1, 2, 4)
        scale = args.reduce_batch
        logging.info('Updating config for # GPUs = %s', num_gpus)

        def update_config(key, new_value):
            key_list = key.split('.')
            d = cfg
            for subkey in key_list[:-1]:
                d = cfg[subkey]
            subkey = key_list[-1]
            old_value = d[subkey]
            logging.info('Updating cfg.%s: %s -> %s', key, old_value,
                         new_value)
            d[subkey] = new_value

        update_config('SOLVER.IMS_PER_BATCH',
                      _safe_int(cfg.SOLVER.IMS_PER_BATCH / scale))
        update_config('SOLVER.BASE_LR', cfg.SOLVER.BASE_LR / scale)
        update_config('SOLVER.MAX_ITER',
                      _safe_int(cfg.SOLVER.MAX_ITER * scale))
        update_config('SOLVER.CHECKPOINT_PERIOD',
                      _safe_int(cfg.SOLVER.CHECKPOINT_PERIOD * scale))
        update_config('SOLVER.STEPS',
                      tuple(_safe_int(x * scale) for x in cfg.SOLVER.STEPS))

    logging.info('Updating config from arguments')
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    logger = logging.getLogger("maskrcnn_benchmark")
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        file_logger.info(config_str)
    file_logger.info("Running with config:\n{}".format(cfg))
    if get_rank() == 0:
        config_output = add_time_to_path(Path(output_dir) / 'config.yaml')
        with open(config_output, 'w') as f:
            f.write(cfg.dump())

    logging.info('Experiment id: %s', experiment_id)
    with open(os.path.join(output_dir, 'experiment_id.txt'), 'w') as f:
        f.write('%s\n' % experiment_id)

    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    model = train(cfg,
                  args.local_rank,
                  args.distributed,
                  use_tensorboard=args.use_tensorboard)

    if not args.skip_test:
        run_test(cfg, model, args.distributed)