예제 #1
0
def main():
    mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__))

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if is_main_process:
        # Setting logging file parameters for compliance logging
        os.environ["COMPLIANCE_FILE"] = './MASKRCNN_complVv0.5.0_' + str(
            datetime.datetime.now())
        mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE")
        mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE)
        mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG)
        mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER)

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

        print_mlperf(key=mlperf_log.RUN_START)

        # setting seeds - needs to be timed, so after RUN_START
        if is_main_process():
            master_seed = random.SystemRandom().randint(0, 2**32 - 1)
            seed_tensor = torch.tensor(master_seed,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))
        else:
            seed_tensor = torch.tensor(0,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))

        torch.distributed.broadcast(seed_tensor, 0)
        master_seed = int(seed_tensor.item())
    else:
        print_mlperf(key=mlperf_log.RUN_START)
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)

    # actually use the random seed
    args.seed = master_seed
    # random number generator with seed set to master_seed
    random_number_generator = random.Random(master_seed)
    print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed)

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(
        random_number_generator,
        torch.distributed.get_world_size()
        if torch.distributed.is_initialized() else 1)

    # todo sharath what if CPU
    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device='cuda')

    # Setting worker seeds
    logger.info("Worker {}: Setting seed {}".format(
        args.local_rank, worker_seeds[args.local_rank]))
    torch.manual_seed(worker_seeds[args.local_rank])

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model = train(cfg, args.local_rank, args.distributed)

    print_mlperf(key=mlperf_log.RUN_FINAL)
def main():

    configure_logger(constants.MASKRCNN)
    log_start(key=constants.INIT_START)

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=herring.get_local_rank())
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = herring.get_world_size()
    args.distributed = num_gpus > 1
    args.local_rank = herring.get_local_rank()

    # if is_main_process:
    #     # Setting logging file parameters for compliance logging
    #     os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str(datetime.datetime.now())
    #     constants.LOG_FILE = os.getenv("COMPLIANCE_FILE")
    #     constants._FILE_HANDLER = logging.FileHandler(constants.LOG_FILE)
    #     constants._FILE_HANDLER.setLevel(logging.DEBUG)
    #     constants.LOGGER.addHandler(constants._FILE_HANDLER)

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        # setting seeds - needs to be timed, so after RUN_START
        if is_main_process():
            master_seed = random.SystemRandom().randint(0, 2**32 - 1)
            seed_tensor = torch.tensor(master_seed,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))
        else:
            seed_tensor = torch.tensor(0,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))

        herring.broadcast(seed_tensor, 0)
        master_seed = int(seed_tensor.item())
    else:
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)

    # actually use the random seed
    args.seed = master_seed
    # random number generator with seed set to master_seed
    random_number_generator = random.Random(master_seed)
    log_event(key=constants.SEED, value=master_seed)

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(random_number_generator,
                                  herring.get_world_size())

    # todo sharath what if CPU
    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device='cuda')

    # Setting worker seeds
    logger.info("Worker {}: Setting seed {}".format(
        args.local_rank, worker_seeds[args.local_rank]))
    torch.manual_seed(worker_seeds[args.local_rank])

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    # Initialise async eval
    init()

    model, success = train(cfg, args.local_rank, args.distributed,
                           random_number_generator)

    if success is not None:
        if success:
            log_end(key=constants.RUN_STOP, metadata={"status": "success"})
        else:
            log_end(key=constants.RUN_STOP, metadata={"status": "aborted"})
예제 #3
0
def main():

    mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__))
    # mlperf_log.LOGGER.propagate = False

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default=
        "/private/home/fmassa/github/detectron.pytorch/configs/rpn_r50.py",
        metavar="FILE",
        help="path to config file",
    )

    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument('--skip-test',
                        dest='skip_test',
                        help='Do not test the model',
                        action='store_true')
    parser.add_argument("--fp16",
                        action="store_true",
                        help="Enable multi-precision training")

    parser.add_argument("--min_bbox_map",
                        type=float,
                        default=0.377,
                        help="Target BBOX MAP")

    parser.add_argument("--min_mask_map",
                        type=float,
                        default=0.339,
                        help="Target SEGM/MASK MAP")

    parser.add_argument("--seed", type=int, default=1, help="Seed")

    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    args.distributed = (int(os.environ["WORLD_SIZE"]) > 1
                        if "WORLD_SIZE" in os.environ else False)

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")

        # to synchronize start of time
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
        torch.cuda.synchronize()

        if torch.distributed.get_rank() == 0:
            # Setting logging file parameters for compliance logging
            os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str(
                datetime.datetime.now())
            mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE")
            mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE)
            mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG)
            mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER)

        print_mlperf(key=mlperf_log.RUN_START)

        # Setting seed
        seed_tensor = torch.tensor(0,
                                   dtype=torch.float32,
                                   device=torch.device("cuda"))

        if torch.distributed.get_rank() == 0:
            # seed = int(time.time())
            # random master seed, random.SystemRandom() uses /dev/urandom on Unix
            master_seed = random.SystemRandom().randint(0, 2**32 - 1)

            seed_tensor = torch.tensor(master_seed,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))
        torch.distributed.broadcast(seed_tensor, 0)
        master_seed = int(seed_tensor.item())
    else:

        # Setting logging file parameters for compliance logging
        os.environ["COMPLIANCE_FILE"] = '/MASKRCNN_complVv0.5.0_' + str(
            datetime.datetime.now())
        mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE")
        mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE)
        mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG)
        mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER)

        print_mlperf(key=mlperf_log.RUN_START)
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)

    args.seed = master_seed
    # random number generator with seed set to master_seed
    random_number_generator = random.Random(master_seed)
    print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed)

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)

    if args.skip_test:
        cfg.DO_ONLINE_MAP_EVAL = False

    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    #logger = setup_logger("maskrcnn_benchmark", output_dir, args.local_rank)
    logger = setup_logger("maskrcnn_benchmark", None, args.local_rank)
    logger.info(args)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(
        random_number_generator,
        torch.distributed.get_world_size()
        if torch.distributed.is_initialized() else 1)

    # todo sharath what if CPU
    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device='cuda')

    # Setting worker seeds
    logger.info("Worker {}: Setting seed {}".format(
        args.local_rank, worker_seeds[args.local_rank]))
    torch.manual_seed(worker_seeds[args.local_rank])

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model = train(cfg, random_number_generator, args.local_rank,
                  args.distributed, args, args.fp16)
    print_mlperf(key=mlperf_log.RUN_FINAL)
예제 #4
0
def main():
    configure_logger(constants.MASKRCNN)
    log_start(key=constants.INIT_START)

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

        # setting seeds - needs to be timed, so after RUN_START
        if is_main_process():
            master_seed = random.SystemRandom().randint(0, 2**32 - 1)
            seed_tensor = torch.tensor(master_seed,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))
        else:
            seed_tensor = torch.tensor(0,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))

        torch.distributed.broadcast(seed_tensor, 0)
        master_seed = int(seed_tensor.item())
    else:
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)

    # actually use the random seed
    args.seed = master_seed
    # random number generator with seed set to master_seed
    random_number_generator = random.Random(master_seed)
    log_event(key=constants.SEED, value=master_seed)

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(
        random_number_generator,
        torch.distributed.get_world_size()
        if torch.distributed.is_initialized() else 1)

    # todo sharath what if CPU
    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device='cuda')

    # Setting worker seeds
    logger.info("Worker {}: Setting seed {}".format(
        args.local_rank, worker_seeds[args.local_rank]))
    torch.manual_seed(worker_seeds[args.local_rank])

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model, success = train(cfg, args.local_rank, args.distributed,
                           args.disable_allreduce_for_logging,
                           random_number_generator)

    if success is not None:
        if success:
            log_end(key=constants.RUN_STOP, metadata={"status": "success"})
        else:
            log_end(key=constants.RUN_STOP, metadata={"status": "aborted"})