示例#1
0
def mptrain(seed, cfg_file, load, load_state_dict, save_every, train_dir):
    local_rank = int(os.environ["LOCAL_RANK"])
    # set gpu
    _set_gpu(local_rank)
    device = torch.cuda.current_device()
    torch.distributed.init_process_group(backend="nccl")

    if train_dir:
        # backup config file, and if in `develop` mode, also backup the aw_nas source code
        if local_rank == 0:
            train_dir = utils.makedir(train_dir, remove=False)
            shutil.copyfile(cfg_file,
                            os.path.join(train_dir, "train_config.yaml"))

    torch.distributed.barrier()

    if train_dir:
        # add log file handler
        log_file = os.path.join(
            train_dir, "train{}.log".format("" if local_rank ==
                                            0 else "_{}".format(local_rank)))
        _logger.addFile(log_file)

    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-train config: {}; train_dir: {}; cwd: {}"\
                              .format(cfg_file, train_dir, os.getcwd()))

    LOGGER.info(
        ("Start distributed parallel training: (world size {}; MASTER {}:{})"
         " rank {} local_rank {} PID {}").format(int(os.environ["WORLD_SIZE"]),
                                                 os.environ["MASTER_ADDR"],
                                                 os.environ["MASTER_PORT"],
                                                 os.environ["RANK"],
                                                 local_rank, os.getpid()))

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    cfg["final_trainer_cfg"]["multiprocess"] = True

    # initialize components
    LOGGER.info("Initializing components.")
    search_space = _init_component(cfg, "search_space")
    whole_dataset = _init_component(cfg, "dataset")

    _data_type = whole_dataset.data_type()
    if _data_type == "sequence":
        # get the num_tokens
        num_tokens = whole_dataset.vocab_size
        LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME,
                    num_tokens)
        model = _init_component(cfg,
                                "final_model",
                                search_space=search_space,
                                device=device,
                                num_tokens=num_tokens)
    else:
        model = _init_component(cfg,
                                "final_model",
                                search_space=search_space,
                                device=device)
    # check model support for data type
    expect(_data_type in model.supported_data_types())
    objective = _init_component(cfg, "objective", search_space=search_space)
    trainer = _init_component(cfg,
                              "final_trainer",
                              dataset=whole_dataset,
                              model=model,
                              device=device,
                              gpus=[device],
                              objective=objective)
    # check trainer support for data type
    expect(_data_type in trainer.supported_data_types())

    # start training
    LOGGER.info("Start training.")
    if local_rank != 0:
        save_every = None
    trainer.setup(load, load_state_dict, save_every, train_dir)
    trainer.train()
示例#2
0
def search(cfg_file, gpu, seed, load, save_every, interleave_report_every,
           train_dir, vis_dir, develop):
    # check dependency and initialize visualization writer
    if vis_dir:
        vis_dir = utils.makedir(vis_dir, remove=True)
        try:
            import tensorboardX
        except ImportError:
            LOGGER.error(
                "Cannot import module tensorboardX. Will IGNORE the `--vis-dir` option! "
                "Try installing the dependency manually, or `pip install aw_nas[vis]`"
            )
            _writer = None
        else:
            _writer = tensorboardX.SummaryWriter(log_dir=vis_dir)
    else:
        _writer = None
    writer = WrapWriter(_writer)

    if train_dir:
        # backup config file, and if in `develop` mode, also backup the aw_nas source code
        train_dir = utils.makedir(train_dir, remove=True)
        shutil.copyfile(cfg_file, os.path.join(train_dir, "config.yaml"))

        if develop:
            import pkg_resources
            src_path = pkg_resources.resource_filename("aw_nas", "")
            backup_code_path = os.path.join(train_dir, "aw_nas")
            if os.path.exists(backup_code_path):
                shutil.rmtree(backup_code_path)
            LOGGER.info("Copy `aw_nas` source code to %s", backup_code_path)
            shutil.copytree(src_path, backup_code_path, ignore=_onlycopy_py)

        # add log file handler
        log_file = os.path.join(train_dir, "search.log")
        _logger.addFile(log_file)

    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-search config: {}; train_dir: {}; vis_dir: {}; cwd: {}"\
                              .format(cfg_file, train_dir, vis_dir, os.getcwd()))

    # set gpu
    _set_gpu(gpu)
    device = torch.device(
        "cuda:{}".format(gpu) if torch.cuda.is_available() else "cpu")

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    # initialize components
    LOGGER.info("Initializing components.")
    trainer = _init_components_from_cfg(cfg, device)[-1]

    # setup trainer and train
    trainer.setup(load,
                  save_every,
                  train_dir,
                  writer=writer,
                  interleave_report_every=interleave_report_every)
    trainer.train()
示例#3
0
def mpsearch(cfg_file, seed, load, save_every, interleave_report_every,
             train_dir, vis_dir, develop):
    # check dependency and initialize visualization writer
    local_rank = int(os.environ["LOCAL_RANK"])
    # set gpu
    _set_gpu(local_rank)
    device = torch.cuda.current_device()
    torch.distributed.init_process_group(backend="nccl",
                                         rank=int(os.environ["RANK"]),
                                         world_size=int(
                                             os.environ["WORLD_SIZE"]))

    if vis_dir and local_rank == 0:
        vis_dir = utils.makedir(vis_dir, remove=True)
        try:
            import tensorboardX
        except ImportError:
            LOGGER.error(
                "Cannot import module tensorboardX. Will IGNORE the `--vis-dir` option! "
                "Try installing the dependency manually, or `pip install aw_nas[vis]`"
            )
            _writer = None
        else:
            _writer = tensorboardX.SummaryWriter(log_dir=vis_dir)
    else:
        _writer = None
    writer = WrapWriter(_writer)

    if train_dir:
        if local_rank == 0:
            # backup config file, and if in `develop` mode, also backup the aw_nas source code
            train_dir = utils.makedir(train_dir, remove=True)
            shutil.copyfile(cfg_file, os.path.join(train_dir, "config.yaml"))

            if develop:
                import pkg_resources
                src_path = pkg_resources.resource_filename("aw_nas", "")
                backup_code_path = os.path.join(train_dir, "aw_nas")
                if os.path.exists(backup_code_path):
                    shutil.rmtree(backup_code_path)
                LOGGER.info("Copy `aw_nas` source code to %s",
                            backup_code_path)
                shutil.copytree(src_path,
                                backup_code_path,
                                ignore=_onlycopy_py)

    torch.distributed.barrier()

    if train_dir:
        # add log file handler
        log_file = os.path.join(
            train_dir, "search{}.log".format("" if local_rank ==
                                             0 else "_{}".format(local_rank)))
        _logger.addFile(log_file)

    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-search config: {}; train_dir: {}; vis_dir: {}; cwd: {}"\
                              .format(cfg_file, train_dir, vis_dir, os.getcwd()))

    LOGGER.info(
        ("Start distributed parallel searching: (world size {}; MASTER {}:{})"
         " rank {} local_rank {} PID {}").format(int(os.environ["WORLD_SIZE"]),
                                                 os.environ["MASTER_ADDR"],
                                                 os.environ["MASTER_PORT"],
                                                 os.environ["RANK"],
                                                 local_rank, os.getpid()))

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    cfg["weights_manager_cfg"]["multiprocess"] = True
    cfg["evaluator_cfg"]["multiprocess"] = True

    # initialize components
    LOGGER.info("Initializing components.")
    whole_dataset = _init_component(cfg, "dataset")
    rollout_type = cfg["rollout_type"]

    search_space = _init_component(cfg, "search_space")
    controller = _init_component(cfg,
                                 "controller",
                                 search_space=search_space,
                                 device=device,
                                 rollout_type=rollout_type)

    _data_type = whole_dataset.data_type()

    if _data_type == "sequence":
        # get the num_tokens
        num_tokens = whole_dataset.vocab_size
        LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME,
                    num_tokens)
        weights_manager = _init_component(cfg,
                                          "weights_manager",
                                          search_space=search_space,
                                          device=device,
                                          gpus=[device],
                                          rollout_type=rollout_type,
                                          num_tokens=num_tokens)
    else:
        weights_manager = _init_component(cfg,
                                          "weights_manager",
                                          search_space=search_space,
                                          device=device,
                                          gpus=[device],
                                          rollout_type=rollout_type)
    # check model support for data type
    expect(_data_type in weights_manager.supported_data_types())

    objective = _init_component(cfg, "objective", search_space=search_space)
    # evaluator
    evaluator = _init_component(cfg,
                                "evaluator",
                                dataset=whole_dataset,
                                weights_manager=weights_manager,
                                objective=objective,
                                rollout_type=rollout_type)
    expect(_data_type in evaluator.supported_data_types())

    trainer = _init_component(cfg,
                              "trainer",
                              evaluator=evaluator,
                              controller=controller,
                              rollout_type=rollout_type)

    # setup trainer and train
    if local_rank != 0:
        save_every = None
    trainer.setup(load,
                  save_every,
                  train_dir,
                  writer=writer,
                  interleave_report_every=interleave_report_every)
    trainer.train()
示例#4
0
def train(gpus, seed, cfg_file, load, load_state_dict, save_every, train_dir):
    if train_dir:
        # backup config file, and if in `develop` mode, also backup the aw_nas source code
        train_dir = utils.makedir(train_dir, remove=True)
        shutil.copyfile(cfg_file, os.path.join(train_dir, "train_config.yaml"))

        # add log file handler
        log_file = os.path.join(train_dir, "train.log")
        _logger.addFile(log_file)

    LOGGER.info("CWD: %s", os.getcwd())
    LOGGER.info("CMD: %s", " ".join(sys.argv))

    setproctitle.setproctitle("awnas-train config: {}; train_dir: {}; cwd: {}"\
                              .format(cfg_file, train_dir, os.getcwd()))

    # set gpu
    gpu_list = [int(g) for g in gpus.split(",")]
    if not gpu_list:
        _set_gpu(None)
        device = "cpu"
    else:
        _set_gpu(gpu_list[0])
        device = torch.device("cuda:{}".format(gpu_list[0]) if torch.cuda.
                              is_available() else "cpu")

    # set seed
    if seed is not None:
        LOGGER.info("Setting random seed: %d.", seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.manual_seed(seed)

    # load components config
    LOGGER.info("Loading configuration files.")
    with open(cfg_file, "r") as f:
        cfg = yaml.safe_load(f)

    # initialize components
    LOGGER.info("Initializing components.")
    search_space = _init_component(cfg, "search_space")
    whole_dataset = _init_component(cfg, "dataset")

    _data_type = whole_dataset.data_type()
    if _data_type == "sequence":
        # get the num_tokens
        num_tokens = whole_dataset.vocab_size
        LOGGER.info("Dataset %s: vocabulary size: %d", whole_dataset.NAME,
                    num_tokens)
        model = _init_component(cfg,
                                "final_model",
                                search_space=search_space,
                                device=device,
                                num_tokens=num_tokens)
    else:
        model = _init_component(cfg,
                                "final_model",
                                search_space=search_space,
                                device=device)
    # check model support for data type
    expect(_data_type in model.supported_data_types())
    objective = _init_component(cfg, "objective", search_space=search_space)
    trainer = _init_component(cfg,
                              "final_trainer",
                              dataset=whole_dataset,
                              model=model,
                              device=device,
                              gpus=gpu_list,
                              objective=objective)
    # check trainer support for data type
    expect(_data_type in trainer.supported_data_types())

    # start training
    LOGGER.info("Start training.")
    trainer.setup(load, load_state_dict, save_every, train_dir)
    trainer.train()