Exemplo n.º 1
0
def run_main(asset_path):
    loader = runsenabler_loader.RunsEnablerLoader("some_dir")
    plugins = default.get_plugins() + [
        paramplot_plugin.ParamPlotPlugin, loader
    ]
    gr_tensorboard = TensorBoard(plugins, lambda: open(asset_path, 'rb'))
    gr_tensorboard.configure(sys.argv)

    use_filesystem_controller = gr_tensorboard.flags.use_filesystem_controller
    original_logdir = pathlib.Path(gr_tensorboard.flags.logdir)
    loader.actual_logdir = str(original_logdir)
    if use_filesystem_controller:
        # Retrieve the actual log directory and replace it in the context with the new logdir
        parent_dir = original_logdir.parent
        print("logdir provided: " + str(original_logdir))
        new_logdir = parent_dir / "temp_dir"
        print("creating temporary workspace in " + str(new_logdir))

        # Create the temp dir
        new_logdir.mkdir(parents=True)

        # swap the original logdir for the new one
        gr_tensorboard.flags.logdir = str(new_logdir)

    try:
        sys.exit(gr_tensorboard.main())
    finally:
        if use_filesystem_controller:
            shutil.rmtree(str(new_logdir))
Exemplo n.º 2
0
def run_main(asset_path):
    plugins = default.get_plugins() + [
        paramplot_plugin.ParamPlotPlugin,
        runsenabler_loader.RunsEnablerLoader()
    ]
    gr_tensorboard = TensorBoard(plugins, lambda: open(asset_path, 'rb'))
    gr_tensorboard.configure(sys.argv)

    sys.exit(gr_tensorboard.main())
Exemplo n.º 3
0
def launch_tensorboard(log_path: str) -> str:
    """
    Launch tensorboard at given log path.
    :param log_path: log path
    :return: tensorboard url
    """
    tb = TensorBoard()
    tb.configure((None, "--logdir", log_path))
    url = tb.launch()
    return url
Exemplo n.º 4
0
def initialize(log_root_path: Optional[str] = None,
               log_name: Optional[str] = None,
               verbose: bool = True) -> SummaryWriter:
    """Initialize Tensorboard daemon.

    .. note::
        It will be used later for monitoring the learning progress.

    :param log_root_path: Fullpath of root log directory.
                          Optional: location of this file / log by default.
    :param log_name: Name of the subdirectory where to save data.
                     Optional: full date _ hostname by default.
    :param verbose: Whether or not to print information about what is going on.
                    Optional: True by default.

    :returns: SummaryWriter to pass to the training agent to monitor the
              training progress.
    """
    # Configure Tensorboard
    if log_root_path is None:
        log_root_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "log")
    if 'tb' not in locals().keys():
        tb = TensorBoard()
        tb.configure(host="0.0.0.0", logdir=log_root_path)
        url = tb.launch()
        if verbose:
            print(f"Started Tensorboard {url}. "
                  f"Root directory: {log_root_path}")

    # Create log directory
    if log_name is None:
        log_name = "_".join((datetime.now().strftime("%Y_%m_%d_%H_%M_%S"),
                            socket.gethostname().replace('-', '_')))
    log_path = os.path.join(log_root_path, log_name)
    pathlib.Path(log_path).mkdir(parents=True, exist_ok=True)
    if verbose:
        print(f"Tensorboard logfiles directory: {log_path}")

    return SummaryWriter(log_path)
Exemplo n.º 5
0
ray.init(
    address=None,  # The address of the Ray cluster to connect to, if any.
    num_cpus=8,  # Number of CPUs assigned to each raylet (None = no limit)
    num_gpus=1,  # Number of GPUs assigned to each raylet (None = no limit)
    webui_host="0.0.0.0",  # The host to bind the web UI server to.
    local_mode=
    False,  # If true, the code will be executed serially (for debugging purpose)
    logging_level=20  # Logging level.
)

# # Create tensorboard Jupyter cell
# %load_ext tensorboard
# %tensorboard --logdir logs
if not 'tb' in locals().keys():
    tb = TensorBoard()
    tb.configure(host="0.0.0.0",
                 logdir=os.path.join(pathlib.Path.home(), 'ray_results'))
    url = tb.launch()
    print(f"Starting Tensorboard {url} ...")

# ================= Run hyperparameter search =================

# Register the custom model architecture (it implements 'vf_share_layers')
ModelCatalog.register_custom_model("my_model", FullyConnectedNetwork)

# Register the environment with custom default constructor arguments
env_creator = lambda env_config: gym.make(GYM_ENV_NAME, **GYM_ENV_KWARGS)
register_env("my_custom_env", env_creator)

Exemplo n.º 6
0
            best_reward = result['rew']
            best_epoch = epoch
            if save_fn:
                save_fn(policy)
        if verbose:
            print(f'Epoch #{epoch}: test_reward: {result["rew"]:.6f}, '
                  f'best_reward: {best_reward:.6f} in #{best_epoch}')
        if stop_fn and stop_fn(best_reward):
            break
    return gather_info(
        start_time, train_collector, test_collector, best_reward)

### Configure Tensorboard
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'log')
if not 'tb' in locals().keys():
    tb = TensorBoard()
    tb.configure(host="0.0.0.0", logdir=data_path)
    url = tb.launch()
    print(f"Started Tensorboard {url} at {data_path}...")
writer = SummaryWriter(data_path)

### Configure export
def save_fn(policy):
    torch.save(policy.state_dict(), os.path.join(data_path, 'policy.pth'))

### Configure early stopping of training
def stop_fn(x):
    return x >= TARGET_EPISODE_STEPS

### Run the learning process
result = onpolicy_trainer(
Exemplo n.º 7
0
def initialize(num_cpus: int,
               num_gpus: int,
               log_root_path: str,
               log_name: Optional[str] = None,
               logger_cls: type = TBXLogger,
               launch_tensorboard: bool = True,
               debug: bool = False,
               verbose: bool = True) -> Callable[[Dict[str, Any]], Logger]:
    """Initialize Ray and Tensorboard daemons.

    It will be used later for almost everything from dashboard, remote/client
    management, to multithreaded environment.

    .. note:
        The default Tensorboard port will be used, namely 6006 if available,
        using 0.0.0.0 (binding to all IPv4 addresses on local machine).
        Similarly, Ray dashboard port is 8265 if available. In both cases, the
        port will be increased interatively until to find one available.

    :param num_cpus: Maximum number of CPU threads that can be executed in
                     parallel. Note that it does not actually reserve part of
                     the CPU, so that several processes can reserve the number
                     of threads available on the system at the same time.
    :param num_gpu: Maximum number of GPU unit that can be used, which can be
                    fractional to only allocate part of the resource. Note that
                    contrary to CPU resource, the memory is likely to actually
                    be reserve and allocated by the process, in particular
                    using Tensorflow backend.
    :param log_root_path: Fullpath of root log directory.
    :param log_name: Name of the subdirectory where to save data. `None` to
                     use default name, empty string '' to set it interactively
                     in command prompt. It must be a valid Python identifier.
                     Optional: full date _ hostname by default.
    :param logger_cls: Custom logger class type deriving from `TBXLogger`.
                       Optional: `TBXLogger` by default.
    :param launch_tensorboard: Whether or not to launch tensorboard
                               automatically.
                               Optional: Enabled by default.
    :param debug: Whether or not to display debugging trace.
                  Optional: Disabled by default.
    :param verbose: Whether or not to print information about what is going on.
                    Optional: True by default.

    :returns: lambda function to pass a `ray.Trainer` to monitor learning
              progress in Tensorboard.
    """
    # Make sure provided logger class derives from ray.tune.logger.Logger
    assert issubclass(logger_cls, Logger), (
        "Logger class must derive from `ray.tune.logger.Logger`")

    # Check if cluster servers are already running, and if requested resources
    # are available.
    is_cluster_running = False
    redis_addresses = services.find_redis_address()
    if redis_addresses:
        for redis_address in redis_addresses:
            # Connect to redis global state accessor
            global_state_accessor = GlobalStateAccessor(
                redis_address, ray_constants.REDIS_DEFAULT_PASSWORD)
            global_state_accessor.connect()

            # Get available resources
            resources: Dict[str, int] = defaultdict(int)
            for info in global_state_accessor.get_all_available_resources():
                # pylint: disable=no-member
                message = ray.gcs_utils.AvailableResources.FromString(info)
                for field, capacity in message.resources_available.items():
                    resources[field] += capacity

            # Disconnect global state accessor
            time.sleep(0.1)
            global_state_accessor.disconnect()

            # Check if enough computation resources are available
            is_cluster_running = (resources["CPU"] >= num_cpus and
                                  resources["GPU"] >= num_gpus)

            # Stop looking as soon as a cluster with enough resources is found
            if is_cluster_running:
                break

    # Connect to Ray server if necessary, starting one if not already running
    if not ray.is_initialized():
        if not is_cluster_running:
            # Start new Ray server, if not already running
            ray.init(
                # Address of Ray cluster to connect to, if any
                address=None,
                # Number of CPUs assigned to each raylet
                num_cpus=num_cpus,
                # Number of GPUs assigned to each raylet
                num_gpus=num_gpus,
                # Enable object eviction in LRU order under memory pressure
                _lru_evict=False,
                # Whether or not to execute the code serially (for debugging)
                local_mode=debug,
                # Logging level
                logging_level=logging.DEBUG if debug else logging.ERROR,
                # Whether to redirect outputs from every worker to the driver
                log_to_driver=debug,
                # Whether to start Ray dashboard, to monitor cluster's status
                include_dashboard=True,
                # The host to bind the dashboard server to
                dashboard_host="0.0.0.0")
        else:
            # Connect to existing Ray cluster
            ray.init(
                address="auto",
                _lru_evict=False,
                local_mode=debug,
                logging_level=logging.DEBUG if debug else logging.ERROR,
                log_to_driver=debug,
                include_dashboard=False)

    # Configure Tensorboard
    if launch_tensorboard:
        tb = TensorBoard()
        tb.configure(host="0.0.0.0", logdir=os.path.abspath(log_root_path))
        url = tb.launch()
        if verbose:
            print(f"Started Tensorboard {url}.",
                  f"Root directory: {log_root_path}")

    # Define log filename interactively if requested
    if log_name == "":
        while True:
            log_name = input(
                "Enter desired log subdirectory name (empty for default)...")
            if not log_name or re.match(r'^[A-Za-z0-9_]+$', log_name):
                break
            print("Unvalid name. Only Python identifiers are supported.")

    # Handling of default log name and sanity checks
    if not log_name:
        log_name = "_".join((
            datetime.now().strftime("%Y_%m_%d_%H_%M_%S"),
            re.sub(r'[^A-Za-z0-9_]', "_", socket.gethostname())))
    else:
        assert re.match(r'^[A-Za-z0-9_]+$', log_name), (
            "Log name must be a valid Python identifier.")

    # Create log directory
    log_path = os.path.join(log_root_path, log_name)
    pathlib.Path(log_path).mkdir(parents=True, exist_ok=True)
    if verbose:
        print(f"Tensorboard logfiles directory: {log_path}")

    # Define Ray logger
    def logger_creator(config: Dict[str, Any]) -> Logger:
        return logger_cls(config, log_path)

    return logger_creator