Пример #1
0
    def _get_cluster(gpu_cluster: bool = True, fill: bool = True, **kwargs):
        name = kwargs.get('name', 'cluster')
        factories_num = kwargs.get('factories_num', 2)
        username = kwargs.get("username", "ubuntu")
        key = kwargs.get("key", "/path/to/key")
        setup_cmds = kwargs.get("setup_cmds", [])

        cluster = Cluster(name=name,
                          factories_num=factories_num,
                          username=username,
                          key=key,
                          setup_cmds=setup_cmds)

        if fill:
            cluster.orchestrator = get_instance(instance.OrchestratorInstance)
            if gpu_cluster:
                cluster.factories = [
                    get_instance(instance.GPUFactoryInstance)
                    for _ in range(cluster.factories_num)
                ]
            else:
                cluster.factories = [
                    get_instance(instance.CPUFactoryInstance)
                    for _ in range(cluster.factories_num)
                ]

        return cluster
Пример #2
0
    def setup_inject_env(self, cluster: Cluster, extensions: Dict[str, str],
                         force: bool, **kwargs) -> None:
        """Call setup and inject the RemoteEnvironment

        Parameters
        ----------
        cluster: Cluster
            The cluster where this Runnable will be running
        extensions: Dict[str, str]
            The ClusterRunnable extensions
        force: bool
            The force value provided to Flambe

        """
        self.setup(cluster=cluster,
                   extensions=extensions,
                   force=force,
                   **kwargs)
        self.set_serializable_attr("env", cluster.get_remote_env())
Пример #3
0
    def setup(self, cluster: Cluster, extensions: Dict[str, str], force: bool,
              **kwargs) -> None:
        """Prepare the cluster for the Experiment remote execution.

        This involves:

        1) [Optional] Kill previous flambe execution
        2) [Optional] Remove existing results
        3) Create supporting dirs (exp/synced_results, exp/resources)
        4) Install extensions in all factories
        5) Launch ray cluster
        6) Send resources
        7) Launch Tensorboard + Report site

        Parameters
        ----------
        cluster: Cluster
            The cluster where this Runnable will be running
        extensions: Dict[str, str]
            The ClusterRunnable extensions
        force: bool
            The force value provided to Flambe

        """

        if cluster.existing_flambe_execution() or cluster.existing_ray_cluster(
        ):
            if not force:
                raise man_errors.ClusterError(
                    "This cluster is currently used by other " +
                    "experiment. Use --force flag to reuse it. Aborting.")
            else:
                cluster.shutdown_flambe_execution()
                cluster.shutdown_ray_cluster()
                logger.info(cl.YE("Forced resource to become available..."))

        output_dir_remote = f"{self.name}/{self.output_folder_name}"
        if cluster.existing_dir(output_dir_remote):
            logger.debug("This cluster already ran an experiment " +
                         "with the same name.")

            if self.resume:
                logger.info(cl.YE("Resuming previous experiment..."))
            elif force:
                cluster.remove_dir(output_dir_remote,
                                   content_only=True,
                                   all_hosts=True)
            else:
                raise man_errors.ClusterError(
                    "This cluster already has results for the same experiment name. "
                    +
                    "If you wish to reuse them, use resume: True or if you want to override them "
                    + "use --force. Aborting.")

        cluster.install_extensions_in_factories(extensions)
        logger.info(cl.YE("Extensions installed in all factories"))

        # Add redundant check for typing
        if not cluster.orchestrator:
            raise man_errors.ClusterError(
                "The orchestrator needs to exist at this point")

        cluster.create_dirs([
            self.name, f"{self.name}/{self.output_folder_name}",
            f"{self.name}/{self.output_folder_name}/_resources"
        ])
        logger.info(cl.YE("Created supporting directories"))

        cluster.launch_ray_cluster()

        if not cluster.check_ray_cluster():
            raise man_errors.ClusterError(
                "Ray cluster not launched correctly.")

        local_resources = {
            k: v
            for k, v in self.resources.items()
            if not isinstance(v, ClusterResource)
        }

        tmp_resources_dir = tempfile.TemporaryDirectory()

        # This will download remote resources.
        local_resources = self.process_resources(
            local_resources, tmp_resources_dir.name)  # type: ignore

        local_resources = cast(Dict[str, str], local_resources)

        if local_resources:
            new_resources = cluster.send_local_content(
                local_resources,
                os.path.join(cluster.orchestrator.get_home_path(), self.name,
                             self.output_folder_name, "_resources"),
                all_hosts=True)
        else:
            new_resources = dict()

        tmp_resources_dir.cleanup()

        # Add the cluster resources without the tag
        new_resources.update({
            k: v.location
            for k, v in self.resources.items()
            if isinstance(v, ClusterResource)
        })

        if cluster.orchestrator.is_tensorboard_running():
            if force:
                cluster.orchestrator.remove_tensorboard()
            else:
                raise man_errors.ClusterError(
                    "Tensorboard was running on the orchestrator.")

        cluster.orchestrator.launch_tensorboard(output_dir_remote,
                                                const.TENSORBOARD_PORT)

        if cluster.orchestrator.is_report_site_running():
            if force:
                cluster.orchestrator.remove_report_site()
            else:
                raise man_errors.ClusterError(
                    "Report site was running on the orchestrator")

        cluster.orchestrator.launch_report_site(
            f"{output_dir_remote}/state.pkl",
            port=const.REPORT_SITE_PORT,
            output_log=f"output.log",
            output_dir=output_dir_remote,
            tensorboard_port=const.TENSORBOARD_PORT)

        self.set_serializable_attr("resources", new_resources)
        self.set_serializable_attr("devices", cluster.get_max_resources())
        self.set_serializable_attr(
            "save_path", f"{cluster.orchestrator.get_home_path()}/{self.name}")