def _get_cluster(gpu_cluster: bool = True, fill: bool = True, **kwargs): name = kwargs.get('name', 'cluster') factories_num = kwargs.get('factories_num', 2) username = kwargs.get("username", "ubuntu") key = kwargs.get("key", "/path/to/key") setup_cmds = kwargs.get("setup_cmds", []) cluster = Cluster(name=name, factories_num=factories_num, username=username, key=key, setup_cmds=setup_cmds) if fill: cluster.orchestrator = get_instance(instance.OrchestratorInstance) if gpu_cluster: cluster.factories = [ get_instance(instance.GPUFactoryInstance) for _ in range(cluster.factories_num) ] else: cluster.factories = [ get_instance(instance.CPUFactoryInstance) for _ in range(cluster.factories_num) ] return cluster
def setup_inject_env(self, cluster: Cluster, extensions: Dict[str, str], force: bool, **kwargs) -> None: """Call setup and inject the RemoteEnvironment Parameters ---------- cluster: Cluster The cluster where this Runnable will be running extensions: Dict[str, str] The ClusterRunnable extensions force: bool The force value provided to Flambe """ self.setup(cluster=cluster, extensions=extensions, force=force, **kwargs) self.set_serializable_attr("env", cluster.get_remote_env())
def setup(self, cluster: Cluster, extensions: Dict[str, str], force: bool, **kwargs) -> None: """Prepare the cluster for the Experiment remote execution. This involves: 1) [Optional] Kill previous flambe execution 2) [Optional] Remove existing results 3) Create supporting dirs (exp/synced_results, exp/resources) 4) Install extensions in all factories 5) Launch ray cluster 6) Send resources 7) Launch Tensorboard + Report site Parameters ---------- cluster: Cluster The cluster where this Runnable will be running extensions: Dict[str, str] The ClusterRunnable extensions force: bool The force value provided to Flambe """ if cluster.existing_flambe_execution() or cluster.existing_ray_cluster( ): if not force: raise man_errors.ClusterError( "This cluster is currently used by other " + "experiment. Use --force flag to reuse it. Aborting.") else: cluster.shutdown_flambe_execution() cluster.shutdown_ray_cluster() logger.info(cl.YE("Forced resource to become available...")) output_dir_remote = f"{self.name}/{self.output_folder_name}" if cluster.existing_dir(output_dir_remote): logger.debug("This cluster already ran an experiment " + "with the same name.") if self.resume: logger.info(cl.YE("Resuming previous experiment...")) elif force: cluster.remove_dir(output_dir_remote, content_only=True, all_hosts=True) else: raise man_errors.ClusterError( "This cluster already has results for the same experiment name. " + "If you wish to reuse them, use resume: True or if you want to override them " + "use --force. Aborting.") cluster.install_extensions_in_factories(extensions) logger.info(cl.YE("Extensions installed in all factories")) # Add redundant check for typing if not cluster.orchestrator: raise man_errors.ClusterError( "The orchestrator needs to exist at this point") cluster.create_dirs([ self.name, f"{self.name}/{self.output_folder_name}", f"{self.name}/{self.output_folder_name}/_resources" ]) logger.info(cl.YE("Created supporting directories")) cluster.launch_ray_cluster() if not cluster.check_ray_cluster(): raise man_errors.ClusterError( "Ray cluster not launched correctly.") local_resources = { k: v for k, v in self.resources.items() if not isinstance(v, ClusterResource) } tmp_resources_dir = tempfile.TemporaryDirectory() # This will download remote resources. local_resources = self.process_resources( local_resources, tmp_resources_dir.name) # type: ignore local_resources = cast(Dict[str, str], local_resources) if local_resources: new_resources = cluster.send_local_content( local_resources, os.path.join(cluster.orchestrator.get_home_path(), self.name, self.output_folder_name, "_resources"), all_hosts=True) else: new_resources = dict() tmp_resources_dir.cleanup() # Add the cluster resources without the tag new_resources.update({ k: v.location for k, v in self.resources.items() if isinstance(v, ClusterResource) }) if cluster.orchestrator.is_tensorboard_running(): if force: cluster.orchestrator.remove_tensorboard() else: raise man_errors.ClusterError( "Tensorboard was running on the orchestrator.") cluster.orchestrator.launch_tensorboard(output_dir_remote, const.TENSORBOARD_PORT) if cluster.orchestrator.is_report_site_running(): if force: cluster.orchestrator.remove_report_site() else: raise man_errors.ClusterError( "Report site was running on the orchestrator") cluster.orchestrator.launch_report_site( f"{output_dir_remote}/state.pkl", port=const.REPORT_SITE_PORT, output_log=f"output.log", output_dir=output_dir_remote, tensorboard_port=const.TENSORBOARD_PORT) self.set_serializable_attr("resources", new_resources) self.set_serializable_attr("devices", cluster.get_max_resources()) self.set_serializable_attr( "save_path", f"{cluster.orchestrator.get_home_path()}/{self.name}")