Exemplo n.º 1
0
    def _create_factories(self,
                          number: int = 1) -> List[FactoryInsT]:
        """Creates new AWS EC2 instances to be the Factory instances.

        These new machines receive all tags defined in the *.ini file.
        Factory instances will be named using the factory basename plus
        an index. For example, "seq2seq_factory_0", "seq2seq_factory_1".

        Parameters
        ----------
        number : int
            The number of factories to be created.

        Returns
        -------
        List[instance.AWSGPUFactoryInstance]
            The new factory instances.

        """
        if not self.factory_ami:
            ami = utils._find_default_ami(_type="factory")
            if ami is None:
                raise errors.ClusterError("Could not find matching AMI for the factory.")
        else:
            ami = self.factory_ami

        factories = self._generic_launch_instances(instance.CPUFactoryInstance,
                                                   number, self.factories_type,
                                                   ami, role="Factory")

        for i, f in enumerate(factories):
            f.wait_until_accessible()
            if f.contains_gpu():
                factories[i] = instance.GPUFactoryInstance(f.host, f.private_host, f.username,
                                                           self.key, self.config, self.debug)

        return factories
Exemplo n.º 2
0
    def _existing_cluster(self) -> Tuple[Any, List[Any]]:
        """Whether there is an existing cluster that matches name.

        The cluster should also match all other tags, including Creator)

        Returns
        -------
        Tuple[Any, List[Any]]
            Returns the (boto_orchestrator, [boto_factories])
            that match the experiment's name.

        """
        candidates: List[Tuple[Any, str]] = []
        for ins, role, cluster_name in self.flambe_own_running_instances():
            if role and cluster_name:
                if cluster_name == self.name:
                    candidates.append((ins, role))
                    logger.debug(f"Found existing {role} host {ins.public_ip_address}")

        orchestrator = None
        factories = []

        for ins, role in candidates:
            if role == 'Orchestrator':
                if orchestrator:
                    raise errors.ClusterError(
                        "Found 2 Orchestrator instances with same experiment name. " +
                        "This should never happen. " +
                        "Please remove manually all instances with tag " +
                        f"'Cluster-Name': '{self.name}' and retry."
                    )

                orchestrator = ins
            elif role == 'Factory':
                factories.append(ins)

        return orchestrator, factories
Exemplo n.º 3
0
    def _create_orchestrator(self) -> instance.OrchestratorInstance:
        """Create a new EC2 instance to be the Orchestrator instance.

        This new machine receives all tags defined in the *.ini file.

        Returns
        -------
        instance.AWSOrchestratorInstance
            The new orchestrator instance.

        """
        if not self.orchestrator_ami:
            ami = self._find_default_ami(_type="orchestrator")
            if ami is None:
                raise errors.ClusterError(
                    "Could not find matching AMI for the orchestrator.")
        else:
            ami = self.orchestrator_ami

        return self._generic_launch_instances(instance.OrchestratorInstance,
                                              1,
                                              self.orchestrator_type,
                                              ami,
                                              role="Orchestrator")[0]
Exemplo n.º 4
0
    def remove_dir(self, _dir: str, content_only: bool = True, all_hosts: bool = True) -> None:
        """ Remove a directory in the ClusterError

        Parameters
        ----------
        _dir: str
            The directory to remove
        content_only: bool
            To remove the content only or the folder also.
            Defaults to True.
        all_hosts: bool
            To remove it in all hosts or only in the Orchestrator.
            Defaults to True (in all hosts).

        """
        if not self.orchestrator:
            raise man_errors.ClusterError("Orchestrator instance was not loaded.")

        if all_hosts:
            for ins in self._get_all_hosts():
                ins.remove_dir(_dir, content_only)

        else:
            self.orchestrator.remove_dir(_dir, content_only)
Exemplo n.º 5
0
    def send_local_content(self,
                           content: Dict[str, str],
                           dest: str,
                           all_hosts: bool = False) -> Dict[str, str]:
        """Send local content to the cluster

        Parameters
        ----------
        content: Dict[str, str]
            The dict of resources key -> local path
        dest: str
            The orchestator's destination folder
        all_hosts: bool
            If False, only send the content to the orchestrator.
            If True, send to all factories.

        Returns
        -------
        Dict[str, str]
            The new dict of content with orchestrator's paths.

        """
        ret = {}

        # The orchestator needs to exist at this point
        if not self.orchestrator:
            raise man_errors.ClusterError(
                "Orchestrator instance was not loaded.")

        for k, c in content.items():
            c = os.path.expanduser(c)
            base: str = ""
            if os.path.exists(c):

                size = get_size_MB(c)
                if size > UPLOAD_WARN_LIMIT_MB:
                    logger.info(
                        cl.
                        YE(f"Uploading '{c}' ({int(size)} MB) which may take a while. "
                           +
                           "Double check you want to be transferring this file "
                           +
                           "(note we automatically sync extensions, experiment resources "
                           +
                           "and potentially the flambe repo if installed in dev mode)"
                           ))

                if os.path.isdir(c):
                    if not c.endswith(os.sep):
                        c = f"{c}{os.sep}"
                    base = os.path.basename(os.path.dirname(c))
                elif os.path.isfile(c):
                    base = os.path.basename(c)

                new_c = os.path.join(dest, f"{k}__{base}")
                self.orchestrator.send_rsync(c, new_c)
                logger.debug(f"Content {k}: {c} sent to cluster")

                ret[k] = new_c
            else:
                ret[k] = c

        if all_hosts:
            self.rsync_orch(dest)

        return ret
Exemplo n.º 6
0
    def setup(self, cluster: Cluster, extensions: Dict[str, str], force: bool,
              **kwargs) -> None:
        """Prepare the cluster for the Experiment remote execution.

        This involves:

        1) [Optional] Kill previous flambe execution
        2) [Optional] Remove existing results
        3) Create supporting dirs (exp/synced_results, exp/resources)
        4) Install extensions in all factories
        5) Launch ray cluster
        6) Send resources
        7) Launch Tensorboard + Report site

        Parameters
        ----------
        cluster: Cluster
            The cluster where this Runnable will be running
        extensions: Dict[str, str]
            The ClusterRunnable extensions
        force: bool
            The force value provided to Flambe

        """

        if cluster.existing_flambe_execution() or cluster.existing_ray_cluster(
        ):
            if not force:
                raise man_errors.ClusterError(
                    "This cluster is currently used by other " +
                    "experiment. Use --force flag to reuse it. Aborting.")
            else:
                cluster.shutdown_flambe_execution()
                cluster.shutdown_ray_cluster()
                logger.info(cl.YE("Forced resource to become available..."))

        output_dir_remote = f"{self.name}/{self.output_folder_name}"
        if cluster.existing_dir(output_dir_remote):
            logger.debug("This cluster already ran an experiment " +
                         "with the same name.")

            if self.resume:
                logger.info(cl.YE("Resuming previous experiment..."))
            elif force:
                cluster.remove_dir(output_dir_remote,
                                   content_only=True,
                                   all_hosts=True)
            else:
                raise man_errors.ClusterError(
                    "This cluster already has results for the same experiment name. "
                    +
                    "If you wish to reuse them, use resume: True or if you want to override them "
                    + "use --force. Aborting.")

        cluster.install_extensions_in_factories(extensions)
        logger.info(cl.YE("Extensions installed in all factories"))

        # Add redundant check for typing
        if not cluster.orchestrator:
            raise man_errors.ClusterError(
                "The orchestrator needs to exist at this point")

        cluster.create_dirs([
            self.name, f"{self.name}/{self.output_folder_name}",
            f"{self.name}/{self.output_folder_name}/_resources"
        ])
        logger.info(cl.YE("Created supporting directories"))

        cluster.launch_ray_cluster()

        if not cluster.check_ray_cluster():
            raise man_errors.ClusterError(
                "Ray cluster not launched correctly.")

        local_resources = {
            k: v
            for k, v in self.resources.items()
            if not isinstance(v, ClusterResource)
        }

        tmp_resources_dir = tempfile.TemporaryDirectory()

        # This will download remote resources.
        local_resources = self.process_resources(
            local_resources, tmp_resources_dir.name)  # type: ignore

        local_resources = cast(Dict[str, str], local_resources)

        if local_resources:
            new_resources = cluster.send_local_content(
                local_resources,
                os.path.join(cluster.orchestrator.get_home_path(), self.name,
                             self.output_folder_name, "_resources"),
                all_hosts=True)
        else:
            new_resources = dict()

        tmp_resources_dir.cleanup()

        # Add the cluster resources without the tag
        new_resources.update({
            k: v.location
            for k, v in self.resources.items()
            if isinstance(v, ClusterResource)
        })

        if cluster.orchestrator.is_tensorboard_running():
            if force:
                cluster.orchestrator.remove_tensorboard()
            else:
                raise man_errors.ClusterError(
                    "Tensorboard was running on the orchestrator.")

        cluster.orchestrator.launch_tensorboard(output_dir_remote,
                                                const.TENSORBOARD_PORT)

        if cluster.orchestrator.is_report_site_running():
            if force:
                cluster.orchestrator.remove_report_site()
            else:
                raise man_errors.ClusterError(
                    "Report site was running on the orchestrator")

        cluster.orchestrator.launch_report_site(
            f"{output_dir_remote}/state.pkl",
            port=const.REPORT_SITE_PORT,
            output_log=f"output.log",
            output_dir=output_dir_remote,
            tensorboard_port=const.TENSORBOARD_PORT)

        self.set_serializable_attr("resources", new_resources)
        self.set_serializable_attr("devices", cluster.get_max_resources())
        self.set_serializable_attr(
            "save_path", f"{cluster.orchestrator.get_home_path()}/{self.name}")
Exemplo n.º 7
0
    def load_all_instances(self) -> None:
        """Launch all instances for the experiment.

        This method launches both  the orchestrator and the factories.

        """
        boto_orchestrator, boto_factories = self._existing_cluster()

        with ThreadPoolExecutor() as executor:
            future_orch, future_factories = None, None

            if boto_orchestrator:
                self.orchestrator = self.get_orchestrator(
                    self._get_boto_public_host(boto_orchestrator),
                    self._get_boto_private_host(boto_orchestrator))
                logger.info(
                    cl.
                    BL(f"Found existing orchestrator ({boto_orchestrator.instance_type}) "
                       + f"{self.orchestrator.host}"))

            else:
                future_orch = executor.submit(self._create_orchestrator)

            for f in boto_factories:
                factory = self.get_factory(self._get_boto_public_host(f),
                                           self._get_boto_private_host(f))
                if factory.contains_gpu():
                    factory = self.get_gpu_factory(
                        self._get_boto_public_host(f),
                        self._get_boto_private_host(f))
                self.factories.append(factory)

            if len(self.factories) > 0:
                logger.info(
                    cl.BL(f"Found {len(self.factories)} existing factories " +
                          f"({str([f.host for f in self.factories])})."))

            pending_new_factories = self.factories_num - len(self.factories)

            logger.debug(f"Creating {pending_new_factories} factories")
            if pending_new_factories > 0:
                future_factories = executor.submit(
                    self._create_factories, number=pending_new_factories)
            elif pending_new_factories < 0:
                logger.info(
                    cl.BL(
                        f"Reusing existing {len(boto_factories)} factories."))

            try:
                if future_orch:
                    self.orchestrator = future_orch.result()
                    logger.info(
                        cl.
                        BL(f"New orchestrator created {self.orchestrator.host}"
                           ))

                if future_factories:
                    new_factories = future_factories.result()
                    self.factories.extend(new_factories)
                    logger.info(
                        cl.
                        BL(f"{pending_new_factories} factories {self.factories_type} created "
                           + f"({str([f.host for f in new_factories])})."))
            except botocore.exceptions.ClientError as e:
                raise errors.ClusterError(
                    "Error creating the instances. Check that the provided configuration "
                    + f" is correct. Original error: {e}")

        self.name_hosts()
        self.update_tags()
        self.remove_existing_events()
        self.create_cloudwatch_events()