예제 #1
0
파일: cluster.py 프로젝트: yukw777/flambe
    def run_cmds(self, setup_cmds: List[str]) -> None:
        """Run setup commands in all hosts

        Parameters
        ----------
        setup_cmds: List[str]
            The list of commands

        Raises
        ------
        errors.RemoteCommandError
            If at least one commands is not successful in at
            least one host.

        """
        with ThreadPoolExecutor() as executor:
            futures = []

            for ins in self._get_all_hosts():
                futures.append(executor.submit(ins.run_cmds, setup_cmds))

            for f in futures:
                try:
                    f.result()
                except errors.RemoteCommandError:
                    raise
                except Exception as exc:
                    logger.error(
                        'Generated an unknown exception: {}'.format(exc))
                    raise

        logger.info(cl.GR("Custom commands ran successfully in all hosts"))
예제 #2
0
파일: cluster.py 프로젝트: yukw777/flambe
    def prepare_all_instances(self) -> None:
        """Prepare all the instances (both orchestrator and factories).

        This method assumes that the hosts are running and accesible.
        It will call the 'prepare' method from all hosts.

        """
        with ThreadPoolExecutor() as executor:
            futures = {}

            for ins in self._get_all_hosts():
                futures[executor.submit(ins.prepare)] = ins

            for f in futures.keys():
                try:
                    f.result()
                except errors.RemoteCommandError:
                    raise
                except Exception as exc:
                    logger.error(f'Generated an exception: {exc}')
                    raise
                else:
                    logger.debug(f'{futures[f].host} ready')

        logger.info(cl.GR("All instances prepared"))
예제 #3
0
파일: cluster.py 프로젝트: yukw777/flambe
    def create_dirs(self, relative_dirs: List[str]) -> None:
        """Create folders in all hostss.

        If some of the already exist, it will do nothing.

        Parameters
        ----------
        relative_dirs: List[str]
            The directories to create. They should be relative paths
            and $HOME of each host will be used to add the prefix.

        """
        with ThreadPoolExecutor() as executor:
            futures = {}

            for ins in self._get_all_hosts():
                futures[executor.submit(ins.create_dirs, relative_dirs)] = ins

            for f in futures.keys():
                try:
                    f.result()
                except errors.RemoteCommandError:
                    raise
                except Exception as exc:
                    logger.error(f'Generated an exception: {exc}')
                    raise
                else:
                    logger.debug(f'{futures[f].host} ready')

        logger.info(cl.GR("All instances prepared"))
예제 #4
0
    def launch_flambe(self, config_file: str, secrets_file: str,
                      force: bool) -> None:
        """Launch flambe execution in the remote host

        Parameters
        ----------
        config_file: str
            The config filename relative to the orchestrator
        secrets_file: str
            The filepath containing the secrets for the orchestrator
        force: bool
            The force parameters that was originally passed to flambe

        """
        force_params = "--force" if force else ""
        cmd = (
            f"tmux new-session -d -s 'flambe' " +
            f"'bash -lc \"flambe {config_file} -i --secrets {secrets_file} " +
            f"{force_params} &> output.log\"'")

        ret = self._run_cmd(cmd)

        if ret.success:
            logger.info(cl.GR("Running flambe in Orchestrator"))
        else:
            raise errors.RemoteCommandError(
                f"Not able to run flambe. {ret.msg}")
예제 #5
0
def install_extensions(extensions: Dict[str, str],
                       user_flag: bool = False) -> None:
    """Install extensions.

    At this point, all extensions must be either local paths or
    valid pypi packages.

    Remote extensions hosted in Github must have been download first.

    Parameters
    ----------
    extensions: Dict[str, str]
        Dictionary of extensions
    user_flag: bool
        Use --user flag when running pip install

    """
    cmd = ['python3', '-m', 'pip', 'install', '-U']
    if user_flag:
        cmd.append('--user')
    for ext, resource in extensions.items():
        curr_cmd = cmd[:]

        try:
            if os.path.exists(resource):
                # Package is local
                if os.sep not in resource:
                    resource = f"./{resource}"
            else:
                # Package follows pypi notation: "torch>=0.4.1,<1.1"
                resource = f"{resource}"

            curr_cmd.append(resource)

            output: Union[bytes, str]
            output = subprocess.check_output(curr_cmd,
                                             stderr=subprocess.DEVNULL)

            output = output.decode("utf-8")

            for l in output.splitlines():
                logger.debug(l)
                r = re.search(
                    r'Successfully uninstalled (?P<pkg_name>\D*)-(?P<version>.*)',
                    l)
                if r and 'pkg_name' in r.groupdict():
                    logger.info(
                        cl.RE(f"WARNING: While installing {ext}, " +
                              f"existing {r.groupdict()['pkg_name']}-" +
                              f"{r.groupdict()['version']} was uninstalled."))
        except subprocess.CalledProcessError:
            raise ImportError(f"Could not install package in {resource}")

        logger.info(cl.GR(f"Successfully installed {ext}"))
예제 #6
0
파일: cluster.py 프로젝트: yukw777/flambe
    def run(self, force: bool = False, **kwargs) -> None:
        """Run a cluster and load all the instances.

        After this metho runs, the orchestrator and factories
        objects will be populated.

        If a runnable is provided, then the cluster will execute
        the runnable remotely in the cluster. Currently, only
        ClusterRunnable is supported.

        This method should be idempotent (ie if called N times with
        the same configuration, only one cluster will be created.)

        Parameters
        ----------
        force: bool, defaults to False
            If true, current executions of the same runnable in the
            cluster will be overriden by a new execution.

        """
        self.load_all_instances()
        logger.info(cl.GR("Cluster loaded"))

        for ins in self._get_all_hosts():
            ins.wait_until_accessible()

        logger.debug("All instances accessible.")
        self.distribute_keys()

        self.create_dirs(["extensions"])
        logger.debug("Created flambe folder to store content")

        if self.setup_cmds is not None:
            self.run_cmds(self.setup_cmds)

        self.prepare_all_instances()
        logger.info(cl.GR("Flambe installed in all hosts"))
예제 #7
0
파일: cluster.py 프로젝트: yukw777/flambe
    def launch_ray_cluster(self) -> None:
        """Create a ray cluster.

        The main node is going to be located in the orchestrator machine
        and all other nodes in the factories.

        The main node is executed with --num-cpus=0 flag so that
        it doesn't do any work and all work is done by the factories.

        """
        for ins in self._get_all_hosts():
            if ins.is_node_running():
                raise man_errors.ClusterError(
                    f"Node {ins.host} is running in an existing cluster. Aborting."
                )

        port = const.RAY_REDIS_PORT

        # The orchestator needs to exist at this point
        if not self.orchestrator:
            raise man_errors.ClusterError(
                "Orchestrator instance was not loaded.")

        self.orchestrator.launch_node(port)

        redis_address = f"{self.orchestrator.private_host}:{port}"

        with ThreadPoolExecutor(max_workers=self.factories_num) as executor:
            futures = {}

            for ins in self.factories:
                futures[executor.submit(ins.launch_node, redis_address)] = ins

            for f in futures.keys():
                try:
                    f.result()
                except errors.RemoteCommandError:
                    raise
                except Exception as exc:
                    logger.error('Generated an exception: {}'.format(exc))
                    raise
                else:
                    logger.debug('{} Ray worker ready'.format(futures[f].host))

        logger.info(cl.GR("Ray cluster launched"))
예제 #8
0
파일: cluster.py 프로젝트: yukw777/flambe
    def distribute_keys(self) -> None:
        """Create a new key pair and distributes it to all hosts.

        Ensure that the hosts have a safe communication.
        The name of the key is the cluster's name

        """
        if self.cluster_has_key():
            logger.info(cl.GR("Cluster has already configured key pair"))
            return

        # generate private/public key pair
        key = rsa.generate_private_key(backend=default_backend(),
                                       public_exponent=65537,
                                       key_size=2048)

        # get public key in OpenSSH format
        public_key = key.public_key().public_bytes(
            serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH)

        # get private key in PEM container format
        pem = key.private_bytes(
            encoding=serialization.Encoding.PEM,
            format=serialization.PrivateFormat.TraditionalOpenSSL,
            encryption_algorithm=serialization.NoEncryption())

        # decode to printable strings
        private_key_str = pem.decode('utf-8')
        public_key_str = public_key.decode('utf-8')
        logger.debug("New key pair generated")

        def m(ins):
            ins._run_cmd(f"rm -rf {ins.get_home_path()}/{const.PUBLIC_KEY}")
            ins._run_cmd(f"rm -rf {ins.get_home_path()}/{const.PRIVATE_KEY}")

            ret = ins._run_cmd(
                f"echo '{public_key_str}' >> {ins.get_home_path()}/.ssh/authorized_keys",
                retries=3)
            if not ret.success:
                raise man_errors.ClusterError(
                    "Could not send key to authorized_keys")

            with tempfile.NamedTemporaryFile("w") as t:
                t.write(private_key_str)
                t.flush()
                ins.send_rsync(t.name,
                               f"{ins.get_home_path()}/{const.PRIVATE_KEY}")
                ins._run_cmd(
                    f"chmod 600 {ins.get_home_path()}/{const.PRIVATE_KEY}")

            with tempfile.NamedTemporaryFile("w") as t:
                t.write(public_key_str)
                t.flush()
                ins.send_rsync(t.name,
                               f"{ins.get_home_path()}/{const.PUBLIC_KEY}")
                logger.debug(f"New key pair sent to {ins.host}")

        with ThreadPoolExecutor() as executor:
            futures = {}

            for ins in self._get_all_hosts():
                futures[executor.submit(m, ins)] = ins

            for f in futures.keys():
                try:
                    f.result()
                except errors.RemoteCommandError:
                    raise
                except Exception as exc:
                    logger.error('Generated an exception: {}'.format(exc))
                    raise

        logger.info(cl.GR("Distributed keys"))
예제 #9
0
    def run(self,
            force: bool = False,
            verbose: bool = False,
            debug: bool = False,
            **kwargs):
        """Run an Experiment"""

        logger.info(cl.BL("Launching local experiment"))

        # Check if save_path/name already exists + is not empty
        # + force and resume are False
        if (os.path.exists(self.full_save_path)
                and os.listdir(self.full_save_path) and not self.resume
                and not force):
            raise error.ParsingRunnableError(
                f"Results from an experiment with the same name were located in the save path "
                +
                f"{self.full_save_path}. To overide this results, please use '--force' "
                +
                "To use these results and resume the experiment, pick 'resume: True' "
                + "If not, just pick another save_path/name.")

        full_save_path = self.full_save_path

        if not self.env:
            wording.print_useful_local_info(full_save_path)

        # If running remotely then all folders were already created.
        # in the 'setup' method.
        if not self.env:
            if os.path.exists(full_save_path) and force:
                shutil.rmtree(full_save_path)  # This deleted the folder also
                logger.info(
                    cl.RE(f"Removed previous existing from {full_save_path} " +
                          "results as --force was specified"))

            if not os.path.exists(full_save_path):
                os.makedirs(full_save_path)
                logger.debug(f"{full_save_path} created to store output")

        self._dump_experiment_file()

        if any(
                map(lambda x: isinstance(x, ClusterResource),
                    self.resources.values())):
            raise ValueError(
                f"Local experiments doesn't support resources with '!cluster' tags. "
                +
                "The '!cluster' tag is used for those resources that need to be handled "
                + "in the cluster when running remote experiments.")

        if not self.env:
            self.tmp_resources_dir = tempfile.TemporaryDirectory()
            resources_folder = self.tmp_resources_dir.name
        else:
            resources_folder = f"{self.full_save_path}/_resources"

        resources = self.process_resources(self.resources, resources_folder)

        # rsync downloaded resources
        if self.env:
            run_utils.rsync_hosts(self.env.orchestrator_ip,
                                  self.env.factories_ips,
                                  self.env.user,
                                  self.full_save_path,
                                  self.env.key,
                                  exclude=["state.pkl"])

        # Check that links are in order (i.e topologically in pipeline)
        utils.check_links(self.pipeline, resources)

        # Check that only computable blocks are given
        # search algorithms and schedulers
        utils.check_search(self.pipeline, self.search, self.schedulers)

        # Initialize ray cluster
        kwargs = {"logging_level": logging.ERROR, "include_webui": False}
        if debug:
            kwargs['local_mode'] = True

        if self.env:
            ray.init(redis_address=
                     f"{self.env.orchestrator_ip}:{const.RAY_REDIS_PORT}",
                     **kwargs)
        else:
            ray.init(**kwargs)
            logger.debug(f"Ray cluster up")

        # Initialize map from block to list of checkpoints
        # This is used whe resolving links over other computable blocks
        # TODO: in python 3.7 we can replace these with dict() or {}
        checkpoints: OrderedDict = OrderedDict()
        schemas: OrderedDict = OrderedDict()
        success: OrderedDict = OrderedDict()

        # By default use all CPUs if no GPU is present
        devices = self.devices if self.devices else None
        if devices is None and utils.local_has_gpu():
            devices = {"cpu": 4, "gpu": 1}

        to_resume = None
        if isinstance(self.resume, str):
            index = list(self.pipeline.keys()).index(self.resume)
            to_resume = list(self.pipeline.keys())[:index + 1]
        elif isinstance(self.resume, Sequence):
            to_resume = list(self.resume)

        # Make experiment_tag easier to extract
        def trial_name_creator(trial):
            identifier = ""
            if "env" in trial.config:
                env = trial.config["env"]
                if isinstance(env, type):
                    env = env.__name__
                identifier += f"{env}"
            if trial.experiment_tag:
                hyper_params = {}
                if "_" in trial.experiment_tag:
                    num, tunable_params = trial.experiment_tag.split("_", 1)
                    identifier += tunable_params
                    param_list = [
                        p.split("=") for p in tunable_params.split(",")
                    ]
                    hyper_params = {p[0]: p[1] for p in param_list}
                else:
                    identifier += trial.experiment_tag
                trial.config['hyper_params'] = hyper_params
            return identifier.replace("/", "_")

        trial_name_creator = ray.tune.function(trial_name_creator)

        # Compute depedencies DAG
        dependency_dag = {}
        schemas_dag: OrderedDict = OrderedDict()
        for block_id, schema_block in self.pipeline.items():
            schemas_dag[block_id] = schema_block
            relevant_ids = utils.extract_needed_blocks(schemas_dag, block_id,
                                                       resources)
            dependencies = deepcopy(relevant_ids)
            dependencies.discard(block_id)

            dependency_dag[block_id] = list(dependencies)

        if self.env:
            self.progress_state = ProgressState(self.name, full_save_path,
                                                dependency_dag, self.content,
                                                len(self.env.factories_ips))
        else:
            self.progress_state = ProgressState(self.name, full_save_path,
                                                dependency_dag, self.content)

        for block_id, schema_block in tqdm(self.pipeline.items()):
            schema_block.add_extensions_metadata(self.extensions)
            logger.debug(f"Starting {block_id}")

            # Add the block to the configuration so far
            schemas[block_id] = schema_block
            success[block_id] = True

            self.progress_state.checkpoint_start(block_id)
            relevant_ids = utils.extract_needed_blocks(schemas, block_id,
                                                       resources)
            relevant_schemas = {
                k: v
                for k, v in deepcopy(schemas).items() if k in relevant_ids
            }

            # Set resume
            resume = False if to_resume is None else (block_id in to_resume)

            # If computable, convert to tune.Trainable
            # Each Component block is an Experiment in ray.tune
            if not isinstance(schema_block, Schema):
                raise ValueError('schema block not of correct type Schema')
            if issubclass(schema_block.component_subclass, Component):

                # Returns is a list non-nested configuration
                divided_schemas = list(
                    utils.divide_nested_grid_search_options(relevant_schemas))
                divided_dict = [utils.extract_dict(x) for x in divided_schemas]
                # Convert options and links
                divided_dict_tune = [
                    utils.convert_tune(x) for x in divided_dict
                ]
                # Execute block
                tune_experiments = []
                for param_dict, schemas_dict in zip(divided_dict_tune,
                                                    divided_schemas):
                    config = {
                        'name': block_id,
                        'merge_plot': self.merge_plot,
                        'params': param_dict,
                        'schemas': Schema.serialize(schemas_dict),
                        'checkpoints': checkpoints,
                        'to_run': block_id,
                        'global_vars': resources,
                        'verbose': verbose,
                        'custom_modules': list(self.extensions.keys()),
                        'debug': debug
                    }
                    # Filter out the tensorboard logger as we handle
                    # general and tensorboard-specific logging ourselves
                    tune_loggers = list(
                        filter(
                            lambda l: l != tf2_compat_logger and  # noqa: E741
                            not issubclass(l, TFLogger),
                            DEFAULT_LOGGERS))
                    tune_experiment = ray.tune.Experiment(
                        name=block_id,
                        run=TuneAdapter,
                        trial_name_creator=trial_name_creator,
                        config=deepcopy(config),
                        local_dir=full_save_path,
                        checkpoint_freq=1,
                        checkpoint_at_end=True,
                        max_failures=self.max_failures,
                        resources_per_trial=devices,
                        loggers=tune_loggers)
                    logger.debug(f"Created tune.Experiment for {param_dict}")
                    tune_experiments.append(tune_experiment)

                trials = ray.tune.run_experiments(
                    tune_experiments,
                    search_alg=self.search.get(block_id, None),
                    scheduler=self.schedulers.get(block_id, None),
                    queue_trials=True,
                    verbose=False,
                    resume=resume,
                    raise_on_failed_trial=False)
                logger.debug(
                    f"Finish running all tune.Experiments for {block_id}")

                any_error = False
                for t in trials:
                    if t.status == t.ERROR:
                        logger.error(
                            cl.
                            RE(f"Variant {t} of '{block_id}' ended with ERROR status."
                               ))
                        success[block_id] = False
                        any_error = True
                if any_error and self.stop_on_failure:
                    self.teardown()
                    self.progress_state.checkpoint_end(block_id,
                                                       success[block_id])
                    raise error.UnsuccessfulRunnableError(
                        f"Stopping experiment at block '{block_id}' "
                        "because there was an error and stop_on_failure == True."
                    )

                # Save checkpoint location
                # It should point from:
                # block_id -> hash(variant) -> checkpoint
                hashes = []
                for t in trials:
                    schema_with_params: Dict = OrderedDict()
                    for b in schemas_dict:
                        schema_copy = deepcopy(schemas_dict[b])
                        utils.update_schema_with_params(
                            schema_copy, t.config['params'][b])
                        schema_with_params[b] = schema_copy
                    hashes.append(repr(schema_with_params))

                paths = [t._checkpoint.value for t in trials]

                # Mask out error trials
                mask = [True] * len(trials)
                for i, trial in enumerate(trials):
                    if trial.status == ray.tune.trial.Trial.ERROR:
                        mask[i] = False

                # Mask out on reduce
                reduce_k = self.reduce.get(block_id, None)
                if reduce_k is not None and int(reduce_k) > 0:
                    # Get best
                    best_trials = utils.get_best_trials(trials,
                                                        topk=int(reduce_k))
                    best_trial_ids = set([t.trial_id for t in best_trials])
                    # Mask out
                    for i, trial in enumerate(trials):
                        if trial.trial_id not in best_trial_ids:
                            mask[i] = False

                trial_checkpoints = {
                    t_hash: path
                    for t_hash, path in zip(hashes, paths)
                }
                trial_mask = {
                    t_hash: mask_value
                    for t_hash, mask_value in zip(hashes, mask)
                }
                checkpoints[block_id] = {
                    'paths': trial_checkpoints,
                    'mask': trial_mask
                }

                # Rsync workers to main machine and back to all workers
                # TODO specify callbacks. If not remote will not work
                if self.env:
                    run_utils.rsync_hosts(self.env.orchestrator_ip,
                                          self.env.factories_ips,
                                          self.env.user,
                                          self.full_save_path,
                                          self.env.key,
                                          exclude=["state.pkl"])

            self.progress_state.checkpoint_end(block_id, success[block_id])
            logger.debug(f"Done running {block_id}")

        self.teardown()

        if all(success.values()):
            logger.info(cl.GR("Experiment ended successfully"))
        else:
            raise error.UnsuccessfulRunnableError(
                "Not all trials were successful. Check the logs for more information"
            )
예제 #10
0
def main(args: argparse.Namespace) -> None:
    """Execute command based on given config"""
    if is_dev_mode():
        print(cl.RA(ASCII_LOGO_DEV))
        print(cl.BL(f"Location: {get_flambe_repo_location()}\n"))
    else:
        print(cl.RA(ASCII_LOGO))
        print(cl.BL(f"VERSION: {flambe.__version__}\n"))

    # Pass original module for ray / pickle
    make_component(torch.nn.Module, TORCH_TAG_PREFIX, only_module='torch.nn')
    # torch.optim.Optimizer exists, ignore mypy
    make_component(
        torch.optim.Optimizer,
        TORCH_TAG_PREFIX,  # type: ignore
        only_module='torch.optim')
    make_component(torch.optim.lr_scheduler._LRScheduler,
                   TORCH_TAG_PREFIX,
                   only_module='torch.optim.lr_scheduler')
    make_component(ray.tune.schedulers.TrialScheduler, TUNE_TAG_PREFIX)
    make_component(ray.tune.suggest.SearchAlgorithm, TUNE_TAG_PREFIX)

    # TODO check first if there is cluster as if there is there
    # is no need to install extensions
    check_system_reqs()
    with SafeExecutionContext(args.config) as ex:
        if args.cluster is not None:
            with SafeExecutionContext(args.cluster) as ex_cluster:
                cluster, _ = ex_cluster.preprocess(
                    secrets=args.secrets, install_ext=args.install_extensions)
                runnable, extensions = ex.preprocess(import_ext=False,
                                                     secrets=args.secrets)
                cluster.run(force=args.force)
                if isinstance(runnable, ClusterRunnable):
                    cluster = cast(Cluster, cluster)

                    # This is independant to the type of ClusterRunnable
                    destiny = os.path.join(cluster.get_orch_home_path(),
                                           "extensions")

                    # Before sending the extensions, they need to be
                    # downloaded (locally).
                    t = os.path.join(FLAMBE_GLOBAL_FOLDER, "extensions")
                    extensions = download_extensions(extensions, t)

                    # At this point, all remote extensions
                    # (except pypi extensions)
                    # have local paths.
                    new_extensions = cluster.send_local_content(extensions,
                                                                destiny,
                                                                all_hosts=True)

                    new_secrets = cluster.send_secrets()

                    # Installing the extensions is crutial as flambe
                    # will execute without '-i' flag and therefore
                    # will assume that the extensions are installed
                    # in the orchestrator.
                    cluster.install_extensions_in_orchestrator(new_extensions)
                    logger.info(cl.GR("Extensions installed in Orchestrator"))

                    runnable.setup_inject_env(cluster=cluster,
                                              extensions=new_extensions,
                                              force=args.force)
                    cluster.execute(runnable, new_extensions, new_secrets,
                                    args.force)
                else:
                    raise ValueError(
                        "Only ClusterRunnables can be executed in a cluster.")
        else:
            runnable, _ = ex.preprocess(secrets=args.secrets,
                                        install_ext=args.install_extensions)
            runnable.run(force=args.force, verbose=args.verbose)
예제 #11
0
        '--cluster',
        type=str,
        default=None,
        help='Specify the cluster that will run the experiment. This option ' +
        'works if the main config is an Experiment')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='Override existing runnables. Be careful ' +
        'when using this flag as it could have undesired effects.')
    parser.add_argument('--secrets',
                        type=str,
                        default=os.path.join(FLAMBE_GLOBAL_FOLDER,
                                             "secrets.ini"))
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='Verbose console output')
    args = parser.parse_args()

    setup_global_logging(logging.INFO if not args.verbose else logging.DEBUG)
    logger = logging.getLogger(__name__)

    try:
        main(args)
        logger.info(cl.GR("------------------- Done -------------------"))
    except KeyboardInterrupt:
        logger.info(cl.RE("---- Exiting early (Keyboard Interrupt) ----"))