예제 #1
0
    def teardown(self):
        # Disconnect process from ray cluster
        ray.shutdown()

        # Shutdown ray cluster.
        if self.env:
            ret = utils.shutdown_ray_node()
            logger.debug(f"Node shutdown {'successful' if ret == 0 else 'failed'} in Orchestrator")
            for f in self.env.factories_ips:
                ret = utils.shutdown_remote_ray_node(f, "ubuntu", self.env.key)
                logger.debug(f"Node shutdown {'successful' if ret == 0 else 'failed'} in {f}")

        self.progress_state.finish()

        if hasattr(self, 'tmp_resources_dir'):
            self.tmp_resources_dir.cleanup()
예제 #2
0
    def run(self, force: bool = False, verbose: bool = False, **kwargs):
        """Run an Experiment"""

        logger.info(cl.BL("Launching local experiment"))

        # Check if save_path/name already exists + is not empty
        # + force and resume are False
        if (
            os.path.exists(self.full_save_path) and
            os.listdir(self.full_save_path) and
            not self.resume and not force
        ):
            raise error.ParsingRunnableError(
                f"Results from an experiment with the same name were located in the save path " +
                f"{self.full_save_path}. To overide this results, please use '--force' " +
                "To use these results and resume the experiment, pick 'resume: True' " +
                "If not, just pick another save_path/name."
            )

        full_save_path = self.full_save_path
        if not self.env:
            wording.print_useful_local_info(full_save_path)

        # If running remotely then all folders were already created.
        # in the 'setup' method.
        if not self.env:
            if os.path.exists(full_save_path) and force:
                shutil.rmtree(full_save_path)  # This deleted the folder also
                logger.info(
                    cl.RE(f"Removed previous existing from {full_save_path} " +
                          "results as --force was specified"))

            if not os.path.exists(full_save_path):
                os.makedirs(full_save_path)
                logger.debug(f"{full_save_path} created to store output")

        local_vars = self.resources.get('local', {}) or {}
        local_vars = utils.rel_to_abs_paths(local_vars)
        remote_vars = self.resources.get('remote', {}) or {}

        global_vars = dict(local_vars, **remote_vars)

        # Check that links are in order (i.e topologically in pipeline)
        utils.check_links(self.pipeline, global_vars)

        # Check that only computable blocks are given
        # search algorithms and schedulers
        utils.check_search(self.pipeline, self.search, self.schedulers)

        # Initialize ray cluster
        kwargs = {"logging_level": logging.ERROR, "include_webui": False}
        if self.debug:
            logger.info(
                cl.BL("Debugger activated"))
            logger.info(
                cl.YE("Pipeline will begin executing all variants and all " +
                      "computables serially. " +
                      "Press 's' to step into the " +
                      "run method of the Component once the ipdb console " +
                      "shows up"))
            kwargs['local_mode'] = True

        if self.env:
            ray.init(redis_address=f"{self.env.orchestrator_ip}:{const.RAY_REDIS_PORT}", **kwargs)
        else:
            ray.init(**kwargs)
            logger.debug(f"Ray cluster up")

        # Initialize map from block to list of checkpoints
        # This is used whe resolving links over other computable blocks
        # TODO: in python 3.7 we can replace these with dict() or {}
        checkpoints: OrderedDict = OrderedDict()
        schemas: OrderedDict = OrderedDict()
        success: OrderedDict = OrderedDict()

        # By default use all CPUs if no GPU is present
        devices = self.devices if self.devices else None
        if devices is None and utils.local_has_gpu():
            devices = {"cpu": 4, "gpu": 1}

        to_resume = None
        if isinstance(self.resume, str):
            index = list(self.pipeline.keys()).index(self.resume)
            to_resume = list(self.pipeline.keys())[:index + 1]
        elif isinstance(self.resume, Sequence):
            to_resume = list(self.resume)

        # Make experiment_tag easier to extract
        def trial_name_creator(trial):
            identifier = ""
            if "env" in trial.config:
                env = trial.config["env"]
                if isinstance(env, type):
                    env = env.__name__
                identifier += f"{env}"
            if trial.experiment_tag:
                hyper_params = {}
                if "_" in trial.experiment_tag:
                    num, tunable_params = trial.experiment_tag.split("_", 1)
                    identifier += tunable_params
                    param_list = [p.split("=") for p in tunable_params.split(",")]
                    hyper_params = {p[0]: p[1] for p in param_list}
                else:
                    identifier += trial.experiment_tag
                trial.config['hyper_params'] = hyper_params
            return identifier.replace("/", "_")

        trial_name_creator = ray.tune.function(trial_name_creator)

        # Compute depedencies DAG
        dependency_dag = {}
        schemas_dag: OrderedDict = OrderedDict()
        for block_id, schema_block in self.pipeline.items():
            schemas_dag[block_id] = schema_block
            relevant_ids = utils.extract_needed_blocks(schemas_dag, block_id, global_vars)
            dependencies = deepcopy(relevant_ids)
            dependencies.discard(block_id)

            dependency_dag[block_id] = list(dependencies)

        if self.env:
            self.progress_state = ProgressState(
                self.name, full_save_path, dependency_dag, len(self.env.factories_ips))
        else:
            self.progress_state = ProgressState(self.name, full_save_path, dependency_dag)

        for block_id, schema_block in tqdm(self.pipeline.items()):
            schema_block.add_extensions_metadata(self.extensions)
            logger.debug(f"Starting {block_id}")

            # Add the block to the configuration so far
            schemas[block_id] = schema_block
            success[block_id] = True

            self.progress_state.checkpoint_start(block_id)
            relevant_ids = utils.extract_needed_blocks(schemas, block_id, global_vars)
            relevant_schemas = {k: v for k, v in deepcopy(schemas).items() if k in relevant_ids}

            # Set resume
            resume = False if to_resume is None else (block_id in to_resume)

            # If computable, convert to tune.Trainable
            # Each Component block is an Experiment in ray.tune
            if not isinstance(schema_block, Schema):
                raise ValueError('schema block not of correct type Schema')
            if issubclass(schema_block.component_subclass, Component):

                # Returns is a list non-nested configuration
                divided_schemas = list(utils.divide_nested_grid_search_options(relevant_schemas))
                divided_dict = [utils.extract_dict(x) for x in divided_schemas]
                # Convert options and links
                divided_dict_tune = [utils.convert_tune(x) for x in divided_dict]
                # Execute block
                tune_experiments = []
                for param_dict, schemas_dict in zip(divided_dict_tune, divided_schemas):
                    config = {'name': block_id,
                              'merge_plot': self.merge_plot,
                              'params': param_dict,
                              'schemas': Schema.serialize(schemas_dict),
                              'checkpoints': checkpoints,
                              'to_run': block_id,
                              'global_vars': global_vars,
                              'verbose': verbose,
                              'custom_modules': list(self.extensions.keys()),
                              'debug': self.debug}
                    # Filter out the tensorboard logger as we handle
                    # general and tensorboard-specific logging ourselves
                    tune_loggers = list(filter(lambda l: not issubclass(l, TFLogger),
                                               DEFAULT_LOGGERS))
                    tune_experiment = ray.tune.Experiment(name=block_id,
                                                          run=TuneAdapter,
                                                          trial_name_creator=trial_name_creator,
                                                          config=deepcopy(config),
                                                          local_dir=full_save_path,
                                                          checkpoint_freq=1,
                                                          checkpoint_at_end=True,
                                                          max_failures=self.max_failures,
                                                          resources_per_trial=devices,
                                                          loggers=tune_loggers)
                    logger.debug(f"Created tune.Experiment for {param_dict}")
                    tune_experiments.append(tune_experiment)

                trials = ray.tune.run_experiments(tune_experiments,
                                                  search_alg=self.search.get(block_id, None),
                                                  scheduler=self.schedulers.get(block_id, None),
                                                  queue_trials=True,
                                                  verbose=False,
                                                  resume=resume,
                                                  raise_on_failed_trial=False)
                logger.debug(f"Finish running all tune.Experiments for {block_id}")

                for t in trials:
                    if t.status == t.ERROR:
                        logger.error(f"{t} ended with ERROR status.")
                        success[block_id] = False

                # Save checkpoint location
                # It should point from:
                # block_id -> hash(variant) -> checkpoint
                hashes = []
                for t in trials:
                    schema_with_params: Dict = OrderedDict()
                    for b in schemas_dict:
                        schema_copy = deepcopy(schemas_dict[b])
                        utils.update_schema_with_params(schema_copy, t.config['params'][b])
                        schema_with_params[b] = schema_copy
                    hashes.append(repr(schema_with_params))

                paths = [t._checkpoint.value for t in trials]

                # Mask out error trials
                mask = [True] * len(trials)
                for i, trial in enumerate(trials):
                    if trial.status == ray.tune.trial.Trial.ERROR:
                        mask[i] = False

                # Mask out on reduce
                reduce_k = self.reduce.get(block_id, None)
                if reduce_k is not None and int(reduce_k) > 0:
                    # Get best
                    best_trials = utils.get_best_trials(trials, topk=int(reduce_k))
                    best_trial_ids = set([t.trial_id for t in best_trials])
                    # Mask out
                    for i, trial in enumerate(trials):
                        if trial.trial_id not in best_trial_ids:
                            mask[i] = False

                trial_checkpoints = {t_hash: path for t_hash, path in zip(hashes, paths)}
                trial_mask = {t_hash: mask_value for t_hash, mask_value in zip(hashes, mask)}
                checkpoints[block_id] = {'paths': trial_checkpoints, 'mask': trial_mask}

                # Rsync workers to main machine and back to all workers
                # TODO specify callbacks. If not remote will not work
                if self.env:
                    run_utils.rsync_hosts(self.env.orchestrator_ip,
                                          self.env.factories_ips,
                                          self.env.user,
                                          self.full_save_path,
                                          self.env.key,
                                          exclude=["state.pkl"])

            self.progress_state.checkpoint_end(block_id, checkpoints, success[block_id])
            logger.debug(f"Done running {block_id}")

        # Disconnect process from ray cluster
        ray.shutdown()

        # Shutdown ray cluster.
        if self.env:
            ret = utils.shutdown_ray_node()
            logger.debug(f"Node shutdown {'successful' if ret == 0 else 'failed'} in Orchestrator")
            for f in self.env.factories_ips:
                ret = utils.shutdown_remote_ray_node(f, "ubuntu", self.env.key)
                logger.debug(f"Node shutdown {'successful' if ret == 0 else 'failed'} in {f}")

        self.progress_state.finish()

        if all(success.values()):
            logger.info(cl.GR("Experiment ended successfully"))
        else:
            raise error.UnsuccessfulRunnableError(
                "Not all trials were successful. Check the logs for more information"
            )