def teardown(self): # Disconnect process from ray cluster ray.shutdown() # Shutdown ray cluster. if self.env: ret = utils.shutdown_ray_node() logger.debug(f"Node shutdown {'successful' if ret == 0 else 'failed'} in Orchestrator") for f in self.env.factories_ips: ret = utils.shutdown_remote_ray_node(f, "ubuntu", self.env.key) logger.debug(f"Node shutdown {'successful' if ret == 0 else 'failed'} in {f}") self.progress_state.finish() if hasattr(self, 'tmp_resources_dir'): self.tmp_resources_dir.cleanup()
def run(self, force: bool = False, verbose: bool = False, **kwargs): """Run an Experiment""" logger.info(cl.BL("Launching local experiment")) # Check if save_path/name already exists + is not empty # + force and resume are False if ( os.path.exists(self.full_save_path) and os.listdir(self.full_save_path) and not self.resume and not force ): raise error.ParsingRunnableError( f"Results from an experiment with the same name were located in the save path " + f"{self.full_save_path}. To overide this results, please use '--force' " + "To use these results and resume the experiment, pick 'resume: True' " + "If not, just pick another save_path/name." ) full_save_path = self.full_save_path if not self.env: wording.print_useful_local_info(full_save_path) # If running remotely then all folders were already created. # in the 'setup' method. if not self.env: if os.path.exists(full_save_path) and force: shutil.rmtree(full_save_path) # This deleted the folder also logger.info( cl.RE(f"Removed previous existing from {full_save_path} " + "results as --force was specified")) if not os.path.exists(full_save_path): os.makedirs(full_save_path) logger.debug(f"{full_save_path} created to store output") local_vars = self.resources.get('local', {}) or {} local_vars = utils.rel_to_abs_paths(local_vars) remote_vars = self.resources.get('remote', {}) or {} global_vars = dict(local_vars, **remote_vars) # Check that links are in order (i.e topologically in pipeline) utils.check_links(self.pipeline, global_vars) # Check that only computable blocks are given # search algorithms and schedulers utils.check_search(self.pipeline, self.search, self.schedulers) # Initialize ray cluster kwargs = {"logging_level": logging.ERROR, "include_webui": False} if self.debug: logger.info( cl.BL("Debugger activated")) logger.info( cl.YE("Pipeline will begin executing all variants and all " + "computables serially. " + "Press 's' to step into the " + "run method of the Component once the ipdb console " + "shows up")) kwargs['local_mode'] = True if self.env: ray.init(redis_address=f"{self.env.orchestrator_ip}:{const.RAY_REDIS_PORT}", **kwargs) else: ray.init(**kwargs) logger.debug(f"Ray cluster up") # Initialize map from block to list of checkpoints # This is used whe resolving links over other computable blocks # TODO: in python 3.7 we can replace these with dict() or {} checkpoints: OrderedDict = OrderedDict() schemas: OrderedDict = OrderedDict() success: OrderedDict = OrderedDict() # By default use all CPUs if no GPU is present devices = self.devices if self.devices else None if devices is None and utils.local_has_gpu(): devices = {"cpu": 4, "gpu": 1} to_resume = None if isinstance(self.resume, str): index = list(self.pipeline.keys()).index(self.resume) to_resume = list(self.pipeline.keys())[:index + 1] elif isinstance(self.resume, Sequence): to_resume = list(self.resume) # Make experiment_tag easier to extract def trial_name_creator(trial): identifier = "" if "env" in trial.config: env = trial.config["env"] if isinstance(env, type): env = env.__name__ identifier += f"{env}" if trial.experiment_tag: hyper_params = {} if "_" in trial.experiment_tag: num, tunable_params = trial.experiment_tag.split("_", 1) identifier += tunable_params param_list = [p.split("=") for p in tunable_params.split(",")] hyper_params = {p[0]: p[1] for p in param_list} else: identifier += trial.experiment_tag trial.config['hyper_params'] = hyper_params return identifier.replace("/", "_") trial_name_creator = ray.tune.function(trial_name_creator) # Compute depedencies DAG dependency_dag = {} schemas_dag: OrderedDict = OrderedDict() for block_id, schema_block in self.pipeline.items(): schemas_dag[block_id] = schema_block relevant_ids = utils.extract_needed_blocks(schemas_dag, block_id, global_vars) dependencies = deepcopy(relevant_ids) dependencies.discard(block_id) dependency_dag[block_id] = list(dependencies) if self.env: self.progress_state = ProgressState( self.name, full_save_path, dependency_dag, len(self.env.factories_ips)) else: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag) for block_id, schema_block in tqdm(self.pipeline.items()): schema_block.add_extensions_metadata(self.extensions) logger.debug(f"Starting {block_id}") # Add the block to the configuration so far schemas[block_id] = schema_block success[block_id] = True self.progress_state.checkpoint_start(block_id) relevant_ids = utils.extract_needed_blocks(schemas, block_id, global_vars) relevant_schemas = {k: v for k, v in deepcopy(schemas).items() if k in relevant_ids} # Set resume resume = False if to_resume is None else (block_id in to_resume) # If computable, convert to tune.Trainable # Each Component block is an Experiment in ray.tune if not isinstance(schema_block, Schema): raise ValueError('schema block not of correct type Schema') if issubclass(schema_block.component_subclass, Component): # Returns is a list non-nested configuration divided_schemas = list(utils.divide_nested_grid_search_options(relevant_schemas)) divided_dict = [utils.extract_dict(x) for x in divided_schemas] # Convert options and links divided_dict_tune = [utils.convert_tune(x) for x in divided_dict] # Execute block tune_experiments = [] for param_dict, schemas_dict in zip(divided_dict_tune, divided_schemas): config = {'name': block_id, 'merge_plot': self.merge_plot, 'params': param_dict, 'schemas': Schema.serialize(schemas_dict), 'checkpoints': checkpoints, 'to_run': block_id, 'global_vars': global_vars, 'verbose': verbose, 'custom_modules': list(self.extensions.keys()), 'debug': self.debug} # Filter out the tensorboard logger as we handle # general and tensorboard-specific logging ourselves tune_loggers = list(filter(lambda l: not issubclass(l, TFLogger), DEFAULT_LOGGERS)) tune_experiment = ray.tune.Experiment(name=block_id, run=TuneAdapter, trial_name_creator=trial_name_creator, config=deepcopy(config), local_dir=full_save_path, checkpoint_freq=1, checkpoint_at_end=True, max_failures=self.max_failures, resources_per_trial=devices, loggers=tune_loggers) logger.debug(f"Created tune.Experiment for {param_dict}") tune_experiments.append(tune_experiment) trials = ray.tune.run_experiments(tune_experiments, search_alg=self.search.get(block_id, None), scheduler=self.schedulers.get(block_id, None), queue_trials=True, verbose=False, resume=resume, raise_on_failed_trial=False) logger.debug(f"Finish running all tune.Experiments for {block_id}") for t in trials: if t.status == t.ERROR: logger.error(f"{t} ended with ERROR status.") success[block_id] = False # Save checkpoint location # It should point from: # block_id -> hash(variant) -> checkpoint hashes = [] for t in trials: schema_with_params: Dict = OrderedDict() for b in schemas_dict: schema_copy = deepcopy(schemas_dict[b]) utils.update_schema_with_params(schema_copy, t.config['params'][b]) schema_with_params[b] = schema_copy hashes.append(repr(schema_with_params)) paths = [t._checkpoint.value for t in trials] # Mask out error trials mask = [True] * len(trials) for i, trial in enumerate(trials): if trial.status == ray.tune.trial.Trial.ERROR: mask[i] = False # Mask out on reduce reduce_k = self.reduce.get(block_id, None) if reduce_k is not None and int(reduce_k) > 0: # Get best best_trials = utils.get_best_trials(trials, topk=int(reduce_k)) best_trial_ids = set([t.trial_id for t in best_trials]) # Mask out for i, trial in enumerate(trials): if trial.trial_id not in best_trial_ids: mask[i] = False trial_checkpoints = {t_hash: path for t_hash, path in zip(hashes, paths)} trial_mask = {t_hash: mask_value for t_hash, mask_value in zip(hashes, mask)} checkpoints[block_id] = {'paths': trial_checkpoints, 'mask': trial_mask} # Rsync workers to main machine and back to all workers # TODO specify callbacks. If not remote will not work if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) self.progress_state.checkpoint_end(block_id, checkpoints, success[block_id]) logger.debug(f"Done running {block_id}") # Disconnect process from ray cluster ray.shutdown() # Shutdown ray cluster. if self.env: ret = utils.shutdown_ray_node() logger.debug(f"Node shutdown {'successful' if ret == 0 else 'failed'} in Orchestrator") for f in self.env.factories_ips: ret = utils.shutdown_remote_ray_node(f, "ubuntu", self.env.key) logger.debug(f"Node shutdown {'successful' if ret == 0 else 'failed'} in {f}") self.progress_state.finish() if all(success.values()): logger.info(cl.GR("Experiment ended successfully")) else: raise error.UnsuccessfulRunnableError( "Not all trials were successful. Check the logs for more information" )