def load_all_instances(self) -> None: """Launch all instances for the experiment. This method launches both the orchestrator and the factories. """ boto_orchestrator, boto_factories = self._existing_cluster() with ThreadPoolExecutor() as executor: future_orch, future_factories = None, None if boto_orchestrator: self.orchestrator = self.get_orchestrator(boto_orchestrator.public_ip_address, boto_orchestrator.private_ip_address) logger.info(cl.BL( f"Found existing orchestrator ({boto_orchestrator.instance_type}) " + f"{self.orchestrator.host}" )) else: future_orch = executor.submit(self._create_orchestrator) for f in boto_factories: factory = self.get_factory(f.public_ip_address, f.private_ip_address) if factory.contains_gpu(): factory = self.get_gpu_factory(f.public_ip_address, f.private_ip_address) self.factories.append(factory) if len(self.factories) > 0: logger.info(cl.BL(f"Found {len(self.factories)} existing factories " + f"({str([f.host for f in self.factories])}).")) pending_new_factories = self.factories_num - len(self.factories) logger.debug(f"Creating {pending_new_factories} factories") if pending_new_factories > 0: future_factories = executor.submit( self._create_factories, number=pending_new_factories ) elif pending_new_factories < 0: logger.info(cl.BL(f"Reusing existing {len(boto_factories)} factories.")) if future_orch: self.orchestrator = future_orch.result() logger.info(cl.BL(f"New orchestrator created {self.orchestrator.host}")) if future_factories: new_factories = future_factories.result() self.factories.extend(new_factories) logger.info(cl.BL( f"{pending_new_factories} factories {self.factories_type} created " + f"({str([f.host for f in new_factories])}).")) self.name_hosts() self.update_tags() self.remove_existing_events() self.create_cloudwatch_events()
def launch_report_site(self, progress_file: str, port: int, output_log: str, output_dir: str, tensorboard_port: int) -> None: """Launch the report site. The report site is a Flask web app. Raises ------ RemoteCommandError In case the launch process fails """ tensorboard_url = f"http://{self.host}:{tensorboard_port}" cmd = ( f"tmux new-session -d -s 'flambe-site' 'bash -lc \"flambe-site {progress_file} " f"--tensorboard_url {tensorboard_url} " f"--host 0.0.0.0 --port {port} " f"--output-dir {output_dir} " f"--output-log {output_log} &>> outputsite.log\"'") res = self._run_cmd(cmd) # Sometimes tmux command returns failure (because of some # timeout) but website is running. # Adding this extra check in that case. if res.success and self.is_report_site_running(): logger.info(cl.BL(f"Report site at http://{self.host}:{port}")) else: raise errors.RemoteCommandError( f"Report site failed to run. {res.msg}")
def save_s3(self, force) -> None: """Save an object to s3 using awscli Parameters ---------- force: bool Wheter to use a non-empty bucket folder or not """ url = urlparse(self.destination) if url.scheme != 's3' or url.netloc == '': raise error.ParsingRunnableError( "When uploading to s3, destination should be: " + "s3://<bucket-name>[/path/to/dir]") bucket_name = url.netloc s3 = self.get_boto_session().resource('s3') bucket = s3.Bucket(bucket_name) for content in bucket.objects.all(): path = url.path[1:] # Remove first '/' if content.key.startswith(path) and not force: raise error.ParsingRunnableError( f"Destination {self.destination} is not empty. " + "Use --force to force the usage of this bucket folder or " + "pick another destination.") with tempfile.TemporaryDirectory() as tmpdirname: flambe.save(self.compiled_component, tmpdirname, **self.serialization_args) try: subprocess.check_output( f"aws s3 cp --recursive {tmpdirname} {self.destination}". split(), stderr=subprocess.STDOUT, universal_newlines=True) except subprocess.CalledProcessError as exc: logger.debug(exc.output) raise ValueError(f"Error uploading artifacts to s3. " + "Check logs for more information") else: logger.info(cl.BL(f"Done uploading to {self.destination}"))
def execute(self, cluster_runnable, extensions: Dict[str, str], new_secrets: str, force: bool) -> None: """Execute a ClusterRunnable in the cluster. It will first upload the runnable file + extensions to the orchestrator (under $HOME/flambe.yaml) and then it will execute it based on the provided secrets Parameters ---------- cluster_runnable: ClusterRunnable The ClusterRunnable to run in the cluster extensions: Dict[str, str] The extensions for the ClusterRunnable new_secrets: str The path (relative to the orchestrator) where the secrets are located. IMPORTANT: previous to calling this method, the secrets should have been uploaded to the orchestrator force: bool The force parameter provided when running flambe locally """ if not self.orchestrator: raise man_errors.ClusterError("Orchestrator instance was not loaded.") orch_exp = ( f"{self.orchestrator.get_home_path()}/flambe.yaml" ) with tempfile.NamedTemporaryFile("w") as t: with StringIO() as s: yaml.dump_all([extensions, cluster_runnable], s) t.write(s.getvalue()) t.flush() self.orchestrator.send_rsync(t.name, orch_exp) logger.info(cl.BL("Remote runnable file sent to orchestrator")) self.orchestrator.launch_flambe(orch_exp, new_secrets, force)
def run(self, force: bool = False, verbose: bool = False, debug: bool = False, **kwargs): """Run an Experiment""" logger.info(cl.BL("Launching local experiment")) # Check if save_path/name already exists + is not empty # + force and resume are False if (os.path.exists(self.full_save_path) and os.listdir(self.full_save_path) and not self.resume and not force): raise error.ParsingRunnableError( f"Results from an experiment with the same name were located in the save path " + f"{self.full_save_path}. To overide this results, please use '--force' " + "To use these results and resume the experiment, pick 'resume: True' " + "If not, just pick another save_path/name.") full_save_path = self.full_save_path if not self.env: wording.print_useful_local_info(full_save_path) # If running remotely then all folders were already created. # in the 'setup' method. if not self.env: if os.path.exists(full_save_path) and force: shutil.rmtree(full_save_path) # This deleted the folder also logger.info( cl.RE(f"Removed previous existing from {full_save_path} " + "results as --force was specified")) if not os.path.exists(full_save_path): os.makedirs(full_save_path) logger.debug(f"{full_save_path} created to store output") self._dump_experiment_file() if any( map(lambda x: isinstance(x, ClusterResource), self.resources.values())): raise ValueError( f"Local experiments doesn't support resources with '!cluster' tags. " + "The '!cluster' tag is used for those resources that need to be handled " + "in the cluster when running remote experiments.") if not self.env: self.tmp_resources_dir = tempfile.TemporaryDirectory() resources_folder = self.tmp_resources_dir.name else: resources_folder = f"{self.full_save_path}/_resources" resources = self.process_resources(self.resources, resources_folder) # rsync downloaded resources if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) # Check that links are in order (i.e topologically in pipeline) utils.check_links(self.pipeline, resources) # Check that only computable blocks are given # search algorithms and schedulers utils.check_search(self.pipeline, self.search, self.schedulers) # Initialize ray cluster kwargs = {"logging_level": logging.ERROR, "include_webui": False} if debug: kwargs['local_mode'] = True if self.env: ray.init(redis_address= f"{self.env.orchestrator_ip}:{const.RAY_REDIS_PORT}", **kwargs) else: ray.init(**kwargs) logger.debug(f"Ray cluster up") # Initialize map from block to list of checkpoints # This is used whe resolving links over other computable blocks # TODO: in python 3.7 we can replace these with dict() or {} checkpoints: OrderedDict = OrderedDict() schemas: OrderedDict = OrderedDict() success: OrderedDict = OrderedDict() # By default use all CPUs if no GPU is present devices = self.devices if self.devices else None if devices is None and utils.local_has_gpu(): devices = {"cpu": 4, "gpu": 1} to_resume = None if isinstance(self.resume, str): index = list(self.pipeline.keys()).index(self.resume) to_resume = list(self.pipeline.keys())[:index + 1] elif isinstance(self.resume, Sequence): to_resume = list(self.resume) # Make experiment_tag easier to extract def trial_name_creator(trial): identifier = "" if "env" in trial.config: env = trial.config["env"] if isinstance(env, type): env = env.__name__ identifier += f"{env}" if trial.experiment_tag: hyper_params = {} if "_" in trial.experiment_tag: num, tunable_params = trial.experiment_tag.split("_", 1) identifier += tunable_params param_list = [ p.split("=") for p in tunable_params.split(",") ] hyper_params = {p[0]: p[1] for p in param_list} else: identifier += trial.experiment_tag trial.config['hyper_params'] = hyper_params return identifier.replace("/", "_") trial_name_creator = ray.tune.function(trial_name_creator) # Compute depedencies DAG dependency_dag = {} schemas_dag: OrderedDict = OrderedDict() for block_id, schema_block in self.pipeline.items(): schemas_dag[block_id] = schema_block relevant_ids = utils.extract_needed_blocks(schemas_dag, block_id, resources) dependencies = deepcopy(relevant_ids) dependencies.discard(block_id) dependency_dag[block_id] = list(dependencies) if self.env: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag, self.content, len(self.env.factories_ips)) else: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag, self.content) for block_id, schema_block in tqdm(self.pipeline.items()): schema_block.add_extensions_metadata(self.extensions) logger.debug(f"Starting {block_id}") # Add the block to the configuration so far schemas[block_id] = schema_block success[block_id] = True self.progress_state.checkpoint_start(block_id) relevant_ids = utils.extract_needed_blocks(schemas, block_id, resources) relevant_schemas = { k: v for k, v in deepcopy(schemas).items() if k in relevant_ids } # Set resume resume = False if to_resume is None else (block_id in to_resume) # If computable, convert to tune.Trainable # Each Component block is an Experiment in ray.tune if not isinstance(schema_block, Schema): raise ValueError('schema block not of correct type Schema') if issubclass(schema_block.component_subclass, Component): # Returns is a list non-nested configuration divided_schemas = list( utils.divide_nested_grid_search_options(relevant_schemas)) divided_dict = [utils.extract_dict(x) for x in divided_schemas] # Convert options and links divided_dict_tune = [ utils.convert_tune(x) for x in divided_dict ] # Execute block tune_experiments = [] for param_dict, schemas_dict in zip(divided_dict_tune, divided_schemas): config = { 'name': block_id, 'merge_plot': self.merge_plot, 'params': param_dict, 'schemas': Schema.serialize(schemas_dict), 'checkpoints': checkpoints, 'to_run': block_id, 'global_vars': resources, 'verbose': verbose, 'custom_modules': list(self.extensions.keys()), 'debug': debug } # Filter out the tensorboard logger as we handle # general and tensorboard-specific logging ourselves tune_loggers = list( filter( lambda l: l != tf2_compat_logger and # noqa: E741 not issubclass(l, TFLogger), DEFAULT_LOGGERS)) tune_experiment = ray.tune.Experiment( name=block_id, run=TuneAdapter, trial_name_creator=trial_name_creator, config=deepcopy(config), local_dir=full_save_path, checkpoint_freq=1, checkpoint_at_end=True, max_failures=self.max_failures, resources_per_trial=devices, loggers=tune_loggers) logger.debug(f"Created tune.Experiment for {param_dict}") tune_experiments.append(tune_experiment) trials = ray.tune.run_experiments( tune_experiments, search_alg=self.search.get(block_id, None), scheduler=self.schedulers.get(block_id, None), queue_trials=True, verbose=False, resume=resume, raise_on_failed_trial=False) logger.debug( f"Finish running all tune.Experiments for {block_id}") any_error = False for t in trials: if t.status == t.ERROR: logger.error( cl. RE(f"Variant {t} of '{block_id}' ended with ERROR status." )) success[block_id] = False any_error = True if any_error and self.stop_on_failure: self.teardown() self.progress_state.checkpoint_end(block_id, success[block_id]) raise error.UnsuccessfulRunnableError( f"Stopping experiment at block '{block_id}' " "because there was an error and stop_on_failure == True." ) # Save checkpoint location # It should point from: # block_id -> hash(variant) -> checkpoint hashes = [] for t in trials: schema_with_params: Dict = OrderedDict() for b in schemas_dict: schema_copy = deepcopy(schemas_dict[b]) utils.update_schema_with_params( schema_copy, t.config['params'][b]) schema_with_params[b] = schema_copy hashes.append(repr(schema_with_params)) paths = [t._checkpoint.value for t in trials] # Mask out error trials mask = [True] * len(trials) for i, trial in enumerate(trials): if trial.status == ray.tune.trial.Trial.ERROR: mask[i] = False # Mask out on reduce reduce_k = self.reduce.get(block_id, None) if reduce_k is not None and int(reduce_k) > 0: # Get best best_trials = utils.get_best_trials(trials, topk=int(reduce_k)) best_trial_ids = set([t.trial_id for t in best_trials]) # Mask out for i, trial in enumerate(trials): if trial.trial_id not in best_trial_ids: mask[i] = False trial_checkpoints = { t_hash: path for t_hash, path in zip(hashes, paths) } trial_mask = { t_hash: mask_value for t_hash, mask_value in zip(hashes, mask) } checkpoints[block_id] = { 'paths': trial_checkpoints, 'mask': trial_mask } # Rsync workers to main machine and back to all workers # TODO specify callbacks. If not remote will not work if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) self.progress_state.checkpoint_end(block_id, success[block_id]) logger.debug(f"Done running {block_id}") self.teardown() if all(success.values()): logger.info(cl.GR("Experiment ended successfully")) else: raise error.UnsuccessfulRunnableError( "Not all trials were successful. Check the logs for more information" )
def load_all_instances(self) -> None: """Launch all instances for the experiment. This method launches both the orchestrator and the factories. """ boto_orchestrator, boto_factories = self._existing_cluster() with ThreadPoolExecutor() as executor: future_orch, future_factories = None, None if boto_orchestrator: self.orchestrator = self.get_orchestrator( self._get_boto_public_host(boto_orchestrator), self._get_boto_private_host(boto_orchestrator)) logger.info( cl. BL(f"Found existing orchestrator ({boto_orchestrator.instance_type}) " + f"{self.orchestrator.host}")) else: future_orch = executor.submit(self._create_orchestrator) for f in boto_factories: factory = self.get_factory(self._get_boto_public_host(f), self._get_boto_private_host(f)) if factory.contains_gpu(): factory = self.get_gpu_factory( self._get_boto_public_host(f), self._get_boto_private_host(f)) self.factories.append(factory) if len(self.factories) > 0: logger.info( cl.BL(f"Found {len(self.factories)} existing factories " + f"({str([f.host for f in self.factories])}).")) pending_new_factories = self.factories_num - len(self.factories) logger.debug(f"Creating {pending_new_factories} factories") if pending_new_factories > 0: future_factories = executor.submit( self._create_factories, number=pending_new_factories) elif pending_new_factories < 0: logger.info( cl.BL( f"Reusing existing {len(boto_factories)} factories.")) try: if future_orch: self.orchestrator = future_orch.result() logger.info( cl. BL(f"New orchestrator created {self.orchestrator.host}" )) if future_factories: new_factories = future_factories.result() self.factories.extend(new_factories) logger.info( cl. BL(f"{pending_new_factories} factories {self.factories_type} created " + f"({str([f.host for f in new_factories])}).")) except botocore.exceptions.ClientError as e: raise errors.ClusterError( "Error creating the instances. Check that the provided configuration " + f" is correct. Original error: {e}") self.name_hosts() self.update_tags() self.remove_existing_events() self.create_cloudwatch_events()
def main(args: argparse.Namespace) -> None: """Execute command based on given config""" if is_dev_mode(): print(cl.RA(ASCII_LOGO_DEV)) print(cl.BL(f"Location: {get_flambe_repo_location()}\n")) else: print(cl.RA(ASCII_LOGO)) print(cl.BL(f"VERSION: {flambe.__version__}\n")) # Pass original module for ray / pickle make_component(torch.nn.Module, TORCH_TAG_PREFIX, only_module='torch.nn') # torch.optim.Optimizer exists, ignore mypy make_component( torch.optim.Optimizer, TORCH_TAG_PREFIX, # type: ignore only_module='torch.optim') make_component(torch.optim.lr_scheduler._LRScheduler, TORCH_TAG_PREFIX, only_module='torch.optim.lr_scheduler') make_component(ray.tune.schedulers.TrialScheduler, TUNE_TAG_PREFIX) make_component(ray.tune.suggest.SearchAlgorithm, TUNE_TAG_PREFIX) # TODO check first if there is cluster as if there is there # is no need to install extensions check_system_reqs() with SafeExecutionContext(args.config) as ex: if args.cluster is not None: with SafeExecutionContext(args.cluster) as ex_cluster: cluster, _ = ex_cluster.preprocess( secrets=args.secrets, install_ext=args.install_extensions) runnable, extensions = ex.preprocess(import_ext=False, secrets=args.secrets) cluster.run(force=args.force) if isinstance(runnable, ClusterRunnable): cluster = cast(Cluster, cluster) # This is independant to the type of ClusterRunnable destiny = os.path.join(cluster.get_orch_home_path(), "extensions") # Before sending the extensions, they need to be # downloaded (locally). t = os.path.join(FLAMBE_GLOBAL_FOLDER, "extensions") extensions = download_extensions(extensions, t) # At this point, all remote extensions # (except pypi extensions) # have local paths. new_extensions = cluster.send_local_content(extensions, destiny, all_hosts=True) new_secrets = cluster.send_secrets() # Installing the extensions is crutial as flambe # will execute without '-i' flag and therefore # will assume that the extensions are installed # in the orchestrator. cluster.install_extensions_in_orchestrator(new_extensions) logger.info(cl.GR("Extensions installed in Orchestrator")) runnable.setup_inject_env(cluster=cluster, extensions=new_extensions, force=args.force) cluster.execute(runnable, new_extensions, new_secrets, args.force) else: raise ValueError( "Only ClusterRunnables can be executed in a cluster.") else: runnable, _ = ex.preprocess(secrets=args.secrets, install_ext=args.install_extensions) runnable.run(force=args.force, verbose=args.verbose)