def run_cmds(self, setup_cmds: List[str]) -> None: """Run setup commands in all hosts Parameters ---------- setup_cmds: List[str] The list of commands Raises ------ errors.RemoteCommandError If at least one commands is not successful in at least one host. """ with ThreadPoolExecutor() as executor: futures = [] for ins in self._get_all_hosts(): futures.append(executor.submit(ins.run_cmds, setup_cmds)) for f in futures: try: f.result() except errors.RemoteCommandError: raise except Exception as exc: logger.error( 'Generated an unknown exception: {}'.format(exc)) raise logger.info(cl.GR("Custom commands ran successfully in all hosts"))
def prepare_all_instances(self) -> None: """Prepare all the instances (both orchestrator and factories). This method assumes that the hosts are running and accesible. It will call the 'prepare' method from all hosts. """ with ThreadPoolExecutor() as executor: futures = {} for ins in self._get_all_hosts(): futures[executor.submit(ins.prepare)] = ins for f in futures.keys(): try: f.result() except errors.RemoteCommandError: raise except Exception as exc: logger.error(f'Generated an exception: {exc}') raise else: logger.debug(f'{futures[f].host} ready') logger.info(cl.GR("All instances prepared"))
def create_dirs(self, relative_dirs: List[str]) -> None: """Create folders in all hostss. If some of the already exist, it will do nothing. Parameters ---------- relative_dirs: List[str] The directories to create. They should be relative paths and $HOME of each host will be used to add the prefix. """ with ThreadPoolExecutor() as executor: futures = {} for ins in self._get_all_hosts(): futures[executor.submit(ins.create_dirs, relative_dirs)] = ins for f in futures.keys(): try: f.result() except errors.RemoteCommandError: raise except Exception as exc: logger.error(f'Generated an exception: {exc}') raise else: logger.debug(f'{futures[f].host} ready') logger.info(cl.GR("All instances prepared"))
def launch_flambe(self, config_file: str, secrets_file: str, force: bool) -> None: """Launch flambe execution in the remote host Parameters ---------- config_file: str The config filename relative to the orchestrator secrets_file: str The filepath containing the secrets for the orchestrator force: bool The force parameters that was originally passed to flambe """ force_params = "--force" if force else "" cmd = ( f"tmux new-session -d -s 'flambe' " + f"'bash -lc \"flambe {config_file} -i --secrets {secrets_file} " + f"{force_params} &> output.log\"'") ret = self._run_cmd(cmd) if ret.success: logger.info(cl.GR("Running flambe in Orchestrator")) else: raise errors.RemoteCommandError( f"Not able to run flambe. {ret.msg}")
def install_extensions(extensions: Dict[str, str], user_flag: bool = False) -> None: """Install extensions. At this point, all extensions must be either local paths or valid pypi packages. Remote extensions hosted in Github must have been download first. Parameters ---------- extensions: Dict[str, str] Dictionary of extensions user_flag: bool Use --user flag when running pip install """ cmd = ['python3', '-m', 'pip', 'install', '-U'] if user_flag: cmd.append('--user') for ext, resource in extensions.items(): curr_cmd = cmd[:] try: if os.path.exists(resource): # Package is local if os.sep not in resource: resource = f"./{resource}" else: # Package follows pypi notation: "torch>=0.4.1,<1.1" resource = f"{resource}" curr_cmd.append(resource) output: Union[bytes, str] output = subprocess.check_output(curr_cmd, stderr=subprocess.DEVNULL) output = output.decode("utf-8") for l in output.splitlines(): logger.debug(l) r = re.search( r'Successfully uninstalled (?P<pkg_name>\D*)-(?P<version>.*)', l) if r and 'pkg_name' in r.groupdict(): logger.info( cl.RE(f"WARNING: While installing {ext}, " + f"existing {r.groupdict()['pkg_name']}-" + f"{r.groupdict()['version']} was uninstalled.")) except subprocess.CalledProcessError: raise ImportError(f"Could not install package in {resource}") logger.info(cl.GR(f"Successfully installed {ext}"))
def run(self, force: bool = False, **kwargs) -> None: """Run a cluster and load all the instances. After this metho runs, the orchestrator and factories objects will be populated. If a runnable is provided, then the cluster will execute the runnable remotely in the cluster. Currently, only ClusterRunnable is supported. This method should be idempotent (ie if called N times with the same configuration, only one cluster will be created.) Parameters ---------- force: bool, defaults to False If true, current executions of the same runnable in the cluster will be overriden by a new execution. """ self.load_all_instances() logger.info(cl.GR("Cluster loaded")) for ins in self._get_all_hosts(): ins.wait_until_accessible() logger.debug("All instances accessible.") self.distribute_keys() self.create_dirs(["extensions"]) logger.debug("Created flambe folder to store content") if self.setup_cmds is not None: self.run_cmds(self.setup_cmds) self.prepare_all_instances() logger.info(cl.GR("Flambe installed in all hosts"))
def launch_ray_cluster(self) -> None: """Create a ray cluster. The main node is going to be located in the orchestrator machine and all other nodes in the factories. The main node is executed with --num-cpus=0 flag so that it doesn't do any work and all work is done by the factories. """ for ins in self._get_all_hosts(): if ins.is_node_running(): raise man_errors.ClusterError( f"Node {ins.host} is running in an existing cluster. Aborting." ) port = const.RAY_REDIS_PORT # The orchestator needs to exist at this point if not self.orchestrator: raise man_errors.ClusterError( "Orchestrator instance was not loaded.") self.orchestrator.launch_node(port) redis_address = f"{self.orchestrator.private_host}:{port}" with ThreadPoolExecutor(max_workers=self.factories_num) as executor: futures = {} for ins in self.factories: futures[executor.submit(ins.launch_node, redis_address)] = ins for f in futures.keys(): try: f.result() except errors.RemoteCommandError: raise except Exception as exc: logger.error('Generated an exception: {}'.format(exc)) raise else: logger.debug('{} Ray worker ready'.format(futures[f].host)) logger.info(cl.GR("Ray cluster launched"))
def distribute_keys(self) -> None: """Create a new key pair and distributes it to all hosts. Ensure that the hosts have a safe communication. The name of the key is the cluster's name """ if self.cluster_has_key(): logger.info(cl.GR("Cluster has already configured key pair")) return # generate private/public key pair key = rsa.generate_private_key(backend=default_backend(), public_exponent=65537, key_size=2048) # get public key in OpenSSH format public_key = key.public_key().public_bytes( serialization.Encoding.OpenSSH, serialization.PublicFormat.OpenSSH) # get private key in PEM container format pem = key.private_bytes( encoding=serialization.Encoding.PEM, format=serialization.PrivateFormat.TraditionalOpenSSL, encryption_algorithm=serialization.NoEncryption()) # decode to printable strings private_key_str = pem.decode('utf-8') public_key_str = public_key.decode('utf-8') logger.debug("New key pair generated") def m(ins): ins._run_cmd(f"rm -rf {ins.get_home_path()}/{const.PUBLIC_KEY}") ins._run_cmd(f"rm -rf {ins.get_home_path()}/{const.PRIVATE_KEY}") ret = ins._run_cmd( f"echo '{public_key_str}' >> {ins.get_home_path()}/.ssh/authorized_keys", retries=3) if not ret.success: raise man_errors.ClusterError( "Could not send key to authorized_keys") with tempfile.NamedTemporaryFile("w") as t: t.write(private_key_str) t.flush() ins.send_rsync(t.name, f"{ins.get_home_path()}/{const.PRIVATE_KEY}") ins._run_cmd( f"chmod 600 {ins.get_home_path()}/{const.PRIVATE_KEY}") with tempfile.NamedTemporaryFile("w") as t: t.write(public_key_str) t.flush() ins.send_rsync(t.name, f"{ins.get_home_path()}/{const.PUBLIC_KEY}") logger.debug(f"New key pair sent to {ins.host}") with ThreadPoolExecutor() as executor: futures = {} for ins in self._get_all_hosts(): futures[executor.submit(m, ins)] = ins for f in futures.keys(): try: f.result() except errors.RemoteCommandError: raise except Exception as exc: logger.error('Generated an exception: {}'.format(exc)) raise logger.info(cl.GR("Distributed keys"))
def run(self, force: bool = False, verbose: bool = False, debug: bool = False, **kwargs): """Run an Experiment""" logger.info(cl.BL("Launching local experiment")) # Check if save_path/name already exists + is not empty # + force and resume are False if (os.path.exists(self.full_save_path) and os.listdir(self.full_save_path) and not self.resume and not force): raise error.ParsingRunnableError( f"Results from an experiment with the same name were located in the save path " + f"{self.full_save_path}. To overide this results, please use '--force' " + "To use these results and resume the experiment, pick 'resume: True' " + "If not, just pick another save_path/name.") full_save_path = self.full_save_path if not self.env: wording.print_useful_local_info(full_save_path) # If running remotely then all folders were already created. # in the 'setup' method. if not self.env: if os.path.exists(full_save_path) and force: shutil.rmtree(full_save_path) # This deleted the folder also logger.info( cl.RE(f"Removed previous existing from {full_save_path} " + "results as --force was specified")) if not os.path.exists(full_save_path): os.makedirs(full_save_path) logger.debug(f"{full_save_path} created to store output") self._dump_experiment_file() if any( map(lambda x: isinstance(x, ClusterResource), self.resources.values())): raise ValueError( f"Local experiments doesn't support resources with '!cluster' tags. " + "The '!cluster' tag is used for those resources that need to be handled " + "in the cluster when running remote experiments.") if not self.env: self.tmp_resources_dir = tempfile.TemporaryDirectory() resources_folder = self.tmp_resources_dir.name else: resources_folder = f"{self.full_save_path}/_resources" resources = self.process_resources(self.resources, resources_folder) # rsync downloaded resources if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) # Check that links are in order (i.e topologically in pipeline) utils.check_links(self.pipeline, resources) # Check that only computable blocks are given # search algorithms and schedulers utils.check_search(self.pipeline, self.search, self.schedulers) # Initialize ray cluster kwargs = {"logging_level": logging.ERROR, "include_webui": False} if debug: kwargs['local_mode'] = True if self.env: ray.init(redis_address= f"{self.env.orchestrator_ip}:{const.RAY_REDIS_PORT}", **kwargs) else: ray.init(**kwargs) logger.debug(f"Ray cluster up") # Initialize map from block to list of checkpoints # This is used whe resolving links over other computable blocks # TODO: in python 3.7 we can replace these with dict() or {} checkpoints: OrderedDict = OrderedDict() schemas: OrderedDict = OrderedDict() success: OrderedDict = OrderedDict() # By default use all CPUs if no GPU is present devices = self.devices if self.devices else None if devices is None and utils.local_has_gpu(): devices = {"cpu": 4, "gpu": 1} to_resume = None if isinstance(self.resume, str): index = list(self.pipeline.keys()).index(self.resume) to_resume = list(self.pipeline.keys())[:index + 1] elif isinstance(self.resume, Sequence): to_resume = list(self.resume) # Make experiment_tag easier to extract def trial_name_creator(trial): identifier = "" if "env" in trial.config: env = trial.config["env"] if isinstance(env, type): env = env.__name__ identifier += f"{env}" if trial.experiment_tag: hyper_params = {} if "_" in trial.experiment_tag: num, tunable_params = trial.experiment_tag.split("_", 1) identifier += tunable_params param_list = [ p.split("=") for p in tunable_params.split(",") ] hyper_params = {p[0]: p[1] for p in param_list} else: identifier += trial.experiment_tag trial.config['hyper_params'] = hyper_params return identifier.replace("/", "_") trial_name_creator = ray.tune.function(trial_name_creator) # Compute depedencies DAG dependency_dag = {} schemas_dag: OrderedDict = OrderedDict() for block_id, schema_block in self.pipeline.items(): schemas_dag[block_id] = schema_block relevant_ids = utils.extract_needed_blocks(schemas_dag, block_id, resources) dependencies = deepcopy(relevant_ids) dependencies.discard(block_id) dependency_dag[block_id] = list(dependencies) if self.env: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag, self.content, len(self.env.factories_ips)) else: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag, self.content) for block_id, schema_block in tqdm(self.pipeline.items()): schema_block.add_extensions_metadata(self.extensions) logger.debug(f"Starting {block_id}") # Add the block to the configuration so far schemas[block_id] = schema_block success[block_id] = True self.progress_state.checkpoint_start(block_id) relevant_ids = utils.extract_needed_blocks(schemas, block_id, resources) relevant_schemas = { k: v for k, v in deepcopy(schemas).items() if k in relevant_ids } # Set resume resume = False if to_resume is None else (block_id in to_resume) # If computable, convert to tune.Trainable # Each Component block is an Experiment in ray.tune if not isinstance(schema_block, Schema): raise ValueError('schema block not of correct type Schema') if issubclass(schema_block.component_subclass, Component): # Returns is a list non-nested configuration divided_schemas = list( utils.divide_nested_grid_search_options(relevant_schemas)) divided_dict = [utils.extract_dict(x) for x in divided_schemas] # Convert options and links divided_dict_tune = [ utils.convert_tune(x) for x in divided_dict ] # Execute block tune_experiments = [] for param_dict, schemas_dict in zip(divided_dict_tune, divided_schemas): config = { 'name': block_id, 'merge_plot': self.merge_plot, 'params': param_dict, 'schemas': Schema.serialize(schemas_dict), 'checkpoints': checkpoints, 'to_run': block_id, 'global_vars': resources, 'verbose': verbose, 'custom_modules': list(self.extensions.keys()), 'debug': debug } # Filter out the tensorboard logger as we handle # general and tensorboard-specific logging ourselves tune_loggers = list( filter( lambda l: l != tf2_compat_logger and # noqa: E741 not issubclass(l, TFLogger), DEFAULT_LOGGERS)) tune_experiment = ray.tune.Experiment( name=block_id, run=TuneAdapter, trial_name_creator=trial_name_creator, config=deepcopy(config), local_dir=full_save_path, checkpoint_freq=1, checkpoint_at_end=True, max_failures=self.max_failures, resources_per_trial=devices, loggers=tune_loggers) logger.debug(f"Created tune.Experiment for {param_dict}") tune_experiments.append(tune_experiment) trials = ray.tune.run_experiments( tune_experiments, search_alg=self.search.get(block_id, None), scheduler=self.schedulers.get(block_id, None), queue_trials=True, verbose=False, resume=resume, raise_on_failed_trial=False) logger.debug( f"Finish running all tune.Experiments for {block_id}") any_error = False for t in trials: if t.status == t.ERROR: logger.error( cl. RE(f"Variant {t} of '{block_id}' ended with ERROR status." )) success[block_id] = False any_error = True if any_error and self.stop_on_failure: self.teardown() self.progress_state.checkpoint_end(block_id, success[block_id]) raise error.UnsuccessfulRunnableError( f"Stopping experiment at block '{block_id}' " "because there was an error and stop_on_failure == True." ) # Save checkpoint location # It should point from: # block_id -> hash(variant) -> checkpoint hashes = [] for t in trials: schema_with_params: Dict = OrderedDict() for b in schemas_dict: schema_copy = deepcopy(schemas_dict[b]) utils.update_schema_with_params( schema_copy, t.config['params'][b]) schema_with_params[b] = schema_copy hashes.append(repr(schema_with_params)) paths = [t._checkpoint.value for t in trials] # Mask out error trials mask = [True] * len(trials) for i, trial in enumerate(trials): if trial.status == ray.tune.trial.Trial.ERROR: mask[i] = False # Mask out on reduce reduce_k = self.reduce.get(block_id, None) if reduce_k is not None and int(reduce_k) > 0: # Get best best_trials = utils.get_best_trials(trials, topk=int(reduce_k)) best_trial_ids = set([t.trial_id for t in best_trials]) # Mask out for i, trial in enumerate(trials): if trial.trial_id not in best_trial_ids: mask[i] = False trial_checkpoints = { t_hash: path for t_hash, path in zip(hashes, paths) } trial_mask = { t_hash: mask_value for t_hash, mask_value in zip(hashes, mask) } checkpoints[block_id] = { 'paths': trial_checkpoints, 'mask': trial_mask } # Rsync workers to main machine and back to all workers # TODO specify callbacks. If not remote will not work if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) self.progress_state.checkpoint_end(block_id, success[block_id]) logger.debug(f"Done running {block_id}") self.teardown() if all(success.values()): logger.info(cl.GR("Experiment ended successfully")) else: raise error.UnsuccessfulRunnableError( "Not all trials were successful. Check the logs for more information" )
def main(args: argparse.Namespace) -> None: """Execute command based on given config""" if is_dev_mode(): print(cl.RA(ASCII_LOGO_DEV)) print(cl.BL(f"Location: {get_flambe_repo_location()}\n")) else: print(cl.RA(ASCII_LOGO)) print(cl.BL(f"VERSION: {flambe.__version__}\n")) # Pass original module for ray / pickle make_component(torch.nn.Module, TORCH_TAG_PREFIX, only_module='torch.nn') # torch.optim.Optimizer exists, ignore mypy make_component( torch.optim.Optimizer, TORCH_TAG_PREFIX, # type: ignore only_module='torch.optim') make_component(torch.optim.lr_scheduler._LRScheduler, TORCH_TAG_PREFIX, only_module='torch.optim.lr_scheduler') make_component(ray.tune.schedulers.TrialScheduler, TUNE_TAG_PREFIX) make_component(ray.tune.suggest.SearchAlgorithm, TUNE_TAG_PREFIX) # TODO check first if there is cluster as if there is there # is no need to install extensions check_system_reqs() with SafeExecutionContext(args.config) as ex: if args.cluster is not None: with SafeExecutionContext(args.cluster) as ex_cluster: cluster, _ = ex_cluster.preprocess( secrets=args.secrets, install_ext=args.install_extensions) runnable, extensions = ex.preprocess(import_ext=False, secrets=args.secrets) cluster.run(force=args.force) if isinstance(runnable, ClusterRunnable): cluster = cast(Cluster, cluster) # This is independant to the type of ClusterRunnable destiny = os.path.join(cluster.get_orch_home_path(), "extensions") # Before sending the extensions, they need to be # downloaded (locally). t = os.path.join(FLAMBE_GLOBAL_FOLDER, "extensions") extensions = download_extensions(extensions, t) # At this point, all remote extensions # (except pypi extensions) # have local paths. new_extensions = cluster.send_local_content(extensions, destiny, all_hosts=True) new_secrets = cluster.send_secrets() # Installing the extensions is crutial as flambe # will execute without '-i' flag and therefore # will assume that the extensions are installed # in the orchestrator. cluster.install_extensions_in_orchestrator(new_extensions) logger.info(cl.GR("Extensions installed in Orchestrator")) runnable.setup_inject_env(cluster=cluster, extensions=new_extensions, force=args.force) cluster.execute(runnable, new_extensions, new_secrets, args.force) else: raise ValueError( "Only ClusterRunnables can be executed in a cluster.") else: runnable, _ = ex.preprocess(secrets=args.secrets, install_ext=args.install_extensions) runnable.run(force=args.force, verbose=args.verbose)
'--cluster', type=str, default=None, help='Specify the cluster that will run the experiment. This option ' + 'works if the main config is an Experiment') parser.add_argument( '-f', '--force', action='store_true', default=False, help='Override existing runnables. Be careful ' + 'when using this flag as it could have undesired effects.') parser.add_argument('--secrets', type=str, default=os.path.join(FLAMBE_GLOBAL_FOLDER, "secrets.ini")) parser.add_argument('-v', '--verbose', action='store_true', help='Verbose console output') args = parser.parse_args() setup_global_logging(logging.INFO if not args.verbose else logging.DEBUG) logger = logging.getLogger(__name__) try: main(args) logger.info(cl.GR("------------------- Done -------------------")) except KeyboardInterrupt: logger.info(cl.RE("---- Exiting early (Keyboard Interrupt) ----"))