def _select_optimizers(self) -> None: """ Selects the optimizers that are going to be used. This is done for backwards compatibility as previously optimizers were passed in as part of the compile() call and are now passed in as part of `self.context.wrap_optimizers()`. """ check.check_len( self._optimizers, 0, "context._select_optimizers() called multiple times. Should be only called " "once by TFKerasTrialController.", ) if len(self._wrapped_optimizers) > 0: logging.debug( f"Using wrapped optimizers: {self._wrapped_optimizers}.") self._optimizers = self._wrapped_optimizers return check.is_not_none( self._compiled_optimizer, "Please use `optimizer = self.context.wrap_optimizer(optimizer)` to wrap your " "optimizer. If using multiple optimizer, you should wrap your optimizer " "separately (calling wrap_optimizer() once for each optimizer).", ) if self._compiled_optimizer: logging.info( "Please switch over to using `optimizer = self.context.wrap_optimizer()`." ) logging.debug( f"Using compiled optimizer: {self._compiled_optimizer}.") self._optimizers = [self._compiled_optimizer]
def _respond(in_response: workload.Response) -> None: # Only the chief container should actually respond to TRAIN_FOR_STEP. if self.rendezvous_info.get_rank() != 0: respond(workload.Skipped()) return check_not_isinstance(in_response, workload.Skipped, "Chief skipped a workload.") in_response = cast(workload.Metrics, in_response) metrics = in_response["metrics"] metrics = cast(workload.Metrics, metrics) if in_response.get("invalid_hp", False): out_response = { "type": "WORKLOAD_COMPLETED", "workload": wkld, "start_time": start_time, "end_time": _current_timestamp(), "metrics": metrics, } out_response["exited_reason"] = "INVALID_HP" respond(out_response) return batch_metrics = metrics["batch_metrics"] # Sanity-check training metrics. det.util.validate_batch_metrics(batch_metrics) check_len(batch_metrics, wkld.num_batches) for callback in self.callbacks: callback.on_train_step_end(wkld.step_id, wkld.num_batches, wkld.total_batches_processed, metrics) self.tensorboard_mgr.sync() out_response = { "type": "WORKLOAD_COMPLETED", "workload": wkld, "start_time": start_time, "end_time": _current_timestamp(), "metrics": metrics, } if in_response.get("stop_requested", False): out_response["exited_reason"] = "USER_CANCELED" # Send the response up. respond(out_response)
def _open_shell(master: str, shell: Command, additional_opts: List[str]) -> None: LOOPBACK_ADDRESS = "[::1]" with tempfile.NamedTemporaryFile("w") as fp: fp.write(shell.misc["privateKey"]) fp.flush() check_len(shell.addresses, 1, "Cannot find address for shell") host, port = shell.addresses[0]["host_ip"], shell.addresses[0][ "host_port"] if host == LOOPBACK_ADDRESS: host = "localhost" # Use determined.cli.tunnel as a portable script for using the HTTP CONNECT mechanism, # similar to `nc -X CONNECT -x ...` but without any dependency on external binaries. python = sys.executable proxy_cmd = "{} -m determined.cli.tunnel {} %h".format(python, master) if request.get_master_cert_bundle() is not None: proxy_cmd += ' --cert-file "{}"'.format( request.get_master_cert_bundle()) if request.get_master_cert_name(): proxy_cmd += ' --cert-name "{}"'.format( request.get_master_cert_name()) username = shell.agent_user_group["user"] or "root" cmd = [ "ssh", "-o", "ProxyCommand={}".format(proxy_cmd), "-o", "StrictHostKeyChecking=no", "-tt", "-o", "IdentitiesOnly=yes", "-i", str(fp.name), "-p", str(port), "{}@{}".format(username, shell.id), *additional_opts, ] subprocess.run(cmd) print( colored("To reconnect, run: det shell open {}".format(shell.id), "green"))
def _dict_to_list(dict_of_lists: Dict[str, List]) -> List[Dict[str, Any]]: """Transpose a dict of lists to a list of dicts. dict_to_list({"a": [1, 2], "b": [3, 4]})) -> [{"a": 1, "b": 3}, {"a": 2, "b": 4}] In some cases _dict_to_list is the inverse of _list_to_dict. This function assumes that all lists have the same length. """ list_len = len(list(dict_of_lists.values())[0]) for lst in dict_of_lists.values(): check.check_len(lst, list_len, "All lists in the dict must be the same length.") output_list = [{} for _ in range(list_len)] # type: List[Dict[str, Any]] for i in range(list_len): for k in dict_of_lists.keys(): output_list[i][k] = dict_of_lists[k][i] return output_list
def _open_shell( master: str, shell: Dict[str, Any], additional_opts: List[str], retain_keys_and_print: bool, print_only: bool, ) -> None: cache_dir = None if retain_keys_and_print: cache_dir = Path(appdirs.user_cache_dir("determined")) / "shell" / shell["id"] if not cache_dir.exists(): cache_dir.mkdir(parents=True) with _prepare_key(cache_dir) as keyfile: keyfile.write(shell["privateKey"]) keyfile.flush() check_len(shell["addresses"], 1, "Cannot find address for shell") _, port = shell["addresses"][0]["host_ip"], shell["addresses"][0]["host_port"] # Use determined.cli.tunnel as a portable script for using the HTTP CONNECT mechanism, # similar to `nc -X CONNECT -x ...` but without any dependency on external binaries. python = sys.executable proxy_cmd = "{} -m determined.cli.tunnel {} %h".format(python, master) cert_bundle_path = _prepare_cert_bundle(cache_dir) if cert_bundle_path is not None: proxy_cmd += ' --cert-file "{}"'.format(cert_bundle_path) cert = certs.cli_cert assert cert is not None, "cli_cert was not configured" if cert.name: proxy_cmd += ' --cert-name "{}"'.format(cert.name) username = shell["agentUserGroup"]["user"] or "root" cmd = [ "ssh", "-o", "ProxyCommand={}".format(proxy_cmd), "-o", "StrictHostKeyChecking=no", "-tt", "-o", "IdentitiesOnly=yes", "-i", str(keyfile.name), "-p", str(port), "{}@{}".format(username, shell["id"]), *additional_opts, ] if retain_keys_and_print: print(colored(subprocess.list2cmdline(cmd), "yellow")) if print_only: return subprocess.run(cmd) print(colored("To reconnect, run: det shell open {}".format(shell["id"]), "green"))