예제 #1
0
    def _select_optimizers(self) -> None:
        """
        Selects the optimizers that are going to be used. This is done for backwards
        compatibility as previously optimizers were passed in as part of the compile()
        call and are now passed in as part of `self.context.wrap_optimizers()`.
        """
        check.check_len(
            self._optimizers,
            0,
            "context._select_optimizers() called multiple times. Should be only called "
            "once by TFKerasTrialController.",
        )

        if len(self._wrapped_optimizers) > 0:
            logging.debug(
                f"Using wrapped optimizers: {self._wrapped_optimizers}.")
            self._optimizers = self._wrapped_optimizers
            return

        check.is_not_none(
            self._compiled_optimizer,
            "Please use `optimizer = self.context.wrap_optimizer(optimizer)` to wrap your "
            "optimizer. If using multiple optimizer, you should wrap your optimizer "
            "separately (calling wrap_optimizer() once for each optimizer).",
        )

        if self._compiled_optimizer:
            logging.info(
                "Please switch over to using `optimizer = self.context.wrap_optimizer()`."
            )
            logging.debug(
                f"Using compiled optimizer: {self._compiled_optimizer}.")
            self._optimizers = [self._compiled_optimizer]
예제 #2
0
        def _respond(in_response: workload.Response) -> None:
            # Only the chief container should actually respond to TRAIN_FOR_STEP.
            if self.rendezvous_info.get_rank() != 0:
                respond(workload.Skipped())
                return

            check_not_isinstance(in_response, workload.Skipped,
                                 "Chief skipped a workload.")

            in_response = cast(workload.Metrics, in_response)
            metrics = in_response["metrics"]

            metrics = cast(workload.Metrics, metrics)

            if in_response.get("invalid_hp", False):
                out_response = {
                    "type": "WORKLOAD_COMPLETED",
                    "workload": wkld,
                    "start_time": start_time,
                    "end_time": _current_timestamp(),
                    "metrics": metrics,
                }
                out_response["exited_reason"] = "INVALID_HP"
                respond(out_response)
                return

            batch_metrics = metrics["batch_metrics"]

            # Sanity-check training metrics.
            det.util.validate_batch_metrics(batch_metrics)
            check_len(batch_metrics, wkld.num_batches)

            for callback in self.callbacks:
                callback.on_train_step_end(wkld.step_id, wkld.num_batches,
                                           wkld.total_batches_processed,
                                           metrics)

            self.tensorboard_mgr.sync()

            out_response = {
                "type": "WORKLOAD_COMPLETED",
                "workload": wkld,
                "start_time": start_time,
                "end_time": _current_timestamp(),
                "metrics": metrics,
            }

            if in_response.get("stop_requested", False):
                out_response["exited_reason"] = "USER_CANCELED"

            # Send the response up.
            respond(out_response)
예제 #3
0
def _open_shell(master: str, shell: Command,
                additional_opts: List[str]) -> None:
    LOOPBACK_ADDRESS = "[::1]"
    with tempfile.NamedTemporaryFile("w") as fp:
        fp.write(shell.misc["privateKey"])
        fp.flush()
        check_len(shell.addresses, 1, "Cannot find address for shell")
        host, port = shell.addresses[0]["host_ip"], shell.addresses[0][
            "host_port"]
        if host == LOOPBACK_ADDRESS:
            host = "localhost"

        # Use determined.cli.tunnel as a portable script for using the HTTP CONNECT mechanism,
        # similar to `nc -X CONNECT -x ...` but without any dependency on external binaries.
        python = sys.executable
        proxy_cmd = "{} -m determined.cli.tunnel {} %h".format(python, master)
        if request.get_master_cert_bundle() is not None:
            proxy_cmd += ' --cert-file "{}"'.format(
                request.get_master_cert_bundle())
        if request.get_master_cert_name():
            proxy_cmd += ' --cert-name "{}"'.format(
                request.get_master_cert_name())

        username = shell.agent_user_group["user"] or "root"

        cmd = [
            "ssh",
            "-o",
            "ProxyCommand={}".format(proxy_cmd),
            "-o",
            "StrictHostKeyChecking=no",
            "-tt",
            "-o",
            "IdentitiesOnly=yes",
            "-i",
            str(fp.name),
            "-p",
            str(port),
            "{}@{}".format(username, shell.id),
            *additional_opts,
        ]

        subprocess.run(cmd)

        print(
            colored("To reconnect, run: det shell open {}".format(shell.id),
                    "green"))
예제 #4
0
def _dict_to_list(dict_of_lists: Dict[str, List]) -> List[Dict[str, Any]]:
    """Transpose a dict of lists to a list of dicts.

        dict_to_list({"a": [1, 2], "b": [3, 4]})) -> [{"a": 1, "b": 3}, {"a": 2, "b": 4}]

    In some cases _dict_to_list is the inverse of _list_to_dict. This function assumes that
    all lists have the same length.
    """

    list_len = len(list(dict_of_lists.values())[0])
    for lst in dict_of_lists.values():
        check.check_len(lst, list_len, "All lists in the dict must be the same length.")

    output_list = [{} for _ in range(list_len)]  # type: List[Dict[str, Any]]
    for i in range(list_len):
        for k in dict_of_lists.keys():
            output_list[i][k] = dict_of_lists[k][i]

    return output_list
예제 #5
0
def _open_shell(
    master: str,
    shell: Dict[str, Any],
    additional_opts: List[str],
    retain_keys_and_print: bool,
    print_only: bool,
) -> None:
    cache_dir = None
    if retain_keys_and_print:
        cache_dir = Path(appdirs.user_cache_dir("determined")) / "shell" / shell["id"]
        if not cache_dir.exists():
            cache_dir.mkdir(parents=True)

    with _prepare_key(cache_dir) as keyfile:
        keyfile.write(shell["privateKey"])
        keyfile.flush()

        check_len(shell["addresses"], 1, "Cannot find address for shell")
        _, port = shell["addresses"][0]["host_ip"], shell["addresses"][0]["host_port"]

        # Use determined.cli.tunnel as a portable script for using the HTTP CONNECT mechanism,
        # similar to `nc -X CONNECT -x ...` but without any dependency on external binaries.
        python = sys.executable
        proxy_cmd = "{} -m determined.cli.tunnel {} %h".format(python, master)

        cert_bundle_path = _prepare_cert_bundle(cache_dir)
        if cert_bundle_path is not None:
            proxy_cmd += ' --cert-file "{}"'.format(cert_bundle_path)

        cert = certs.cli_cert
        assert cert is not None, "cli_cert was not configured"
        if cert.name:
            proxy_cmd += ' --cert-name "{}"'.format(cert.name)

        username = shell["agentUserGroup"]["user"] or "root"

        cmd = [
            "ssh",
            "-o",
            "ProxyCommand={}".format(proxy_cmd),
            "-o",
            "StrictHostKeyChecking=no",
            "-tt",
            "-o",
            "IdentitiesOnly=yes",
            "-i",
            str(keyfile.name),
            "-p",
            str(port),
            "{}@{}".format(username, shell["id"]),
            *additional_opts,
        ]

        if retain_keys_and_print:
            print(colored(subprocess.list2cmdline(cmd), "yellow"))
            if print_only:
                return

        subprocess.run(cmd)

        print(colored("To reconnect, run: det shell open {}".format(shell["id"]), "green"))