Exemplo n.º 1
0
    def __init__(
        self,
        scheduler: torch.optim.lr_scheduler._LRScheduler,
        step_mode: StepMode,
    ):
        """LRScheduler constructor

        Args:
            scheduler (:py:class:`torch.optim.lr_scheduler._LRScheduler`):
                Learning rate scheduler to be used by Determined.
            step_mode (:py:class:`det.pytorch.LRSchedulerStepMode`):
                The strategy Determined will use to call (or not call) scheduler.step().

                1. ``STEP_EVERY_EPOCH``: Determined will call scheduler.step() after
                   every training epoch. No arguments will be passed to step().

                2. ``STEP_EVERY_BATCH``: Determined will call scheduler.step() after every
                   training batch. No arguments will be passed to step().

                3. ``MANUAL_STEP``: Determined will not call scheduler.step() at all.
                   It is up to the user to decide when to call scheduler.step(),
                   and whether to pass any arguments.
        """
        check.check_not_none(scheduler)
        check.check_isinstance(step_mode, LRScheduler.StepMode)

        self._scheduler = scheduler
        self._step_mode = step_mode
Exemplo n.º 2
0
    def __init__(self, scheduler: torch.optim.lr_scheduler._LRScheduler, step_mode: StepMode):
        """Wrapper for a PyTorch LRScheduler.

        Usage of this wrapper is required to properly schedule the optimizer's learning rate.

        This wrapper fulfills two main functions:
            1. Save and restore the learning rate when a trial is paused, preempted, etc.
            2. Step the learning rate scheduler at the configured frequency
               (e.g., every batch or every epoch).

        Args:
            scheduler (:py:class:`torch.optim.lr_scheduler._LRScheduler`):
                Learning rate scheduler to be used by Determined.
            step_mode (:py:class:`det.pytorch.LRSchedulerStepMode`):
                The strategy Determined will use to call (or not call) scheduler.step().

                1. ``STEP_EVERY_EPOCH``: Determined will call scheduler.step() after
                   every training epoch. No arguments will be passed to step().

                2. ``STEP_EVERY_BATCH``: Determined will call scheduler.step() after every
                   training batch. No arguments will be passed to step().

                3. ``MANUAL_STEP``: Determined will not call scheduler.step() at all.
                   It is up to the user to decide when to call scheduler.step(),
                   and whether to pass any arguments.
        """

        check.check_not_none(scheduler)
        check.check_isinstance(step_mode, LRScheduler.StepMode)

        self.scheduler = scheduler
        self.step_mode = step_mode
Exemplo n.º 3
0
    def get_optimizer(self) -> torch.optim.Optimizer:  # type: ignore
        """
        Get the optimizer associated with the trial. This function should not be
        called from:

            * ``__init__``
            * ``build_model()``
            * ``optimizer()``
        """
        check.check_not_none(self.optimizer)
        return self.optimizer
Exemplo n.º 4
0
    def get_model(self) -> torch.nn.Module:
        """
        Get the model associated with the trial. This function should not be
        called from:

            * ``__init__``
            * ``build_model()``
        """

        check.check_not_none(self.model)
        return cast(torch.nn.Module, self.model)
Exemplo n.º 5
0
    def yield_checkpoint_model(
            self, wkld: workload.Workload,
            respond: workload.ResponseFunc) -> workload.Stream:
        start_time = _current_timestamp()

        # Only the chief container should checkpoint.
        if self.rendezvous_info.get_rank() != 0:
            respond(workload.Skipped())
            return

        # Save the workload completed message for after checkpoint upload completes.
        message = None  # type: Optional[workload.Response]

        def _respond(checkpoint_info: workload.Response) -> None:
            checkpoint_info = cast(Dict[str, Any], checkpoint_info)
            metadata = storage.StorageMetadata(
                storage_id,
                storage.StorageManager._list_directory(path),
                checkpoint_info.get("framework", ""),
                checkpoint_info.get("format", ""),
            )

            logging.info("Saved trial to checkpoint {}".format(
                metadata.storage_id))
            self.tensorboard_mgr.sync()

            nonlocal message
            message = {
                "type": "WORKLOAD_COMPLETED",
                "workload": wkld,
                "start_time": start_time,
                "end_time": _current_timestamp(),
                "metrics": metadata,
            }

        with self.storage_mgr.store_path() as (storage_id, path):
            yield wkld, [pathlib.Path(path)], _respond

        # Because the messaging is synchronous, the layer below us must have called _respond.
        check_not_none(message, "response function did not get called")
        message = cast(workload.Response, message)

        respond(message)
Exemplo n.º 6
0
def main(args: List[str] = sys.argv[1:]) -> None:
    # TODO(#1690): Refactor admin command(s) to a separate CLI tool.
    if "DET_ADMIN" in os.environ:
        experiment_args_description.subs.append(
            Cmd(
                "delete",
                experiment.delete_experiment,
                "delete experiment",
                [
                    Arg("experiment_id", help="delete experiment"),
                    Arg(
                        "--yes",
                        action="store_true",
                        default=False,
                        help="automatically answer yes to prompts",
                    ),
                ],
            ))

    try:
        parser = make_parser()
        argcomplete.autocomplete(parser)

        parsed_args = parser.parse_args(args)

        def die(message: str, always_print_traceback: bool = False) -> None:
            if always_print_traceback or os.getenv(
                    "DET_DEBUG", "").lower() in ("true", "1", "yes"):
                import traceback

                traceback.print_exc()

            parser.exit(1, colored(message + "\n", "red"))

        v = vars(parsed_args)
        if not v.get("func"):
            parser.print_usage()
            parser.exit(2, "{}: no subcommand specified\n".format(parser.prog))

        cert_fn = str(auth.get_config_path().joinpath("master.crt"))
        if os.path.exists(cert_fn):
            os.environ["REQUESTS_CA_BUNDLE"] = cert_fn

        try:
            try:
                check_version(parsed_args)
            except requests.exceptions.SSLError:
                # An SSLError usually means that we queried a master over HTTPS and got an untrusted
                # cert, so allow the user to store and trust the current cert. (It could also mean
                # that we tried to talk HTTPS on the HTTP port, but distinguishing that based on the
                # exception is annoying, and we'll figure that out in the next step anyway.)
                addr = api.parse_master_address(parsed_args.master)
                check_not_none(addr.hostname)
                check_not_none(addr.port)
                try:
                    cert_pem_data = ssl.get_server_certificate(
                        (cast(str, addr.hostname), cast(int, addr.port)))
                except ssl.SSLError:
                    die("Tried to connect over HTTPS but couldn't get a certificate from the "
                        "master; consider using HTTP")

                cert_hash = hashlib.sha256(
                    ssl.PEM_cert_to_DER_cert(cert_pem_data)).hexdigest()
                cert_fingerprint = ":".join(chunks(cert_hash, 2))

                if not render.yes_or_no(
                        "The master sent an untrusted certificate with this SHA256 fingerprint:\n"
                        "{}\nDo you want to trust this certificate from now on?"
                        .format(cert_fingerprint)):
                    die("Unable to verify master certificate")

                with open(cert_fn, "w") as out:
                    out.write(cert_pem_data)
                os.environ["REQUESTS_CA_BUNDLE"] = cert_fn

                check_version(parsed_args)

            parsed_args.func(parsed_args)
        except KeyboardInterrupt as e:
            raise e
        except (api.errors.BadRequestException,
                api.errors.BadResponseException) as e:
            die("Failed to {}: {}".format(parsed_args.func.__name__, e))
        except api.errors.CorruptTokenCacheException:
            die("Failed to login: Attempted to read a corrupted token cache. "
                "The store has been deleted; please try again.")
        except Exception:
            die("Failed to {}".format(parsed_args.func.__name__),
                always_print_traceback=True)
    except KeyboardInterrupt:
        parser.exit(3, colored("Interrupting...\n", "red"))
Exemplo n.º 7
0
 def from_json(record: Dict[str, Any]) -> "StorageMetadata":
     check_not_none(record["uuid"], "Storage ID is undefined")
     check_not_none(record["resources"], "Resources are undefined")
     return StorageMetadata(record["uuid"], record["resources"],
                            record.get("labels"))