Пример #1
0
def configure_helm(args: argparse.Namespace) -> None:
    helm_dir = Path(args.helm_dir)
    with (helm_dir / "Chart.yaml").open() as f:
        helm_chart = safe_load_yaml_with_exceptions(f)
    if args.det_version:
        helm_chart["appVersion"] = args.det_version
    elif "dev" in helm_chart["appVersion"]:
        # Preserve user overridden appVersion in helm chart unless it includes dev in the version.
        helm_chart["appVersion"] = determined.__version__
    if args.gpu_coscheduler:
        helm_chart["defaultScheduler"] = "coscheduler"
    elif args.preemption:
        helm_chart["defaultScheduler"] = "preemption"
    with (helm_dir / "Chart.yaml").open("w") as f:
        yaml.round_trip_dump(helm_chart, f)
    with (helm_dir / "values.yaml").open() as f:
        helm_values = safe_load_yaml_with_exceptions(f)
    checkpointStorage = {}
    checkpointStorage["saveExperimentBest"] = helm_values[
        "checkpointStorage"].get("saveExperimentBest", 0)
    checkpointStorage["saveTrialBest"] = helm_values["checkpointStorage"].get(
        "saveTrialBest", 1)
    checkpointStorage["saveTrialLatest"] = helm_values[
        "checkpointStorage"].get("saveTrialLatest", 1)
    checkpointStorage["type"] = "gcs"
    checkpointStorage["bucket"] = args.gcs_bucket_name
    helm_values["checkpointStorage"] = checkpointStorage
    helm_values["maxSlotsPerPod"] = args.gpus_per_node

    if args.multiple_node_pools:
        gpu_pod_spec = make_spec(helm_values["taskContainerDefaults"],
                                 "gpuPodSpec")
        gpu_spec = cast(Dict, gpu_pod_spec["spec"])
        gpu_spec["tolerations"] = [{
            "key": "gpuAvailable",
            "operator": "Equal",
            "value": "True",
            "effect": "NoSchedule",
        }]
        gpu_spec["nodeSelector"] = {"accelerator_type": "gpu"}

        helm_values["taskContainerDefaults"]["gpuPodSpec"] = gpu_pod_spec

        cpu_pod_spec = make_spec(helm_values["taskContainerDefaults"],
                                 "cpuPodSpec")
        cpu_spec = cast(Dict, cpu_pod_spec["spec"])
        cpu_spec["tolerations"] = [{
            "key": "gpuAvailable",
            "operator": "Equal",
            "value": "False",
            "effect": "NoSchedule",
        }]
        cpu_spec["nodeSelector"] = {"accelerator_type": "cpu"}

        helm_values["taskContainerDefaults"]["cpuPodSpec"] = cpu_pod_spec

    with (helm_dir / "values.yaml").open("w") as f:
        yaml.round_trip_dump(helm_values, f)
Пример #2
0
def parse_config(
    config_file: Optional[IO],
    entrypoint: Optional[List[str]],
    overrides: Iterable[str],
    volumes: Iterable[str],
) -> Dict[str, Any]:
    config = {}  # type: Dict[str, Any]
    if config_file:
        with config_file:
            config = util.safe_load_yaml_with_exceptions(config_file)

    parse_config_overrides(config, overrides)

    for volume_arg in volumes:
        if ":" not in volume_arg:
            raise ValueError("Could not read volume option '{}'\n\n"
                             "Expecting:\n{}".format(volume_arg, VOLUME_DESC))

        host_path, container_path = volume_arg.split(":", maxsplit=1)
        bind_mounts = config.setdefault("bind_mounts", [])
        bind_mounts.append({
            "host_path": host_path,
            "container_path": container_path
        })

    # Use the entrypoint command line argument if an entrypoint has not already
    # defined by previous settings.
    if not config.get("entrypoint") and entrypoint:
        config["entrypoint"] = entrypoint

    return config
Пример #3
0
def context(mmdet_config_dir: str) -> det_torch.PyTorchTrialContext:
    config_file = "./tests/fixtures/maskrcnn.yaml"
    with open(config_file, "rb") as f:
        config = util.safe_load_yaml_with_exceptions(f)
    context = det_torch.PyTorchTrialContext.from_config(config)
    context = cast(det_torch.PyTorchTrialContext, context)
    return context
Пример #4
0
    def create_experiment(
        self,
        config: Union[str, pathlib.Path, Dict],
        model_dir: Union[str, pathlib.Path],
    ) -> experiment.ExperimentReference:
        """
        Create an experiment with config parameters and model directory. The function
        returns :class:`~determined.experimental.ExperimentReference` of the experiment.

        Arguments:
            config(string, pathlib.Path, dictionary): experiment config filename (.yaml)
                or a dict.
            model_dir(string): directory containing model definition.
        """
        check.is_instance(
            config, (str, pathlib.Path, dict), "config parameter must be dictionary or path"
        )
        if isinstance(config, str):
            with open(config) as f:
                experiment_config = util.safe_load_yaml_with_exceptions(f)
        elif isinstance(config, pathlib.Path):
            with config.open() as f:
                experiment_config = util.safe_load_yaml_with_exceptions(f)
        elif isinstance(config, Dict):
            experiment_config = config

        if isinstance(model_dir, str):
            model_dir = pathlib.Path(model_dir)

        model_context, _ = context.read_context(model_dir)

        resp = self._session.post(
            "/api/v1/experiments",
            body={
                "config": yaml.safe_dump(experiment_config),
                "model_definition": model_context,
            },
        )

        exp_id = _CreateExperimentResponse(resp.json()).id
        exp = experiment.ExperimentReference(exp_id, self._session)
        exp.activate()

        return exp
Пример #5
0
def preview_search(args: Namespace) -> None:
    experiment_config = safe_load_yaml_with_exceptions(args.config_file)
    args.config_file.close()

    if "searcher" not in experiment_config:
        print("Experiment configuration must have 'searcher' section")
        sys.exit(1)
    r = api.post(args.master, "searcher/preview", json=experiment_config)
    j = r.json()

    def to_full_name(kind: str) -> str:
        try:
            # The unitless searcher case, for masters newer than 0.17.6.
            length = int(kind)
            return f"train for {length}"
        except ValueError:
            pass
        if kind[-1] == "R":
            return "train {} records".format(kind[:-1])
        if kind[-1] == "B":
            return "train {} batch(es)".format(kind[:-1])
        if kind[-1] == "E":
            return "train {} epoch(s)".format(kind[:-1])
        if kind == "V":
            return "validation"
        raise ValueError("unexpected kind: {}".format(kind))

    def render_sequence(sequence: List[str]) -> str:
        if not sequence:
            return "N/A"
        instructions = []
        current = sequence[0]
        count = 0
        for k in sequence:
            if k != current:
                instructions.append("{} x {}".format(count,
                                                     to_full_name(current)))
                current = k
                count = 1
            else:
                count += 1
        instructions.append("{} x {}".format(count, to_full_name(current)))
        return ", ".join(instructions)

    headers = ["Trials", "Breakdown"]
    values = [(count, render_sequence(operations.split()))
              for operations, count in j["results"].items()]

    print(colored("Using search configuration:", "green"))
    yml = yaml.YAML()
    yml.indent(mapping=2, sequence=4, offset=2)
    yml.dump(experiment_config["searcher"], sys.stdout)
    print()
    print("This search will create a total of {} trial(s).".format(
        sum(j["results"].values())))
    print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=False)
Пример #6
0
def check_quotas(det_config: Dict[str, Any],
                 deployment_object: DeterminedDeployment) -> None:
    try:
        boto_session: boto3.session.Session = det_config[
            constants.cloudformation.BOTO3_SESSION]
        gpu_instance_type = get_cf_parameter(
            det_config, deployment_object,
            constants.cloudformation.GPU_AGENT_INSTANCE_TYPE)
        max_agents = get_cf_parameter(
            det_config, deployment_object,
            constants.cloudformation.MAX_DYNAMIC_AGENTS)
        spot_enabled = get_cf_parameter(det_config, deployment_object,
                                        constants.cloudformation.SPOT_ENABLED)

        quota_code = get_instance_type_quota_code(gpu_instance_type,
                                                  spot=spot_enabled)
        vcpu_quota = fetch_instance_type_quota(boto_session,
                                               quota_code=quota_code)

        mapping_fn = pkg_resources.resource_filename("determined.deploy.aws",
                                                     "vcpu_mapping.yaml")
        with open(mapping_fn) as fin:
            mapping_data = util.safe_load_yaml_with_exceptions(fin)
            vcpu_mapping = {d["instanceType"]: d for d in mapping_data}

        if gpu_instance_type not in vcpu_mapping:
            raise PreflightFailure("unknown vCPU count for instance type")

        vcpus_required = int(vcpu_mapping[gpu_instance_type]["vcpu"] *
                             max_agents)
    except PreflightFailure as ex:
        print(colored("Failed to check AWS instance quota: %s" % ex, "yellow"))
        return
    except Exception as ex:
        print(
            colored("Error while checking AWS instance quota: %s" % ex,
                    "yellow"))
        return

    if vcpus_required > vcpu_quota:
        print(
            colored(
                "Insufficient AWS GPU agent instance quota (available: %s, required: %s)"
                % (vcpu_quota, vcpus_required),
                "red",
            ))
        print(
            "You can request a quota increase at "
            "https://%s.console.aws.amazon.com/servicequotas/home/services/ec2/quotas"
            % boto_session.region_name)
        print("Required quota code: %s" % quota_code)
        print(
            "This check can be skipped via `det deploy --no-preflight-checks ...`"
        )
        sys.exit(1)
Пример #7
0
def parse_config(
    config_file: Optional[IO],
    entrypoint: Optional[List[str]],
    overrides: Iterable[str],
    volumes: Iterable[str],
) -> Dict[str, Any]:
    config = {}  # type: Dict[str, Any]
    if config_file:
        with config_file:
            config = util.safe_load_yaml_with_exceptions(config_file)

    for config_arg in overrides:
        if "=" not in config_arg:
            raise ValueError(
                "Could not read configuration option '{}'\n\n"
                "Expecting:\n{}".format(config_arg, CONFIG_DESC)
            )

        key, value = config_arg.split("=", maxsplit=1)  # type: Tuple[str, Any]

        # Separate values if a comma exists. Use yaml.safe_load() to cast
        # the value(s) to the type YAML would use, e.g., "4" -> 4.
        if "," in value:
            value = [yaml.safe_load(v) for v in value.split(",")]
        else:
            value = yaml.safe_load(value)

            # Certain configurations keys are expected to have list values.
            # Convert a single value to a singleton list if needed.
            if key in _CONFIG_PATHS_COERCE_TO_LIST:
                value = [value]

        # TODO(#2703): Consider using full JSONPath spec instead of dot
        # notation.
        config = _set_nested_config(config, key.split("."), value)

    for volume_arg in volumes:
        if ":" not in volume_arg:
            raise ValueError(
                "Could not read volume option '{}'\n\n"
                "Expecting:\n{}".format(volume_arg, VOLUME_DESC)
            )

        host_path, container_path = volume_arg.split(":", maxsplit=1)
        bind_mounts = config.setdefault("bind_mounts", [])
        bind_mounts.append({"host_path": host_path, "container_path": container_path})

    # Use the entrypoint command line argument if an entrypoint has not already
    # defined by previous settings.
    if not config.get("entrypoint") and entrypoint:
        config["entrypoint"] = entrypoint

    return config
Пример #8
0
    def create_experiment(
        self,
        config: Union[str, pathlib.Path, Dict],
        model_dir: str,
    ) -> experiment.ExperimentReference:
        """
        Create an experiment with config parameters and model direcotry. The function
        returns :class:`~determined.experimental.ExperimentReference` of the experiment.

        Arguments:
            config(string, pathlib.Path, dictionary): experiment config filename (.yaml)
                or a dict.
            model_dir(string): directory containing model definition.
        """
        check.is_instance(config, (str, pathlib.Path, dict),
                          "config parameter must be dictionary or path")
        if isinstance(config, str):
            with open(config) as f:
                experiment_config = util.safe_load_yaml_with_exceptions(f)
        elif isinstance(config, pathlib.Path):
            with config.open() as f:
                experiment_config = util.safe_load_yaml_with_exceptions(f)
        elif isinstance(config, Dict):
            experiment_config = config

        model_context = _path_to_files(pathlib.Path(model_dir))

        experiment_request = V1CreateExperimentRequest(
            model_definition=model_context,
            config=yaml.safe_dump(experiment_config),
        )
        experiment_response = self._internal.determined_create_experiment(
            experiment_request)
        return experiment.ExperimentReference(
            experiment_response.experiment.id,
            self._session._master,
            self._experiments,
        )
def test_experimental_experiment_api_determined_disabled() -> None:
    context_path = pathlib.Path(conf.fixtures_path("no_op"))
    model_def_path = pathlib.Path(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"))

    model_context = context.Context.from_local(context_path)

    with model_def_path.open("r") as fin:
        dai_experiment_config = util.safe_load_yaml_with_exceptions(fin)

    determined_master = conf.make_master_url()
    requested_user, password = create_test_user(ADMIN_CREDENTIALS,
                                                add_password=True)
    a_username, _ = ADMIN_CREDENTIALS

    try:
        det_spawn(["-u", a_username, "user", "deactivate", "determined"])

        certs.cli_cert = certs.default_load(master_url=determined_master, )
        determined_api.authentication.cli_auth = determined_api.authentication.Authentication(
            determined_master,
            requested_user=requested_user,
            password=password,
            try_reauth=True,
            cert=certs.cli_cert,
        )
        exp_id = determined_api.experiment.create_experiment_and_follow_logs(
            master_url=determined_master,
            config=dai_experiment_config,
            model_context=model_context,
            template=None,
            additional_body_fields={},
            activate=True,
            follow_first_trial_logs=False,
        )

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
    finally:
        det_spawn(["-u", a_username, "user", "activate", "determined"])
Пример #10
0
def set_template(args: Namespace) -> None:
    with args.template_file:
        body = util.safe_load_yaml_with_exceptions(args.template_file)
        api.put(args.master, path="templates/" + args.template_name, json=body)
        print(colored("Set template {}".format(args.template_name), "green"))
Пример #11
0
def load_config(config_path: str) -> Any:
    with open(config_path) as f:
        config = util.safe_load_yaml_with_exceptions(f)
    return config