def configure_helm(args: argparse.Namespace) -> None: helm_dir = Path(args.helm_dir) with (helm_dir / "Chart.yaml").open() as f: helm_chart = safe_load_yaml_with_exceptions(f) if args.det_version: helm_chart["appVersion"] = args.det_version elif "dev" in helm_chart["appVersion"]: # Preserve user overridden appVersion in helm chart unless it includes dev in the version. helm_chart["appVersion"] = determined.__version__ if args.gpu_coscheduler: helm_chart["defaultScheduler"] = "coscheduler" elif args.preemption: helm_chart["defaultScheduler"] = "preemption" with (helm_dir / "Chart.yaml").open("w") as f: yaml.round_trip_dump(helm_chart, f) with (helm_dir / "values.yaml").open() as f: helm_values = safe_load_yaml_with_exceptions(f) checkpointStorage = {} checkpointStorage["saveExperimentBest"] = helm_values[ "checkpointStorage"].get("saveExperimentBest", 0) checkpointStorage["saveTrialBest"] = helm_values["checkpointStorage"].get( "saveTrialBest", 1) checkpointStorage["saveTrialLatest"] = helm_values[ "checkpointStorage"].get("saveTrialLatest", 1) checkpointStorage["type"] = "gcs" checkpointStorage["bucket"] = args.gcs_bucket_name helm_values["checkpointStorage"] = checkpointStorage helm_values["maxSlotsPerPod"] = args.gpus_per_node if args.multiple_node_pools: gpu_pod_spec = make_spec(helm_values["taskContainerDefaults"], "gpuPodSpec") gpu_spec = cast(Dict, gpu_pod_spec["spec"]) gpu_spec["tolerations"] = [{ "key": "gpuAvailable", "operator": "Equal", "value": "True", "effect": "NoSchedule", }] gpu_spec["nodeSelector"] = {"accelerator_type": "gpu"} helm_values["taskContainerDefaults"]["gpuPodSpec"] = gpu_pod_spec cpu_pod_spec = make_spec(helm_values["taskContainerDefaults"], "cpuPodSpec") cpu_spec = cast(Dict, cpu_pod_spec["spec"]) cpu_spec["tolerations"] = [{ "key": "gpuAvailable", "operator": "Equal", "value": "False", "effect": "NoSchedule", }] cpu_spec["nodeSelector"] = {"accelerator_type": "cpu"} helm_values["taskContainerDefaults"]["cpuPodSpec"] = cpu_pod_spec with (helm_dir / "values.yaml").open("w") as f: yaml.round_trip_dump(helm_values, f)
def parse_config( config_file: Optional[IO], entrypoint: Optional[List[str]], overrides: Iterable[str], volumes: Iterable[str], ) -> Dict[str, Any]: config = {} # type: Dict[str, Any] if config_file: with config_file: config = util.safe_load_yaml_with_exceptions(config_file) parse_config_overrides(config, overrides) for volume_arg in volumes: if ":" not in volume_arg: raise ValueError("Could not read volume option '{}'\n\n" "Expecting:\n{}".format(volume_arg, VOLUME_DESC)) host_path, container_path = volume_arg.split(":", maxsplit=1) bind_mounts = config.setdefault("bind_mounts", []) bind_mounts.append({ "host_path": host_path, "container_path": container_path }) # Use the entrypoint command line argument if an entrypoint has not already # defined by previous settings. if not config.get("entrypoint") and entrypoint: config["entrypoint"] = entrypoint return config
def context(mmdet_config_dir: str) -> det_torch.PyTorchTrialContext: config_file = "./tests/fixtures/maskrcnn.yaml" with open(config_file, "rb") as f: config = util.safe_load_yaml_with_exceptions(f) context = det_torch.PyTorchTrialContext.from_config(config) context = cast(det_torch.PyTorchTrialContext, context) return context
def create_experiment( self, config: Union[str, pathlib.Path, Dict], model_dir: Union[str, pathlib.Path], ) -> experiment.ExperimentReference: """ Create an experiment with config parameters and model directory. The function returns :class:`~determined.experimental.ExperimentReference` of the experiment. Arguments: config(string, pathlib.Path, dictionary): experiment config filename (.yaml) or a dict. model_dir(string): directory containing model definition. """ check.is_instance( config, (str, pathlib.Path, dict), "config parameter must be dictionary or path" ) if isinstance(config, str): with open(config) as f: experiment_config = util.safe_load_yaml_with_exceptions(f) elif isinstance(config, pathlib.Path): with config.open() as f: experiment_config = util.safe_load_yaml_with_exceptions(f) elif isinstance(config, Dict): experiment_config = config if isinstance(model_dir, str): model_dir = pathlib.Path(model_dir) model_context, _ = context.read_context(model_dir) resp = self._session.post( "/api/v1/experiments", body={ "config": yaml.safe_dump(experiment_config), "model_definition": model_context, }, ) exp_id = _CreateExperimentResponse(resp.json()).id exp = experiment.ExperimentReference(exp_id, self._session) exp.activate() return exp
def preview_search(args: Namespace) -> None: experiment_config = safe_load_yaml_with_exceptions(args.config_file) args.config_file.close() if "searcher" not in experiment_config: print("Experiment configuration must have 'searcher' section") sys.exit(1) r = api.post(args.master, "searcher/preview", json=experiment_config) j = r.json() def to_full_name(kind: str) -> str: try: # The unitless searcher case, for masters newer than 0.17.6. length = int(kind) return f"train for {length}" except ValueError: pass if kind[-1] == "R": return "train {} records".format(kind[:-1]) if kind[-1] == "B": return "train {} batch(es)".format(kind[:-1]) if kind[-1] == "E": return "train {} epoch(s)".format(kind[:-1]) if kind == "V": return "validation" raise ValueError("unexpected kind: {}".format(kind)) def render_sequence(sequence: List[str]) -> str: if not sequence: return "N/A" instructions = [] current = sequence[0] count = 0 for k in sequence: if k != current: instructions.append("{} x {}".format(count, to_full_name(current))) current = k count = 1 else: count += 1 instructions.append("{} x {}".format(count, to_full_name(current))) return ", ".join(instructions) headers = ["Trials", "Breakdown"] values = [(count, render_sequence(operations.split())) for operations, count in j["results"].items()] print(colored("Using search configuration:", "green")) yml = yaml.YAML() yml.indent(mapping=2, sequence=4, offset=2) yml.dump(experiment_config["searcher"], sys.stdout) print() print("This search will create a total of {} trial(s).".format( sum(j["results"].values()))) print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=False)
def check_quotas(det_config: Dict[str, Any], deployment_object: DeterminedDeployment) -> None: try: boto_session: boto3.session.Session = det_config[ constants.cloudformation.BOTO3_SESSION] gpu_instance_type = get_cf_parameter( det_config, deployment_object, constants.cloudformation.GPU_AGENT_INSTANCE_TYPE) max_agents = get_cf_parameter( det_config, deployment_object, constants.cloudformation.MAX_DYNAMIC_AGENTS) spot_enabled = get_cf_parameter(det_config, deployment_object, constants.cloudformation.SPOT_ENABLED) quota_code = get_instance_type_quota_code(gpu_instance_type, spot=spot_enabled) vcpu_quota = fetch_instance_type_quota(boto_session, quota_code=quota_code) mapping_fn = pkg_resources.resource_filename("determined.deploy.aws", "vcpu_mapping.yaml") with open(mapping_fn) as fin: mapping_data = util.safe_load_yaml_with_exceptions(fin) vcpu_mapping = {d["instanceType"]: d for d in mapping_data} if gpu_instance_type not in vcpu_mapping: raise PreflightFailure("unknown vCPU count for instance type") vcpus_required = int(vcpu_mapping[gpu_instance_type]["vcpu"] * max_agents) except PreflightFailure as ex: print(colored("Failed to check AWS instance quota: %s" % ex, "yellow")) return except Exception as ex: print( colored("Error while checking AWS instance quota: %s" % ex, "yellow")) return if vcpus_required > vcpu_quota: print( colored( "Insufficient AWS GPU agent instance quota (available: %s, required: %s)" % (vcpu_quota, vcpus_required), "red", )) print( "You can request a quota increase at " "https://%s.console.aws.amazon.com/servicequotas/home/services/ec2/quotas" % boto_session.region_name) print("Required quota code: %s" % quota_code) print( "This check can be skipped via `det deploy --no-preflight-checks ...`" ) sys.exit(1)
def parse_config( config_file: Optional[IO], entrypoint: Optional[List[str]], overrides: Iterable[str], volumes: Iterable[str], ) -> Dict[str, Any]: config = {} # type: Dict[str, Any] if config_file: with config_file: config = util.safe_load_yaml_with_exceptions(config_file) for config_arg in overrides: if "=" not in config_arg: raise ValueError( "Could not read configuration option '{}'\n\n" "Expecting:\n{}".format(config_arg, CONFIG_DESC) ) key, value = config_arg.split("=", maxsplit=1) # type: Tuple[str, Any] # Separate values if a comma exists. Use yaml.safe_load() to cast # the value(s) to the type YAML would use, e.g., "4" -> 4. if "," in value: value = [yaml.safe_load(v) for v in value.split(",")] else: value = yaml.safe_load(value) # Certain configurations keys are expected to have list values. # Convert a single value to a singleton list if needed. if key in _CONFIG_PATHS_COERCE_TO_LIST: value = [value] # TODO(#2703): Consider using full JSONPath spec instead of dot # notation. config = _set_nested_config(config, key.split("."), value) for volume_arg in volumes: if ":" not in volume_arg: raise ValueError( "Could not read volume option '{}'\n\n" "Expecting:\n{}".format(volume_arg, VOLUME_DESC) ) host_path, container_path = volume_arg.split(":", maxsplit=1) bind_mounts = config.setdefault("bind_mounts", []) bind_mounts.append({"host_path": host_path, "container_path": container_path}) # Use the entrypoint command line argument if an entrypoint has not already # defined by previous settings. if not config.get("entrypoint") and entrypoint: config["entrypoint"] = entrypoint return config
def create_experiment( self, config: Union[str, pathlib.Path, Dict], model_dir: str, ) -> experiment.ExperimentReference: """ Create an experiment with config parameters and model direcotry. The function returns :class:`~determined.experimental.ExperimentReference` of the experiment. Arguments: config(string, pathlib.Path, dictionary): experiment config filename (.yaml) or a dict. model_dir(string): directory containing model definition. """ check.is_instance(config, (str, pathlib.Path, dict), "config parameter must be dictionary or path") if isinstance(config, str): with open(config) as f: experiment_config = util.safe_load_yaml_with_exceptions(f) elif isinstance(config, pathlib.Path): with config.open() as f: experiment_config = util.safe_load_yaml_with_exceptions(f) elif isinstance(config, Dict): experiment_config = config model_context = _path_to_files(pathlib.Path(model_dir)) experiment_request = V1CreateExperimentRequest( model_definition=model_context, config=yaml.safe_dump(experiment_config), ) experiment_response = self._internal.determined_create_experiment( experiment_request) return experiment.ExperimentReference( experiment_response.experiment.id, self._session._master, self._experiments, )
def test_experimental_experiment_api_determined_disabled() -> None: context_path = pathlib.Path(conf.fixtures_path("no_op")) model_def_path = pathlib.Path( conf.fixtures_path("no_op/single-medium-train-step.yaml")) model_context = context.Context.from_local(context_path) with model_def_path.open("r") as fin: dai_experiment_config = util.safe_load_yaml_with_exceptions(fin) determined_master = conf.make_master_url() requested_user, password = create_test_user(ADMIN_CREDENTIALS, add_password=True) a_username, _ = ADMIN_CREDENTIALS try: det_spawn(["-u", a_username, "user", "deactivate", "determined"]) certs.cli_cert = certs.default_load(master_url=determined_master, ) determined_api.authentication.cli_auth = determined_api.authentication.Authentication( determined_master, requested_user=requested_user, password=password, try_reauth=True, cert=certs.cli_cert, ) exp_id = determined_api.experiment.create_experiment_and_follow_logs( master_url=determined_master, config=dai_experiment_config, model_context=model_context, template=None, additional_body_fields={}, activate=True, follow_first_trial_logs=False, ) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) finally: det_spawn(["-u", a_username, "user", "activate", "determined"])
def set_template(args: Namespace) -> None: with args.template_file: body = util.safe_load_yaml_with_exceptions(args.template_file) api.put(args.master, path="templates/" + args.template_name, json=body) print(colored("Set template {}".format(args.template_name), "green"))
def load_config(config_path: str) -> Any: with open(config_path) as f: config = util.safe_load_yaml_with_exceptions(f) return config