def make_default_env_context( hparams: Dict[str, Any], experiment_config: Optional[Dict] = None, trial_seed: int = 0 ) -> det.EnvContext: if experiment_config is None: experiment_config = make_default_exp_config(hparams, 1) # TODO(ryan): Fix the parameter passing so that this doesn't read from environment variables, # and we can get rid of the @expose_gpus fixture. use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu) return det.EnvContext( experiment_config=experiment_config, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1) ), master_addr="", master_port=0, container_id="", hparams=hparams, latest_checkpoint=None, use_gpu=use_gpu, container_gpus=gpu_uuids, slot_ids=[], debug=False, workload_manager_type="", det_rendezvous_ports="", det_trial_runner_network_interface=constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=trial_seed, )
def main() -> None: for k in ENVIRONMENT_VARIABLE_KEYS: if k not in os.environ: sys.exit("Environment not set: missing " + k) experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"]) debug = experiment_config.get("debug", False) det._set_logger(debug) master_addr = os.environ["DET_MASTER_ADDR"] master_port = int(os.environ["DET_MASTER_PORT"]) agent_id = os.environ["DET_AGENT_ID"] container_id = os.environ["DET_CONTAINER_ID"] hparams = simplejson.loads(os.environ["DET_HPARAMS"]) initial_work = workload.Workload.from_json(simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"])) latest_checkpoint = simplejson.loads(os.environ["DET_LATEST_CHECKPOINT"]) use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) slot_ids = json.loads(os.environ["DET_SLOT_IDS"]) workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"] det_rendezvous_ports = os.environ["DET_RENDEZVOUS_PORTS"] det_trial_runner_network_interface = os.environ["DET_TRIAL_RUNNER_NETWORK_INTERFACE"] det_trial_id = os.environ["DET_TRIAL_ID"] det_experiment_id = os.environ["DET_EXPERIMENT_ID"] det_cluster_id = os.environ["DET_CLUSTER_ID"] trial_seed = int(os.environ["DET_TRIAL_SEED"]) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids) env = det.EnvContext( master_addr, master_port, container_id, experiment_config, hparams, initial_work, latest_checkpoint, use_gpu, gpu_uuids, slot_ids, debug, workload_manager_type, det_rendezvous_ports, det_trial_runner_network_interface, det_trial_id, det_experiment_id, det_cluster_id, trial_seed, ) logging.info( f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}." ) try: storage.validate_config(env.experiment_config["checkpoint_storage"]) except Exception as e: logging.error("Checkpoint storage validation failed: {}".format(e)) sys.exit(1) build_and_run_training_pipeline(env)
def main() -> None: for k in ENVIRONMENT_VARIABLE_KEYS: if k not in os.environ: sys.exit("Environment not set: missing " + k) experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"]) debug = experiment_config.get("debug", False) determined.common.set_logger(debug) master_addr = os.environ["DET_MASTER_ADDR"] master_port = int(os.environ["DET_MASTER_PORT"]) use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false")) master_cert_file = os.environ.get("DET_MASTER_CERT_FILE") master_cert_name = os.environ.get("DET_MASTER_CERT_NAME") agent_id = os.environ["DET_AGENT_ID"] container_id = os.environ["DET_CONTAINER_ID"] hparams = simplejson.loads(os.environ["DET_HPARAMS"]) initial_work = workload.Workload.from_json( simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"])) with open(os.environ["DET_LATEST_CHECKPOINT"], "r") as f: latest_checkpoint = json.load(f) use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) slot_ids = json.loads(os.environ["DET_SLOT_IDS"]) workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"] det_rendezvous_port = os.environ["DET_RENDEZVOUS_PORT"] det_trial_unique_port_offset = int( os.environ["DET_TRIAL_UNIQUE_PORT_OFFSET"]) det_trial_runner_network_interface = os.environ[ "DET_TRIAL_RUNNER_NETWORK_INTERFACE"] det_trial_id = os.environ["DET_TRIAL_ID"] det_experiment_id = os.environ["DET_EXPERIMENT_ID"] det_agent_id = os.environ["DET_AGENT_ID"] det_cluster_id = os.environ["DET_CLUSTER_ID"] det_task_token = os.environ["DET_TASK_TOKEN"] trial_seed = int(os.environ["DET_TRIAL_SEED"]) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids) env = det.EnvContext( master_addr, master_port, use_tls, master_cert_file, master_cert_name, container_id, experiment_config, hparams, initial_work, latest_checkpoint, use_gpu, gpu_uuids, slot_ids, debug, workload_manager_type, det_rendezvous_port, det_trial_unique_port_offset, det_trial_runner_network_interface, det_trial_id, det_experiment_id, det_agent_id, det_cluster_id, det_task_token, trial_seed, managed_training=True, test_mode=False, on_cluster=True, ) logging.info( f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}." ) try: storage.validate_config( env.experiment_config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) except Exception as e: logging.error("Checkpoint storage validation failed: {}".format(e)) sys.exit(1) try: build_and_run_training_pipeline(env) except det.InvalidHP: logging.info("InvalidHP detected, gracefully exiting trial") pass