def _wait_hostname_resolution(): """Wait for the hostname resolution of the container. This is known behavior as the cluster boots up and has been documented here: https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-dist-training """ for host in environment.Environment().hosts: _dns_lookup(host)
def parse_args(): """ Parse arguments. """ env = environment.Environment() parser = argparse.ArgumentParser() # hyperparameters sent by the client are passed as command-line arguments to the script parser.add_argument("--max-depth", type=int, default=10) parser.add_argument("--n-jobs", type=int, default=env.num_cpus) parser.add_argument("--n-estimators", type=int, default=120) # data directories parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN")) parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST")) # model directory: we will use the default set by SageMaker, /opt/ml/model parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR")) return parser.parse_known_args()
def _get_by_runner_type(identifier, user_entry_point=None, args=None, env_vars=None, extra_opts=None): env = environment.Environment() user_entry_point = user_entry_point or env.user_entry_point args = args or env.to_cmd_args() env_vars = env_vars or env.to_env_vars() mpi_args = extra_opts or {} # Default to single process for CPU default_processes_per_host = int( env.num_gpus) if int(env.num_gpus) > 0 else 1 processes_per_host = _mpi_param_value(mpi_args, env, params.MPI_PROCESSES_PER_HOST, default_processes_per_host) if identifier is RunnerType.SMDataParallel and env.is_master: custom_mpi_options = _mpi_param_value( mpi_args, env, params.SMDATAPARALLEL_CUSTOM_MPI_OPTIONS, "") return smdataparallel.SMDataParallelRunner( user_entry_point, args, env_vars, processes_per_host, env.master_hostname, env.distribution_hosts, custom_mpi_options, env.network_interface_name, ) elif identifier is RunnerType.SMDataParallel: return mpi.WorkerRunner(user_entry_point, args, env_vars, processes_per_host, env.master_hostname) elif identifier is RunnerType.MPI and env.is_master: num_processes = _mpi_param_value(mpi_args, env, params.MPI_NUM_PROCESSES) custom_mpi_options = _mpi_param_value(mpi_args, env, params.MPI_CUSTOM_OPTIONS, "") return mpi.MasterRunner( user_entry_point, args, env_vars, processes_per_host, env.master_hostname, env.distribution_hosts, custom_mpi_options, env.network_interface_name, num_processes=num_processes, ) elif identifier is RunnerType.MPI: return mpi.WorkerRunner(user_entry_point, args, env_vars, processes_per_host, env.master_hostname) elif identifier is RunnerType.Process: return process.ProcessRunner(user_entry_point, args, env_vars, processes_per_host) else: raise ValueError("Invalid identifier %s" % identifier)
def test_env_module_name(sagemaker_program): session_mock = Mock() session_mock.region_name = "us-west-2" os.environ[params.USER_PROGRAM_ENV] = sagemaker_program module_name = environment.Environment().module_name del os.environ[params.USER_PROGRAM_ENV] assert module_name == "program"
def framework_training_with_script_mode_fn(capture_error): training_env = environment.Environment() entry_point.run( training_env.module_dir, training_env.user_entry_point, training_env.to_cmd_args(), training_env.to_env_vars(), capture_error=capture_error, )
def test_env_dictionary(): session_mock = Mock() session_mock.region_name = "us-west-2" os.environ[params.USER_PROGRAM_ENV] = "my_app.py" test_env = environment.Environment() assert len(test_env) == len(test_env.properties()) assert test_env["module_name"] == "my_app" assert test_env["log_level"] == logging.INFO
def main(): env = environment.Environment() entry_point_module = env.hyperparameters["entry_point"] src_package_dir = "/opt/ml/input/data/code" src_package_path = os.path.join(src_package_dir, os.listdir(src_package_dir)[0]) os.system(sys.executable + " -m pip install " + src_package_path) importlib.import_module(entry_point_module).run(env)
def train(): training_env = environment.Environment() script = modules.import_module(training_env.module_dir, training_env.module_name) model = script.train(**functions.matching_args(script.train, training_env)) if model: if hasattr(script, "save"): script.save(model, training_env.model_dir) else: model_file = os.path.join(training_env.model_dir, "saved_model") model.save(model_file)
def framework_training_fn(): training_env = environment.Environment() mod = modules.import_module(training_env.module_dir, training_env.module_name) model = mod.train(**functions.matching_args(mod.train, training_env)) if model: if hasattr(mod, "save"): mod.save(model, training_env.model_dir) else: model_file = os.path.join(training_env.model_dir, "saved_model") model.save(model_file)
def main(): """Training entry point """ hyperparameters = environment.read_hyperparameters() env = environment.Environment(hyperparameters=hyperparameters) user_hyperparameters = env.hyperparameters # If the training job is part of the multiple training jobs for tuning, we need to append the training job name to # model_dir in case they read from/write to the same object if "_tuning_objective_metric" in hyperparameters: model_dir = _model_dir_with_training_job(hyperparameters.get("model_dir"), env.job_name) logger.info("Appending the training job name to model_dir: {}".format(model_dir)) user_hyperparameters["model_dir"] = model_dir s3_utils.configure(user_hyperparameters.get("model_dir"), os.environ.get("SAGEMAKER_REGION")) train(env, mapping.to_cmd_args(user_hyperparameters)) _log_model_missing_warning(MODEL_DIR)
def run(self, wait=True, capture_error=False): """Run the process. Args: wait (bool): A boolean indicating whether to wait and check for errors. Defaults to True. capture_error (bool): A boolean indicating whether to direct stderr to a stream that can later be read. Defaults to False. Returns: process (subprocess.Popen): The spawned process. """ self._setup() cmd = self._create_command() logging_config.log_script_invocation(cmd, self._env_vars) training_env = environment.Environment() exception_classes = get_modelparallel_exception_classes() if wait: process_spawned = process.check_error( cmd, exception_classes if training_env.is_modelparallel_enabled else errors.ExecuteUserScriptError, self._processes_per_host, capture_error=capture_error, cwd=environment.code_dir, ) else: _, _, process_spawned = process.create( cmd, exception_classes if training_env.is_modelparallel_enabled else errors.ExecuteUserScriptError, self._processes_per_host, capture_error=capture_error, cwd=environment.code_dir, ) self._tear_down() return process_spawned
def create_training_env(): with patch("sagemaker_training.environment.read_resource_config", lambda: RESOURCE_CONFIG), patch( "sagemaker_training.environment.read_input_data_config", lambda: INPUT_DATA_CONFIG), patch( "sagemaker_training.environment.read_hyperparameters", lambda: ALL_HYPERPARAMETERS), patch( "sagemaker_training.environment.num_cpus", lambda: 8), patch( "sagemaker_training.environment.num_gpus", lambda: 4): session_mock = Mock() session_mock.region_name = "us-west-2" old_environ = os.environ.copy() os.environ[params.TRAINING_JOB_ENV] = "training-job-42" yield environment.Environment() os.environ = old_environ
def test_parameter_server(): module = test.UserModule(test.File(name="user_script.py", data=PARAMETER_SERVER_SCRIPT)) hyperparameters = dict(sagemaker_program="user_script.py") test.prepare( user_module=module, hyperparameters=hyperparameters, channels=[test.Channel.create(name="training")], ) training_env = environment.Environment() process = entry_point.run( training_env.module_dir, training_env.user_entry_point, training_env.to_cmd_args(), training_env.to_env_vars(), wait=False, ) # confirm the ps process is still hanging assert process.poll() is None process.kill()
def main(): training_env = environment.Environment() train(training_env)
def main(): train(environment.Environment())
type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=100, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--backend', type=str, default=None, help='backend for distributed training') parser.add_argument('--processor', type=str, default='cpu', help='backend for distributed training') # Container environment env = environment.Environment() parser.add_argument('--hosts', type=list, default=env.hosts) parser.add_argument('--current-host', type=str, default=env.current_host) parser.add_argument('--model-dir', type=str, default=env.model_dir) parser.add_argument('--data-dir', type=str, default=env.channel_input_dirs['training']) parser.add_argument('--num-gpus', type=int, default=env.num_gpus) train(parser.parse_args())
def train(): """The main function responsible for running training in the container.""" intermediate_sync = None exit_code = SUCCESS_CODE try: env = environment.Environment() region = os.environ.get("AWS_REGION", os.environ.get(params.REGION_NAME_ENV)) s3_endpoint_url = os.environ.get(params.S3_ENDPOINT_URL, None) intermediate_sync = intermediate_output.start_sync( env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url) if env.framework_module: framework_name, entry_point_name = env.framework_module.split(":") framework = importlib.import_module(framework_name) # the logger is configured after importing the framework library, allowing # the framework to configure logging at import time. logging_config.configure_logger(env.log_level) logger.info("Imported framework %s", framework_name) entrypoint = getattr(framework, entry_point_name) entrypoint() else: logging_config.configure_logger(env.log_level) mpi_enabled = env.additional_framework_parameters.get( params.MPI_ENABLED) runner_type = (runner.RunnerType.MPI if mpi_enabled and (env.current_instance_group in env.distribution_instance_groups) else runner.RunnerType.Process) entry_point.run( env.module_dir, env.user_entry_point, env.to_cmd_args(), env.to_env_vars(), runner_type=runner_type, ) logger.info("Reporting training SUCCESS") files.write_success_file() except errors.ClientError as e: failure_msg = str(e) files.write_failure_file(failure_msg) logger.error("Reporting training FAILURE") logger.error(failure_msg) if intermediate_sync: intermediate_sync.join() exit_code = DEFAULT_FAILURE_CODE except Exception as e: # pylint: disable=broad-except failure_msg = "Framework Error: \n%s\n%s" % (traceback.format_exc(), str(e)) files.write_failure_file(failure_msg) logger.error("Reporting training FAILURE") logger.error(failure_msg) error_number = getattr(e, "errno", DEFAULT_FAILURE_CODE) exit_code = _get_valid_failure_exit_code(error_number) finally: if intermediate_sync: intermediate_sync.join() _exit_processes(exit_code)
def test_env_vars_round_trip(): hyperparameters = { "loss": "SGD", "sagemaker_program": "user_script.py", "epochs": 10, "batch_size": 64, "precision": 5.434322, "sagemaker_region": "us-west-2", "sagemaker_job_name": "horovod-training-job", "sagemaker_submit_directory": "s3/something", } resource_config = { "current_host": "algo-1", "hosts": ["algo-1", "algo-2", "algo-3"] } input_data_config = { "train": { "ContentType": "trainingContentType", "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None", }, "validation": { "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None", }, } os.environ[ params. FRAMEWORK_TRAINING_MODULE_ENV] = "test.functional.simple_framework:train" training_env = environment.Environment( resource_config=resource_config, input_data_config=input_data_config, hyperparameters=hyperparameters, ) os.environ[params.FRAMEWORK_TRAINING_MODULE_ENV] = "" args = mapping.to_cmd_args(training_env.hyperparameters) env_vars = training_env.to_env_vars() env_vars["SM_USER_ARGS"] = " ".join(args) assert env_vars["SM_OUTPUT_DATA_DIR"] == training_env.output_data_dir assert ( env_vars["SM_INPUT_DATA_CONFIG"] == '{"train":{"ContentType":"trainingContentType",' '"RecordWrapperType":"None","S3DistributionType":"FullyReplicated",' '"TrainingInputMode":"File"},"validation":{"RecordWrapperType":"None",' '"S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}') assert env_vars["SM_NETWORK_INTERFACE_NAME"] == "eth0" assert env_vars["SM_LOG_LEVEL"] == "20" assert env_vars["SM_INPUT_DIR"].endswith("/opt/ml/input") assert env_vars["SM_NUM_CPUS"] == str(training_env.num_cpus) assert env_vars["SM_HP_BATCH_SIZE"] == "64" assert env_vars["SM_CHANNEL_TRAIN"].endswith("/opt/ml/input/data/train") assert env_vars["SM_CHANNEL_VALIDATION"].endswith( "/opt/ml/input/data/validation") assert env_vars["SM_HP_EPOCHS"] == "10" assert env_vars[ "SM_HPS"] == '{"batch_size":64,"epochs":10,"loss":"SGD","precision":5.434322}' assert env_vars["SM_HP_PRECISION"] == "5.434322" assert (env_vars["SM_RESOURCE_CONFIG"] == '{"current_host":"algo-1","hosts":["algo-1","algo-2","algo-3"]}') assert env_vars["SM_MODULE_NAME"] == "user_script" assert env_vars["SM_INPUT_CONFIG_DIR"].endswith("/opt/ml/input/config") assert env_vars[ "SM_USER_ARGS"] == "--batch_size 64 --epochs 10 --loss SGD --precision 5.434322" assert env_vars["SM_OUTPUT_DIR"].endswith("/opt/ml/output") assert env_vars["SM_MODEL_DIR"].endswith("/opt/ml/model") assert env_vars["SM_HOSTS"] == '["algo-1","algo-2","algo-3"]' assert env_vars["SM_NUM_GPUS"] == str(training_env.num_gpus) assert env_vars["SM_MODULE_DIR"] == "s3/something" assert env_vars["SM_CURRENT_HOST"] == "algo-1" assert env_vars["SM_CHANNELS"] == '["train","validation"]' assert env_vars["SM_HP_LOSS"] == "SGD" assert env_vars[ "SM_FRAMEWORK_MODULE"] == "test.functional.simple_framework:train" assert all(x in env_vars["SM_TRAINING_ENV"] for x in (training_env.properties()))
def test_create_training_env_without_training_files_and_directories_should_not_fail( ): training_env = environment.Environment() hostname = socket.gethostname() assert training_env.current_host == hostname assert training_env.hosts == [hostname]