def _wait_hostname_resolution():
    """Wait for the hostname resolution of the container. This is known behavior as the cluster
    boots up and has been documented here:
     https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-dist-training
    """
    for host in environment.Environment().hosts:
        _dns_lookup(host)
def parse_args():
    """
    Parse arguments.
    """
    env = environment.Environment()

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument("--max-depth", type=int, default=10)
    parser.add_argument("--n-jobs", type=int, default=env.num_cpus)
    parser.add_argument("--n-estimators", type=int, default=120)

    # data directories
    parser.add_argument("--train",
                        type=str,
                        default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test",
                        type=str,
                        default=os.environ.get("SM_CHANNEL_TEST"))

    # model directory: we will use the default set by SageMaker, /opt/ml/model
    parser.add_argument("--model_dir",
                        type=str,
                        default=os.environ.get("SM_MODEL_DIR"))

    return parser.parse_known_args()
예제 #3
0
def _get_by_runner_type(identifier,
                        user_entry_point=None,
                        args=None,
                        env_vars=None,
                        extra_opts=None):
    env = environment.Environment()
    user_entry_point = user_entry_point or env.user_entry_point
    args = args or env.to_cmd_args()
    env_vars = env_vars or env.to_env_vars()
    mpi_args = extra_opts or {}

    # Default to single process for CPU
    default_processes_per_host = int(
        env.num_gpus) if int(env.num_gpus) > 0 else 1
    processes_per_host = _mpi_param_value(mpi_args, env,
                                          params.MPI_PROCESSES_PER_HOST,
                                          default_processes_per_host)

    if identifier is RunnerType.SMDataParallel and env.is_master:
        custom_mpi_options = _mpi_param_value(
            mpi_args, env, params.SMDATAPARALLEL_CUSTOM_MPI_OPTIONS, "")
        return smdataparallel.SMDataParallelRunner(
            user_entry_point,
            args,
            env_vars,
            processes_per_host,
            env.master_hostname,
            env.distribution_hosts,
            custom_mpi_options,
            env.network_interface_name,
        )
    elif identifier is RunnerType.SMDataParallel:
        return mpi.WorkerRunner(user_entry_point, args, env_vars,
                                processes_per_host, env.master_hostname)
    elif identifier is RunnerType.MPI and env.is_master:
        num_processes = _mpi_param_value(mpi_args, env,
                                         params.MPI_NUM_PROCESSES)
        custom_mpi_options = _mpi_param_value(mpi_args, env,
                                              params.MPI_CUSTOM_OPTIONS, "")
        return mpi.MasterRunner(
            user_entry_point,
            args,
            env_vars,
            processes_per_host,
            env.master_hostname,
            env.distribution_hosts,
            custom_mpi_options,
            env.network_interface_name,
            num_processes=num_processes,
        )
    elif identifier is RunnerType.MPI:
        return mpi.WorkerRunner(user_entry_point, args, env_vars,
                                processes_per_host, env.master_hostname)
    elif identifier is RunnerType.Process:
        return process.ProcessRunner(user_entry_point, args, env_vars,
                                     processes_per_host)
    else:
        raise ValueError("Invalid identifier %s" % identifier)
def test_env_module_name(sagemaker_program):
    session_mock = Mock()
    session_mock.region_name = "us-west-2"
    os.environ[params.USER_PROGRAM_ENV] = sagemaker_program
    module_name = environment.Environment().module_name

    del os.environ[params.USER_PROGRAM_ENV]

    assert module_name == "program"
def framework_training_with_script_mode_fn(capture_error):
    training_env = environment.Environment()

    entry_point.run(
        training_env.module_dir,
        training_env.user_entry_point,
        training_env.to_cmd_args(),
        training_env.to_env_vars(),
        capture_error=capture_error,
    )
def test_env_dictionary():
    session_mock = Mock()
    session_mock.region_name = "us-west-2"
    os.environ[params.USER_PROGRAM_ENV] = "my_app.py"
    test_env = environment.Environment()

    assert len(test_env) == len(test_env.properties())

    assert test_env["module_name"] == "my_app"
    assert test_env["log_level"] == logging.INFO
예제 #7
0
def main():
    env = environment.Environment()

    entry_point_module = env.hyperparameters["entry_point"]

    src_package_dir = "/opt/ml/input/data/code"
    src_package_path = os.path.join(src_package_dir,
                                    os.listdir(src_package_dir)[0])
    os.system(sys.executable + " -m pip install " + src_package_path)

    importlib.import_module(entry_point_module).run(env)
예제 #8
0
def train():
    training_env = environment.Environment()

    script = modules.import_module(training_env.module_dir, training_env.module_name)

    model = script.train(**functions.matching_args(script.train, training_env))

    if model:
        if hasattr(script, "save"):
            script.save(model, training_env.model_dir)
        else:
            model_file = os.path.join(training_env.model_dir, "saved_model")
            model.save(model_file)
def framework_training_fn():
    training_env = environment.Environment()

    mod = modules.import_module(training_env.module_dir, training_env.module_name)

    model = mod.train(**functions.matching_args(mod.train, training_env))

    if model:
        if hasattr(mod, "save"):
            mod.save(model, training_env.model_dir)
        else:
            model_file = os.path.join(training_env.model_dir, "saved_model")
            model.save(model_file)
def main():
    """Training entry point
    """
    hyperparameters = environment.read_hyperparameters()
    env = environment.Environment(hyperparameters=hyperparameters)

    user_hyperparameters = env.hyperparameters

    # If the training job is part of the multiple training jobs for tuning, we need to append the training job name to
    # model_dir in case they read from/write to the same object
    if "_tuning_objective_metric" in hyperparameters:
        model_dir = _model_dir_with_training_job(hyperparameters.get("model_dir"), env.job_name)
        logger.info("Appending the training job name to model_dir: {}".format(model_dir))
        user_hyperparameters["model_dir"] = model_dir

    s3_utils.configure(user_hyperparameters.get("model_dir"), os.environ.get("SAGEMAKER_REGION"))
    train(env, mapping.to_cmd_args(user_hyperparameters))
    _log_model_missing_warning(MODEL_DIR)
예제 #11
0
    def run(self, wait=True, capture_error=False):
        """Run the process.

        Args:
            wait (bool): A boolean indicating whether to wait and check for errors.
                Defaults to True.
            capture_error (bool): A boolean indicating whether to direct stderr to a stream
                that can later be read. Defaults to False.

        Returns:
            process (subprocess.Popen): The spawned process.
        """
        self._setup()

        cmd = self._create_command()

        logging_config.log_script_invocation(cmd, self._env_vars)

        training_env = environment.Environment()
        exception_classes = get_modelparallel_exception_classes()
        if wait:
            process_spawned = process.check_error(
                cmd,
                exception_classes
                if training_env.is_modelparallel_enabled
                else errors.ExecuteUserScriptError,
                self._processes_per_host,
                capture_error=capture_error,
                cwd=environment.code_dir,
            )
        else:
            _, _, process_spawned = process.create(
                cmd,
                exception_classes
                if training_env.is_modelparallel_enabled
                else errors.ExecuteUserScriptError,
                self._processes_per_host,
                capture_error=capture_error,
                cwd=environment.code_dir,
            )

        self._tear_down()
        return process_spawned
def create_training_env():
    with patch("sagemaker_training.environment.read_resource_config",
               lambda: RESOURCE_CONFIG), patch(
                   "sagemaker_training.environment.read_input_data_config",
                   lambda: INPUT_DATA_CONFIG), patch(
                       "sagemaker_training.environment.read_hyperparameters",
                       lambda: ALL_HYPERPARAMETERS), patch(
                           "sagemaker_training.environment.num_cpus",
                           lambda: 8), patch(
                               "sagemaker_training.environment.num_gpus",
                               lambda: 4):
        session_mock = Mock()
        session_mock.region_name = "us-west-2"
        old_environ = os.environ.copy()
        os.environ[params.TRAINING_JOB_ENV] = "training-job-42"

        yield environment.Environment()

        os.environ = old_environ
def test_parameter_server():
    module = test.UserModule(test.File(name="user_script.py", data=PARAMETER_SERVER_SCRIPT))
    hyperparameters = dict(sagemaker_program="user_script.py")

    test.prepare(
        user_module=module,
        hyperparameters=hyperparameters,
        channels=[test.Channel.create(name="training")],
    )
    training_env = environment.Environment()
    process = entry_point.run(
        training_env.module_dir,
        training_env.user_entry_point,
        training_env.to_cmd_args(),
        training_env.to_env_vars(),
        wait=False,
    )
    # confirm the ps process is still hanging
    assert process.poll() is None
    process.kill()
예제 #14
0
def main():
    training_env = environment.Environment()
    train(training_env)
예제 #15
0
def main():
    train(environment.Environment())
예제 #16
0
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=100,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--backend',
                        type=str,
                        default=None,
                        help='backend for distributed training')
    parser.add_argument('--processor',
                        type=str,
                        default='cpu',
                        help='backend for distributed training')

    # Container environment
    env = environment.Environment()
    parser.add_argument('--hosts', type=list, default=env.hosts)
    parser.add_argument('--current-host', type=str, default=env.current_host)
    parser.add_argument('--model-dir', type=str, default=env.model_dir)
    parser.add_argument('--data-dir',
                        type=str,
                        default=env.channel_input_dirs['training'])
    parser.add_argument('--num-gpus', type=int, default=env.num_gpus)

    train(parser.parse_args())
예제 #17
0
def train():
    """The main function responsible for running training in the container."""
    intermediate_sync = None
    exit_code = SUCCESS_CODE
    try:
        env = environment.Environment()

        region = os.environ.get("AWS_REGION",
                                os.environ.get(params.REGION_NAME_ENV))
        s3_endpoint_url = os.environ.get(params.S3_ENDPOINT_URL, None)
        intermediate_sync = intermediate_output.start_sync(
            env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url)

        if env.framework_module:
            framework_name, entry_point_name = env.framework_module.split(":")

            framework = importlib.import_module(framework_name)

            # the logger is configured after importing the framework library, allowing
            # the framework to configure logging at import time.
            logging_config.configure_logger(env.log_level)
            logger.info("Imported framework %s", framework_name)
            entrypoint = getattr(framework, entry_point_name)
            entrypoint()
        else:
            logging_config.configure_logger(env.log_level)

            mpi_enabled = env.additional_framework_parameters.get(
                params.MPI_ENABLED)
            runner_type = (runner.RunnerType.MPI if mpi_enabled and
                           (env.current_instance_group
                            in env.distribution_instance_groups) else
                           runner.RunnerType.Process)

            entry_point.run(
                env.module_dir,
                env.user_entry_point,
                env.to_cmd_args(),
                env.to_env_vars(),
                runner_type=runner_type,
            )
        logger.info("Reporting training SUCCESS")

        files.write_success_file()
    except errors.ClientError as e:

        failure_msg = str(e)
        files.write_failure_file(failure_msg)
        logger.error("Reporting training FAILURE")

        logger.error(failure_msg)

        if intermediate_sync:
            intermediate_sync.join()

        exit_code = DEFAULT_FAILURE_CODE
    except Exception as e:  # pylint: disable=broad-except
        failure_msg = "Framework Error: \n%s\n%s" % (traceback.format_exc(),
                                                     str(e))

        files.write_failure_file(failure_msg)
        logger.error("Reporting training FAILURE")

        logger.error(failure_msg)

        error_number = getattr(e, "errno", DEFAULT_FAILURE_CODE)
        exit_code = _get_valid_failure_exit_code(error_number)
    finally:
        if intermediate_sync:
            intermediate_sync.join()
        _exit_processes(exit_code)
예제 #18
0
def test_env_vars_round_trip():
    hyperparameters = {
        "loss": "SGD",
        "sagemaker_program": "user_script.py",
        "epochs": 10,
        "batch_size": 64,
        "precision": 5.434322,
        "sagemaker_region": "us-west-2",
        "sagemaker_job_name": "horovod-training-job",
        "sagemaker_submit_directory": "s3/something",
    }

    resource_config = {
        "current_host": "algo-1",
        "hosts": ["algo-1", "algo-2", "algo-3"]
    }

    input_data_config = {
        "train": {
            "ContentType": "trainingContentType",
            "TrainingInputMode": "File",
            "S3DistributionType": "FullyReplicated",
            "RecordWrapperType": "None",
        },
        "validation": {
            "TrainingInputMode": "File",
            "S3DistributionType": "FullyReplicated",
            "RecordWrapperType": "None",
        },
    }

    os.environ[
        params.
        FRAMEWORK_TRAINING_MODULE_ENV] = "test.functional.simple_framework:train"

    training_env = environment.Environment(
        resource_config=resource_config,
        input_data_config=input_data_config,
        hyperparameters=hyperparameters,
    )

    os.environ[params.FRAMEWORK_TRAINING_MODULE_ENV] = ""

    args = mapping.to_cmd_args(training_env.hyperparameters)

    env_vars = training_env.to_env_vars()
    env_vars["SM_USER_ARGS"] = " ".join(args)

    assert env_vars["SM_OUTPUT_DATA_DIR"] == training_env.output_data_dir
    assert (
        env_vars["SM_INPUT_DATA_CONFIG"] ==
        '{"train":{"ContentType":"trainingContentType",'
        '"RecordWrapperType":"None","S3DistributionType":"FullyReplicated",'
        '"TrainingInputMode":"File"},"validation":{"RecordWrapperType":"None",'
        '"S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}')
    assert env_vars["SM_NETWORK_INTERFACE_NAME"] == "eth0"
    assert env_vars["SM_LOG_LEVEL"] == "20"
    assert env_vars["SM_INPUT_DIR"].endswith("/opt/ml/input")
    assert env_vars["SM_NUM_CPUS"] == str(training_env.num_cpus)
    assert env_vars["SM_HP_BATCH_SIZE"] == "64"
    assert env_vars["SM_CHANNEL_TRAIN"].endswith("/opt/ml/input/data/train")
    assert env_vars["SM_CHANNEL_VALIDATION"].endswith(
        "/opt/ml/input/data/validation")
    assert env_vars["SM_HP_EPOCHS"] == "10"
    assert env_vars[
        "SM_HPS"] == '{"batch_size":64,"epochs":10,"loss":"SGD","precision":5.434322}'
    assert env_vars["SM_HP_PRECISION"] == "5.434322"
    assert (env_vars["SM_RESOURCE_CONFIG"] ==
            '{"current_host":"algo-1","hosts":["algo-1","algo-2","algo-3"]}')
    assert env_vars["SM_MODULE_NAME"] == "user_script"
    assert env_vars["SM_INPUT_CONFIG_DIR"].endswith("/opt/ml/input/config")
    assert env_vars[
        "SM_USER_ARGS"] == "--batch_size 64 --epochs 10 --loss SGD --precision 5.434322"
    assert env_vars["SM_OUTPUT_DIR"].endswith("/opt/ml/output")
    assert env_vars["SM_MODEL_DIR"].endswith("/opt/ml/model")
    assert env_vars["SM_HOSTS"] == '["algo-1","algo-2","algo-3"]'
    assert env_vars["SM_NUM_GPUS"] == str(training_env.num_gpus)
    assert env_vars["SM_MODULE_DIR"] == "s3/something"
    assert env_vars["SM_CURRENT_HOST"] == "algo-1"
    assert env_vars["SM_CHANNELS"] == '["train","validation"]'
    assert env_vars["SM_HP_LOSS"] == "SGD"
    assert env_vars[
        "SM_FRAMEWORK_MODULE"] == "test.functional.simple_framework:train"

    assert all(x in env_vars["SM_TRAINING_ENV"]
               for x in (training_env.properties()))
def test_create_training_env_without_training_files_and_directories_should_not_fail(
):
    training_env = environment.Environment()
    hostname = socket.gethostname()
    assert training_env.current_host == hostname
    assert training_env.hosts == [hostname]