예제 #1
0
def test_large_files():
    os.environ["TRAINING_JOB_NAME"] = _timestamp()
    p = _intermediate_output.start_sync(bucket_uri, region)

    file_size = 1024 * 256 * 17  # 17MB

    file = os.path.join(intermediate_path, "file.npy")
    _generate_large_npy_file(file_size, file)

    file_to_modify = os.path.join(intermediate_path, "file_to_modify.npy")
    _generate_large_npy_file(file_size, file_to_modify)
    content_to_assert = _generate_large_npy_file(file_size, file_to_modify)

    _files.write_failure_file("Failure!!")
    p.join()

    assert os.path.exists(file)
    assert os.path.exists(file_to_modify)

    key_prefix = os.path.join(os.environ.get("TRAINING_JOB_NAME"), "output", "intermediate")
    client = boto3.client("s3", region)
    assert _file_exists_in_s3(
        client, os.path.join(key_prefix, os.path.relpath(file, intermediate_path))
    )
    assert _file_exists_in_s3(
        client, os.path.join(key_prefix, os.path.relpath(file_to_modify, intermediate_path))
    )

    # check that modified file has
    s3 = boto3.resource("s3", region_name=region)
    key = os.path.join(key_prefix, os.path.relpath(file_to_modify, intermediate_path))
    modified_file = os.path.join(_env.output_dir, "modified_file.npy")
    s3.Bucket(bucket).download_file(key, modified_file)
    assert np.array_equal(np.load(modified_file), content_to_assert)
예제 #2
0
def train():
    try:
        # TODO: iquintero - add error handling for ImportError to let the user know
        # if the framework module is not defined.
        env = sagemaker_containers.training_env()

        framework_name, entry_point_name = env.framework_module.split(':')

        try:
            framework = importlib.import_module(framework_name)
        except:
            logger.error("Import failure loading %s. sys.path=%s" %
                         (framework_name, sys.path))
            raise

        # the logger is configured after importing the framework library, allowing the framework to
        # configure logging at import time.
        _logging.configure_logger(env.log_level)

        logger.info('Imported framework %s', framework_name)

        entry_point = getattr(framework, entry_point_name)

        _modules.write_env_vars(env.to_env_vars())

        entry_point()

        logger.info('Reporting training SUCCESS')
        _files.write_success_file()
        _exit_processes(SUCCESS_CODE)

    except _errors.ClientError as e:

        failure_message = str(e)
        _files.write_failure_file(failure_message)

        logger.error(failure_message)
        _exit_processes(DEFAULT_FAILURE_CODE)
    except Exception as e:
        failure_msg = 'framework error: \n%s\n%s' % (traceback.format_exc(),
                                                     str(e))

        _files.write_failure_file(failure_msg)
        logger.error('Reporting training FAILURE')

        logger.error(failure_msg)

        exit_code = getattr(e, 'errno', DEFAULT_FAILURE_CODE)
        _exit_processes(exit_code)
예제 #3
0
def train():
    intermediate_sync = None
    exit_code = SUCCESS_CODE
    try:
        # TODO: iquintero - add error handling for ImportError to let the user know
        # if the framework module is not defined.
        env = sagemaker_containers.training_env()

        # TODO: [issue#144] There is a bug in the logic -
        # we need os.environ.get(_params.REGION_NAME_ENV)
        # in certain regions, but it is not going to be available unless
        # TrainingEnvironment has been initialized. It shouldn't be environment variable.
        region = os.environ.get('AWS_REGION', os.environ.get(_params.REGION_NAME_ENV))
        s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL")
        intermediate_sync = _intermediate_output.start_sync(env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url)

        if env.framework_module:
            framework_name, entry_point_name = env.framework_module.split(':')

            framework = importlib.import_module(framework_name)

            # the logger is configured after importing the framework library, allowing the framework to
            # configure logging at import time.
            _logging.configure_logger(env.log_level)
            logger.info('Imported framework %s', framework_name)

            entrypoint = getattr(framework, entry_point_name)
            entrypoint()
        else:
            _logging.configure_logger(env.log_level)

            mpi_enabled = env.additional_framework_parameters.get(_params.MPI_ENABLED)
            runner_type = _runner.RunnerType.MPI if mpi_enabled else _runner.RunnerType.Process

            entry_point.run(env.module_dir, env.user_entry_point, env.to_cmd_args(),
                            env.to_env_vars(), runner=runner_type)

        logger.info('Reporting training SUCCESS')

        _files.write_success_file()
    except _errors.ClientError as e:

        failure_message = str(e)
        _files.write_failure_file(failure_message)

        logger.error(failure_message)

        if intermediate_sync:
            intermediate_sync.join()

        exit_code = DEFAULT_FAILURE_CODE
    except Exception as e:
        failure_msg = 'framework error: \n%s\n%s' % (traceback.format_exc(), str(e))

        _files.write_failure_file(failure_msg)
        logger.error('Reporting training FAILURE')

        logger.error(failure_msg)

        exit_code = getattr(e, 'errno', DEFAULT_FAILURE_CODE)
    finally:
        if intermediate_sync:
            intermediate_sync.join()

        _exit_processes(exit_code)
예제 #4
0
def test_write_failure_file():
    file_path = os.path.join(_env.output_dir, 'failure')
    failure_msg = 'This is a failure'
    _files.write_failure_file(failure_msg)
    open.assert_called_with(file_path, 'w')
    open().write.assert_called_with(failure_msg)
예제 #5
0
def test_write_failure_file():
    file_path = os.path.join(_env.output_dir, "failure")
    failure_msg = "This is a failure"
    _files.write_failure_file(failure_msg)
    open.assert_called_with(file_path, "w")
    open().write.assert_called_with(failure_msg)
예제 #6
0
def train():
    """Placeholder docstring"""
    intermediate_sync = None
    exit_code = SUCCESS_CODE
    try:
        env = sagemaker_containers.training_env()

        region = os.environ.get("AWS_REGION",
                                os.environ.get(_params.REGION_NAME_ENV))
        s3_endpoint_url = os.environ.get(_params.S3_ENDPOINT_URL, None)
        intermediate_sync = _intermediate_output.start_sync(
            env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url)

        if env.framework_module:
            framework_name, entry_point_name = env.framework_module.split(":")

            framework = importlib.import_module(framework_name)

            # the logger is configured after importing the framework library, allowing
            # the framework to configure logging at import time.
            _logging.configure_logger(env.log_level)
            logger.info("Imported framework %s", framework_name)

            entrypoint = getattr(framework, entry_point_name)
            entrypoint()
        else:
            _logging.configure_logger(env.log_level)

            mpi_enabled = env.additional_framework_parameters.get(
                _params.MPI_ENABLED)
            runner_type = _runner.RunnerType.MPI if mpi_enabled else _runner.RunnerType.Process

            entry_point.run(
                env.module_dir,
                env.user_entry_point,
                env.to_cmd_args(),
                env.to_env_vars(),
                runner=runner_type,
            )

        logger.info("Reporting training SUCCESS")

        _files.write_success_file()
    except _errors.ClientError as e:

        failure_message = str(e)
        _files.write_failure_file(failure_message)

        logger.error(failure_message)

        if intermediate_sync:
            intermediate_sync.join()

        exit_code = DEFAULT_FAILURE_CODE
    except Exception as e:  # pylint: disable=broad-except
        failure_msg = "framework error: \n%s\n%s" % (traceback.format_exc(),
                                                     str(e))

        _files.write_failure_file(failure_msg)
        logger.error("Reporting training FAILURE")

        logger.error(failure_msg)

        error_number = getattr(e, "errno", DEFAULT_FAILURE_CODE)
        exit_code = _get_valid_failure_exit_code(error_number)
    finally:
        if intermediate_sync:
            intermediate_sync.join()

        _exit_processes(exit_code)