def test_large_files(): os.environ["TRAINING_JOB_NAME"] = _timestamp() p = _intermediate_output.start_sync(bucket_uri, region) file_size = 1024 * 256 * 17 # 17MB file = os.path.join(intermediate_path, "file.npy") _generate_large_npy_file(file_size, file) file_to_modify = os.path.join(intermediate_path, "file_to_modify.npy") _generate_large_npy_file(file_size, file_to_modify) content_to_assert = _generate_large_npy_file(file_size, file_to_modify) _files.write_failure_file("Failure!!") p.join() assert os.path.exists(file) assert os.path.exists(file_to_modify) key_prefix = os.path.join(os.environ.get("TRAINING_JOB_NAME"), "output", "intermediate") client = boto3.client("s3", region) assert _file_exists_in_s3( client, os.path.join(key_prefix, os.path.relpath(file, intermediate_path)) ) assert _file_exists_in_s3( client, os.path.join(key_prefix, os.path.relpath(file_to_modify, intermediate_path)) ) # check that modified file has s3 = boto3.resource("s3", region_name=region) key = os.path.join(key_prefix, os.path.relpath(file_to_modify, intermediate_path)) modified_file = os.path.join(_env.output_dir, "modified_file.npy") s3.Bucket(bucket).download_file(key, modified_file) assert np.array_equal(np.load(modified_file), content_to_assert)
def train(): try: # TODO: iquintero - add error handling for ImportError to let the user know # if the framework module is not defined. env = sagemaker_containers.training_env() framework_name, entry_point_name = env.framework_module.split(':') try: framework = importlib.import_module(framework_name) except: logger.error("Import failure loading %s. sys.path=%s" % (framework_name, sys.path)) raise # the logger is configured after importing the framework library, allowing the framework to # configure logging at import time. _logging.configure_logger(env.log_level) logger.info('Imported framework %s', framework_name) entry_point = getattr(framework, entry_point_name) _modules.write_env_vars(env.to_env_vars()) entry_point() logger.info('Reporting training SUCCESS') _files.write_success_file() _exit_processes(SUCCESS_CODE) except _errors.ClientError as e: failure_message = str(e) _files.write_failure_file(failure_message) logger.error(failure_message) _exit_processes(DEFAULT_FAILURE_CODE) except Exception as e: failure_msg = 'framework error: \n%s\n%s' % (traceback.format_exc(), str(e)) _files.write_failure_file(failure_msg) logger.error('Reporting training FAILURE') logger.error(failure_msg) exit_code = getattr(e, 'errno', DEFAULT_FAILURE_CODE) _exit_processes(exit_code)
def train(): intermediate_sync = None exit_code = SUCCESS_CODE try: # TODO: iquintero - add error handling for ImportError to let the user know # if the framework module is not defined. env = sagemaker_containers.training_env() # TODO: [issue#144] There is a bug in the logic - # we need os.environ.get(_params.REGION_NAME_ENV) # in certain regions, but it is not going to be available unless # TrainingEnvironment has been initialized. It shouldn't be environment variable. region = os.environ.get('AWS_REGION', os.environ.get(_params.REGION_NAME_ENV)) s3_endpoint_url = os.environ.get("S3_ENDPOINT_URL") intermediate_sync = _intermediate_output.start_sync(env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url) if env.framework_module: framework_name, entry_point_name = env.framework_module.split(':') framework = importlib.import_module(framework_name) # the logger is configured after importing the framework library, allowing the framework to # configure logging at import time. _logging.configure_logger(env.log_level) logger.info('Imported framework %s', framework_name) entrypoint = getattr(framework, entry_point_name) entrypoint() else: _logging.configure_logger(env.log_level) mpi_enabled = env.additional_framework_parameters.get(_params.MPI_ENABLED) runner_type = _runner.RunnerType.MPI if mpi_enabled else _runner.RunnerType.Process entry_point.run(env.module_dir, env.user_entry_point, env.to_cmd_args(), env.to_env_vars(), runner=runner_type) logger.info('Reporting training SUCCESS') _files.write_success_file() except _errors.ClientError as e: failure_message = str(e) _files.write_failure_file(failure_message) logger.error(failure_message) if intermediate_sync: intermediate_sync.join() exit_code = DEFAULT_FAILURE_CODE except Exception as e: failure_msg = 'framework error: \n%s\n%s' % (traceback.format_exc(), str(e)) _files.write_failure_file(failure_msg) logger.error('Reporting training FAILURE') logger.error(failure_msg) exit_code = getattr(e, 'errno', DEFAULT_FAILURE_CODE) finally: if intermediate_sync: intermediate_sync.join() _exit_processes(exit_code)
def test_write_failure_file(): file_path = os.path.join(_env.output_dir, 'failure') failure_msg = 'This is a failure' _files.write_failure_file(failure_msg) open.assert_called_with(file_path, 'w') open().write.assert_called_with(failure_msg)
def test_write_failure_file(): file_path = os.path.join(_env.output_dir, "failure") failure_msg = "This is a failure" _files.write_failure_file(failure_msg) open.assert_called_with(file_path, "w") open().write.assert_called_with(failure_msg)
def train(): """Placeholder docstring""" intermediate_sync = None exit_code = SUCCESS_CODE try: env = sagemaker_containers.training_env() region = os.environ.get("AWS_REGION", os.environ.get(_params.REGION_NAME_ENV)) s3_endpoint_url = os.environ.get(_params.S3_ENDPOINT_URL, None) intermediate_sync = _intermediate_output.start_sync( env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url) if env.framework_module: framework_name, entry_point_name = env.framework_module.split(":") framework = importlib.import_module(framework_name) # the logger is configured after importing the framework library, allowing # the framework to configure logging at import time. _logging.configure_logger(env.log_level) logger.info("Imported framework %s", framework_name) entrypoint = getattr(framework, entry_point_name) entrypoint() else: _logging.configure_logger(env.log_level) mpi_enabled = env.additional_framework_parameters.get( _params.MPI_ENABLED) runner_type = _runner.RunnerType.MPI if mpi_enabled else _runner.RunnerType.Process entry_point.run( env.module_dir, env.user_entry_point, env.to_cmd_args(), env.to_env_vars(), runner=runner_type, ) logger.info("Reporting training SUCCESS") _files.write_success_file() except _errors.ClientError as e: failure_message = str(e) _files.write_failure_file(failure_message) logger.error(failure_message) if intermediate_sync: intermediate_sync.join() exit_code = DEFAULT_FAILURE_CODE except Exception as e: # pylint: disable=broad-except failure_msg = "framework error: \n%s\n%s" % (traceback.format_exc(), str(e)) _files.write_failure_file(failure_msg) logger.error("Reporting training FAILURE") logger.error(failure_msg) error_number = getattr(e, "errno", DEFAULT_FAILURE_CODE) exit_code = _get_valid_failure_exit_code(error_number) finally: if intermediate_sync: intermediate_sync.join() _exit_processes(exit_code)