def move_to_destination(source, destination, job_name, sagemaker_session): """Move source to destination. Can handle uploading to S3. Args: source (str): root directory to move destination (str): file:// or s3:// URI that source will be moved to. job_name (str): SageMaker job name. sagemaker_session (sagemaker.Session): a sagemaker_session to interact with S3 if needed Returns: (str): destination URI """ parsed_uri = urlparse(destination) if parsed_uri.scheme == "file": recursive_copy(source, parsed_uri.path) final_uri = destination elif parsed_uri.scheme == "s3": bucket = parsed_uri.netloc path = s3.s3_path_join(parsed_uri.path, job_name) final_uri = s3.s3_path_join("s3://", bucket, path) sagemaker_session.upload_data(source, bucket, path) else: raise ValueError("Invalid destination URI, must be s3:// or file://, got: %s" % destination) shutil.rmtree(source) return final_uri
def add_model(self, model_data_source, model_data_path=None): """Adds a model to the ``MultiDataModel``. It is done by uploading or copying the model_data_source artifact to the given S3 path model_data_path relative to model_data_prefix Args: model_source: Valid local file path or S3 path of the trained model artifact model_data_path: S3 path where the trained model artifact should be uploaded relative to ``self.model_data_prefix`` path. (default: None). If None, then the model artifact is uploaded to a path relative to model_data_prefix Returns: str: S3 uri to uploaded model artifact """ parse_result = urlparse(model_data_source) # If the model source is an S3 path, copy the model artifact to the destination S3 path if parse_result.scheme == "s3": source_bucket, source_model_data_path = s3.parse_s3_url( model_data_source) copy_source = { "Bucket": source_bucket, "Key": source_model_data_path } if not model_data_path: model_data_path = source_model_data_path # Construct the destination path dst_url = s3.s3_path_join(self.model_data_prefix, model_data_path) destination_bucket, destination_model_data_path = s3.parse_s3_url( dst_url) # Copy the model artifact self.s3_client.copy(copy_source, destination_bucket, destination_model_data_path) return s3.s3_path_join("s3://", destination_bucket, destination_model_data_path) # If the model source is a local path, upload the local model artifact to the destination # S3 path if os.path.exists(model_data_source): destination_bucket, dst_prefix = s3.parse_s3_url( self.model_data_prefix) if model_data_path: dst_s3_uri = s3.s3_path_join(dst_prefix, model_data_path) else: dst_s3_uri = s3.s3_path_join( dst_prefix, os.path.basename(model_data_source)) self.s3_client.upload_file(model_data_source, destination_bucket, dst_s3_uri) # return upload_path return s3.s3_path_join("s3://", destination_bucket, dst_s3_uri) # Raise error if the model source is of an unexpected type raise ValueError( "model_source must either be a valid local file path or s3 uri. Received: " '"{}"'.format(model_data_source))
def from_string( cls, constraints_file_string, kms_key=None, file_name=None, sagemaker_session=None ): """Generates a Constraints object from an s3 uri. Args: constraints_file_string (str): The uri of the constraints JSON file. kms_key (str): The kms key to be used to encrypt the file in S3. file_name (str): The file name to use when uploading to S3. sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker interactions (default: None). If not specified, one is created using the default AWS configuration chain. Returns: sagemaker.model_monitor.Constraints: The instance of Constraints generated from the s3 uri. """ sagemaker_session = sagemaker_session or Session() file_name = file_name or "constraints.json" desired_s3_uri = s3.s3_path_join( "s3://", sagemaker_session.default_bucket(), "monitoring", str(uuid.uuid4()), file_name ) s3_uri = s3.S3Uploader.upload_string_as_file_body( body=constraints_file_string, desired_s3_uri=desired_s3_uri, kms_key=kms_key, sagemaker_session=sagemaker_session, ) return Constraints.from_s3_uri( constraints_file_s3_uri=s3_uri, kms_key=kms_key, sagemaker_session=sagemaker_session )
def prepare_container_def(self, instance_type=None, accelerator_type=None): """ Args: instance_type: accelerator_type: """ if self.image_uri is None and instance_type is None: raise ValueError( "Must supply either an instance type (for choosing CPU vs GPU) or an image URI." ) image_uri = self._get_image_uri(instance_type, accelerator_type) env = self._get_container_env() if self.entry_point: key_prefix = sagemaker.fw_utils.model_code_key_prefix( self.key_prefix, self.name, image_uri) bucket = self.bucket or self.sagemaker_session.default_bucket() model_data = s3.s3_path_join("s3://", bucket, key_prefix, "model.tar.gz") sagemaker.utils.repack_model( self.entry_point, self.source_dir, self.dependencies, self.model_data, model_data, self.sagemaker_session, kms_key=self.model_kms_key, ) else: model_data = self.model_data return sagemaker.container_def(image_uri, model_data, env)
def _upload_code(self, code, kms_key=None): """Uploads a code file or directory specified as a string and returns the S3 URI. Args: code (str): A file or directory to be uploaded to S3. kms_key (str): The ARN of the KMS key that is used to encrypt the user code file (default: None). Returns: str: The S3 URI of the uploaded file or directory. """ desired_s3_uri = s3.s3_path_join( "s3://", self.sagemaker_session.default_bucket(), self._current_job_name, "input", self._CODE_CONTAINER_INPUT_NAME, ) return s3.S3Uploader.upload( local_path=code, desired_s3_uri=desired_s3_uri, sagemaker_session=self.sagemaker_session, kms_key=kms_key, )
def _default_s3_path(self, directory, mpi=False): """Placeholder docstring""" local_code = utils.get_config_value("local.local_code", self.sagemaker_session.config) if self.sagemaker_session.local_mode and local_code: return "/opt/ml/shared/{}".format(directory) if mpi: return "/opt/ml/model" if self._current_job_name: return s3.s3_path_join(self.output_path, self._current_job_name, directory) return None
def _normalize_inputs(self, inputs=None, kms_key=None): """Ensures that all the ``ProcessingInput`` objects have names and S3 URIs. Args: inputs (list[sagemaker.processing.ProcessingInput]): A list of ``ProcessingInput`` objects to be normalized (default: None). If not specified, an empty list is returned. kms_key (str): The ARN of the KMS key that is used to encrypt the user code file (default: None). Returns: list[sagemaker.processing.ProcessingInput]: The list of normalized ``ProcessingInput`` objects. Raises: TypeError: if the inputs are not ``ProcessingInput`` objects. """ # Initialize a list of normalized ProcessingInput objects. normalized_inputs = [] if inputs is not None: # Iterate through the provided list of inputs. for count, file_input in enumerate(inputs, 1): if not isinstance(file_input, ProcessingInput): raise TypeError( "Your inputs must be provided as ProcessingInput objects." ) # Generate a name for the ProcessingInput if it doesn't have one. if file_input.input_name is None: file_input.input_name = "input-{}".format(count) if isinstance(file_input.source, Properties) or file_input.dataset_definition: normalized_inputs.append(file_input) continue # If the source is a local path, upload it to S3 # and save the S3 uri in the ProcessingInput source. parse_result = urlparse(file_input.s3_input.s3_uri) if parse_result.scheme != "s3": desired_s3_uri = s3.s3_path_join( "s3://", self.sagemaker_session.default_bucket(), self._current_job_name, "input", file_input.input_name, ) s3_uri = s3.S3Uploader.upload( local_path=file_input.s3_input.s3_uri, desired_s3_uri=desired_s3_uri, sagemaker_session=self.sagemaker_session, kms_key=kms_key, ) file_input.s3_input.s3_uri = s3_uri normalized_inputs.append(file_input) return normalized_inputs
def test_path_join(): test_cases = ( ("foo/bar", ("foo", "bar")), ("foo/bar", ("foo/", "bar")), ("foo/bar", ("/foo/", "bar")), ("s3://foo/bar", ("s3://", "foo", "bar")), ("s3://foo/bar", ("s3://", "/foo", "bar")), ("s3://foo/bar", ("s3://foo", "bar")), ) for expected, args in test_cases: assert expected == s3.s3_path_join(*args)
def prepare_container_def(self, instance_type=None, accelerator_type=None, serverless_inference_config=None): """Prepare the container definition. Args: instance_type: Instance type of the container. accelerator_type: Accelerator type, if applicable. serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig): Specifies configuration related to serverless endpoint. Instance type is not provided in serverless inference. So this is used to find image URIs. Returns: A container definition for deploying a ``Model`` to an ``Endpoint``. """ if not self.image_uri: if instance_type is None and serverless_inference_config is None: raise ValueError( "Must supply either an instance type (for choosing CPU vs GPU) or an image URI." ) image_uri = self._get_image_uri( instance_type, accelerator_type, serverless_inference_config=serverless_inference_config) env = self._get_container_env() # If self.model_data is pipeline variable, model is not yet there. # So defer repacking to later during pipeline execution if self.entry_point and not is_pipeline_variable(self.model_data): key_prefix = sagemaker.fw_utils.model_code_key_prefix( self.key_prefix, self.name, image_uri) bucket = self.bucket or self.sagemaker_session.default_bucket() model_data = s3.s3_path_join("s3://", bucket, key_prefix, "model.tar.gz") sagemaker.utils.repack_model( self.entry_point, self.source_dir, self.dependencies, self.model_data, model_data, self.sagemaker_session, kms_key=self.model_kms_key, ) else: model_data = self.model_data return sagemaker.container_def(image_uri, model_data, env)
def _get_s3_base_uri_for_monitoring_analysis_config(self) -> str: """Generate s3 base uri for monitoring schedule analysis config Returns: str: The S3 base uri of the monitoring schedule analysis config """ s3_analysis_config_output_path = (self.clarify_check_config.data_config .s3_analysis_config_output_path) monitoring_cfg_base_name = f"{_BIAS_MONITORING_CFG_BASE_NAME}-configuration" if isinstance(self.clarify_check_config, ModelExplainabilityCheckConfig): monitoring_cfg_base_name = f"{_EXPLAINABILITY_MONITORING_CFG_BASE_NAME}-configuration" if s3_analysis_config_output_path: return s3.s3_path_join( s3_analysis_config_output_path, monitoring_cfg_base_name, ) return s3.s3_path_join( "s3://", self._model_monitor.sagemaker_session.default_bucket(), _MODEL_MONITOR_S3_PATH, monitoring_cfg_base_name, )
def __init__( self, enable_capture, sampling_percentage=20, destination_s3_uri=None, kms_key_id=None, capture_options=None, csv_content_types=None, json_content_types=None, sagemaker_session=None, ): """Initialize a DataCaptureConfig object for capturing data from Amazon SageMaker Endpoints. Args: enable_capture (bool): Required. Whether data capture should be enabled or not. sampling_percentage (int): Optional. Default=20. The percentage of data to sample. Must be between 0 and 100. destination_s3_uri (str): Optional. Defaults to "s3://<default-session-bucket>/ model-monitor/data-capture". kms_key_id (str): Optional. Default=None. The kms key to use when writing to S3. capture_options ([str]): Optional. Must be a list containing any combination of the following values: "REQUEST", "RESPONSE". Default=["REQUEST", "RESPONSE"]. Denotes which data to capture between request and response. csv_content_types ([str]): Optional. Default=["text/csv"]. json_content_types([str]): Optional. Default=["application/json"]. sagemaker_session (sagemaker.session.Session): A SageMaker Session object, used for SageMaker interactions (default: None). If not specified, one is created using the default AWS configuration chain. """ self.enable_capture = enable_capture self.sampling_percentage = sampling_percentage self.destination_s3_uri = destination_s3_uri if self.destination_s3_uri is None: sagemaker_session = sagemaker_session or Session() self.destination_s3_uri = s3.s3_path_join( "s3://", sagemaker_session.default_bucket(), _MODEL_MONITOR_S3_PATH, _DATA_CAPTURE_S3_PATH, ) self.kms_key_id = kms_key_id self.capture_options = capture_options or ["REQUEST", "RESPONSE"] self.csv_content_types = csv_content_types or ["text/csv"] self.json_content_types = json_content_types or ["application/json"]
def _normalize_outputs(self, outputs=None): """Ensures that all the outputs are ``ProcessingOutput`` objects with names and S3 URIs. Args: outputs (list[sagemaker.processing.ProcessingOutput]): A list of outputs to be normalized (default: None). Can be either strings or ``ProcessingOutput`` objects. If not specified, an empty list is returned. Returns: list[sagemaker.processing.ProcessingOutput]: The list of normalized ``ProcessingOutput`` objects. Raises: TypeError: if the outputs are not ``ProcessingOutput`` objects. """ # Initialize a list of normalized ProcessingOutput objects. normalized_outputs = [] if outputs is not None: # Iterate through the provided list of outputs. for count, output in enumerate(outputs, 1): if not isinstance(output, ProcessingOutput): raise TypeError( "Your outputs must be provided as ProcessingOutput objects." ) # Generate a name for the ProcessingOutput if it doesn't have one. if output.output_name is None: output.output_name = "output-{}".format(count) # if the output's destination is a workflow expression, do no normalization if isinstance(output.destination, Expression): normalized_outputs.append(output) continue # If the output's destination is not an s3_uri, create one. parse_result = urlparse(output.destination) if parse_result.scheme != "s3": s3_uri = s3.s3_path_join( "s3://", self.sagemaker_session.default_bucket(), self._current_job_name, "output", output.output_name, ) output.destination = s3_uri normalized_outputs.append(output) return normalized_outputs
def _generate_baseline_output(self): """Generates a ProcessingOutput object Returns: sagemaker.processing.ProcessingOutput: The normalized ProcessingOutput object. """ s3_uri = self.quality_check_config.output_s3_uri or s3.s3_path_join( "s3://", self._model_monitor.sagemaker_session.default_bucket(), _MODEL_MONITOR_S3_PATH, _BASELINING_S3_PATH, self._model_monitor.latest_baselining_job_name, _RESULTS_S3_PATH, ) return ProcessingOutput( source=str(pathlib.PurePosixPath(_CONTAINER_BASE_PATH, _CONTAINER_OUTPUT_PATH)), destination=s3_uri, output_name=_DEFAULT_OUTPUT_NAME, )
def _create_args(self, role_arn: str, description: str, parallelism_config: ParallelismConfiguration): """Constructs the keyword argument dict for a create_pipeline call. Args: role_arn (str): The role arn that is assumed by pipelines to create step artifacts. description (str): A description of the pipeline. parallelism_config (Optional[ParallelismConfiguration]): Parallelism configuration that is applied to each of the executions of the pipeline. It takes precedence over the parallelism configuration of the parent pipeline. Returns: A keyword argument dict for calling create_pipeline. """ pipeline_definition = self.definition() kwargs = dict( PipelineName=self.name, RoleArn=role_arn, ) # If pipeline definition is large, upload to S3 bucket and # provide PipelineDefinitionS3Location to request instead. if len(pipeline_definition.encode("utf-8")) < 1024 * 100: kwargs["PipelineDefinition"] = pipeline_definition else: desired_s3_uri = s3.s3_path_join( "s3://", self.sagemaker_session.default_bucket(), self.name) s3.S3Uploader.upload_string_as_file_body( body=pipeline_definition, desired_s3_uri=desired_s3_uri, sagemaker_session=self.sagemaker_session, ) kwargs["PipelineDefinitionS3Location"] = { "Bucket": self.sagemaker_session.default_bucket(), "ObjectKey": self.name, } update_args(kwargs, PipelineDescription=description, ParallelismConfiguration=parallelism_config) return kwargs
def prepare_container_def(self, instance_type=None, accelerator_type=None, serverless_inference_config=None): """Prepare the container definition. Args: instance_type: Instance type of the container. accelerator_type: Accelerator type, if applicable. serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig): Specifies configuration related to serverless endpoint. Instance type is not provided in serverless inference. So this is used to find image URIs. Returns: A container definition for deploying a ``Model`` to an ``Endpoint``. """ if not self.image_uri: if instance_type is None and serverless_inference_config is None: raise ValueError( "Must supply either an instance type (for choosing CPU vs GPU) or an image URI." ) image_uri = self._get_image_uri( instance_type, accelerator_type, serverless_inference_config=serverless_inference_config) env = self._get_container_env() if self.entry_point and not is_pipeline_variable(self.model_data): key_prefix = sagemaker.fw_utils.model_code_key_prefix( self.key_prefix, self.name, image_uri) bucket = self.bucket or self.sagemaker_session.default_bucket() model_data = s3.s3_path_join("s3://", bucket, key_prefix, "model.tar.gz") sagemaker.utils.repack_model( self.entry_point, self.source_dir, self.dependencies, self.model_data, model_data, self.sagemaker_session, kms_key=self.model_kms_key, ) elif self.entry_point and is_pipeline_variable(self.model_data): # model is not yet there, defer repacking to later during pipeline execution if isinstance(self.sagemaker_session, PipelineSession): self.sagemaker_session.context.need_runtime_repack.add( id(self)) else: # TODO: link the doc in the warning once ready logging.warning( "The model_data is a Pipeline variable of type %s, " "which should be used under `PipelineSession` and " "leverage `ModelStep` to create or register model. " "Otherwise some functionalities e.g. " "runtime repack may be missing", type(self.model_data), ) model_data = self.model_data else: model_data = self.model_data return sagemaker.container_def(image_uri, model_data, env)