예제 #1
0
    def add_model(self, model_data_source, model_data_path=None):
        """Adds a model to the ``MultiDataModel``.

        It is done by uploading or copying the model_data_source artifact to the given
        S3 path model_data_path relative to model_data_prefix

        Args:
            model_source: Valid local file path or S3 path of the trained model artifact
            model_data_path: S3 path where the trained model artifact
                should be uploaded relative to ``self.model_data_prefix`` path. (default: None).
                If None, then the model artifact is uploaded to a path relative to model_data_prefix

        Returns:
            str: S3 uri to uploaded model artifact
        """
        parse_result = urlparse(model_data_source)

        # If the model source is an S3 path, copy the model artifact to the destination S3 path
        if parse_result.scheme == "s3":
            source_bucket, source_model_data_path = s3.parse_s3_url(
                model_data_source)
            copy_source = {
                "Bucket": source_bucket,
                "Key": source_model_data_path
            }

            if not model_data_path:
                model_data_path = source_model_data_path

            # Construct the destination path
            dst_url = s3.s3_path_join(self.model_data_prefix, model_data_path)
            destination_bucket, destination_model_data_path = s3.parse_s3_url(
                dst_url)

            # Copy the model artifact
            self.s3_client.copy(copy_source, destination_bucket,
                                destination_model_data_path)
            return s3.s3_path_join("s3://", destination_bucket,
                                   destination_model_data_path)

        # If the model source is a local path, upload the local model artifact to the destination
        # S3 path
        if os.path.exists(model_data_source):
            destination_bucket, dst_prefix = s3.parse_s3_url(
                self.model_data_prefix)
            if model_data_path:
                dst_s3_uri = s3.s3_path_join(dst_prefix, model_data_path)
            else:
                dst_s3_uri = s3.s3_path_join(
                    dst_prefix, os.path.basename(model_data_source))
            self.s3_client.upload_file(model_data_source, destination_bucket,
                                       dst_s3_uri)
            # return upload_path
            return s3.s3_path_join("s3://", destination_bucket, dst_s3_uri)

        # Raise error if the model source is of an unexpected type
        raise ValueError(
            "model_source must either be a valid local file path or s3 uri. Received: "
            '"{}"'.format(model_data_source))
    def _wait_for_output(
        self,
        output_path,
        waiter_config,
    ):
        """Check the Amazon S3 output path for the output.

        Periodically check Amazon S3 output path for async inference result.
        Timeout automatically after max attempts reached
        """
        bucket, key = parse_s3_url(output_path)
        s3_waiter = self.s3_client.get_waiter("object_exists")
        try:
            s3_waiter.wait(Bucket=bucket,
                           Key=key,
                           WaiterConfig=waiter_config._to_request_dict())
        except WaiterError:
            raise PollingTimeoutError(
                message="Inference could still be running",
                output_path=output_path,
                seconds=waiter_config.delay * waiter_config.max_attempts,
            )

        s3_object = self.s3_client.get_object(Bucket=bucket, Key=key)
        result = self.predictor._handle_response(response=s3_object)
        return result
    def _upload_data_to_s3(
        self,
        data,
        input_path=None,
    ):
        """Upload request data to Amazon S3 for users"""
        if input_path:
            bucket, key = parse_s3_url(input_path)
        else:
            my_uuid = str(uuid.uuid4())
            timestamp = sagemaker_timestamp()
            bucket = self.sagemaker_session.default_bucket()
            key = "async-endpoint-inputs/{}/{}-{}".format(
                name_from_base(self.name, short=True),
                timestamp,
                my_uuid,
            )

        data = self.serializer.serialize(data)
        self.s3_client.put_object(Body=data,
                                  Bucket=bucket,
                                  Key=key,
                                  ContentType=self.serializer.CONTENT_TYPE)
        input_path = input_path or "s3://{}/{}".format(
            self.sagemaker_session.default_bucket(), key)

        return input_path
def test_transform_pytorch_vpc_custom_model_bucket(
    sagemaker_session,
    pytorch_inference_latest_version,
    pytorch_inference_latest_py_version,
    cpu_instance_type,
    custom_bucket_name,
):
    data_dir = os.path.join(DATA_DIR, "pytorch_mnist")

    ec2_client = sagemaker_session.boto_session.client("ec2")
    subnet_ids, security_group_id = get_or_create_vpc_resources(ec2_client)

    model_data = sagemaker_session.upload_data(
        path=os.path.join(data_dir, "model.tar.gz"),
        bucket=custom_bucket_name,
        key_prefix="integ-test-data/pytorch_mnist/model",
    )

    model = PyTorchModel(
        model_data=model_data,
        entry_point=os.path.join(data_dir, "mnist.py"),
        role="SageMakerRole",
        framework_version=pytorch_inference_latest_version,
        py_version=pytorch_inference_latest_py_version,
        sagemaker_session=sagemaker_session,
        vpc_config={
            "Subnets": subnet_ids,
            "SecurityGroupIds": [security_group_id]
        },
        code_location="s3://{}".format(custom_bucket_name),
    )

    transform_input = sagemaker_session.upload_data(
        path=os.path.join(data_dir, "transform", "data.npy"),
        key_prefix="integ-test-data/pytorch_mnist/transform",
    )

    transformer = model.transformer(1, cpu_instance_type)
    transformer.transform(
        transform_input,
        content_type="application/x-npy",
        job_name=unique_name_from_base("test-transform-vpc"),
    )

    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name)
        assert set(subnet_ids) == set(model_desc["VpcConfig"]["Subnets"])
        assert [security_group_id
                ] == model_desc["VpcConfig"]["SecurityGroupIds"]

        model_bucket, _ = s3.parse_s3_url(
            model_desc["PrimaryContainer"]["ModelDataUrl"])
        assert custom_bucket_name == model_bucket
예제 #5
0
def get_full_hyperparameters(base_hyperparameters: dict, job_name: str,
                             model_artifacts_uri: str) -> dict:

    bucket, key = parse_s3_url(model_artifacts_uri)
    return {
        **base_hyperparameters,
        "sagemaker_job_name": job_name,
        "model-artifact-bucket": bucket,
        "model-artifact-key": key,
    }
예제 #6
0
def parse_s3_url(url):
    """Calls the method with the same name in the s3 module.

    :func:~sagemaker.s3.parse_s3_url

    Args:
        url: A URL, expected with an s3 scheme.

    Returns: The return value of s3.parse_s3_url, which is a tuple containing:
        str: S3 bucket name str: S3 key
    """
    return s3.parse_s3_url(url)
예제 #7
0
def is_jumpstart_model_uri(uri: Optional[str]) -> bool:
    """Returns True if URI corresponds to a JumpStart-hosted model.

    Args:
        uri (Optional[str]): uri for inference/training job.
    """

    bucket = None
    if urlparse(uri).scheme == "s3":
        bucket, _ = parse_s3_url(uri)

    return bucket in constants.JUMPSTART_BUCKET_NAME_SET
예제 #8
0
    def list_models(self):
        """Generates and returns relative paths to model archives stored at model_data_prefix
        S3 location.

        Yields: Paths to model archives relative to model_data_prefix path.
        """
        bucket, url_prefix = s3.parse_s3_url(self.model_data_prefix)
        file_keys = self.sagemaker_session.list_s3_files(bucket=bucket, key_prefix=url_prefix)
        for file_key in file_keys:
            # Return the model paths relative to the model_data_prefix
            # Ex: "a/b/c.tar.gz" -> "b/c.tar.gz" where url_prefix = "a/"
            yield file_key.replace(url_prefix, "")
 def _get_result_from_s3(
     self,
     output_path,
 ):
     """Get inference result from the output Amazon S3 path"""
     bucket, key = parse_s3_url(output_path)
     try:
         response = self.predictor_async.s3_client.get_object(Bucket=bucket,
                                                              Key=key)
         return self.predictor_async.predictor._handle_response(response)
     except ClientError as ex:
         if ex.response["Error"]["Code"] == "NoSuchKey":
             raise ObjectNotExistedError(
                 message="Inference could still be running",
                 output_path=output_path,
             )
         raise UnexpectedClientError(
             message=ex.response["Error"]["Message"], )
예제 #10
0
    def _initialize_job(
        self, monitored_metrics, dataset, num_samples, quantiles, job_name
    ):
        if self.sagemaker_session.local_mode:
            # TODO implement local mode support
            raise NotImplementedError(
                "Local mode has not yet been implemented."
            )

        # set metrics to be monitored
        self.metric_definitions = make_metrics(monitored_metrics)

        self._hyperparameters.update(
            DATASET=dataset,  # pass dataset as hyper-parameter
            NUM_SAMPLES=num_samples,
            QUANTILES=str(quantiles),
        )

        # needed to set default output and code location properly
        if self.output_path is None:
            default_bucket = self.sagemaker_session.default_bucket()
            self.output_path = f"s3://{default_bucket}"

        if self.code_location is None:
            code_bucket, _ = parse_s3_url(self.output_path)
            self.code_location = (
                f"s3://{code_bucket}"  # for consistency with sagemaker API
            )

        locations = Locations(
            job_name=job_name,
            output_path=self.output_path,
            code_location=self.code_location,
        )

        logger.info(f"OUTPUT_PATH: {locations.job_output_path}")
        logger.info(f"CODE_LOCATION: {locations.job_code_location}")

        return locations
    def __init__(self,
                 model_data,
                 image_uri,
                 role,
                 entry_point,
                 source_dir=None,
                 predictor_cls=None,
                 env=None,
                 name=None,
                 container_log_level=logging.INFO,
                 code_location=None,
                 sagemaker_session=None,
                 dependencies=None,
                 git_config=None,
                 **kwargs):
        """Initialize a ``FrameworkModel``.

        Args:
            model_data (str): The S3 location of a SageMaker model data
                ``.tar.gz`` file.
            image_uri (str): A Docker image URI.
            role (str): An IAM role name or ARN for SageMaker to access AWS
                resources on your behalf.
            entry_point (str): Path (absolute or relative) to the Python source
                file which should be executed as the entry point to model
                hosting. If ``source_dir`` is specified, then ``entry_point``
                must point to a file located at the root of ``source_dir``.
                If 'git_config' is provided, 'entry_point' should be
                a relative location to the Python source file in the Git repo.

                Example:
                    With the following GitHub repo directory structure:

                    >>> |----- README.md
                    >>> |----- src
                    >>>         |----- inference.py
                    >>>         |----- test.py

                    You can assign entry_point='src/inference.py'.
            source_dir (str): Path (absolute, relative or an S3 URI) to a directory
                with any other training source code dependencies aside from the entry
                point file (default: None). If ``source_dir`` is an S3 URI, it must
                point to a tar.gz file. Structure within this directory are preserved
                when training on Amazon SageMaker. If 'git_config' is provided,
                'source_dir' should be a relative location to a directory in the Git repo.
                If the directory points to S3, no code will be uploaded and the S3 location
                will be used instead.

                .. admonition:: Example

                    With the following GitHub repo directory structure:

                    >>> |----- README.md
                    >>> |----- src
                    >>>         |----- inference.py
                    >>>         |----- test.py

                    You can assign entry_point='inference.py', source_dir='src'.
            predictor_cls (callable[string, sagemaker.session.Session]): A
                function to call to create a predictor (default: None). If not
                None, ``deploy`` will return the result of invoking this
                function on the created endpoint name.
            env (dict[str, str]): Environment variables to run with ``image_uri``
                when hosted in SageMaker (default: None).
            name (str): The model name. If None, a default model name will be
                selected on each ``deploy``.
            container_log_level (int): Log level to use within the container
                (default: logging.INFO). Valid values are defined in the Python
                logging module.
            code_location (str): Name of the S3 bucket where custom code is
                uploaded (default: None). If not specified, default bucket
                created by ``sagemaker.session.Session`` is used.
            sagemaker_session (sagemaker.session.Session): A SageMaker Session
                object, used for SageMaker interactions (default: None). If not
                specified, one is created using the default AWS configuration
                chain.
            dependencies (list[str]): A list of paths to directories (absolute
                or relative) with any additional libraries that will be exported
                to the container (default: []). The library folders will be
                copied to SageMaker in the same folder where the entrypoint is
                copied. If 'git_config' is provided, 'dependencies' should be a
                list of relative locations to directories with any additional
                libraries needed in the Git repo. If the ```source_dir``` points
                to S3, code will be uploaded and the S3 location will be used
                instead.

                .. admonition:: Example

                    The following call

                    >>> Model(entry_point='inference.py',
                    ...       dependencies=['my/libs/common', 'virtual-env'])

                    results in the following inside the container:

                    >>> $ ls

                    >>> opt/ml/code
                    >>>     |------ inference.py
                    >>>     |------ common
                    >>>     |------ virtual-env

                This is not supported with "local code" in Local Mode.
            git_config (dict[str, str]): Git configurations used for cloning
                files, including ``repo``, ``branch``, ``commit``,
                ``2FA_enabled``, ``username``, ``password`` and ``token``. The
                ``repo`` field is required. All other fields are optional.
                ``repo`` specifies the Git repository where your training script
                is stored. If you don't provide ``branch``, the default value
                'master' is used. If you don't provide ``commit``, the latest
                commit in the specified branch is used. .. admonition:: Example

                    The following config:

                    >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git',
                    >>>               'branch': 'test-branch-git-config',
                    >>>               'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'}

                    results in cloning the repo specified in 'repo', then
                    checkout the 'master' branch, and checkout the specified
                    commit.

                ``2FA_enabled``, ``username``, ``password`` and ``token`` are
                used for authentication. For GitHub (or other Git) accounts, set
                ``2FA_enabled`` to 'True' if two-factor authentication is
                enabled for the account, otherwise set it to 'False'. If you do
                not provide a value for ``2FA_enabled``, a default value of
                'False' is used. CodeCommit does not support two-factor
                authentication, so do not provide "2FA_enabled" with CodeCommit
                repositories.

                For GitHub and other Git repos, when SSH URLs are provided, it
                doesn't matter whether 2FA is enabled or disabled; you should
                either have no passphrase for the SSH key pairs, or have the
                ssh-agent configured so that you will not be prompted for SSH
                passphrase when you do 'git clone' command with SSH URLs. When
                HTTPS URLs are provided: if 2FA is disabled, then either token
                or username+password will be used for authentication if provided
                (token prioritized); if 2FA is enabled, only token will be used
                for authentication if provided. If required authentication info
                is not provided, python SDK will try to use local credentials
                storage to authenticate. If that fails either, an error message
                will be thrown.

                For CodeCommit repos, 2FA is not supported, so '2FA_enabled'
                should not be provided. There is no token in CodeCommit, so
                'token' should not be provided too. When 'repo' is an SSH URL,
                the requirements are the same as GitHub-like repos. When 'repo'
                is an HTTPS URL, username+password will be used for
                authentication if they are provided; otherwise, python SDK will
                try to use either CodeCommit credential helper or local
                credential storage for authentication.
            **kwargs: Keyword arguments passed to the ``Model`` initializer.

        .. tip::

            You can find additional parameters for initializing this class at
            :class:`~sagemaker.model.Model`.
        """
        super(FrameworkModel,
              self).__init__(image_uri,
                             model_data,
                             role,
                             predictor_cls=predictor_cls,
                             env=env,
                             name=name,
                             sagemaker_session=sagemaker_session,
                             **kwargs)
        self.entry_point = entry_point
        self.source_dir = source_dir
        self.dependencies = dependencies or []
        self.git_config = git_config
        self.container_log_level = container_log_level
        if code_location:
            self.bucket, self.key_prefix = s3.parse_s3_url(code_location)
        else:
            self.bucket, self.key_prefix = None, None
        if self.git_config:
            updates = git_utils.git_clone_repo(self.git_config,
                                               self.entry_point,
                                               self.source_dir,
                                               self.dependencies)
            self.entry_point = updates["entry_point"]
            self.source_dir = updates["source_dir"]
            self.dependencies = updates["dependencies"]
        self.uploaded_code = None
        self.repacked_model_data = None
예제 #12
0
def test_parse_s3_url_fail():
    with pytest.raises(ValueError) as error:
        s3.parse_s3_url("t3://code_location")
    assert "Expecting 's3' scheme" in str(error)
예제 #13
0
def test_parse_s3_url():
    bucket, key_prefix = s3.parse_s3_url("s3://bucket/code_location")
    assert "bucket" == bucket
    assert "code_location" == key_prefix