예제 #1
0
    def transformer(self, instance_count, instance_type, strategy=None, assemble_with=None, output_path=None,
                    output_kms_key=None, accept=None, env=None, max_concurrent_transforms=None,
                    max_payload=None, tags=None, volume_kms_key=None):
        """Return a ``Transformer`` that uses this Model.

        Args:
            instance_count (int): Number of EC2 instances to use.
            instance_type (str): Type of EC2 instance to use, for example, 'ml.c4.xlarge'.
            strategy (str): The strategy used to decide how to batch records in a single request (default: None).
                Valid values: 'MULTI_RECORD' and 'SINGLE_RECORD'.
            assemble_with (str): How the output is assembled (default: None). Valid values: 'Line' or 'None'.
            output_path (str): S3 location for saving the transform result. If not specified, results are stored to
                a default bucket.
            output_kms_key (str): Optional. KMS key ID for encrypting the transform output (default: None).
            accept (str): The content type accepted by the endpoint deployed during the transform job.
            env (dict): Environment variables to be set for use during the transform job (default: None).
            max_concurrent_transforms (int): The maximum number of HTTP requests to be made to
                each individual transform container at one time.
            max_payload (int): Maximum size of the payload in a single HTTP request to the container in MB.
            tags (list[dict]): List of tags for labeling a transform job. If none specified, then the tags used for
                the training job are used for the transform job.
            volume_kms_key (str): Optional. KMS key ID for encrypting the volume attached to the ML
                compute instance (default: None).
        """
        self._create_sagemaker_model(instance_type)
        if self.enable_network_isolation():
            env = None

        return Transformer(self.name, instance_count, instance_type, strategy=strategy, assemble_with=assemble_with,
                           output_path=output_path, output_kms_key=output_kms_key, accept=accept,
                           max_concurrent_transforms=max_concurrent_transforms, max_payload=max_payload,
                           env=env, tags=tags, base_transform_job_name=self.name,
                           volume_kms_key=volume_kms_key, sagemaker_session=self.sagemaker_session)
def pca_transformer(pca_model):
    return Transformer(
        model_name='pca-model',
        instance_count=1,
        instance_type='ml.c4.xlarge',
        output_path='s3://sagemaker/transform-output'
    )
예제 #3
0
def test_transformer_fails_without_model():
    transformer = Transformer(
        model_name="remote-model",
        sagemaker_session=test_local_mode.LocalNoS3Session(),
        instance_type="local",
        instance_count=1,
    )

    with pytest.raises(ValueError) as error:

        transformer.transform("empty-data")

    assert (str(
        error.value) == "Failed to fetch model information for remote-model. "
            "Please ensure that the model exists. "
            "Local instance types require locally created models.")
def transformer(sagemaker_session):
    return Transformer(MODEL_NAME,
                       INSTANCE_COUNT,
                       INSTANCE_TYPE,
                       output_path=OUTPUT_PATH,
                       sagemaker_session=sagemaker_session,
                       volume_kms_key=KMS_KEY_ID)
예제 #5
0
    def transformer(self, instance_count, instance_type, strategy=None, assemble_with=None, output_path=None,
                    output_kms_key=None, accept=None, env=None, max_concurrent_transforms=None,
                    max_payload=None, tags=None, role=None):
        """Return a ``Transformer`` that uses a SageMaker Model based on the training job. It reuses the
        SageMaker Session and base job name used by the Estimator.

        Args:
            instance_count (int): Number of EC2 instances to use.
            instance_type (str): Type of EC2 instance to use, for example, 'ml.c4.xlarge'.
            strategy (str): The strategy used to decide how to batch records in a single request (default: None).
                Valid values: 'MULTI_RECORD' and 'SINGLE_RECORD'.
            assemble_with (str): How the output is assembled (default: None). Valid values: 'Line' or 'None'.
            output_path (str): S3 location for saving the transform result. If not specified, results are stored to
                a default bucket.
            output_kms_key (str): Optional. KMS key ID for encrypting the transform output (default: None).
            accept (str): The content type accepted by the endpoint deployed during the transform job.
            env (dict): Environment variables to be set for use during the transform job (default: None).
            max_concurrent_transforms (int): The maximum number of HTTP requests to be made to
                each individual transform container at one time.
            max_payload (int): Maximum size of the payload in a single HTTP request to the container in MB.
            tags (list[dict]): List of tags for labeling a transform job. If none specified, then the tags used for
                the training job are used for the transform job.
            role (str): The ``ExecutionRoleArn`` IAM Role ARN for the ``Model``, which is also used during
                transform jobs. If not specified, the role from the Estimator will be used.
        """
        self._ensure_latest_training_job()

        model_name = self.sagemaker_session.create_model_from_job(self.latest_training_job.name, role=role)
        tags = tags or self.tags

        return Transformer(model_name, instance_count, instance_type, strategy=strategy, assemble_with=assemble_with,
                           output_path=output_path, output_kms_key=output_kms_key, accept=accept,
                           max_concurrent_transforms=max_concurrent_transforms, max_payload=max_payload,
                           env=env, tags=tags, base_transform_job_name=self.base_job_name,
                           sagemaker_session=self.sagemaker_session)
def test_attach(prepare_init_params, transformer, sagemaker_session):
    sagemaker_session.sagemaker_client.describe_transform_job = Mock(name="describe_transform_job")
    attached = Transformer.attach(JOB_NAME, sagemaker_session)

    assert prepare_init_params.called_once
    assert attached.latest_transform_job.job_name == JOB_NAME
    assert attached.model_name == MODEL_NAME
    assert attached.instance_count == INSTANCE_COUNT
    assert attached.instance_type == INSTANCE_TYPE
def test_attach(prepare_init_params, transformer, sagemaker_session):
    sagemaker_session.sagemaker_client.describe_transform_job = Mock(name='describe_transform_job')
    attached = Transformer.attach(JOB_NAME, sagemaker_session)

    assert prepare_init_params.called_once
    assert attached.latest_transform_job.job_name == JOB_NAME
    assert attached.model_name == MODEL_NAME
    assert attached.instance_count == INSTANCE_COUNT
    assert attached.instance_type == INSTANCE_TYPE
def test_transformer_init(sagemaker_session):
    transformer = Transformer(
        MODEL_NAME, INSTANCE_COUNT, INSTANCE_TYPE, sagemaker_session=sagemaker_session
    )

    assert transformer.model_name == MODEL_NAME
    assert transformer.instance_count == INSTANCE_COUNT
    assert transformer.instance_type == INSTANCE_TYPE
    assert transformer.sagemaker_session == sagemaker_session

    assert transformer._current_job_name is None
    assert transformer.latest_transform_job is None
    assert transformer._reset_output_path is False
예제 #9
0
def test_transform_step(sagemaker_session):
    transformer = Transformer(
        model_name=MODEL_NAME,
        instance_count=1,
        instance_type="c4.4xlarge",
        sagemaker_session=sagemaker_session,
    )
    inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest")
    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
    step = TransformStep(
        name="MyTransformStep",
        depends_on=["TestStep"],
        transformer=transformer,
        display_name="TransformStep",
        description="TestDescription",
        inputs=inputs,
        cache_config=cache_config,
    )
    step.add_depends_on(["SecondTestStep"])
    assert step.to_request() == {
        "Name": "MyTransformStep",
        "Type": "Transform",
        "Description": "TestDescription",
        "DisplayName": "TransformStep",
        "DependsOn": ["TestStep", "SecondTestStep"],
        "Arguments": {
            "ModelName": "gisele",
            "TransformInput": {
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": "s3://my-bucket/transform_manifest",
                    }
                }
            },
            "TransformOutput": {
                "S3OutputPath": None
            },
            "TransformResources": {
                "InstanceCount": 1,
                "InstanceType": "c4.4xlarge",
            },
        },
        "CacheConfig": {
            "Enabled": True,
            "ExpireAfter": "PT1H"
        },
    }
    assert step.properties.TransformJobName.expr == {
        "Get": "Steps.MyTransformStep.TransformJobName"
    }
예제 #10
0
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type):
    data_path = os.path.join(DATA_DIR, "one_p_mnist")
    pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, "mnist.pkl.gz")
    with gzip.open(train_set_path, "rb") as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(data_path, "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input, cpu_instance_type)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
예제 #11
0
    def test_sagemaker_transform_step_successfully(self, m_default_bucket):

        m_default_bucket.return_value = "sagemaker-bucket-name"

        with DataJobStack(scope=self.app, id="some-stack", stage="stg") as djs:
            transformer = Transformer(
                model_name="some-model",
                instance_count=1,
                instance_type="ml.t2.medium",
                sagemaker_session=self.sagemaker_session,
            )

            transform_step = TransformStep(
                datajob_stack=djs,
                name="transform-job",
                transformer=transformer,
                data="s3://some-bucket/some-data.csv",
            )

            estimator = SKLearn(
                entry_point=str(
                    pathlib.Path(current_dir, "resources", "train.py")),
                train_instance_type="ml.m5.xlarge",
                role=self.role,
                framework_version="0.20.0",
                py_version="py3",
                sagemaker_session=self.sagemaker_session,
            )

            tuner = HyperparameterTuner(
                estimator=estimator,
                hyperparameter_ranges={
                    "alpha": ContinuousParameter(0.0001, 0.05)
                },
                objective_metric_name="rmse",
            )

            tuner_step = TuningStep(
                datajob_stack=djs,
                name="tuning-step",
                tuner=tuner,
                data="s3://some-bucket/some-data.csv",
            )

            with StepfunctionsWorkflow(djs, "sequential") as sfn_workflow:
                transform_step >> tuner_step
예제 #12
0
def test_attach_transform_kmeans(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(
                        sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type):
    kmeans = KMeans(
        role="SageMakerRole",
        instance_count=1,
        instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(datasets.one_p_mnist()[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(DATA_DIR, "one_p_mnist",
                                        "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input, cpu_instance_type)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
예제 #14
0
def test_transformer_init_optional_params(sagemaker_session):
    strategy = "MultiRecord"
    assemble_with = "Line"
    accept = "text/csv"
    max_concurrent_transforms = 100
    max_payload = 100
    tags = {"Key": "foo", "Value": "bar"}
    env = {"FOO": "BAR"}

    transformer = Transformer(
        MODEL_NAME,
        INSTANCE_COUNT,
        INSTANCE_TYPE,
        strategy=strategy,
        assemble_with=assemble_with,
        output_path=OUTPUT_PATH,
        output_kms_key=KMS_KEY_ID,
        accept=accept,
        max_concurrent_transforms=max_concurrent_transforms,
        max_payload=max_payload,
        tags=tags,
        env=env,
        base_transform_job_name=JOB_NAME,
        sagemaker_session=sagemaker_session,
        volume_kms_key=KMS_KEY_ID,
    )

    assert transformer.model_name == MODEL_NAME
    assert transformer.strategy == strategy
    assert transformer.env == env
    assert transformer.output_path == OUTPUT_PATH
    assert transformer.output_kms_key == KMS_KEY_ID
    assert transformer.accept == accept
    assert transformer.assemble_with == assemble_with
    assert transformer.instance_count == INSTANCE_COUNT
    assert transformer.instance_type == INSTANCE_TYPE
    assert transformer.volume_kms_key == KMS_KEY_ID
    assert transformer.max_concurrent_transforms == max_concurrent_transforms
    assert transformer.max_payload == max_payload
    assert transformer.tags == tags
    assert transformer.base_transform_job_name == JOB_NAME
예제 #15
0
def test_transform_step(sagemaker_session):
    transformer = Transformer(
        model_name=MODEL_NAME,
        instance_count=1,
        instance_type="c4.4xlarge",
        sagemaker_session=sagemaker_session,
    )
    inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest")
    step = TransformStep(
        name="MyTransformStep",
        transformer=transformer,
        inputs=inputs,
    )
    assert step.to_request() == {
        "Name": "MyTransformStep",
        "Type": "Transform",
        "Arguments": {
            "ModelName": "gisele",
            "TransformInput": {
                "DataSource": {
                    "S3DataSource": {
                        "S3DataType": "S3Prefix",
                        "S3Uri": "s3://my-bucket/transform_manifest",
                    }
                }
            },
            "TransformOutput": {
                "S3OutputPath": None
            },
            "TransformResources": {
                "InstanceCount": 1,
                "InstanceType": "c4.4xlarge",
            },
        },
    }
    assert step.properties.TransformJobName.expr == {
        "Get": "Steps.MyTransformStep.TransformJobName"
    }
def test_attach_transform_kmeans(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                    train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(path=transform_input_path,
                                                           key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(kmeans, transform_input)

    attached_transformer = Transformer.attach(transformer.latest_transform_job.name,
                                              sagemaker_session=sagemaker_session)
    attached_transformer.wait()
예제 #17
0
    def transformer(
        self,
        instance_count,
        instance_type,
        strategy=None,
        assemble_with=None,
        output_path=None,
        output_kms_key=None,
        accept=None,
        env=None,
        max_concurrent_transforms=None,
        max_payload=None,
        tags=None,
        role=None,
        volume_kms_key=None,
    ):
        """Return a ``Transformer`` that uses a SageMaker Model based on the
        training job. It reuses the SageMaker Session and base job name used by
        the Estimator.

        Args:
            instance_count (int): Number of EC2 instances to use.
            instance_type (str): Type of EC2 instance to use, for example,
                'ml.c4.xlarge'.
            strategy (str): The strategy used to decide how to batch records in
                a single request (default: None). Valid values: 'MultiRecord'
                and 'SingleRecord'.
            assemble_with (str): How the output is assembled (default: None).
                Valid values: 'Line' or 'None'.
            output_path (str): S3 location for saving the transform result. If
                not specified, results are stored to a default bucket.
            output_kms_key (str): Optional. KMS key ID for encrypting the
                transform output (default: None).
            accept (str): The accept header passed by the client to
                the inference endpoint. If it is supported by the endpoint,
                it will be the format of the batch transform output.
            env (dict): Environment variables to be set for use during the
                transform job (default: None).
            max_concurrent_transforms (int): The maximum number of HTTP requests
                to be made to each individual transform container at one time.
            max_payload (int): Maximum size of the payload in a single HTTP
                request to the container in MB.
            tags (list[dict]): List of tags for labeling a transform job. If
                none specified, then the tags used for the training job are used
                for the transform job.
            role (str): The ``ExecutionRoleArn`` IAM Role ARN for the ``Model``,
                which is also used during transform jobs. If not specified, the
                role from the Estimator will be used.
            volume_kms_key (str): Optional. KMS key ID for encrypting the volume
                attached to the ML compute instance (default: None).
        """
        role = role or self.role

        if self.latest_training_job is not None:
            model = self.create_model(role=role)
            model._create_sagemaker_model()
            model_name = model.name
            transform_env = {}
            if env is not None:
                transform_env = model.env.copy()
                transform_env.update(env)
            if self._is_marketplace():
                transform_env = None

            tags = tags or self.tags
        else:
            raise RuntimeError(
                "No finished training job found associated with this estimator"
            )

        return Transformer(
            model_name,
            instance_count,
            instance_type,
            strategy=strategy,
            assemble_with=assemble_with,
            output_path=output_path,
            output_kms_key=output_kms_key,
            accept=accept,
            max_concurrent_transforms=max_concurrent_transforms,
            max_payload=max_payload,
            env=transform_env,
            tags=tags,
            base_transform_job_name=self.base_job_name,
            volume_kms_key=volume_kms_key,
            sagemaker_session=self.sagemaker_session,
        )
예제 #18
0
    def transformer(
        self,
        instance_count,
        instance_type,
        strategy=None,
        assemble_with=None,
        output_path=None,
        output_kms_key=None,
        accept=None,
        env=None,
        max_concurrent_transforms=None,
        max_payload=None,
        tags=None,
        role=None,
        model_server_workers=None,
        volume_kms_key=None,
        endpoint_type=None,
        entry_point=None,
        vpc_config_override=VPC_CONFIG_DEFAULT,
    ):
        """Return a ``Transformer`` that uses a SageMaker Model based on the training job. It
        reuses the SageMaker Session and base job name used by the Estimator.

        Args:
            instance_count (int): Number of EC2 instances to use.
            instance_type (str): Type of EC2 instance to use, for example, 'ml.c4.xlarge'.
            strategy (str): The strategy used to decide how to batch records in a single request
                (default: None). Valid values: 'MULTI_RECORD' and 'SINGLE_RECORD'.
            assemble_with (str): How the output is assembled (default: None). Valid values: 'Line'
                or 'None'.
            output_path (str): S3 location for saving the transform result. If not specified,
                results are stored to a default bucket.
            output_kms_key (str): Optional. KMS key ID for encrypting the transform output
                (default: None).
            accept (str): The accept header passed by the client to
                the inference endpoint. If it is supported by the endpoint,
                it will be the format of the batch transform output.
            env (dict): Environment variables to be set for use during the transform job
                (default: None).
            max_concurrent_transforms (int): The maximum number of HTTP requests to be made to
                each individual transform container at one time.
            max_payload (int): Maximum size of the payload in a single HTTP request to the
                container in MB.
            tags (list[dict]): List of tags for labeling a transform job. If none specified, then
                the tags used for the training job are used for the transform job.
            role (str): The ``ExecutionRoleArn`` IAM Role ARN for the ``Model``, which is also
                used during transform jobs. If not specified, the role from the Estimator will be
                used.
            model_server_workers (int): Optional. The number of worker processes used by the
                inference server. If None, server will use one worker per vCPU.
            volume_kms_key (str): Optional. KMS key ID for encrypting the volume attached to the ML
                compute instance (default: None).
            endpoint_type (str): Optional. Selects the software stack used by the inference server.
                If not specified, the model will be configured to use the default
                SageMaker model server.
                If 'tensorflow-serving', the model will be configured to
                use the SageMaker Tensorflow Serving container.
            entry_point (str): Path (absolute or relative) to the local Python source file which
                should be executed as the entry point to training. If not specified and
                ``endpoint_type`` is 'tensorflow-serving', no entry point is used. If
                ``endpoint_type`` is also ``None``, then the training entry point is used.
            vpc_config_override (dict[str, list[str]]): Optional override for
                the VpcConfig set on the model.
                Default: use subnets and security groups from this Estimator.
                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.
        """
        role = role or self.role

        if self.latest_training_job is None:
            logging.warning(
                "No finished training job found associated with this estimator. Please make sure "
                "this estimator is only used for building workflow config")
            return Transformer(
                self._current_job_name,
                instance_count,
                instance_type,
                strategy=strategy,
                assemble_with=assemble_with,
                output_path=output_path,
                output_kms_key=output_kms_key,
                accept=accept,
                max_concurrent_transforms=max_concurrent_transforms,
                max_payload=max_payload,
                env=env or {},
                tags=tags,
                base_transform_job_name=self.base_job_name,
                volume_kms_key=volume_kms_key,
                sagemaker_session=self.sagemaker_session,
            )

        model = self.create_model(
            model_server_workers=model_server_workers,
            role=role,
            vpc_config_override=vpc_config_override,
            endpoint_type=endpoint_type,
            entry_point=entry_point,
        )
        return model.transformer(
            instance_count,
            instance_type,
            strategy=strategy,
            assemble_with=assemble_with,
            output_path=output_path,
            output_kms_key=output_kms_key,
            accept=accept,
            env=env,
            max_concurrent_transforms=max_concurrent_transforms,
            max_payload=max_payload,
            tags=tags,
            volume_kms_key=volume_kms_key,
        )
    def __init__(
        self,
        name: str,
        estimator: EstimatorBase,
        model_data,
        model_inputs,
        instance_count,
        instance_type,
        transform_inputs,
        # model arguments
        image_uri=None,
        predictor_cls=None,
        env=None,
        # transformer arguments
        strategy=None,
        assemble_with=None,
        output_path=None,
        output_kms_key=None,
        accept=None,
        max_concurrent_transforms=None,
        max_payload=None,
        tags=None,
        volume_kms_key=None,
        depends_on: List[str] = None,
        **kwargs,
    ):
        """Construct steps required for a Transformer step collection:

        An estimator-centric step collection. It models what happens in workflows
        when invoking the `transform()` method on an estimator instance:
        First, if custom
        model artifacts are required, a `_RepackModelStep` is included.
        Second, a
        `CreateModelStep` with the model data passed in from a training step or other
        training job output.
        Finally, a `TransformerStep`.

        If repacking
        the model artifacts is not necessary, only the CreateModelStep and TransformerStep
        are in the step collection.

        Args:
            name (str): The name of the Transform Step.
            estimator: The estimator instance.
            instance_count (int): The number of EC2 instances to use.
            instance_type (str): The type of EC2 instance to use.
            strategy (str): The strategy used to decide how to batch records in
                a single request (default: None). Valid values: 'MultiRecord'
                and 'SingleRecord'.
            assemble_with (str): How the output is assembled (default: None).
                Valid values: 'Line' or 'None'.
            output_path (str): The S3 location for saving the transform result. If
                not specified, results are stored to a default bucket.
            output_kms_key (str): Optional. A KMS key ID for encrypting the
                transform output (default: None).
            accept (str): The accept header passed by the client to
                the inference endpoint. If it is supported by the endpoint,
                it will be the format of the batch transform output.
            env (dict): The Environment variables to be set for use during the
                transform job (default: None).
            depends_on (List[str]): The list of step names the first step in
                the collection depends on
        """
        steps = []
        if "entry_point" in kwargs:
            entry_point = kwargs["entry_point"]
            source_dir = kwargs.get("source_dir")
            dependencies = kwargs.get("dependencies")
            repack_model_step = _RepackModelStep(
                name=f"{name}RepackModel",
                depends_on=depends_on,
                estimator=estimator,
                model_data=model_data,
                entry_point=entry_point,
                source_dir=source_dir,
                dependencies=dependencies,
            )
            steps.append(repack_model_step)
            model_data = repack_model_step.properties.ModelArtifacts.S3ModelArtifacts

        def predict_wrapper(endpoint, session):
            return Predictor(endpoint, session)

        predictor_cls = predictor_cls or predict_wrapper

        model = Model(
            image_uri=image_uri or estimator.training_image_uri(),
            model_data=model_data,
            predictor_cls=predictor_cls,
            vpc_config=None,
            sagemaker_session=estimator.sagemaker_session,
            role=estimator.role,
            **kwargs,
        )
        model_step = CreateModelStep(
            name=f"{name}CreateModelStep",
            model=model,
            inputs=model_inputs,
        )
        if "entry_point" not in kwargs and depends_on:
            # if the CreateModelStep is the first step in the collection
            model_step.add_depends_on(depends_on)
        steps.append(model_step)

        transformer = Transformer(
            model_name=model_step.properties.ModelName,
            instance_count=instance_count,
            instance_type=instance_type,
            strategy=strategy,
            assemble_with=assemble_with,
            output_path=output_path,
            output_kms_key=output_kms_key,
            accept=accept,
            max_concurrent_transforms=max_concurrent_transforms,
            max_payload=max_payload,
            env=env,
            tags=tags,
            base_transform_job_name=name,
            volume_kms_key=volume_kms_key,
            sagemaker_session=estimator.sagemaker_session,
        )
        transform_step = TransformStep(
            name=f"{name}TransformStep",
            transformer=transformer,
            inputs=transform_inputs,
        )
        steps.append(transform_step)

        self.steps = steps
예제 #20
0
    def transformer(
        self,
        instance_count,
        instance_type,
        strategy=None,
        assemble_with=None,
        output_path=None,
        output_kms_key=None,
        accept=None,
        env=None,
        max_concurrent_transforms=None,
        max_payload=None,
        tags=None,
        role=None,
        volume_kms_key=None,
        entry_point=None,
        vpc_config_override=VPC_CONFIG_DEFAULT,
        enable_network_isolation=None,
        model_name=None,
    ):
        """Return a ``Transformer`` that uses a SageMaker Model based on the training job.

        It reuses the SageMaker Session and base job name used by the Estimator.

        Args:
            instance_count (int): Number of EC2 instances to use.
            instance_type (str): Type of EC2 instance to use, for example, 'ml.c4.xlarge'.
            strategy (str): The strategy used to decide how to batch records in a single request
                (default: None). Valid values: 'MultiRecord' and 'SingleRecord'.
            assemble_with (str): How the output is assembled (default: None). Valid values: 'Line'
                or 'None'.
            output_path (str): S3 location for saving the transform result. If not specified,
                results are stored to a default bucket.
            output_kms_key (str): Optional. KMS key ID for encrypting the transform output
                (default: None).
            accept (str): The accept header passed by the client to
                the inference endpoint. If it is supported by the endpoint,
                it will be the format of the batch transform output.
            env (dict): Environment variables to be set for use during the transform job
                (default: None).
            max_concurrent_transforms (int): The maximum number of HTTP requests to be made to
                each individual transform container at one time.
            max_payload (int): Maximum size of the payload in a single HTTP request to the
                container in MB.
            tags (list[dict]): List of tags for labeling a transform job. If none specified, then
                the tags used for the training job are used for the transform job.
            role (str): The IAM Role ARN for the ``TensorFlowModel``, which is also used
                during transform jobs. If not specified, the role from the Estimator is used.
            volume_kms_key (str): Optional. KMS key ID for encrypting the volume attached to the ML
                compute instance (default: None).
            entry_point (str): Path (absolute or relative) to the local Python source file which
                should be executed as the entry point to training. If ``source_dir`` is specified,
                then ``entry_point`` must point to a file located at the root of ``source_dir``.
                If not specified and ``endpoint_type`` is 'tensorflow-serving',
                no entry point is used. If ``endpoint_type`` is also ``None``,
                then the training entry point is used.
            vpc_config_override (dict[str, list[str]]): Optional override for
                the VpcConfig set on the model.
                Default: use subnets and security groups from this Estimator.

                * 'Subnets' (list[str]): List of subnet ids.
                * 'SecurityGroupIds' (list[str]): List of security group ids.

            enable_network_isolation (bool): Specifies whether container will
                run in network isolation mode. Network isolation mode restricts
                the container access to outside networks (such as the internet).
                The container does not make any inbound or outbound network
                calls. If True, a channel named "code" will be created for any
                user entry script for inference. Also known as Internet-free mode.
                If not specified, this setting is taken from the estimator's
                current configuration.
            model_name (str): Name to use for creating an Amazon SageMaker
                model. If not specified, the estimator generates a default job name
                based on the training image name and current timestamp.
        """
        role = role or self.role
        model_name = self._get_or_create_name(model_name)

        if self.latest_training_job is None:
            logger.warning(
                "No finished training job found associated with this estimator. Please make sure "
                "this estimator is only used for building workflow config")
            return Transformer(
                model_name,
                instance_count,
                instance_type,
                strategy=strategy,
                assemble_with=assemble_with,
                output_path=output_path,
                output_kms_key=output_kms_key,
                accept=accept,
                max_concurrent_transforms=max_concurrent_transforms,
                max_payload=max_payload,
                env=env or {},
                tags=tags,
                base_transform_job_name=self.base_job_name,
                volume_kms_key=volume_kms_key,
                sagemaker_session=self.sagemaker_session,
            )

        if enable_network_isolation is None:
            enable_network_isolation = self.enable_network_isolation()

        model = self.create_model(
            role=role,
            vpc_config_override=vpc_config_override,
            entry_point=entry_point,
            enable_network_isolation=enable_network_isolation,
            name=model_name,
        )

        return model.transformer(
            instance_count,
            instance_type,
            strategy=strategy,
            assemble_with=assemble_with,
            output_path=output_path,
            output_kms_key=output_kms_key,
            accept=accept,
            env=env,
            max_concurrent_transforms=max_concurrent_transforms,
            max_payload=max_payload,
            tags=tags,
            volume_kms_key=volume_kms_key,
        )
예제 #21
0
def test_start_new(prepare_data_processing, load_config, sagemaker_session):
    input_config = "input"
    output_config = "output"
    resource_config = "resource"
    load_config.return_value = {
        "input_config": input_config,
        "output_config": output_config,
        "resource_config": resource_config,
    }

    strategy = "MultiRecord"
    max_concurrent_transforms = 100
    max_payload = 100
    tags = {"Key": "foo", "Value": "bar"}
    env = {"FOO": "BAR"}

    transformer = Transformer(
        MODEL_NAME,
        INSTANCE_COUNT,
        INSTANCE_TYPE,
        strategy=strategy,
        output_path=OUTPUT_PATH,
        max_concurrent_transforms=max_concurrent_transforms,
        max_payload=max_payload,
        tags=tags,
        env=env,
        sagemaker_session=sagemaker_session,
    )
    transformer._current_job_name = JOB_NAME

    content_type = "text/csv"
    compression_type = "Gzip"
    split_type = "Line"
    io_filter = "$"
    join_source = "Input"
    model_client_config = {
        "InvocationsTimeoutInSeconds": 60,
        "InvocationsMaxRetries": 2
    }

    job = _TransformJob.start_new(
        transformer=transformer,
        data=DATA,
        data_type=S3_DATA_TYPE,
        content_type=content_type,
        compression_type=compression_type,
        split_type=split_type,
        input_filter=io_filter,
        output_filter=io_filter,
        join_source=join_source,
        experiment_config={"ExperimentName": "exp"},
        model_client_config=model_client_config,
    )

    assert job.sagemaker_session == sagemaker_session
    assert job.job_name == JOB_NAME

    load_config.assert_called_with(DATA, S3_DATA_TYPE, content_type,
                                   compression_type, split_type, transformer)
    prepare_data_processing.assert_called_with(io_filter, io_filter,
                                               join_source)

    sagemaker_session.transform.assert_called_with(
        job_name=JOB_NAME,
        model_name=MODEL_NAME,
        strategy=strategy,
        max_concurrent_transforms=max_concurrent_transforms,
        max_payload=max_payload,
        env=env,
        input_config=input_config,
        output_config=output_config,
        resource_config=resource_config,
        experiment_config={"ExperimentName": "exp"},
        model_client_config=model_client_config,
        tags=tags,
        data_processing=prepare_data_processing.return_value,
    )
예제 #22
0
xgb_estimator = Estimator(image_name=xgb_container,
                          role=role,
                          sagemaker_session=sagemaker.session.Session(sess),
                          **config["train_model"]["estimator_config"])

# train_config specifies SageMaker training configuration
train_data = create_s3_input(config['train_model']['inputs']['train'])
validation_data = create_s3_input(
    config['train_model']['inputs']['validation'])
data_channels = {'train': train_data, 'validation': validation_data}

train_config = training_config(estimator=xgb_estimator, inputs=data_channels)

# Batch inference
xgb_transformer = Transformer(
    model_name=config['batch_transform']['model_name'],
    sagemaker_session=sagemaker.session.Session(sess),
    **config['batch_transform']['transformer_config'])

transform_config = transform_config(
    transformer=xgb_transformer,
    **config['batch_transform']['transform_config'])

# =============================================================================
# define airflow DAG and tasks
# =============================================================================
# define airflow DAG
args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)}

dag = DAG(
    'sagemaker-ml-pipeline',
    default_args=args,
예제 #23
0
def test_end_to_end_pipeline_successful_execution(
    sagemaker_session, region_name, role, pipeline_name, wait=False
):
    model_package_group_name = f"{pipeline_name}ModelPackageGroup"
    data_path = os.path.join(DATA_DIR, "workflow")
    default_bucket = sagemaker_session.default_bucket()

    # download the input data
    local_input_path = os.path.join(data_path, "abalone-dataset.csv")
    s3 = sagemaker_session.boto_session.resource("s3")
    s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file(
        "dataset/abalone-dataset.csv", local_input_path
    )

    # # upload the input data to our bucket
    base_uri = f"s3://{default_bucket}/{pipeline_name}"
    with open(local_input_path) as data:
        body = data.read()
        input_data_uri = S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{base_uri}/abalone-dataset.csv",
            sagemaker_session=sagemaker_session,
        )

    # download batch transform data
    local_batch_path = os.path.join(data_path, "abalone-dataset-batch")
    s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file(
        "dataset/abalone-dataset-batch", local_batch_path
    )

    # upload the batch transform data
    with open(local_batch_path) as data:
        body = data.read()
        batch_data_uri = S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=f"{base_uri}/abalone-dataset-batch",
            sagemaker_session=sagemaker_session,
        )

    # define parameters
    processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
    processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )
    training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )
    model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved")
    input_data = ParameterString(
        name="InputData",
        default_value=input_data_uri,
    )
    batch_data = ParameterString(
        name="BatchData",
        default_value=batch_data_uri,
    )

    # define processing step
    framework_version = "0.23-1"
    sklearn_processor = SKLearnProcessor(
        framework_version=framework_version,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        base_job_name=f"{pipeline_name}-process",
        role=role,
        sagemaker_session=sagemaker_session,
    )
    step_process = ProcessingStep(
        name="AbaloneProcess",
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
        ],
        outputs=[
            ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
            ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
            ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
        ],
        code=os.path.join(data_path, "abalone/preprocessing.py"),
    )

    # define training step
    model_path = f"s3://{default_bucket}/{pipeline_name}Train"
    image_uri = image_uris.retrieve(
        framework="xgboost",
        region=region_name,
        version="1.0-1",
        py_version="py3",
        instance_type=training_instance_type,
    )
    xgb_train = Estimator(
        image_uri=image_uri,
        instance_type=training_instance_type,
        instance_count=1,
        output_path=model_path,
        role=role,
        sagemaker_session=sagemaker_session,
    )
    xgb_train.set_hyperparameters(
        objective="reg:linear",
        num_round=50,
        max_depth=5,
        eta=0.2,
        gamma=4,
        min_child_weight=6,
        subsample=0.7,
        silent=0,
    )
    step_train = TrainingStep(
        name="AbaloneTrain",
        estimator=xgb_train,
        inputs={
            "train": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
            "validation": TrainingInput(
                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                    "validation"
                ].S3Output.S3Uri,
                content_type="text/csv",
            ),
        },
    )

    # define evaluation step
    script_eval = ScriptProcessor(
        image_uri=image_uri,
        command=["python3"],
        instance_type=processing_instance_type,
        instance_count=1,
        base_job_name=f"{pipeline_name}-eval",
        role=role,
        sagemaker_session=sagemaker_session,
    )
    evaluation_report = PropertyFile(
        name="EvaluationReport", output_name="evaluation", path="evaluation.json"
    )
    step_eval = ProcessingStep(
        name="AbaloneEval",
        processor=script_eval,
        inputs=[
            ProcessingInput(
                source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                destination="/opt/ml/processing/model",
            ),
            ProcessingInput(
                source=step_process.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                destination="/opt/ml/processing/test",
            ),
        ],
        outputs=[
            ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
        ],
        code=os.path.join(data_path, "abalone/evaluation.py"),
        property_files=[evaluation_report],
    )

    # define create model step
    model = Model(
        image_uri=image_uri,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_create_model = CreateModelStep(
        name="AbaloneCreateModel",
        model=model,
        inputs=inputs,
    )

    # define transform step
    transformer = Transformer(
        model_name=step_create_model.properties.ModelName,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        output_path=f"s3://{default_bucket}/{pipeline_name}Transform",
        sagemaker_session=sagemaker_session,
    )
    step_transform = TransformStep(
        name="AbaloneTransform",
        transformer=transformer,
        inputs=TransformInput(data=batch_data),
    )

    # define register model step
    model_metrics = ModelMetrics(
        model_statistics=MetricsSource(
            s3_uri="{}/evaluation.json".format(
                step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
            ),
            content_type="application/json",
        )
    )
    step_register = RegisterModel(
        name="AbaloneRegisterModel",
        estimator=xgb_train,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        content_types=["text/csv"],
        response_types=["text/csv"],
        inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
        transform_instances=["ml.m5.xlarge"],
        model_package_group_name=model_package_group_name,
        approval_status=model_approval_status,
        model_metrics=model_metrics,
    )

    # define condition step
    cond_lte = ConditionLessThanOrEqualTo(
        left=JsonGet(
            step_name=step_eval.name,
            property_file=evaluation_report,
            json_path="regression_metrics.mse.value",
        ),
        right=20.0,
    )

    step_cond = ConditionStep(
        name="AbaloneMSECond",
        conditions=[cond_lte],
        if_steps=[step_register, step_create_model, step_transform],
        else_steps=[],
    )

    # define pipeline
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            processing_instance_type,
            processing_instance_count,
            training_instance_type,
            model_approval_status,
            input_data,
            batch_data,
        ],
        steps=[step_process, step_train, step_eval, step_cond],
        sagemaker_session=sagemaker_session,
    )

    pipeline.create(role)
    execution = pipeline.start()
    execution_arn = execution.arn

    if wait:
        execution.wait()

    return execution_arn
def test_delete_model(sagemaker_session):
    transformer = Transformer(
        MODEL_NAME, INSTANCE_COUNT, INSTANCE_TYPE, sagemaker_session=sagemaker_session
    )
    transformer.delete_model()
    sagemaker_session.delete_model.assert_called_with(MODEL_NAME)
예제 #25
0
def transformer(sagemaker_session):
    return Transformer(MODEL_NAME, INSTANCE_COUNT, INSTANCE_TYPE,
                       output_path=OUTPUT_PATH, sagemaker_session=sagemaker_session)