def __init__(
        self,
        name: str,
        sagemaker_session,
        role,
        model_data: str,
        entry_point: str,
        source_dir: str = None,
        dependencies: List = None,
        depends_on: List[str] = None,
        **kwargs,
    ):
        """Constructs a TrainingStep, given an `EstimatorBase` instance.

        In addition to the estimator instance, the other arguments are those that are supplied to
        the `fit` method of the `sagemaker.estimator.Estimator`.

        Args:
            name (str): The name of the training step.
            estimator (EstimatorBase): A `sagemaker.estimator.EstimatorBase` instance.
            inputs (TrainingInput): A `sagemaker.inputs.TrainingInput` instance. Defaults to `None`.
        """
        # yeah, go ahead and save the originals for now
        self._model_data = model_data
        self.sagemaker_session = sagemaker_session
        self.role = role
        if isinstance(model_data, Properties):
            self._model_prefix = model_data
            self._model_archive = "model.tar.gz"
        else:
            self._model_prefix = "/".join(self._model_data.split("/")[:-1])
            self._model_archive = self._model_data.split("/")[-1]
        self._entry_point = entry_point
        self._entry_point_basename = os.path.basename(self._entry_point)
        self._source_dir = source_dir
        self._dependencies = dependencies

        # the real estimator and inputs
        repacker = SKLearn(
            framework_version=FRAMEWORK_VERSION,
            instance_type=INSTANCE_TYPE,
            entry_point=REPACK_SCRIPT,
            source_dir=self._source_dir,
            dependencies=self._dependencies,
            sagemaker_session=self.sagemaker_session,
            role=self.role,
            hyperparameters={
                "inference_script": self._entry_point_basename,
                "model_archive": self._model_archive,
            },
            **kwargs,
        )
        repacker.disable_profiler = True
        inputs = TrainingInput(self._model_prefix)

        # super!
        super(_RepackModelStep, self).__init__(name=name,
                                               depends_on=depends_on,
                                               estimator=repacker,
                                               inputs=inputs)
示例#2
0
def train_model_sagemaker(X_train_path: str,
                          sklearn_estimator_kwargs: Dict[str, Any]) -> str:
    """Train the linear regression model on SageMaker.

    Args:
        X_train_path: Full S3 path to `X_train` dataset.
        sklearn_estimator_kwargs: Keyword arguments that will be used
            to instantiate SKLearn estimator.

    Returns:
        Full S3 path to `model.tar.gz` file containing the model artifact.

    """
    sklearn_estimator = SKLearn(**sklearn_estimator_kwargs)

    # we need a path to the directory containing both
    # X_train (feature table) and y_train (target variable)
    inputs_dir = X_train_path.rsplit("/", 1)[0]
    inputs = {"train": inputs_dir}

    # wait=True ensures that the execution is blocked
    # until the job finishes on SageMaker
    sklearn_estimator.fit(inputs=inputs, wait=True)

    training_job = sklearn_estimator.latest_training_job
    job_description = training_job.describe()
    model_path = job_description["ModelArtifacts"]["S3ModelArtifacts"]
    return model_path
示例#3
0
 def create_sklearn_estimator(self):
     self.estimator = SKLearn(
         self.script_path,
         instance_type="ml.m4.xlarge",
         framework_version="0.20.0",
         sagemaker_session=self.sagemaker_session,
         role=aws_role,
         metric_definitions=[
             {
                 "Name": "train:accuracy",
                 "Regex": "Train_accuracy=(.*?);"
             },
             {
                 "Name": "test:accuracy",
                 "Regex": "Test_accuracy=(.*?);"
             },
             {
                 "Name": "train:f1",
                 "Regex": "Train_f1=(.*?);"
             },
             {
                 "Name": "test:f1",
                 "Regex": "Test_f1=(.*?);"
             }
         ],
         hyperparameters=self.hyperparameters
     )
class SagemakerRegression:
    def __init__(self, script_path, data_path, hyperparameters):
        self.script_path = script_path
        self.data_path = data_path
        self.hyperparameters = hyperparameters

        self.create_sagemaker_session()
        self.create_sklearn_estimator()

    def create_sagemaker_session(self):
        self.sagemaker_session = get_sagemaker_session()

    def create_sklearn_estimator(self):
        self.estimator = SKLearn(self.script_path,
                                 instance_type="ml.m4.xlarge",
                                 framework_version="0.20.0",
                                 sagemaker_session=self.sagemaker_session,
                                 role=aws_role,
                                 metric_definitions=[{
                                     "Name":
                                     "train:mse",
                                     "Regex":
                                     "Train_mse=(.*?);"
                                 }, {
                                     "Name": "test:mse",
                                     "Regex": "Test_mse=(.*?);"
                                 }],
                                 hyperparameters=self.hyperparameters)

    def fit(self):
        self.estimator.fit({"train": self.data_path}, wait=False)

    def get_training_name(self):
        return self.estimator.latest_training_job.job_name
示例#5
0
def test_github_with_ssh_passphrase_not_configured(sagemaker_local_session,
                                                   sklearn_latest_version,
                                                   sklearn_latest_py_version):
    script_path = "mnist.py"
    data_path = os.path.join(DATA_DIR, "sklearn_mnist")
    git_config = {
        "repo": PRIVATE_GIT_REPO_2FA_SSH,
        "branch": PRIVATE_BRANCH_2FA,
        "commit": PRIVATE_COMMIT_2FA,
    }
    source_dir = "sklearn"

    sklearn = SKLearn(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir=source_dir,
        instance_count=1,
        instance_type="local",
        sagemaker_session=sagemaker_local_session,
        framework_version=sklearn_latest_version,
        py_version=sklearn_latest_py_version,
        hyperparameters={"epochs": 1},
        git_config=git_config,
    )
    train_input = "file://" + os.path.join(data_path, "train")
    test_input = "file://" + os.path.join(data_path, "test")

    with pytest.raises(subprocess.CalledProcessError) as error:
        sklearn.fit({"train": train_input, "test": test_input})
    assert "returned non-zero exit status" in str(error)
def cloud():
    sklearn = SKLearn(entry_point='train.py',
                      framework_version="0.23-1",
                      instance_count=1,
                      instance_type='ml.c4.xlarge',
                      py_version="py3",
                      sagemaker_role=role,
                      sagemaker_session=sagemaker_session,
                      source_dir='./src/')

    sklearn.fit({'train': train_path})
示例#7
0
def test_private_github_with_2fa(sagemaker_local_session,
                                 sklearn_latest_version,
                                 sklearn_latest_py_version):
    script_path = "mnist.py"
    data_path = os.path.join(DATA_DIR, "sklearn_mnist")
    git_config = {
        "repo": PRIVATE_GIT_REPO_2FA,
        "branch": PRIVATE_BRANCH_2FA,
        "commit": PRIVATE_COMMIT_2FA,
        "2FA_enabled": True,
        "token": "",  # TODO: find a secure approach
    }
    source_dir = "sklearn"

    sklearn = SKLearn(
        entry_point=script_path,
        role="SageMakerRole",
        source_dir=source_dir,
        py_version=sklearn_latest_py_version,
        instance_count=1,
        instance_type="local",
        sagemaker_session=sagemaker_local_session,
        framework_version=sklearn_latest_version,
        hyperparameters={"epochs": 1},
        git_config=git_config,
    )
    train_input = "file://" + os.path.join(data_path, "train")
    test_input = "file://" + os.path.join(data_path, "test")
    sklearn.fit({"train": train_input, "test": test_input})

    assert os.path.isdir(sklearn.source_dir)

    with lock.lock(LOCK_PATH):
        try:
            client = sagemaker_local_session.sagemaker_client
            desc = client.describe_training_job(
                TrainingJobName=sklearn.latest_training_job.name)
            model_data = desc["ModelArtifacts"]["S3ModelArtifacts"]
            model = SKLearnModel(
                model_data,
                "SageMakerRole",
                entry_point=script_path,
                framework_version=sklearn_latest_version,
                source_dir=source_dir,
                sagemaker_session=sagemaker_local_session,
                git_config=git_config,
            )
            predictor = model.deploy(1, "local")

            data = numpy.zeros((100, 784), dtype="float32")
            result = predictor.predict(data)
            assert result is not None
        finally:
            predictor.delete_endpoint()
def cloud():
    sklearn = SKLearn(entry_point='train.py',
                      source_dir='./src/',
                      role=role,
                      train_instance_count=1,
                      train_instance_type='ml.c4.xlarge',
                      sagemaker_session=sagemaker_session,
                      hyperparameters={
                          'max_depth': 5,
                          'n_estimators': 10
                      })

    sklearn.fit({'train': train_path})
示例#9
0
def sklearn_preprocessor():
    script_path = 'sklearn_abalone_featurizer.py'
    source_dir = 's3://sagemaker/source'
    sagemaker_session = MagicMock()
    sagemaker_session.boto_region_name = 'us-east-1'

    sklearn_preprocessor = SKLearn(entry_point=script_path,
                                   role=SAGEMAKER_EXECUTION_ROLE,
                                   train_instance_type="ml.c4.xlarge",
                                   source_dir=source_dir,
                                   sagemaker_session=sagemaker_session)

    sklearn_preprocessor.debugger_hook_config = DebuggerHookConfig(
        s3_output_path='s3://sagemaker/source/debug')

    return sklearn_preprocessor
def local():
    sklearn = SKLearn(entry_point='train.py',
                      source_dir='./src/',
                      role=role,
                      train_instance_count=1,
                      train_instance_type='local',
                      hyperparameters={
                          'max_depth': 5,
                          'n_estimators': 10
                      })

    sklearn.fit({'train': 'file://models/train.csv'})
    predictor = sklearn.deploy(initial_instance_count=1, instance_type='local')
    test_data = pd.read_csv('./models/test.csv', header=None, names=None)
    test_y = test_data.iloc[:, 0]
    test_x = test_data.iloc[:, 1:]
    test_y_preds = predictor.predict(test_x)
    accuracy = accuracy_score(test_y, test_y_preds)
    print('The current accuracy score for the prediction', accuracy)
示例#11
0
def sklearn_estimator(sagemaker_role_arn, sagemaker_session):
    script_path = os.path.join(DATA_DIR, 'one_p_mnist',
                               'sklearn_mnist_estimator.py')
    sklearn_estimator = SKLearn(entry_point=script_path,
                                role=sagemaker_role_arn,
                                train_instance_type="ml.m5.large",
                                sagemaker_session=sagemaker_session,
                                hyperparameters={"epochs": 1},
                                input_mode='File')
    return sklearn_estimator
示例#12
0
def test_training_script_in_local_container(inspectlocal):
    code_path = "../../src/mlmax/train.py"
    train_data_path = "opt/ml/processing/train/"
    test_data_path = "opt/ml/processing/test/"

    sklearn = SKLearn(
        entry_point=code_path,
        role=role,
        py_version="py3",
        framework_version="0.20.0",
        instance_type="local",
        hyperparameters={"inspect": True if inspectlocal else None},
    )
    sklearn.fit(
        {
            "train": "file://" + train_data_path,
            "test": "file://" + test_data_path
        },
        wait=True,
    )
示例#13
0
def sklearn_preprocessor(sagemaker_role_arn, sagemaker_session):
    script_path = os.path.join(DATA_DIR, 'one_p_mnist',
                               'sklearn_mnist_preprocessor.py')
    sklearn_preprocessor = SKLearn(
        framework_version='0.20.0',
        py_version='py3',
        entry_point=script_path,
        role=sagemaker_role_arn,
        instance_type="ml.m5.large",
        sagemaker_session=sagemaker_session,
        hyperparameters={"epochs": 1},
    )
    return sklearn_preprocessor
示例#14
0
def train():

    try:

        #Create a sagemaker.sklearn.SKLearn Estimator
    
        aws_sklearn = SKLearn(entry_point=TRAIN_SCRIPT,
                            source_dir=SOURCE,
                            train_instance_type='ml.m4.xlarge',
                            role=ROLE)
                            
        #Call the fit method on SKlearn estimator which uses our python script to train the model
        
        aws_sklearn.fit({'train':TRAIN_DATA})

        #Deploy the model created in previous step and create an endpoint
        
        aws_sklearn_predictor = aws_sklearn.deploy(instance_type='ml.m4.xlarge', 
                                                initial_instance_count=1)

    except Exception as e:
        return e
    else:
        return 'success'
示例#15
0
    def test_sagemaker_transform_step_successfully(self, m_default_bucket):

        m_default_bucket.return_value = "sagemaker-bucket-name"

        with DataJobStack(scope=self.app, id="some-stack", stage="stg") as djs:
            transformer = Transformer(
                model_name="some-model",
                instance_count=1,
                instance_type="ml.t2.medium",
                sagemaker_session=self.sagemaker_session,
            )

            transform_step = TransformStep(
                datajob_stack=djs,
                name="transform-job",
                transformer=transformer,
                data="s3://some-bucket/some-data.csv",
            )

            estimator = SKLearn(
                entry_point=str(
                    pathlib.Path(current_dir, "resources", "train.py")),
                train_instance_type="ml.m5.xlarge",
                role=self.role,
                framework_version="0.20.0",
                py_version="py3",
                sagemaker_session=self.sagemaker_session,
            )

            tuner = HyperparameterTuner(
                estimator=estimator,
                hyperparameter_ranges={
                    "alpha": ContinuousParameter(0.0001, 0.05)
                },
                objective_metric_name="rmse",
            )

            tuner_step = TuningStep(
                datajob_stack=djs,
                name="tuning-step",
                tuner=tuner,
                data="s3://some-bucket/some-data.csv",
            )

            with StepfunctionsWorkflow(djs, "sequential") as sfn_workflow:
                transform_step >> tuner_step
示例#16
0
def define_training_pipeline(
    sm_role,
    workflow_execution_role,
    training_pipeline_name,
    return_yaml=True,
    dump_yaml_file="templates/sagemaker_training_pipeline.yaml",
    kms_key_id=None,
):
    """
    Return YAML definition of the training pipeline, which consists of multiple
    Amazon StepFunction steps

    sm_role:                    ARN of the SageMaker execution role
    workflow_execution_role:    ARN of the StepFunction execution role
    return_yaml:                Return YAML representation or not, if False,
                                it returns an instance of
                                    `stepfunctions.workflow.WorkflowObject`
    dump_yaml_file:             If not None, a YAML file will be generated at
                                    this file location

    """

    # Pass required parameters dynamically for each execution using placeholders.
    execution_input = ExecutionInput(
        schema={
            "InputDataURL": str,
            "PreprocessingJobName": str,
            "PreprocessingCodeURL": str,
            "TrainingJobName": str,
            # Prevent sagemaker config hardcode sagemaker_submit_directory in
            # workflow definition
            "SMSubmitDirURL": str,
            # Prevent sagemaker config hardcode sagemaker_region in workflow definition
            "SMRegion": str,
            "EvaluationProcessingJobName": str,
            "EvaluationCodeURL": str,
            "EvaluationResultURL": str,
            "PreprocessedTrainDataURL": str,
            "PreprocessedTestDataURL": str,
            "PreprocessedModelURL": str,
            "SMOutputDataURL": str,
            "SMDebugOutputURL": str,
        })
    """
    Data pre-processing and feature engineering
    """
    sklearn_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=sm_role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        max_runtime_in_seconds=1200,
    )

    # Create ProcessingInputs and ProcessingOutputs objects for Inputs and
    # Outputs respectively for the SageMaker Processing Job
    inputs = [
        ProcessingInput(
            source=execution_input["InputDataURL"],
            destination="/opt/ml/processing/input",
            input_name="input-1",
        ),
        ProcessingInput(
            source=execution_input["PreprocessingCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs = [
        ProcessingOutput(
            source="/opt/ml/processing/train",
            destination=execution_input["PreprocessedTrainDataURL"],
            output_name="train_data",
        ),
        ProcessingOutput(
            source="/opt/ml/processing/test",
            destination=execution_input["PreprocessedTestDataURL"],
            output_name="test_data",
        ),
        ProcessingOutput(
            source="/opt/ml/processing/model",
            destination=execution_input["PreprocessedModelURL"],
            output_name="proc_model",
        ),
    ]

    processing_step = ProcessingStep(
        "SageMaker pre-processing step",
        processor=sklearn_processor,
        job_name=execution_input["PreprocessingJobName"],
        inputs=inputs,
        outputs=outputs,
        container_arguments=[
            "--train-test-split-ratio", "0.2", "--mode", "train"
        ],
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/preprocessing.py",
        ],
        kms_key_id=kms_key_id,
    )
    """
    Training using the pre-processed data
    """
    sklearn = SKLearn(
        entry_point="../../src/mlmax/train.py",
        train_instance_type="ml.m5.xlarge",
        role=sm_role,
        py_version="py3",
        framework_version="0.20.0",
        output_kms_key=kms_key_id,
    )

    training_step = MLMaxTrainingStep(
        "SageMaker Training Step",
        estimator=sklearn,
        job_name=execution_input["TrainingJobName"],
        train_data=execution_input["PreprocessedTrainDataURL"],
        test_data=execution_input["PreprocessedTestDataURL"],
        sm_submit_url=execution_input["SMSubmitDirURL"],
        sm_region=execution_input["SMRegion"],
        sm_output_data=execution_input["SMOutputDataURL"],
        sm_debug_output_data=execution_input["SMDebugOutputURL"],
        wait_for_completion=True,
    )
    """
    Model evaluation
    """
    # Create input and output objects for Model Evaluation ProcessingStep.
    inputs_evaluation = [
        ProcessingInput(
            source=execution_input["PreprocessedTestDataURL"],
            destination="/opt/ml/processing/test",
            input_name="input-1",
        ),
        ProcessingInput(
            source=training_step.get_expected_model().model_data,
            destination="/opt/ml/processing/model",
            input_name="input-2",
        ),
        ProcessingInput(
            source=execution_input["EvaluationCodeURL"],
            destination="/opt/ml/processing/input/code",
            input_name="code",
        ),
    ]

    outputs_evaluation = [
        ProcessingOutput(
            source="/opt/ml/processing/evaluation",
            destination=execution_input["EvaluationResultURL"],
            output_name="evaluation",
        ),
    ]

    model_evaluation_processor = SKLearnProcessor(
        framework_version="0.20.0",
        role=sm_role,
        instance_type="ml.m5.xlarge",
        instance_count=1,
        max_runtime_in_seconds=1200,
    )

    processing_evaluation_step = ProcessingStep(
        "SageMaker Processing Model Evaluation step",
        processor=model_evaluation_processor,
        job_name=execution_input["EvaluationProcessingJobName"],
        inputs=inputs_evaluation,
        outputs=outputs_evaluation,
        container_entrypoint=[
            "python3", "/opt/ml/processing/input/code/evaluation.py"
        ],
    )

    # Create Fail state to mark the workflow failed in case any of the steps fail.
    failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
        "ML Workflow failed", cause="SageMakerProcessingJobFailed")

    # Add the Error handling in the workflow
    catch_state_processing = stepfunctions.steps.states.Catch(
        error_equals=["States.TaskFailed"],
        next_step=failed_state_sagemaker_processing_failure,
    )
    processing_step.add_catch(catch_state_processing)
    processing_evaluation_step.add_catch(catch_state_processing)
    training_step.add_catch(catch_state_processing)

    # Create the Workflow
    workflow_graph = Chain(
        [processing_step, training_step, processing_evaluation_step])
    training_pipeline = Workflow(
        name=training_pipeline_name,
        definition=workflow_graph,
        role=workflow_execution_role,
    )
    return training_pipeline
# 你可以使用以下命令之一导入一个评估器:
# ```
# from sagemaker.sklearn.estimator import SKLearn
# ```
# ```
# from sagemaker.pytorch import PyTorch
# ```

# In[37]:

# your import and estimator code, here
from sagemaker.sklearn.estimator import SKLearn

estimator = SKLearn(entry_point="train.py",
                    source_dir="source_sklearn",
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge')

# ## 练习:训练评估器
#
# 使用在 S3 中存储的训练数据训练评估器。代码应该创建一个训练作业,你可以在 SageMaker 控制台中监控该作业。

# In[38]:

get_ipython().run_cell_magic(
    'time', '',
    "\n# Train your estimator on S3 training data\n\nestimator.fit({'train': input_data})"
)

# ## 练习:部署训练过的模型
示例#18
0
# TESTING: Confirm that data is in S3 bucket
# empty_check = []
# for obj in boto3.resource('s3').Bucket(bucket).objects.all():
#     empty_check.append(obj.key)
#     print(obj.key)

# assert len(empty_check) !=0, 'S3 bucket is empty.'
# print('Test passed!')

# Specify an output path
output_path = 's3://{}/{}'.format(bucket, prefix)

estimator = SKLearn(
    entry_point='train.py',
    source_dir='src',
    role=role,
    framework_version="0.23-1",
    py_version="py3",
    instance_count=1,
    instance_type='ml.c4.xlarge',
    sagemaker_session=sagemaker_session,
    output_path=output_path,
)

# Train your estimator on S3 training data
estimator.fit({'train': input_data})

# deploy your model to create a predictor
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.t2.medium')
def test_jumpstart_sklearn_image_uri(patched_get_model_specs, session):

    patched_get_model_specs.side_effect = get_prototype_model_spec

    model_id, model_version = "sklearn-classification-linear", "*"
    instance_type = "ml.m2.xlarge"
    region = "us-west-2"

    model_specs = accessors.JumpStartModelsAccessor.get_model_specs(
        region, model_id, model_version)

    # inference
    uri = image_uris.retrieve(
        framework=None,
        region=region,
        image_scope="inference",
        model_id=model_id,
        model_version=model_version,
        instance_type=instance_type,
    )

    framework_class_uri = SKLearnModel(
        role="mock_role",
        model_data="mock_data",
        entry_point="mock_entry_point",
        framework_version=model_specs.hosting_ecr_specs.framework_version,
        py_version=model_specs.hosting_ecr_specs.py_version,
        sagemaker_session=session,
    ).serving_image_uri(region, instance_type)

    assert uri == framework_class_uri
    assert (
        uri ==
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3"
    )

    # training
    uri = image_uris.retrieve(
        framework=None,
        region=region,
        image_scope="training",
        model_id=model_id,
        model_version=model_version,
        instance_type=instance_type,
    )

    framework_class_uri = SKLearn(
        role="mock_role",
        entry_point="mock_entry_point",
        framework_version=model_specs.training_ecr_specs.framework_version,
        py_version=model_specs.training_ecr_specs.py_version,
        instance_type=instance_type,
        instance_count=1,
        image_uri_region=region,
        sagemaker_session=session,
    ).training_image_uri(region=region)

    assert uri == framework_class_uri
    assert (
        uri ==
        "246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3"
    )

    with pytest.raises(ValueError):
        image_uris.retrieve(
            framework=None,
            region="us-west-2",
            image_scope="training",
            model_id=model_id,
            model_version=model_version,
            instance_type="ml.p2.xlarge",
        )
"""
NOTE: You can not execute this file as it required AWS creds
"""
import json
import boto3
from sagemaker.sklearn.estimator import SKLearn

if __name__ == '__main__':
    role = '<Enter role>'
    aws_sklearn = SKLearn(entry_point='aws_main.py',
                          train_instance_type='ml.m4.xlarge',
                          role=role,
                          framework_version="0.23-1",
                          py_version="py3")

    aws_sklearn.fit({'train': 's3://mymlflowbucket/testdata.csv'})

    aws_sklearn_predictor = aws_sklearn.deploy(instance_type='ml.m4.xlarge',
                                               initial_instance_count=1)

    print(aws_sklearn_predictor.endpoint)

    # Testing
    runtime = boto3.client('sagemaker-runtime')

    input = {
        'features': [{
            'product': 1704,
            'amount': 1.0,
            'price': 50.748000000000005,
            'unit': -1,
import config

# Get the working path of script
p = abspath(getsourcefile(lambda: 0))
p = p.rsplit('/', 1)[0]
os.chdir(p)
print('Working Directory is: %s' % os.getcwd())

model_name = 'rf'

FRAMEWORK_VERSION = '0.23-1'  # framework version
role = config.aws_role  # get execution role
aws_sklearn = SKLearn(
    entry_point=p + '/model_scripts_aws/' + model_name +
    '.py',  # change script name for different model
    train_instance_type='ml.m4.2xlarge',
    framework_version=FRAMEWORK_VERSION,
    base_job_name=config.job_name + model_name,  # change for any name
    role=role
    # source_dir='./',
    # requirements_file='requirements.txt'
)

# Send model to train
aws_sklearn.fit({
    'train': config.train_path,
    'test': config.test_path
},
                wait=False)
示例#22
0
    def test_sagemaker_services_successfully(self, m_default_bucket):

        m_default_bucket.return_value = "sagemaker-bucket-name"

        with DataJobStack(scope=self.app, id="some-stack", stage="stg") as djs:

            processor = SKLearnProcessor(
                framework_version="0.23-1",
                role=self.role,
                instance_type="local",
                instance_count=1,
                sagemaker_session=self.sagemaker_session,
            )

            processing_step = ProcessingStep(
                datajob_stack=djs,
                name="processing-job",
                processor=processor,
            )

            estimator = SKLearn(
                entry_point=str(
                    pathlib.Path(current_dir, "resources", "train.py")),
                train_instance_type="ml.m5.xlarge",
                role=self.role,
                framework_version="0.20.0",
                py_version="py3",
                sagemaker_session=self.sagemaker_session,
            )

            training_step = TrainingStep(
                datajob_stack=djs,
                name="training-job",
                estimator=estimator,
            )

            model_step = ModelStep(
                datajob_stack=djs,
                name="model-step",
                model=training_step.sfn_task.get_expected_model(),
            )

            endpoint_config_step = EndpointConfigStep(
                datajob_stack=djs,
                name="endpoint-config-step",
                model_name=model_step.model_name,
            )

            endpoint_step = EndpointStep(
                datajob_stack=djs,
                name="endpoint-step",
                endpoint_config_name=endpoint_config_step.name,
            )

            with StepfunctionsWorkflow(
                    djs, "sequential") as sfn_workflow_sequential:
                (processing_step >> training_step >> model_step >>
                 endpoint_config_step >> endpoint_step)

            with StepfunctionsWorkflow(djs,
                                       "parallel") as sfn_workflow_parallel:
                processing_step >> processing_step
                training_step >> training_step

        # check if we have the expected value for the execution input
        self.assertDictEqual(
            djs.execution_input.execution_input_schema,
            {
                "some-stack-stg-processing-job": str,
                "some-stack-stg-training-job": str,
                "some-stack-stg-model-step": str,
                "some-stack-stg-endpoint-config-step": str,
                "some-stack-stg-endpoint-step": str,
            },
        )
        # execution input is added to cloudformation output
        self.assertDictEqual(
            djs.outputs,
            {
                "DatajobExecutionInput":
                json.dumps([
                    "some-stack-stg-processing-job",
                    "some-stack-stg-training-job",
                    "some-stack-stg-model-step",
                    "some-stack-stg-endpoint-config-step",
                    "some-stack-stg-endpoint-step",
                ])
            },
        )
示例#23
0
# Below code not working for uploading .Bunch object to S3 directly
# s3client.put_object(Body=sklearn.datasets.load_breast_cancer(),
#                     Bucket='sklearn-sagemaker-data',
#                     key='sklearn-sagemaker-data/breast-cancer/bc-data.Bunch')

#Create sagemaker estimator for iris
from sagemaker.sklearn.estimator import SKLearn
FRAMEWORK_VERSION = "0.23-1"
script_path_iris = 'AWS_Sagemaker/sklearn_sagemaker_deploy/sklearn_script_iris.py'
script_path_breast_cancer = 'AWS_Sagemaker/sklearn_sagemaker_deploy/sklearn_script_breast_cancer.py'
sklearn_path_random_model = 'AWS_Sagemaker/sklearn_sagemaker_deploy/sklearn_script_random_model.py'

sklearn_estimator_iris = SKLearn(entry_point=script_path_iris,
                                 framework_version=FRAMEWORK_VERSION,
                                 instance_type="ml.c4.xlarge",
                                 role=SageMakerRole,
                                 sagemaker_session=sagemaker_session,
                                 hyperparameters={'max_leaf_nodes': 30})

sklearn_estimator_breast_cancer = SKLearn(
    entry_point=script_path_breast_cancer,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c4.xlarge",
    role=SageMakerRole,
    sagemaker_session=sagemaker_session,
    hyperparameters={'max_leaf_nodes': 30})

sklearn_estimator_random_model = SKLearn(entry_point=sklearn_path_random_model,
                                         framework_version=FRAMEWORK_VERSION,
                                         instance_type="ml.c4.xlarge",
                                         role=SageMakerRole,
示例#24
0
    def __init__(
        self,
        name: str,
        sagemaker_session,
        role,
        model_data: str,
        entry_point: str,
        display_name: str = None,
        description: str = None,
        source_dir: str = None,
        dependencies: List = None,
        depends_on: Optional[List[Union[str, Step, "StepCollection"]]] = None,
        retry_policies: List[RetryPolicy] = None,
        subnets=None,
        security_group_ids=None,
        **kwargs,
    ):
        """Base class initializer.

        Args:
            name (str): The name of the training step.
            sagemaker_session (sagemaker.session.Session): Session object which manages
                    interactions with Amazon SageMaker APIs and any other AWS services needed. If
                    not specified, the estimator creates one using the default
                    AWS configuration chain.
            role (str): An AWS IAM role (either name or full ARN). The Amazon
                    SageMaker training jobs and APIs that create Amazon SageMaker
                    endpoints use this role to access training data and model
                    artifacts. After the endpoint is created, the inference code
                    might use the IAM role, if it needs to access an AWS resource.
            model_data (str): The S3 location of a SageMaker model data `.tar.gz` file.
            entry_point (str): Path (absolute or relative) to the local Python
                    source file which should be executed as the entry point to
                    inference. If ``source_dir`` is specified, then ``entry_point``
                    must point to a file located at the root of ``source_dir``.
                    If 'git_config' is provided, 'entry_point' should be
                    a relative location to the Python source file in the Git repo.

                    Example:
                        With the following GitHub repo directory structure:

                        >>> |----- README.md
                        >>> |----- src
                        >>>         |----- train.py
                        >>>         |----- test.py

                        You can assign entry_point='src/train.py'.
            display_name (str): The display name of this `_RepackModelStep` step (default: None).
            description (str): The description of this `_RepackModelStep` (default: None).
            source_dir (str): A relative location to a directory with other training
                or model hosting source code dependencies aside from the entry point
                file in the Git repo (default: None). Structure within this
                directory are preserved when training on Amazon SageMaker.
            dependencies (list[str]): A list of paths to directories (absolute
                    or relative) with any additional libraries that will be exported
                    to the container (default: []). The library folders will be
                    copied to SageMaker in the same folder where the entrypoint is
                    copied. If 'git_config' is provided, 'dependencies' should be a
                    list of relative locations to directories with any additional
                    libraries needed in the Git repo.

                    .. admonition:: Example

                        The following call

                        >>> Estimator(entry_point='train.py',
                        ...           dependencies=['my/libs/common', 'virtual-env'])

                        results in the following inside the container:

                        >>> $ ls

                        >>> opt/ml/code
                        >>>     |------ train.py
                        >>>     |------ common
                        >>>     |------ virtual-env

                    This is not supported with "local code" in Local Mode.
            depends_on (List[Union[str, Step, StepCollection]]): The list of `Step`/`StepCollection`
                names or `Step` instances or `StepCollection` instances that the current `Step`
                depends on (default: None).
            retry_policies (List[RetryPolicy]): The list of retry policies for the current step
                (default: None).
            subnets (list[str]): List of subnet ids. If not specified, the re-packing
                    job will be created without VPC config (default: None).
            security_group_ids (list[str]): List of security group ids. If not
                specified, the re-packing job will be created without VPC config (default: None).
            **kwargs: additional arguments for the repacking job.
        """
        self._model_data = model_data
        self.sagemaker_session = sagemaker_session
        self.role = role
        self._entry_point = entry_point
        self._entry_point_basename = os.path.basename(self._entry_point)
        self._source_dir = source_dir
        self._dependencies = dependencies

        # convert dependencies array into space-delimited string
        dependencies_hyperparameter = None
        if self._dependencies:
            dependencies_hyperparameter = " ".join(self._dependencies)

        # the real estimator and inputs
        repacker = SKLearn(
            framework_version=FRAMEWORK_VERSION,
            instance_type=INSTANCE_TYPE,
            entry_point=REPACK_SCRIPT,
            source_dir=self._source_dir,
            dependencies=self._dependencies,
            sagemaker_session=self.sagemaker_session,
            role=self.role,
            hyperparameters={
                "inference_script": self._entry_point_basename,
                "model_archive": self._model_data,
                "dependencies": dependencies_hyperparameter,
                "source_dir": self._source_dir,
            },
            subnets=subnets,
            security_group_ids=security_group_ids,
            **kwargs,
        )
        repacker.disable_profiler = True
        inputs = TrainingInput(self._model_data)

        # super!
        super(_RepackModelStep, self).__init__(
            name=name,
            display_name=display_name,
            description=description,
            depends_on=depends_on,
            retry_policies=retry_policies,
            estimator=repacker,
            inputs=inputs,
        )
示例#25
0
def train_deploy_model(
        keys,
        instance='ml.m4.xlarge',  # Don't change this!
        instance_count=1,  # Don't change this!
        model_path='tmp/model/model.py',
        key_bucket='tmp/train/embeddings',  # It was: tmp/data/data.pickle. data.pickle is harcoded inside the function
        update=True,  # This should be always true if there is an open endpoint
        hyperparms=None):
    """
  This function trains a sagemaker model and deploys it.

    Args:
       keys (json): Json with credential keys
       instance (str): instance type to train model and deploy it
       instance_count (int): initial instance count for deploying the model
       model_path (str): Directory path where the model is located
       hyperparms (dictionary): Hyperparameters for SVM
    
    Returns:
       Print statement

  """
    with open(keys) as k:
        keys = json.load(k)

    session = boto3.session.Session(
        aws_access_key_id=keys["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=keys["AWS_SECRET_ACCESS_KEY"],
        region_name=keys["REGION_NAME"])

    #sagemaker_session = sagemaker.local.LocalSession(boto_session = session)
    sagemaker_session = sagemaker.Session(boto_session=session)
    if not hyperparms:
        print(model_path)
        sklearn = SKLearn(entry_point=model_path,
                          train_instance_type=instance,
                          role=keys["ROLE"],
                          sagemaker_session=sagemaker_session)
    else:
        print(model_path)
        sklearn = SKLearn(entry_point=model_path,
                          train_instance_type=instance,
                          role=keys["ROLE"],
                          sagemaker_session=sagemaker_session,
                          hyperparameters=hyperparms)

    ## Data for training
    inputs = sagemaker_session.upload_data(path='tmp/train/embeddings',
                                           key_prefix=key_bucket,
                                           bucket=keys["BUCKET_NAME"])
    ## Training the model
    sklearn.fit({'train': inputs})
    ## Deploying the model
    try:
        predictor = sklearn.deploy(initial_instance_count=instance_count,
                                   instance_type=instance,
                                   endpoint_name=keys["ENDPOINT_NAME"],
                                   update_endpoint=update)
    except:
        print("The model was not deployed")

    return print("Endpoint updated: {}".format(keys["ENDPOINT_NAME"]))
示例#26
0
def test_three_step_definition(
    sagemaker_session,
    region_name,
    role,
    script_dir,
    pipeline_name,
    athena_dataset_definition,
):
    framework_version = "0.20.0"
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.xlarge")
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    output_prefix = ParameterString(name="OutputPrefix",
                                    default_value="output")

    input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv"

    sklearn_processor = SKLearnProcessor(
        framework_version=framework_version,
        instance_type=instance_type,
        instance_count=instance_count,
        base_job_name="test-sklearn",
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_process = ProcessingStep(
        name="my-process",
        processor=sklearn_processor,
        inputs=[
            ProcessingInput(source=input_data,
                            destination="/opt/ml/processing/input"),
            ProcessingInput(dataset_definition=athena_dataset_definition),
        ],
        outputs=[
            ProcessingOutput(output_name="train_data",
                             source="/opt/ml/processing/train"),
            ProcessingOutput(
                output_name="test_data",
                source="/opt/ml/processing/test",
                destination=Join(
                    on="/",
                    values=[
                        "s3:/",
                        sagemaker_session.default_bucket(),
                        "test-sklearn",
                        output_prefix,
                        ExecutionVariables.PIPELINE_EXECUTION_ID,
                    ],
                ),
            ),
        ],
        code=os.path.join(script_dir, "preprocessing.py"),
    )

    sklearn_train = SKLearn(
        framework_version=framework_version,
        entry_point=os.path.join(script_dir, "train.py"),
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    step_train = TrainingStep(
        name="my-train",
        estimator=sklearn_train,
        inputs=TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.
            Outputs["train_data"].S3Output.S3Uri),
    )

    model = Model(
        image_uri=sklearn_train.image_uri,
        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
        sagemaker_session=sagemaker_session,
        role=role,
    )
    model_inputs = CreateModelInput(
        instance_type="ml.m5.large",
        accelerator_type="ml.eia1.medium",
    )
    step_model = CreateModelStep(
        name="my-model",
        model=model,
        inputs=model_inputs,
    )

    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[instance_type, instance_count, output_prefix],
        steps=[step_process, step_train, step_model],
        sagemaker_session=sagemaker_session,
    )

    definition = json.loads(pipeline.definition())
    assert definition["Version"] == "2020-12-01"

    assert set(tuple(param.items())
               for param in definition["Parameters"]) == set([
                   tuple({
                       "Name": "InstanceType",
                       "Type": "String",
                       "DefaultValue": "ml.m5.xlarge"
                   }.items()),
                   tuple({
                       "Name": "InstanceCount",
                       "Type": "Integer",
                       "DefaultValue": 1
                   }.items()),
                   tuple({
                       "Name": "OutputPrefix",
                       "Type": "String",
                       "DefaultValue": "output"
                   }.items()),
               ])

    steps = definition["Steps"]
    assert len(steps) == 3

    names_and_types = []
    processing_args = {}
    training_args = {}
    for step in steps:
        names_and_types.append((step["Name"], step["Type"]))
        if step["Type"] == "Processing":
            processing_args = step["Arguments"]
        if step["Type"] == "Training":
            training_args = step["Arguments"]
        if step["Type"] == "Model":
            model_args = step["Arguments"]

    assert set(names_and_types) == set([
        ("my-process", "Processing"),
        ("my-train", "Training"),
        ("my-model", "Model"),
    ])

    assert processing_args["ProcessingResources"]["ClusterConfig"] == {
        "InstanceType": {
            "Get": "Parameters.InstanceType"
        },
        "InstanceCount": {
            "Get": "Parameters.InstanceCount"
        },
        "VolumeSizeInGB": 30,
    }

    assert training_args["ResourceConfig"] == {
        "InstanceCount": 1,
        "InstanceType": {
            "Get": "Parameters.InstanceType"
        },
        "VolumeSizeInGB": 30,
    }
    assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] == {
            "Get":
            "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri"
        }
    assert model_args["PrimaryContainer"]["ModelDataUrl"] == {
        "Get": "Steps.my-train.ModelArtifacts.S3ModelArtifacts"
    }
    try:
        response = pipeline.create(role)
        create_arn = response["PipelineArn"]
        assert re.match(
            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
            create_arn,
        )
    finally:
        try:
            pipeline.delete()
        except Exception:
            pass
示例#27
0
# -*- coding: utf-8 -*-

# Deploy the model
from sagemaker.sklearn.estimator import SKLearn

role = 'SageMakerFullAccess_sklearn_api_test'

# Create the SKLearn Object by directing it to the aws_sklearn_main.py script
aws_sklearn = SKLearn(entry_point='aws_sklearn_main.py',
                      train_instance_type='ml.m4.xlarge',
                      role=role)

# Train the model using by passing the path to the S3 bucket with the training data
aws_sklearn.fit({'train': 's3://replace-with-your-bucket-name/'})

# Deploy model
aws_sklearn_predictor = aws_sklearn.deploy(instance_type='ml.t2.medium',
                                           initial_instance_count=1)

# Print the endpoint to test in next step
print(aws_sklearn_predictor.endpoint)

# Uncomment and run to terminate the endpoint after you are finished
#predictor.delete_endpoint()
示例#28
0
# send data to S3. SageMaker will take training data from s3
trainpath = sess.upload_data(path='train.csv',
                             bucket=bucket,
                             key_prefix=prefix)

testpath = sess.upload_data(path='test.csv', bucket=bucket, key_prefix=prefix)

sklearn_estimator = SKLearn(entry_point='script.py',
                            role=get_execution_role(),
                            train_instance_count=1,
                            train_instance_type='ml.m4.xlarge',
                            framework_version='0.20.0',
                            metric_definitions=[{
                                'Name':
                                'median-AE',
                                'Regex':
                                "AE-at-50th-percentile: ([0-9.]+).*$"
                            }],
                            hyperparameters={
                                'n-estimators': 100,
                                'min-samples-leaf': 2,
                                'target': 'churn'
                            })
sklearn_estimator.fit({'train': trainpath, 'test': testpath}, wait=True)

# And now we are ready to host the model

# In[ ]:

sm_boto3 = boto3.client('sagemaker')
artifact = sm_boto3.describe_training_job(
示例#29
0
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) != 0, 'S3 bucket is empty.'
print('Test passed!')

#!pygmentize plagarism_detection/plagarism_train.py

# Define SKlearn estimator
from sagemaker.sklearn.estimator import SKLearn

estimator = SKLearn(entry_point="train.py",
                    source_dir="source_sklearn",
                    role=role,
                    instance_count=1,
                    instance_type='ml.c4.xlarge',
                    py_version='py3',
                    framework_version='0.23-1')

#%%time

# Train your estimator on S3 training data
estimator.fit({'train': input_data})

# uncomment, if needed
# from sagemaker.pytorch import PyTorchModel

# deploy your model to create a predictor
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.t2.medium')
示例#30
0
                                 key_prefix='sagemaker/sklearncontainer')

    testpath = sess.upload_data(path='boston_test.csv',
                                bucket=bucket,
                                key_prefix='sagemaker/sklearncontainer')

    sklearn_estimator = SKLearn(
        entry_point='train.py',
        source_dir=os.path.abspath(os.path.dirname(__file__)),
        role=sm_role,
        train_instance_count=1,
        train_instance_type='ml.c5.xlarge',
        framework_version='0.20.0',
        base_job_name='rf-scikit',
        metric_definitions=[{
            'Name': 'median-AE',
            'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"
        }],
        hyperparameters={
            'n-estimators': 100,
            'min-samples-leaf': 3,
            'features':
            'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
            'target': 'target'
        })

    if (args.tune):
        tune_job(trainpath, testpath, sklearn_estimator)
    else:
        # launch training job, with asynchronous call
        sklearn_estimator.fit({