def test_attach_wrong_framework(sagemaker_session):
    rjd = {
        "AlgorithmSpecification": {
            "TrainingInputMode": "File",
            "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py3-cpu:1.0.4",
        },
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "checkpoint_path": '"s3://other/1508872349"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_container_log_level": '"logging.INFO"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=rjd
    )

    with pytest.raises(ValueError) as error:
        SKLearn.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
def test_attach_custom_image(sagemaker_session):
    training_image = "1.dkr.ecr.us-west-2.amazonaws.com/my_custom_sklearn_image:latest"
    returned_job_description = {
        "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image},
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_s3_uri_training": '"sagemaker-3/integ-test-data/tf_iris"',
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
        "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description
    )

    estimator = SKLearn.attach(training_job_name="neo", sagemaker_session=sagemaker_session)
    assert estimator.image_uri == training_image
    assert estimator.training_image_uri() == training_image
def test_attach_deploy(sklearn_training_job, sagemaker_session):
    endpoint_name = "test-sklearn-attach-deploy-{}".format(sagemaker_timestamp())

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = SKLearn.attach(sklearn_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
Exemplo n.º 4
0
def test_async_fit(
    sagemaker_session,
    cpu_instance_type,
    sklearn_latest_version,
    sklearn_latest_py_version,
):
    endpoint_name = "test-sklearn-attach-deploy-{}".format(
        sagemaker_timestamp())

    with timeout(minutes=5):
        training_job_name = _run_mnist_training_job(
            sagemaker_session,
            cpu_instance_type,
            sklearn_version=sklearn_latest_version,
            wait=False,
        )

        print("Waiting to re-attach to the training job: %s" %
              training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = SKLearn.attach(training_job_name=training_job_name,
                                   sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1,
                                     cpu_instance_type,
                                     endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
def test_attach(sagemaker_session, sklearn_version):
    training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-{}".format(
        sklearn_version, PYTHON_VERSION)
    returned_job_description = {
        "AlgorithmSpecification": {
            "TrainingInputMode": "File",
            "TrainingImage": training_image
        },
        "HyperParameters": {
            "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
            "sagemaker_program": '"iris-dnn-classifier.py"',
            "sagemaker_s3_uri_training":
            '"sagemaker-3/integ-test-data/tf_iris"',
            "sagemaker_enable_cloudwatch_metrics": "false",
            "sagemaker_container_log_level": '"logging.INFO"',
            "sagemaker_job_name": '"neo"',
            "training_steps": "100",
            "sagemaker_region": '"us-west-2"',
        },
        "RoleArn": "arn:aws:iam::366:role/SageMakerRole",
        "ResourceConfig": {
            "VolumeSizeInGB": 30,
            "InstanceCount": 1,
            "InstanceType": "ml.c4.xlarge",
        },
        "StoppingCondition": {
            "MaxRuntimeInSeconds": 24 * 60 * 60
        },
        "TrainingJobName": "neo",
        "TrainingJobStatus": "Completed",
        "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
        "OutputDataConfig": {
            "KmsKeyId": "",
            "S3OutputPath": "s3://place/output/neo"
        },
        "TrainingJobOutput": {
            "S3TrainingJobOutput": "s3://here/output.tar.gz"
        },
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name="describe_training_job", return_value=returned_job_description)

    estimator = SKLearn.attach(training_job_name="neo",
                               sagemaker_session=sagemaker_session)
    assert estimator._current_job_name == "neo"
    assert estimator.latest_training_job.job_name == "neo"
    assert estimator.py_version == PYTHON_VERSION
    assert estimator.framework_version == sklearn_version
    assert estimator.role == "arn:aws:iam::366:role/SageMakerRole"
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == "File"
    assert estimator.base_job_name == "neo"
    assert estimator.output_path == "s3://place/output/neo"
    assert estimator.output_kms_key == ""
    assert estimator.hyperparameters()["training_steps"] == "100"
    assert estimator.source_dir == "s3://some/sourcedir.tar.gz"
    assert estimator.entry_point == "iris-dnn-classifier.py"
def test_attach(sagemaker_session, sklearn_version):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:{}-cpu-{}'.format(
        sklearn_version, PYTHON_VERSION)
    returned_job_description = {
        'AlgorithmSpecification': {
            'TrainingInputMode': 'File',
            'TrainingImage': training_image
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_s3_uri_training':
            '"sagemaker-3/integ-test-data/tf_iris"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'sagemaker_job_name': '"neo"',
            'training_steps': '100',
            'sagemaker_region': '"us-west-2"'
        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'
        },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 24 * 60 * 60
        },
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo',
        'OutputDataConfig': {
            'KmsKeyId': '',
            'S3OutputPath': 's3://place/output/neo'
        },
        'TrainingJobOutput': {
            'S3TrainingJobOutput': 's3://here/output.tar.gz'
        }
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name='describe_training_job', return_value=returned_job_description)

    estimator = SKLearn.attach(training_job_name='neo',
                               sagemaker_session=sagemaker_session)
    assert estimator._current_job_name == 'neo'
    assert estimator.latest_training_job.job_name == 'neo'
    assert estimator.py_version == PYTHON_VERSION
    assert estimator.framework_version == sklearn_version
    assert estimator.role == 'arn:aws:iam::366:role/SageMakerRole'
    assert estimator.train_instance_count == 1
    assert estimator.train_max_run == 24 * 60 * 60
    assert estimator.input_mode == 'File'
    assert estimator.base_job_name == 'neo'
    assert estimator.output_path == 's3://place/output/neo'
    assert estimator.output_kms_key == ''
    assert estimator.hyperparameters()['training_steps'] == '100'
    assert estimator.source_dir == 's3://some/sourcedir.tar.gz'
    assert estimator.entry_point == 'iris-dnn-classifier.py'
Exemplo n.º 7
0
def test_attach_wrong_framework(sagemaker_session):
    rjd = {
        'AlgorithmSpecification': {
            'TrainingInputMode':
            'File',
            'TrainingImage':
            '1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py3-cpu:1.0.4'
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'checkpoint_path': '"s3://other/1508872349"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'training_steps': '100',
            'sagemaker_region': '"us-west-2"'
        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'
        },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 24 * 60 * 60
        },
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo',
        'OutputDataConfig': {
            'KmsKeyId': '',
            'S3OutputPath': 's3://place/output/neo'
        },
        'TrainingJobOutput': {
            'S3TrainingJobOutput': 's3://here/output.tar.gz'
        }
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name='describe_training_job', return_value=rjd)

    with pytest.raises(ValueError) as error:
        SKLearn.attach(training_job_name='neo',
                       sagemaker_session=sagemaker_session)
    assert "didn't use image for requested framework" in str(error)
Exemplo n.º 8
0
def build_and_train_estimator(data_desc: str,
                              classifier: str,
                              count: int = 1,
                              wait: bool = False,
                              **hyperparams: object) -> Tuple[SKLearn, str]:
    """
    Creates or returns an existing sagemaker training job

    :param data_desc: name of data to use (unique)
    :param classifier: name of sklearn classifier
    :param count: cache buster
    :param wait: waits on job, useful for debugging
    :param hyperparams: hyperparameters for the model
    :return: estimator | None
    """
    model_name = build_model_name(data_desc, classifier, hyperparams, count)
    print('model_name', model_name)

    # check if model has already been built on this data
    #  if it has check if it's finished and attach
    try:
        import boto3
        client = boto3.client('sagemaker')
        response = client.describe_training_job(TrainingJobName=model_name)
        if wait or response['TrainingJobStatus'] in ['Completed', 'Failed']:
            return SKLearn.attach(model_name), model_name
        else:
            raise Warning(f'{model_name} isn\'t finished training yet')
    except ClientError:
        pass

    output_location = f's3://{bucket}/{S3_MODEL_DIR / data_desc}'
    estimator = SKLearn('train_and_deploy.py',
                        source_dir='sagemaker_container',
                        code_location=output_location,
                        output_path=output_location,
                        train_instance_type=TRAIN_INSTANCE,
                        framework_version='0.23-1',
                        role=role,
                        hyperparameters={
                            'classifier': classifier,
                            **hyperparams
                        })

    estimator.fit(f's3://{bucket}/{S3_FEATURE_DIR / data_desc}',
                  wait=wait,
                  job_name=model_name)

    return estimator, model_name
Exemplo n.º 9
0
def test_async_fit(sagemaker_session):
    endpoint_name = 'test-sklearn-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout(minutes=5):
        training_job_name = _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge",
                                                    sklearn_full_version=SKLEARN_VERSION, wait=False)

        print("Waiting to re-attach to the training job: %s" % training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = SKLearn.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
Exemplo n.º 10
0
def test_attach_custom_image(sagemaker_session):
    training_image = '1.dkr.ecr.us-west-2.amazonaws.com/my_custom_sklearn_image:latest'
    returned_job_description = {
        'AlgorithmSpecification': {
            'TrainingInputMode': 'File',
            'TrainingImage': training_image
        },
        'HyperParameters': {
            'sagemaker_submit_directory': '"s3://some/sourcedir.tar.gz"',
            'sagemaker_program': '"iris-dnn-classifier.py"',
            'sagemaker_s3_uri_training':
            '"sagemaker-3/integ-test-data/tf_iris"',
            'sagemaker_enable_cloudwatch_metrics': 'false',
            'sagemaker_container_log_level': '"logging.INFO"',
            'sagemaker_job_name': '"neo"',
            'training_steps': '100',
            'sagemaker_region': '"us-west-2"'
        },
        'RoleArn': 'arn:aws:iam::366:role/SageMakerRole',
        'ResourceConfig': {
            'VolumeSizeInGB': 30,
            'InstanceCount': 1,
            'InstanceType': 'ml.c4.xlarge'
        },
        'StoppingCondition': {
            'MaxRuntimeInSeconds': 24 * 60 * 60
        },
        'TrainingJobName': 'neo',
        'TrainingJobStatus': 'Completed',
        'TrainingJobArn': 'arn:aws:sagemaker:us-west-2:336:training-job/neo',
        'OutputDataConfig': {
            'KmsKeyId': '',
            'S3OutputPath': 's3://place/output/neo'
        },
        'TrainingJobOutput': {
            'S3TrainingJobOutput': 's3://here/output.tar.gz'
        }
    }
    sagemaker_session.sagemaker_client.describe_training_job = Mock(
        name='describe_training_job', return_value=returned_job_description)

    estimator = SKLearn.attach(training_job_name='neo',
                               sagemaker_session=sagemaker_session)
    assert estimator.image_name == training_image
    assert estimator.train_image() == training_image