def test_kmeans_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        kmeans = KMeans(
            role=ROLE,
            instance_count=SINGLE_INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(datasets.one_p_mnist()[0][:100])

        training_config = _build_airflow_workflow(
            estimator=kmeans, instance_type=cpu_instance_type, inputs=records)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]
            ["S3Uri"],
        )
예제 #2
0
def test_kmeans(sagemaker_session):
    job_name = unique_name_from_base("kmeans")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type="ml.c4.xlarge",
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None

    predictor.delete_model()
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
        assert "Could not find model" in str(exception.value)
def test_async_kmeans(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False, job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(
            estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session
        )
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
예제 #4
0
def test_kmeans_serverless_inference(sagemaker_session, cpu_instance_type,
                                     training_set):
    job_name = unique_name_from_base("kmeans-serverless")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans = KMeans(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            eval_metrics=json.dumps(kmeans.eval_metrics),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(training_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(
            serverless_inference_config=ServerlessInferenceConfig(),
            endpoint_name=job_name)
        result = predictor.predict(training_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
        predictor.delete_model()
        with pytest.raises(Exception) as exception:
            sagemaker_session.sagemaker_client.describe_model(
                ModelName=model.name)
            assert "Could not find model" in str(exception.value)
예제 #5
0
def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        kmeans = KMeans(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(training_set[0][:100]),
                   wait=False,
                   job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(training_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
예제 #6
0
def test_async_kmeans(sagemaker_session):
    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense='True',
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
예제 #7
0
def test_kmeans(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense='True',
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]))

    endpoint_name = name_from_base('kmeans')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
def test_kmeans_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role=ROLE,
            train_instance_count=SINGLE_INSTANCE_COUNT,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(train_set[0][:100])

        training_config = _build_airflow_workflow(
            estimator=kmeans, instance_type=cpu_instance_type, inputs=records)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]
            ["S3Uri"],
        )