Exemplo n.º 1
0
def test_tuning_kmeans(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                        output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

        # set kmeans specific hp
        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1

        records = kmeans.record_set(train_set[0][:100])
        test_records = kmeans.record_set(train_set[0][:100], channel='test')

        # specify which hp you want to optimize over
        hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10),
                                 'mini_batch_size': IntegerParameter(10, 100),
                                 'epochs': IntegerParameter(1, 2),
                                 'init_method': CategoricalParameter(['kmeans++', 'random'])}
        objective_metric_name = 'test:msd'

        tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit([records, test_records])

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label['closest_cluster'] is not None
            assert record.label['distance_to_cluster'] is not None
Exemplo n.º 2
0
def test_tuning_kmeans(sagemaker_session):
    with timeout(minutes=20):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                        output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

        # set kmeans specific hp
        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1

        records = kmeans.record_set(train_set[0][:100])
        test_records = kmeans.record_set(train_set[0][:100], channel='test')

        # specify which hp you want to optimize over
        hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10),
                                 'mini_batch_size': IntegerParameter(10, 100),
                                 'epochs': IntegerParameter(1, 2),
                                 'init_method': CategoricalParameter(['kmeans++', 'random'])}
        objective_metric_name = 'test:msd'

        tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit([records, test_records])

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label['closest_cluster'] is not None
            assert record.label['distance_to_cluster'] is not None
Exemplo n.º 3
0
def test_kmeans(sagemaker_session):
    job_name = unique_name_from_base("kmeans")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type="ml.c4.xlarge",
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, "ml.c4.xlarge", endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None

    predictor.delete_model()
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
        assert "Could not find model" in str(exception.value)
def test_kmeans_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        kmeans = KMeans(
            role=ROLE,
            instance_count=SINGLE_INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(datasets.one_p_mnist()[0][:100])

        training_config = _build_airflow_workflow(
            estimator=kmeans, instance_type=cpu_instance_type, inputs=records)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]
            ["S3Uri"],
        )
Exemplo n.º 5
0
def test_kmeans(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        kmeans.fit(kmeans.record_set(train_set[0][:100]))

    endpoint_name = name_from_base('kmeans')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
def test_transform_byo_estimator(sagemaker_session, cpu_instance_type):
    data_path = os.path.join(DATA_DIR, "one_p_mnist")
    pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}
    tags = [{"Key": "some-tag", "Value": "value-for-tag"}]

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, "mnist.pkl.gz")
    with gzip.open(train_set_path, "rb") as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session)
    estimator._enable_network_isolation = True

    transform_input_path = os.path.join(data_path, "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix
    )

    transformer = estimator.transformer(1, cpu_instance_type, tags=tags)
    transformer.transform(transform_input, content_type="text/csv")

    with timeout_and_delete_model_with_transformer(
        transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES
    ):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name
        )
        assert model_desc["EnableNetworkIsolation"]

        model_tags = sagemaker_session.sagemaker_client.list_tags(
            ResourceArn=model_desc["ModelArn"]
        )["Tags"]
        assert tags == model_tags
def test_async_kmeans(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False, job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(
            estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session
        )
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
def test_async_kmeans():

    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        sagemaker_session = sagemaker.Session(boto_session=boto3.Session(
            region_name=REGION))
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole',
                        train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10,
                        sagemaker_session=sagemaker_session,
                        base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=35):
        estimator = KMeans.attach(training_job_name=training_job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role='SageMakerRole',
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Exemplo n.º 9
0
def test_kmeans_serverless_inference(sagemaker_session, cpu_instance_type,
                                     training_set):
    job_name = unique_name_from_base("kmeans-serverless")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans = KMeans(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            eval_metrics=json.dumps(kmeans.eval_metrics),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(training_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(
            serverless_inference_config=ServerlessInferenceConfig(),
            endpoint_name=job_name)
        result = predictor.predict(training_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
        predictor.delete_model()
        with pytest.raises(Exception) as exception:
            sagemaker_session.sagemaker_client.describe_model(
                ModelName=model.name)
            assert "Could not find model" in str(exception.value)
Exemplo n.º 10
0
def test_async_kmeans(sagemaker_session):
    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense='True',
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Exemplo n.º 11
0
def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        kmeans = KMeans(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(training_set[0][:100]),
                   wait=False,
                   job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(training_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Exemplo n.º 12
0
def test_transform_byo_estimator(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
    tags = [{'Key': 'some-tag', 'Value': 'value-for-tag'}]

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(
                        sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base('test-kmeans-attach')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    estimator = Estimator.attach(training_job_name=job_name,
                                 sagemaker_session=sagemaker_session)

    transformer = estimator.transformer(1, 'ml.m4.xlarge', tags=tags)
    transformer.transform(transform_input, content_type='text/csv')

    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name)
        model_tags = sagemaker_session.sagemaker_client.list_tags(
            ResourceArn=model_desc['ModelArn'])['Tags']
        assert tags == model_tags
Exemplo n.º 13
0
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type):
    data_path = os.path.join(DATA_DIR, "one_p_mnist")
    pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, "mnist.pkl.gz")
    with gzip.open(train_set_path, "rb") as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(data_path, "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input, cpu_instance_type)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
Exemplo n.º 14
0
def test_attach_transform_kmeans(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(
                        sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type):
    kmeans = KMeans(
        role="SageMakerRole",
        instance_count=1,
        instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(datasets.one_p_mnist()[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(DATA_DIR, "one_p_mnist",
                                        "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input, cpu_instance_type)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
def test_record_set(sagemaker_session, cpu_instance_type):
    """Test the method ``AmazonAlgorithmEstimatorBase.record_set``.

    In particular, test that the objects uploaded to the S3 bucket are encrypted.
    """
    kmeans = KMeans(
        role="SageMakerRole",
        instance_count=1,
        instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
    )
    record_set = kmeans.record_set(datasets.one_p_mnist()[0][:100],
                                   encrypt=True)
    parsed_url = urlparse(record_set.s3_data)
    s3_client = sagemaker_session.boto_session.client("s3")
    head = s3_client.head_object(Bucket=parsed_url.netloc,
                                 Key=parsed_url.path.lstrip("/"))
    assert head["ServerSideEncryption"] == "AES256"
Exemplo n.º 17
0
def test_record_set(sagemaker_session):
    """Test the method ``AmazonAlgorithmEstimatorBase.record_set``.

    In particular, test that the objects uploaded to the S3 bucket are encrypted.
    """
    data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
    with gzip.open(data_path, 'rb') as file_object:
        train_set, _, _ = pickle.load(file_object, **pickle_args)
    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session)
    record_set = kmeans.record_set(train_set[0][:100], encrypt=True)
    parsed_url = urlparse(record_set.s3_data)
    s3_client = sagemaker_session.boto_session.client('s3')
    head = s3_client.head_object(Bucket=parsed_url.netloc,
                                 Key=parsed_url.path.lstrip('/'))
    assert head['ServerSideEncryption'] == 'AES256'
Exemplo n.º 18
0
def test_record_set(sagemaker_session):
    """Test the method ``AmazonAlgorithmEstimatorBase.record_set``.

    In particular, test that the objects uploaded to the S3 bucket are encrypted.
    """
    data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
    pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}
    with gzip.open(data_path, "rb") as file_object:
        train_set, _, _ = pickle.load(file_object, **pickle_args)
    kmeans = KMeans(
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type="ml.c4.xlarge",
        k=10,
        sagemaker_session=sagemaker_session,
    )
    record_set = kmeans.record_set(train_set[0][:100], encrypt=True)
    parsed_url = urlparse(record_set.s3_data)
    s3_client = sagemaker_session.boto_session.client("s3")
    head = s3_client.head_object(Bucket=parsed_url.netloc, Key=parsed_url.path.lstrip("/"))
    assert head["ServerSideEncryption"] == "AES256"
def test_kmeans_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role=ROLE,
            train_instance_count=SINGLE_INSTANCE_COUNT,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(train_set[0][:100])

        training_config = _build_airflow_workflow(
            estimator=kmeans, instance_type=cpu_instance_type, inputs=records)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]
            ["S3Uri"],
        )
def test_attach_transform_kmeans(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                    train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(path=transform_input_path,
                                                           key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(kmeans, transform_input)

    attached_transformer = Transformer.attach(transformer.latest_transform_job.name,
                                              sagemaker_session=sagemaker_session)
    attached_transformer.wait()
Exemplo n.º 21
0

if __name__ == "__main__":
    # get MNIST dataset
    train_set, valid_set, test_set = get_mnist_dataset()

    # create model using built-in k-means algorithm
    kmeans = KMeans(
        role=ROLE,
        train_instance_count=1,
        #train_instance_type='local',
        train_instance_type='ml.c4.4xlarge',
        output_path=OUTPUT_PATH,
        k=10)
    # train model
    kmeans.fit(kmeans.record_set(train_set[0]))

    # deploy model to endpoint
    kmeans_predictor = kmeans.deploy(initial_instance_count=2,
                                     instance_type='ml.m4.xlarge',
                                     endpoint_name=ENDPOINT_NAME)
    # test model
    input_set = test_set

    clustered_data = [[] for i in range(0, 10)]
    for i in range(0, len(input_set[0])):
        result = kmeans_predictor.predict(input_set[0][i].reshape(1, 784))[0]
        predicted_cluster = int(
            result.label['closest_cluster'].float32_tensor.values[0])
        clustered_data[predicted_cluster].append(i)
NUM_CLUSTER = 8

kmeans = KMeans(
    role = role,
    train_instance_count=1,
    train_instance_type='ml.c4.xlarge',
    output_path=output_path,
    k = NUM_CLUSTER
)

# ### EXERCISE: Create formatted, k-means training data
# In[50]:
# convert the transformed dataframe into record_set data
data4kmeans = counties_transformed.values.astype('float32')
data4kmeans = kmeans.record_set(data4kmeans)


# ### EXERCISE: Train the k-means model
# In[51]:
get_ipython().run_cell_magic('time', '', '# train kmeans\nkmeans.fit(data4kmeans)')

# ### EXERCISE: Deploy the k-means model
# In[52]:
get_ipython().run_cell_magic('time', '', "# deploy the model to create a predictor\nkmeans_predictor = kmeans.deploy(initial_instance_count=1,\n                                 instance_type='ml.t2.medium')")


# ### EXERCISE: Pass in the training data and assign predicted cluster labels
# In[54]:
# get the predicted clusters for all the kmeans training data
cluster_info = kmeans_predictor.predict(counties_transformed.values.astype('float32'))
                train_instance_type='ml.c4.xlarge',
                output_path=output_path, # using the same output path as was defined, earlier              
                k=NUM_CLUSTERS)


# ### EXERCISE: Create formatted, k-means training data
# 
# Just as before, you should convert the `counties_transformed` df into a numpy array and then into a RecordSet. This is the required format for passing training data into a `KMeans` model.

# In[41]:


# convert the transformed dataframe into record_set data
#Solution
kmeans_train_data_np = counties_transformed.values.astype('float32')
kmeans_formatted_data = kmeans.record_set(kmeans_train_data_np)


# ### EXERCISE: Train the k-means model
# 
# Pass in the formatted training data and train the k-means model.

# In[42]:


get_ipython().run_cell_magic('time', '', '# train kmeans\n#Solution\nkmeans.fit(kmeans_formatted_data)')


# ### EXERCISE: Deploy the k-means model
# 
# Deploy the trained model to create a `kmeans_predictor`.
Exemplo n.º 24
0
def cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker):
  A_df = pd.read_pickle(local_data_folder + ticker + '.pkl')
  A_df.dropna(inplace=True)
  A_df.drop(columns=["Date"], inplace=True)

  # Normalize
  scaler = MinMaxScaler()

  Y_df = pd.DataFrame(A_df["Label"]).astype('float64')
  X_df = A_df.drop(columns=["Label"]).astype('float64')

  X = scaler.fit_transform(X_df)
  Y = scaler.fit_transform(Y_df)

  # split data
  print("Splitting data")
  x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True)

  # clustering
  s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)
  print("Clustering")
  kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type="ml.m4.xlarge",
                output_path=s3_output_folder,
                k=3)

  kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values))

  # deploy
  print("Deploying model", kmeans.model_data)
  kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")


  create_dir('{}s3/{}'.format(local_data_folder, ticker))

  # upload train and test data to S3
  dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \
            pd.DataFrame(x_train).astype("float32"),\
            clustering(x_train, kmeans_predictor)
            ], axis=1)
  dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False)
  # prepare cluster data sets    
  create_dir('{}s3/{}/train'.format(local_data_folder, ticker))
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker), True, local_data_folder)
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker), True, local_data_folder)
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker), True, local_data_folder)

  # We have to predict the clusters for each of the test data sets so that we could use it for testing out next model
  dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \
            pd.DataFrame(x_test).astype("float32"),\
            clustering(x_test, kmeans_predictor)
            ], axis=1)
  dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False)
  # # prepare cluster data sets    
#   create_dir('{}s3/{}/test'.format(local_data_folder, ticker))
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False, local_data_folder)
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False, local_data_folder)
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False, local_data_folder)

  # delete endpoint
  kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

  print('Completed clustering for', ticker)
Exemplo n.º 25
0
def process(ticker, local_data_folder, bucket, role, prefix,
            sagemaker_session):
    df = pd.read_pickle('{}/{}.{}'.format(local_data_folder, ticker, 'pkl'))
    df.dropna(inplace=True)
    df.drop(columns=["Date"], inplace=True)
    df.loc[df.Label >= threshold, 'direction'] = BUY
    df.loc[df.Label <= -threshold, 'direction'] = SELL
    df.loc[(df.Label < threshold) & (df.Label > -threshold),
           'direction'] = NONE

    # Normalize
    scaler = MinMaxScaler()

    Y_df = pd.DataFrame(df["Label"]).astype('float64')
    X_df = df.drop(columns=["Label"]).astype('float64')

    X = scaler.fit_transform(X_df)
    Y = scaler.fit_transform(Y_df)

    X[:, X.shape[1] - 1] = X_df["direction"].to_numpy()

    #### split data
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.33,
                                                        random_state=1,
                                                        shuffle=True)

    # clustering
    s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)
    kmeans = KMeans(role=role,
                    train_instance_count=1,
                    train_instance_type="ml.m4.xlarge",
                    output_path=s3_output_folder,
                    k=3)

    # Remove direction column and train
    kmeans.fit(
        kmeans.record_set(x_train[:,
                                  0:x_train.shape[1] - 1].astype('float32')))

    # deploy
    print("Deploying model", kmeans.model_data)
    kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                     instance_type="ml.m4.xlarge")

    create_dir('{}/s3/{}'.format(local_data_folder, ticker))
    '''
        Label = Change in price(+ve, -ve, none)
        Direction = BUY, SELL, NONE
        Cluster = cluster_0, cluster_1, cluster_2
    '''
    # train data
    y_train_df = pd.DataFrame(y_train, columns=["Label"])
    x_train_df = pd.DataFrame(
        x_train,
        columns=['col-{}'.format(i)
                 for i in range(x_train.shape[1] - 1)] + ["direction"])
    dataset_with_cluster = pd.concat([y_train_df.astype("float32"), x_train_df.astype("float32"),\
            clustering(x_train_df.drop(columns=["direction"]).astype('float32').values, kmeans_predictor)
        ], axis=1)
    dataset_with_cluster.to_csv('{}/s3/{}/all-train.csv'.format(
        local_data_folder, ticker),
                                header=True,
                                index=False)

    # test data
    y_test_df = pd.DataFrame(y_test, columns=["Label"])
    x_test_df = pd.DataFrame(
        x_test,
        columns=['col-{}'.format(i)
                 for i in range(x_test.shape[1] - 1)] + ['direction'])
    pd.concat([y_test_df.astype("float32"), x_test_df.astype("float32")], axis=1)\
        .to_csv('{}/s3/{}/all-test.csv'.format(local_data_folder, ticker), header=True, index=False)

    # clean clustering end point
    kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

    all_test_pred = pd.read_csv("{}/s3/{}/all-test.csv".format(
        local_data_folder, ticker)).dropna()
    all_train_pred = pd.read_csv("{}/s3/{}/all-train.csv".format(
        local_data_folder, ticker)).dropna()

    cluster0_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       0].drop(columns=["Cluster"])
    save_data(cluster0_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster0_pred"] = estimator.predict(
        all_test_pred.drop(
            columns=["Label", "direction"]).astype('float32').values)
    all_train_pred["cluster0_pred"] = estimator.predict(
        all_train_pred.drop(columns=["Label", "direction", "Cluster"]).astype(
            'float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    cluster1_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       1].drop(columns=["Cluster"])
    save_data(cluster1_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster1_pred"] = estimator.predict(
        all_test_pred.drop(columns=["Label", "direction", "cluster0_pred"
                                    ]).astype('float32').values)
    all_train_pred["cluster1_pred"] = estimator.predict(
        all_train_pred.drop(
            columns=["Label", "direction", "Cluster", "cluster0_pred"]).astype(
                'float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    cluster2_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       2].drop(columns=["Cluster"])
    save_data(cluster2_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster2_pred"] = estimator.predict(
        all_test_pred.drop(
            columns=["Label", "direction", "cluster0_pred", "cluster1_pred"
                     ]).astype('float32').values)
    all_train_pred["cluster2_pred"] = estimator.predict(
        all_train_pred.drop(columns=[
            "Label", "direction", "Cluster", "cluster0_pred", "cluster1_pred"
        ]).astype('float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    os.remove(local_data_folder + '/s3/' + ticker + '/train.csv')
    os.remove(local_data_folder + '/s3/' + ticker + '/validation.csv')

    all_buys = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == BUY].shape[0],
            cluster1_df[cluster1_df['direction'] == BUY].shape[0],
            cluster2_df[cluster2_df['direction'] == BUY].shape[0]
        ],
        columns=["BUY"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    all_sells = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == SELL].shape[0],
            cluster1_df[cluster1_df['direction'] == SELL].shape[0],
            cluster2_df[cluster2_df['direction'] == SELL].shape[0]
        ],
        columns=["SELL"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    all_nones = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == NONE].shape[0],
            cluster1_df[cluster1_df['direction'] == NONE].shape[0],
            cluster2_df[cluster2_df['direction'] == NONE].shape[0]
        ],
        columns=["NONE"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    cluster_selection_df = pd.concat([all_buys, all_sells, all_nones], axis=1)

    cluster_selection_index = cluster_selection_df.index
    buy_cluster_name = cluster_selection_index[
        cluster_selection_df['BUY'].values.argmax()]
    sell_cluster_name = cluster_selection_index[cluster_selection_df.drop(
        index=[buy_cluster_name])['SELL'].values.argmax()]
    none_cluster_name = cluster_selection_index[cluster_selection_df.drop(
        index=[buy_cluster_name, sell_cluster_name])['NONE'].values.argmax()]

    # Generate selected-cluster column based on max(cluster0, cluster1, cluster2)
    all_test_pred["selected-cluster"] = all_test_pred[[
        "cluster0_pred", "cluster1_pred", "cluster2_pred"
    ]].idxmax(axis=1)
    all_train_pred["selected-cluster"] = all_train_pred[[
        "cluster0_pred", "cluster1_pred", "cluster2_pred"
    ]].idxmax(axis=1)

    # convert selected-cluster to BUY, SELL, NONE
    all_test_pred.loc[all_test_pred["selected-cluster"] == buy_cluster_name,
                      "prediction"] = BUY
    all_test_pred.loc[all_test_pred["selected-cluster"] == sell_cluster_name,
                      "prediction"] = SELL
    all_test_pred.loc[all_test_pred["selected-cluster"] == none_cluster_name,
                      "prediction"] = NONE

    all_train_pred.loc[all_train_pred["selected-cluster"] == buy_cluster_name,
                       "prediction"] = BUY
    all_train_pred.loc[all_train_pred["selected-cluster"] == sell_cluster_name,
                       "prediction"] = SELL
    all_train_pred.loc[all_train_pred["selected-cluster"] == none_cluster_name,
                       "prediction"] = NONE

    # Bench mark results
    all_test_pred["random-prediction"] = [
        generate_random_direction() for _ in range(all_test_pred.shape[0])
    ]
    all_train_pred["random-prediction"] = [
        generate_random_direction() for _ in range(all_train_pred.shape[0])
    ]

    all_test_pred.to_csv('{}/s3/{}/all-test-pred.csv'.format(
        local_data_folder, ticker),
                         index=None)
    all_train_pred.to_csv('{}/s3/{}/all-train-pred.csv'.format(
        local_data_folder, ticker),
                          index=None)
    cluster_selection_df.to_csv('{}/s3/{}/cluster-selection.csv'.format(
        local_data_folder, ticker),
                                index=None)

    # remove NA
    all_test_pred = all_test_pred.dropna()
    all_train_pred = all_train_pred.dropna()

    # test accuracy
    test_accuracy = accuracy_score(all_test_pred["direction"],
                                   all_test_pred["prediction"],
                                   normalize=True)
    benchmark_test_accuracy = accuracy_score(
        all_test_pred["direction"],
        all_test_pred["random-prediction"],
        normalize=True)
    print('Test accuracy:', test_accuracy, ", Benchmark:",
          benchmark_test_accuracy)

    # train accuracy
    train_accuracy = accuracy_score(all_train_pred["direction"],
                                    all_train_pred["prediction"],
                                    normalize=True)
    benchmark_train_accuracy = accuracy_score(
        all_train_pred["direction"],
        all_train_pred["random-prediction"],
        normalize=True)
    print('Train accuracy:', train_accuracy, ", Benchmark:",
          benchmark_train_accuracy)

    accuracy_df = pd.DataFrame([
        ticker, test_accuracy, benchmark_test_accuracy, train_accuracy,
        benchmark_train_accuracy
    ]).T
    accuracy_df.columns = [
        "ticker", "test_accuracy", "benchmark_test_accuracy", "train_accuracy",
        "benchmark_train_accuracy"
    ]

    accuracy_file = "{}/accuracy.csv".format(local_data_folder)
    header = not os.path.exists(accuracy_file)
    accuracy_df.to_csv(accuracy_file, mode="a", header=header, index=False)
Exemplo n.º 26
0
news_df['vectors'] = news_df.words.progress_apply(gtext2vec)

## Clustering and generating scatter
X = np.concatenate(news_df['vectors'].values)

## run sagemaker kmeans
role = get_execution_role()
num_clusters = 10
kmeans = KMeans(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.4xlarge",
    output_path="s3://" + bucket + "/news_kmeans/",
    k=num_clusters,
)
kmeans.fit(kmeans.record_set(X))

## deploy sagemaker kmeans endpoint
kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type="ml.t2.medium")
news_df['cluster'] = kmeans_predictor.predict(X)

## Save News
news_df = news_df.drop(["ori_text", "words"], axis=1)
news_df.to_pickle('news_df.pkl')

## Save Model
import pickle
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(kmeans, file)
Exemplo n.º 27
0
data_path = "s3://ressonance/data/model_data/"
output_path = "s3://ressonance/models/"

# portfolio clustering

port_kmeans = KMeans(role=role,
                     train_instance_count=2,
                     train_instance_type="ml.c4.xlarge",
                     output_path=output_path + "portfolio",
                     k=5,
                     data_location=data_path + "portfolios.csv")

port_training = pd.read_csv("data/training_data/portfolios.csv")

port_kmeans.fit(port_kmeans.record_set(port_training))
port_predictor = port_kmeans.deploy(initial_instance_count=1,
                                    instance_type="ml.m4.xlarge")

## Step 2: people
# Substiuting portfolios


def sub_port(port):
    return port_predictor(portfolio_processing(list(port)))


clis = None
clis_df = client_processing(clis)

clis_df.portfolio = sub_port(clis_df.portfolio)