def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        role = efs_fsx_setup["role_name"]
        subnets = [efs_fsx_setup["subnet_id"]]
        security_group_ids = efs_fsx_setup["security_group_ids"]
        kmeans = KMeans(
            role=role,
            instance_count=INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            k=K,
            sagemaker_session=sagemaker_session,
            subnets=subnets,
            security_group_ids=security_group_ids,
        )

        file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"]
        records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
        )

        job_name = unique_name_from_base("kmeans-fsx")
        kmeans.fit(records, job_name=job_name)
        model_path, _ = kmeans.model_data.rsplit("/", 1)
        assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
Exemplo n.º 2
0
def test_tuning_kmeans(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                        output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

        # set kmeans specific hp
        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1

        records = kmeans.record_set(train_set[0][:100])
        test_records = kmeans.record_set(train_set[0][:100], channel='test')

        # specify which hp you want to optimize over
        hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10),
                                 'mini_batch_size': IntegerParameter(10, 100),
                                 'epochs': IntegerParameter(1, 2),
                                 'init_method': CategoricalParameter(['kmeans++', 'random'])}
        objective_metric_name = 'test:msd'

        tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit([records, test_records])

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label['closest_cluster'] is not None
            assert record.label['distance_to_cluster'] is not None
Exemplo n.º 3
0
def kmeans_estimator(sagemaker_session):
    kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                    output_path='s3://{}/'.format(sagemaker_session.default_bucket()))
    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    return kmeans
def test_record_set(sagemaker_session, cpu_instance_type):
    """Test the method ``AmazonAlgorithmEstimatorBase.record_set``.

    In particular, test that the objects uploaded to the S3 bucket are encrypted.
    """
    kmeans = KMeans(
        role="SageMakerRole",
        instance_count=1,
        instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
    )
    record_set = kmeans.record_set(datasets.one_p_mnist()[0][:100],
                                   encrypt=True)
    parsed_url = urlparse(record_set.s3_data)
    s3_client = sagemaker_session.boto_session.client("s3")
    head = s3_client.head_object(Bucket=parsed_url.netloc,
                                 Key=parsed_url.path.lstrip("/"))
    assert head["ServerSideEncryption"] == "AES256"
Exemplo n.º 5
0
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session,
                           cpu_instance_type):
    subnets = [efs_fsx_setup.subnet_id]
    security_group_ids = efs_fsx_setup.security_group_ids
    role = efs_fsx_setup.role_name
    kmeans = KMeans(
        role=role,
        train_instance_count=TRAIN_INSTANCE_COUNT,
        train_instance_type=cpu_instance_type,
        k=K,
        sagemaker_session=sagemaker_session,
        subnets=subnets,
        security_group_ids=security_group_ids,
    )

    hyperparameter_ranges = {
        "extra_center_factor": IntegerParameter(4, 10),
        "mini_batch_size": IntegerParameter(10, 100),
        "epochs": IntegerParameter(1, 2),
        "init_method": CategoricalParameter(["kmeans++", "random"]),
    }

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        tuner = HyperparameterTuner(
            estimator=kmeans,
            objective_metric_name=OBJECTIVE_METRIC_NAME,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_type="Minimize",
            max_jobs=MAX_JOBS,
            max_parallel_jobs=MAX_PARALLEL_JOBS,
        )

        file_system_fsx_id = efs_fsx_setup.file_system_fsx_id
        train_records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
        )

        test_records = FileSystemRecordSet(
            file_system_id=file_system_fsx_id,
            file_system_type="FSxLustre",
            directory_path=FSX_DIR_PATH,
            num_records=NUM_RECORDS,
            feature_dim=FEATURE_DIM,
            channel="test",
        )

        job_name = unique_name_from_base("tune-kmeans-fsx")
        tuner.fit([train_records, test_records], job_name=job_name)
        tuner.wait()
        best_training_job = tuner.best_training_job()
        assert best_training_job
Exemplo n.º 6
0
def test_record_set(sagemaker_session):
    """Test the method ``AmazonAlgorithmEstimatorBase.record_set``.

    In particular, test that the objects uploaded to the S3 bucket are encrypted.
    """
    data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
    with gzip.open(data_path, 'rb') as file_object:
        train_set, _, _ = pickle.load(file_object, **pickle_args)
    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session)
    record_set = kmeans.record_set(train_set[0][:100], encrypt=True)
    parsed_url = urlparse(record_set.s3_data)
    s3_client = sagemaker_session.boto_session.client('s3')
    head = s3_client.head_object(Bucket=parsed_url.netloc,
                                 Key=parsed_url.path.lstrip('/'))
    assert head['ServerSideEncryption'] == 'AES256'
Exemplo n.º 7
0
def test_record_set(sagemaker_session):
    """Test the method ``AmazonAlgorithmEstimatorBase.record_set``.

    In particular, test that the objects uploaded to the S3 bucket are encrypted.
    """
    data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
    pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}
    with gzip.open(data_path, "rb") as file_object:
        train_set, _, _ = pickle.load(file_object, **pickle_args)
    kmeans = KMeans(
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type="ml.c4.xlarge",
        k=10,
        sagemaker_session=sagemaker_session,
    )
    record_set = kmeans.record_set(train_set[0][:100], encrypt=True)
    parsed_url = urlparse(record_set.s3_data)
    s3_client = sagemaker_session.boto_session.client("s3")
    head = s3_client.head_object(Bucket=parsed_url.netloc, Key=parsed_url.path.lstrip("/"))
    assert head["ServerSideEncryption"] == "AES256"
def test_tuning_step(sfn_client, record_set_for_hyperparameter_tuning,
                     sagemaker_role_arn, sfn_role_arn):
    job_name = generate_job_name()

    kmeans = KMeans(role=sagemaker_role_arn,
                    instance_count=1,
                    instance_type=INSTANCE_TYPE,
                    k=10)

    hyperparameter_ranges = {
        "extra_center_factor": IntegerParameter(4, 10),
        "mini_batch_size": IntegerParameter(10, 100),
        "epochs": IntegerParameter(1, 2),
        "init_method": CategoricalParameter(["kmeans++", "random"]),
    }

    tuner = HyperparameterTuner(
        estimator=kmeans,
        objective_metric_name="test:msd",
        hyperparameter_ranges=hyperparameter_ranges,
        objective_type="Minimize",
        max_jobs=2,
        max_parallel_jobs=2,
    )

    # Build workflow definition
    tuning_step = TuningStep('Tuning',
                             tuner=tuner,
                             job_name=job_name,
                             data=record_set_for_hyperparameter_tuning)
    tuning_step.add_retry(SAGEMAKER_RETRY_STRATEGY)
    workflow_graph = Chain([tuning_step])

    with timeout(minutes=DEFAULT_TIMEOUT_MINUTES):
        # Create workflow and check definition
        workflow = create_workflow_and_check_definition(
            workflow_graph=workflow_graph,
            workflow_name=unique_name_from_base(
                "integ-test-tuning-step-workflow"),
            sfn_client=sfn_client,
            sfn_role_arn=sfn_role_arn)

        # Execute workflow
        execution = workflow.execute()
        execution_output = execution.get_output(wait=True)

        # Check workflow output
        assert execution_output.get(
            "HyperParameterTuningJobStatus") == "Completed"

        # Cleanup
        state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
Exemplo n.º 9
0
def kmeans_estimator(sagemaker_session, cpu_instance_type):
    kmeans = KMeans(
        role="SageMakerRole",
        instance_count=1,
        instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )
    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    return kmeans
def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role=ROLE,
            train_instance_count=SINGLE_INSTANCE_COUNT,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(train_set[0][:100])

        training_config = _build_airflow_workflow(
            estimator=kmeans, instance_type=cpu_instance_type, inputs=records
        )

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"],
        )
def test_attach_transform_kmeans(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                    train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(path=transform_input_path,
                                                           key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(kmeans, transform_input)

    attached_transformer = Transformer.attach(transformer.latest_transform_job.name,
                                              sagemaker_session=sagemaker_session)
    attached_transformer.wait()
Exemplo n.º 12
0
        "http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
    f = gzip.open('mnist.pkl.gz', 'rb')
    train_set, valid_set, test_set = pickle.load(f, encoding="latin1")
    f.close()
    return train_set, valid_set, test_set


if __name__ == "__main__":
    # get MNIST dataset
    train_set, valid_set, test_set = get_mnist_dataset()

    # create model using built-in k-means algorithm
    kmeans = KMeans(
        role=ROLE,
        train_instance_count=1,
        #train_instance_type='local',
        train_instance_type='ml.c4.4xlarge',
        output_path=OUTPUT_PATH,
        k=10)
    # train model
    kmeans.fit(kmeans.record_set(train_set[0]))

    # deploy model to endpoint
    kmeans_predictor = kmeans.deploy(initial_instance_count=2,
                                     instance_type='ml.m4.xlarge',
                                     endpoint_name=ENDPOINT_NAME)
    # test model
    input_set = test_set

    clustered_data = [[] for i in range(0, 10)]
    for i in range(0, len(input_set[0])):
Exemplo n.º 13
0
def test_async_kmeans(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]),
                   wait=False,
                   job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
def test_kmeans():

    with timeout(minutes=15):
        sagemaker_session = sagemaker.Session(boto_session=boto3.Session(
            region_name=REGION))
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole',
                        train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10,
                        sagemaker_session=sagemaker_session,
                        base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        kmeans.fit(kmeans.record_set(train_set[0][:100]))

    endpoint_name = name_from_base('kmeans')
    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=20):
        model = KMeansModel(kmeans.model_data,
                            role='SageMakerRole',
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Exemplo n.º 15
0
def test_attach_transform_kmeans(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(
                        sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base('test-kmeans-attach')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
Exemplo n.º 16
0
def test_async_kmeans(sagemaker_session):
    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole',
                        train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10,
                        sagemaker_session=sagemaker_session,
                        base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=35):
        estimator = KMeans.attach(training_job_name=training_job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role='SageMakerRole',
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Exemplo n.º 17
0
# In[ ]:


# from time import gmtime, strftime
# job_name = 'KMeans-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) 
# print("Training job", job_name)


# In[ ]:


from sagemaker import KMeans

kmeans = KMeans(role=role,
                train_instance_count=2,
                train_instance_type='ml.c4.8xlarge',
                output_path="s3://2018-10-08-batch-test",
                k=10,
                data_location=trainURL)


# Use the high-level SDK

# In[ ]:


get_ipython().run_cell_magic('time', '', '\nkmeans.fit(kmeans.record_set(train_set[0]))')


# In[ ]:

Exemplo n.º 18
0
def process(ticker, local_data_folder, bucket, role, prefix,
            sagemaker_session):
    df = pd.read_pickle('{}/{}.{}'.format(local_data_folder, ticker, 'pkl'))
    df.dropna(inplace=True)
    df.drop(columns=["Date"], inplace=True)
    df.loc[df.Label >= threshold, 'direction'] = BUY
    df.loc[df.Label <= -threshold, 'direction'] = SELL
    df.loc[(df.Label < threshold) & (df.Label > -threshold),
           'direction'] = NONE

    # Normalize
    scaler = MinMaxScaler()

    Y_df = pd.DataFrame(df["Label"]).astype('float64')
    X_df = df.drop(columns=["Label"]).astype('float64')

    X = scaler.fit_transform(X_df)
    Y = scaler.fit_transform(Y_df)

    X[:, X.shape[1] - 1] = X_df["direction"].to_numpy()

    #### split data
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.33,
                                                        random_state=1,
                                                        shuffle=True)

    # clustering
    s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)
    kmeans = KMeans(role=role,
                    train_instance_count=1,
                    train_instance_type="ml.m4.xlarge",
                    output_path=s3_output_folder,
                    k=3)

    # Remove direction column and train
    kmeans.fit(
        kmeans.record_set(x_train[:,
                                  0:x_train.shape[1] - 1].astype('float32')))

    # deploy
    print("Deploying model", kmeans.model_data)
    kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                     instance_type="ml.m4.xlarge")

    create_dir('{}/s3/{}'.format(local_data_folder, ticker))
    '''
        Label = Change in price(+ve, -ve, none)
        Direction = BUY, SELL, NONE
        Cluster = cluster_0, cluster_1, cluster_2
    '''
    # train data
    y_train_df = pd.DataFrame(y_train, columns=["Label"])
    x_train_df = pd.DataFrame(
        x_train,
        columns=['col-{}'.format(i)
                 for i in range(x_train.shape[1] - 1)] + ["direction"])
    dataset_with_cluster = pd.concat([y_train_df.astype("float32"), x_train_df.astype("float32"),\
            clustering(x_train_df.drop(columns=["direction"]).astype('float32').values, kmeans_predictor)
        ], axis=1)
    dataset_with_cluster.to_csv('{}/s3/{}/all-train.csv'.format(
        local_data_folder, ticker),
                                header=True,
                                index=False)

    # test data
    y_test_df = pd.DataFrame(y_test, columns=["Label"])
    x_test_df = pd.DataFrame(
        x_test,
        columns=['col-{}'.format(i)
                 for i in range(x_test.shape[1] - 1)] + ['direction'])
    pd.concat([y_test_df.astype("float32"), x_test_df.astype("float32")], axis=1)\
        .to_csv('{}/s3/{}/all-test.csv'.format(local_data_folder, ticker), header=True, index=False)

    # clean clustering end point
    kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

    all_test_pred = pd.read_csv("{}/s3/{}/all-test.csv".format(
        local_data_folder, ticker)).dropna()
    all_train_pred = pd.read_csv("{}/s3/{}/all-train.csv".format(
        local_data_folder, ticker)).dropna()

    cluster0_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       0].drop(columns=["Cluster"])
    save_data(cluster0_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster0_pred"] = estimator.predict(
        all_test_pred.drop(
            columns=["Label", "direction"]).astype('float32').values)
    all_train_pred["cluster0_pred"] = estimator.predict(
        all_train_pred.drop(columns=["Label", "direction", "Cluster"]).astype(
            'float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    cluster1_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       1].drop(columns=["Cluster"])
    save_data(cluster1_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster1_pred"] = estimator.predict(
        all_test_pred.drop(columns=["Label", "direction", "cluster0_pred"
                                    ]).astype('float32').values)
    all_train_pred["cluster1_pred"] = estimator.predict(
        all_train_pred.drop(
            columns=["Label", "direction", "Cluster", "cluster0_pred"]).astype(
                'float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    cluster2_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       2].drop(columns=["Cluster"])
    save_data(cluster2_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster2_pred"] = estimator.predict(
        all_test_pred.drop(
            columns=["Label", "direction", "cluster0_pred", "cluster1_pred"
                     ]).astype('float32').values)
    all_train_pred["cluster2_pred"] = estimator.predict(
        all_train_pred.drop(columns=[
            "Label", "direction", "Cluster", "cluster0_pred", "cluster1_pred"
        ]).astype('float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    os.remove(local_data_folder + '/s3/' + ticker + '/train.csv')
    os.remove(local_data_folder + '/s3/' + ticker + '/validation.csv')

    all_buys = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == BUY].shape[0],
            cluster1_df[cluster1_df['direction'] == BUY].shape[0],
            cluster2_df[cluster2_df['direction'] == BUY].shape[0]
        ],
        columns=["BUY"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    all_sells = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == SELL].shape[0],
            cluster1_df[cluster1_df['direction'] == SELL].shape[0],
            cluster2_df[cluster2_df['direction'] == SELL].shape[0]
        ],
        columns=["SELL"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    all_nones = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == NONE].shape[0],
            cluster1_df[cluster1_df['direction'] == NONE].shape[0],
            cluster2_df[cluster2_df['direction'] == NONE].shape[0]
        ],
        columns=["NONE"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    cluster_selection_df = pd.concat([all_buys, all_sells, all_nones], axis=1)

    cluster_selection_index = cluster_selection_df.index
    buy_cluster_name = cluster_selection_index[
        cluster_selection_df['BUY'].values.argmax()]
    sell_cluster_name = cluster_selection_index[cluster_selection_df.drop(
        index=[buy_cluster_name])['SELL'].values.argmax()]
    none_cluster_name = cluster_selection_index[cluster_selection_df.drop(
        index=[buy_cluster_name, sell_cluster_name])['NONE'].values.argmax()]

    # Generate selected-cluster column based on max(cluster0, cluster1, cluster2)
    all_test_pred["selected-cluster"] = all_test_pred[[
        "cluster0_pred", "cluster1_pred", "cluster2_pred"
    ]].idxmax(axis=1)
    all_train_pred["selected-cluster"] = all_train_pred[[
        "cluster0_pred", "cluster1_pred", "cluster2_pred"
    ]].idxmax(axis=1)

    # convert selected-cluster to BUY, SELL, NONE
    all_test_pred.loc[all_test_pred["selected-cluster"] == buy_cluster_name,
                      "prediction"] = BUY
    all_test_pred.loc[all_test_pred["selected-cluster"] == sell_cluster_name,
                      "prediction"] = SELL
    all_test_pred.loc[all_test_pred["selected-cluster"] == none_cluster_name,
                      "prediction"] = NONE

    all_train_pred.loc[all_train_pred["selected-cluster"] == buy_cluster_name,
                       "prediction"] = BUY
    all_train_pred.loc[all_train_pred["selected-cluster"] == sell_cluster_name,
                       "prediction"] = SELL
    all_train_pred.loc[all_train_pred["selected-cluster"] == none_cluster_name,
                       "prediction"] = NONE

    # Bench mark results
    all_test_pred["random-prediction"] = [
        generate_random_direction() for _ in range(all_test_pred.shape[0])
    ]
    all_train_pred["random-prediction"] = [
        generate_random_direction() for _ in range(all_train_pred.shape[0])
    ]

    all_test_pred.to_csv('{}/s3/{}/all-test-pred.csv'.format(
        local_data_folder, ticker),
                         index=None)
    all_train_pred.to_csv('{}/s3/{}/all-train-pred.csv'.format(
        local_data_folder, ticker),
                          index=None)
    cluster_selection_df.to_csv('{}/s3/{}/cluster-selection.csv'.format(
        local_data_folder, ticker),
                                index=None)

    # remove NA
    all_test_pred = all_test_pred.dropna()
    all_train_pred = all_train_pred.dropna()

    # test accuracy
    test_accuracy = accuracy_score(all_test_pred["direction"],
                                   all_test_pred["prediction"],
                                   normalize=True)
    benchmark_test_accuracy = accuracy_score(
        all_test_pred["direction"],
        all_test_pred["random-prediction"],
        normalize=True)
    print('Test accuracy:', test_accuracy, ", Benchmark:",
          benchmark_test_accuracy)

    # train accuracy
    train_accuracy = accuracy_score(all_train_pred["direction"],
                                    all_train_pred["prediction"],
                                    normalize=True)
    benchmark_train_accuracy = accuracy_score(
        all_train_pred["direction"],
        all_train_pred["random-prediction"],
        normalize=True)
    print('Train accuracy:', train_accuracy, ", Benchmark:",
          benchmark_train_accuracy)

    accuracy_df = pd.DataFrame([
        ticker, test_accuracy, benchmark_test_accuracy, train_accuracy,
        benchmark_train_accuracy
    ]).T
    accuracy_df.columns = [
        "ticker", "test_accuracy", "benchmark_test_accuracy", "train_accuracy",
        "benchmark_train_accuracy"
    ]

    accuracy_file = "{}/accuracy.csv".format(local_data_folder)
    header = not os.path.exists(accuracy_file)
    accuracy_df.to_csv(accuracy_file, mode="a", header=header, index=False)
Exemplo n.º 19
0
def cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker):
  A_df = pd.read_pickle(local_data_folder + ticker + '.pkl')
  A_df.dropna(inplace=True)
  A_df.drop(columns=["Date"], inplace=True)

  # Normalize
  scaler = MinMaxScaler()

  Y_df = pd.DataFrame(A_df["Label"]).astype('float64')
  X_df = A_df.drop(columns=["Label"]).astype('float64')

  X = scaler.fit_transform(X_df)
  Y = scaler.fit_transform(Y_df)

  # split data
  print("Splitting data")
  x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True)

  # clustering
  s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)
  print("Clustering")
  kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type="ml.m4.xlarge",
                output_path=s3_output_folder,
                k=3)

  kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values))

  # deploy
  print("Deploying model", kmeans.model_data)
  kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")


  create_dir('{}s3/{}'.format(local_data_folder, ticker))

  # upload train and test data to S3
  dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \
            pd.DataFrame(x_train).astype("float32"),\
            clustering(x_train, kmeans_predictor)
            ], axis=1)
  dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False)
  # prepare cluster data sets    
  create_dir('{}s3/{}/train'.format(local_data_folder, ticker))
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker), True, local_data_folder)
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker), True, local_data_folder)
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker), True, local_data_folder)

  # We have to predict the clusters for each of the test data sets so that we could use it for testing out next model
  dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \
            pd.DataFrame(x_test).astype("float32"),\
            clustering(x_test, kmeans_predictor)
            ], axis=1)
  dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False)
  # # prepare cluster data sets    
#   create_dir('{}s3/{}/test'.format(local_data_folder, ticker))
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False, local_data_folder)
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False, local_data_folder)
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False, local_data_folder)

  # delete endpoint
  kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

  print('Completed clustering for', ticker)
Exemplo n.º 20
0
buf.seek(0)

boto3.resource('s3').Bucket(bucket).Object(data_key).upload_fileobj(buf)


## 3.3.2
from sagemaker import KMeans

data_location = 's3://{}/kmeans_highlevel_example/data'.format(bucket)
output_location = 's3://{}/kmeans_highlevel_example/output'.format(bucket)

print('training data will be uploaded to: {}'.format(data_location))
print('training artifacts will be uploaded to: {}'.format(output_location))

kmeans = KMeans(role=role, train_instance_count=2,
                train_instance_type='ml.c4.8xlarge',
                output_path=output_location,
                k=10,data_location=data_location)


%%time
kmeans.fit(kmeans.record_set(train_set[0]))

3.4.1


3 Steps



High level - takes care of all these via the `deploy` method
Low level - provides corresponding methods
def test_kmeans_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        kmeans = KMeans(
            role=ROLE,
            instance_count=SINGLE_INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(datasets.one_p_mnist()[0][:100])

        training_config = _build_airflow_workflow(
            estimator=kmeans, instance_type=cpu_instance_type, inputs=records)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]
            ["S3Uri"],
        )
Exemplo n.º 22
0
def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set):
    job_name = unique_name_from_base("kmeans")

    with timeout(minutes=5):
        kmeans = KMeans(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(training_set[0][:100]),
                   wait=False,
                   job_name=job_name)

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=job_name,
                                  sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(training_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
Exemplo n.º 23
0
def test_kmeans(sagemaker_session, cpu_instance_type, training_set):
    job_name = unique_name_from_base("kmeans")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans = KMeans(
            role="SageMakerRole",
            instance_count=1,
            instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            eval_metrics=json.dumps(kmeans.eval_metrics),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(training_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(training_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
        predictor.delete_model()
        with pytest.raises(Exception) as exception:
            sagemaker_session.sagemaker_client.describe_model(
                ModelName=model.name)
            assert "Could not find model" in str(exception.value)
Exemplo n.º 24
0
def test_transform_byo_estimator(sagemaker_session, cpu_instance_type):
    data_path = os.path.join(DATA_DIR, "one_p_mnist")
    pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}
    tags = [{"Key": "some-tag", "Value": "value-for-tag"}]

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, "mnist.pkl.gz")
    with gzip.open(train_set_path, "rb") as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    estimator = Estimator.attach(training_job_name=job_name,
                                 sagemaker_session=sagemaker_session)
    estimator._enable_network_isolation = True

    transform_input_path = os.path.join(data_path, "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = estimator.transformer(1, cpu_instance_type, tags=tags)
    transformer.transform(transform_input, content_type="text/csv")

    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name)
        assert model_desc["EnableNetworkIsolation"]

        model_tags = sagemaker_session.sagemaker_client.list_tags(
            ResourceArn=model_desc["ModelArn"])["Tags"]
        assert tags == model_tags
Exemplo n.º 25
0
def test_tuning_kmeans(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole',
                        train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10,
                        sagemaker_session=sagemaker_session,
                        base_job_name='tk',
                        output_path='s3://{}/'.format(
                            sagemaker_session.default_bucket()))

        # set kmeans specific hp
        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1

        records = kmeans.record_set(train_set[0][:100])
        test_records = kmeans.record_set(train_set[0][:100], channel='test')

        # specify which hp you want to optimize over
        hyperparameter_ranges = {
            'extra_center_factor': IntegerParameter(1, 10),
            'mini_batch_size': IntegerParameter(10, 100),
            'epochs': IntegerParameter(1, 2),
            'init_method': CategoricalParameter(['kmeans++', 'random'])
        }
        objective_metric_name = 'test:msd'

        tuner = HyperparameterTuner(
            estimator=kmeans,
            objective_metric_name=objective_metric_name,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_type='Minimize',
            max_jobs=2,
            max_parallel_jobs=2)

        tuner.fit([records, test_records])

        print('Started hyperparameter tuning job with name:' +
              tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job,
                                             sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label['closest_cluster'] is not None
            assert record.label['distance_to_cluster'] is not None
Exemplo n.º 26
0
def test_transform_byo_estimator(sagemaker_session):
    data_path = os.path.join(DATA_DIR, 'one_p_mnist')
    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
    tags = [{'Key': 'some-tag', 'Value': 'value-for-tag'}]

    # Load the data into memory as numpy arrays
    train_set_path = os.path.join(data_path, 'mnist.pkl.gz')
    with gzip.open(train_set_path, 'rb') as f:
        train_set, _, _ = pickle.load(f, **pickle_args)

    kmeans = KMeans(role='SageMakerRole',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    k=10,
                    sagemaker_session=sagemaker_session,
                    output_path='s3://{}/'.format(
                        sagemaker_session.default_bucket()))

    # set kmeans specific hp
    kmeans.init_method = 'random'
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = 'kmeans++'
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(train_set[0][:100])

    job_name = unique_name_from_base('test-kmeans-attach')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(data_path, 'transform_input.csv')
    transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    estimator = Estimator.attach(training_job_name=job_name,
                                 sagemaker_session=sagemaker_session)

    transformer = estimator.transformer(1, 'ml.m4.xlarge', tags=tags)
    transformer.transform(transform_input, content_type='text/csv')

    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        transformer.wait()
        model_desc = sagemaker_session.sagemaker_client.describe_model(
            ModelName=transformer.model_name)
        model_tags = sagemaker_session.sagemaker_client.list_tags(
            ResourceArn=model_desc['ModelArn'])['Tags']
        assert tags == model_tags
    subplot.axis('off') 
    subplot.imshow(imgr, cmap='gray') 
    plt.title(caption) 
    
show_digit(train_set[0][30], 'This is a {}'.format(train_set[1][30])) 


#4
from sagemaker import KMeans 
data_location = 's3://{}/kmeans_highlevel_example/data'.format(bucket) 
output_location = 's3://{}/kmeans_highlevel_example/output'.format(bucket) 
print('training data will be uploaded to: {}'.format(data_location)) 
print('training artifacts will be uploaded to: {}'.format(output_location)) 
kmeans = KMeans(role=role, # 훈련 결과 읽기 및 쓰기에 사용되는 사용자 IAM 
                train_instance_count=2, # 모델 훈련에 사용할 인스턴스의 수 
                train_instance_type='ml.c4.8xlarge', # 모델 훈련에 사용할 인스턴스의 타입 
                output_path=output_location, # 훈련 결과를 저장할 위치 
                k=10, # 생성할 클러스터의 수, 0부터 9까지의 숫자 분류 문제이기에 10으로 설정 
                data_location=data_location) # 변환된 훈련 데이터를 업로드하는 Amazon S3의 위치


#5
%%time 
kmeans.fit(kmeans.record_set(train_set[0]))  


#6
%%time 
kmeans_predictor = kmeans.deploy(initial_instance_count=1, 
                                instance_type='ml.t2.medium') 

#7
Exemplo n.º 28
0
def gtext2vec(text):
    return text2vec(model, text)


news_df['vectors'] = news_df.words.progress_apply(gtext2vec)

## Clustering and generating scatter
X = np.concatenate(news_df['vectors'].values)

## run sagemaker kmeans
role = get_execution_role()
num_clusters = 10
kmeans = KMeans(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.4xlarge",
    output_path="s3://" + bucket + "/news_kmeans/",
    k=num_clusters,
)
kmeans.fit(kmeans.record_set(X))

## deploy sagemaker kmeans endpoint
kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type="ml.t2.medium")
news_df['cluster'] = kmeans_predictor.predict(X)

## Save News
news_df = news_df.drop(["ori_text", "words"], axis=1)
news_df.to_pickle('news_df.pkl')

## Save Model
# <img src='notebook_ims/elbow_graph.png' width=50% />
# 
# A distance elbow can be seen around 8 when the distance starts to increase and then decrease at a slower rate. This indicates that there is enough separation to distinguish the data points in each cluster, but also that you included enough clusters so that the data points aren’t *extremely* far away from each cluster.

# In[40]:


# define a KMeans estimator
# Solution
from sagemaker import KMeans

NUM_CLUSTERS = 8

kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c4.xlarge',
                output_path=output_path, # using the same output path as was defined, earlier              
                k=NUM_CLUSTERS)


# ### EXERCISE: Create formatted, k-means training data
# 
# Just as before, you should convert the `counties_transformed` df into a numpy array and then into a RecordSet. This is the required format for passing training data into a `KMeans` model.

# In[41]:


# convert the transformed dataframe into record_set data
#Solution
kmeans_train_data_np = counties_transformed.values.astype('float32')
kmeans_formatted_data = kmeans.record_set(kmeans_train_data_np)
Exemplo n.º 30
0
def test_kmeans(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("kmeans")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            eval_metrics=json.dumps(kmeans.eval_metrics),
            force_dense="True",
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = KMeansModel(kmeans.model_data,
                            role="SageMakerRole",
                            sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None

    predictor.delete_model()
    with pytest.raises(Exception) as exception:
        sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
        assert "Could not find model" in str(exception.value)
def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type):
    kmeans = KMeans(
        role="SageMakerRole",
        instance_count=1,
        instance_type=cpu_instance_type,
        k=10,
        sagemaker_session=sagemaker_session,
        output_path="s3://{}/".format(sagemaker_session.default_bucket()),
    )

    # set kmeans specific hp
    kmeans.init_method = "random"
    kmeans.max_iterators = 1
    kmeans.tol = 1
    kmeans.num_trials = 1
    kmeans.local_init_method = "kmeans++"
    kmeans.half_life_time_size = 1
    kmeans.epochs = 1

    records = kmeans.record_set(datasets.one_p_mnist()[0][:100])

    job_name = unique_name_from_base("test-kmeans-attach")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        kmeans.fit(records, job_name=job_name)

    transform_input_path = os.path.join(DATA_DIR, "one_p_mnist",
                                        "transform_input.csv")
    transform_input_key_prefix = "integ-test-data/one_p_mnist/transform"
    transform_input = kmeans.sagemaker_session.upload_data(
        path=transform_input_path, key_prefix=transform_input_key_prefix)

    transformer = _create_transformer_and_transform_job(
        kmeans, transform_input, cpu_instance_type)

    attached_transformer = Transformer.attach(
        transformer.latest_transform_job.name,
        sagemaker_session=sagemaker_session)
    with timeout_and_delete_model_with_transformer(
            transformer,
            sagemaker_session,
            minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES):
        attached_transformer.wait()
Exemplo n.º 32
0

train_data = counties_transformed.values.astype('float32')


# First, we call and define the hyperparameters of our KMeans model as we have done with our PCA model. The Kmeans algorithm allows the user to specify how many clusters to identify. In this instance, let's try to find the top 7 clusters from our dataset.

# In[33]:


from sagemaker import KMeans

num_clusters = 7
kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c4.xlarge',
                output_path='s3://'+ bucket_name +'/counties/',              
                k=num_clusters)


# Then we train the model on our training data.

# In[34]:


get_ipython().run_cell_magic('time', '', 'kmeans.fit(kmeans.record_set(train_data))')


# Now we deploy the model and we can pass in the original training set to get the labels for each entry. This will give us which cluster each county belongs to.

# In[35]:
Exemplo n.º 33
0
def test_async_kmeans(sagemaker_session):
    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense='True',
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None