Пример #1
0
## Clustering and generating scatter
X = np.concatenate(news_df['vectors'].values)

## run sagemaker kmeans
role = get_execution_role()
num_clusters = 10
kmeans = KMeans(
    role=role,
    train_instance_count=1,
    train_instance_type="ml.m5.4xlarge",
    output_path="s3://" + bucket + "/news_kmeans/",
    k=num_clusters,
)
kmeans.fit(kmeans.record_set(X))

## deploy sagemaker kmeans endpoint
kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type="ml.t2.medium")
news_df['cluster'] = kmeans_predictor.predict(X)

## Save News
news_df = news_df.drop(["ori_text", "words"], axis=1)
news_df.to_pickle('news_df.pkl')

## Save Model
import pickle
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(kmeans, file)
Пример #2
0
    train_set, valid_set, test_set = get_mnist_dataset()

    # create model using built-in k-means algorithm
    kmeans = KMeans(
        role=ROLE,
        train_instance_count=1,
        #train_instance_type='local',
        train_instance_type='ml.c4.4xlarge',
        output_path=OUTPUT_PATH,
        k=10)
    # train model
    kmeans.fit(kmeans.record_set(train_set[0]))

    # deploy model to endpoint
    kmeans_predictor = kmeans.deploy(initial_instance_count=2,
                                     instance_type='ml.m4.xlarge',
                                     endpoint_name=ENDPOINT_NAME)
    # test model
    input_set = test_set

    clustered_data = [[] for i in range(0, 10)]
    for i in range(0, len(input_set[0])):
        result = kmeans_predictor.predict(input_set[0][i].reshape(1, 784))[0]
        predicted_cluster = int(
            result.label['closest_cluster'].float32_tensor.values[0])
        clustered_data[predicted_cluster].append(i)

    for i in range(0, 10):
        print("Cluster " + str(i) + "\n" + "=" * 80)
        cnt = [0 for i in range(0, 10)]
        for data in clustered_data[i]:
Пример #3
0
def cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker):
  A_df = pd.read_pickle(local_data_folder + ticker + '.pkl')
  A_df.dropna(inplace=True)
  A_df.drop(columns=["Date"], inplace=True)

  # Normalize
  scaler = MinMaxScaler()

  Y_df = pd.DataFrame(A_df["Label"]).astype('float64')
  X_df = A_df.drop(columns=["Label"]).astype('float64')

  X = scaler.fit_transform(X_df)
  Y = scaler.fit_transform(Y_df)

  # split data
  print("Splitting data")
  x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True)

  # clustering
  s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)
  print("Clustering")
  kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type="ml.m4.xlarge",
                output_path=s3_output_folder,
                k=3)

  kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values))

  # deploy
  print("Deploying model", kmeans.model_data)
  kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")


  create_dir('{}s3/{}'.format(local_data_folder, ticker))

  # upload train and test data to S3
  dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \
            pd.DataFrame(x_train).astype("float32"),\
            clustering(x_train, kmeans_predictor)
            ], axis=1)
  dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False)
  # prepare cluster data sets    
  create_dir('{}s3/{}/train'.format(local_data_folder, ticker))
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker), True, local_data_folder)
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker), True, local_data_folder)
  save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker), True, local_data_folder)

  # We have to predict the clusters for each of the test data sets so that we could use it for testing out next model
  dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \
            pd.DataFrame(x_test).astype("float32"),\
            clustering(x_test, kmeans_predictor)
            ], axis=1)
  dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False)
  # # prepare cluster data sets    
#   create_dir('{}s3/{}/test'.format(local_data_folder, ticker))
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False, local_data_folder)
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False, local_data_folder)
#   save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False, local_data_folder)

  # delete endpoint
  kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

  print('Completed clustering for', ticker)
Пример #4
0
def process(ticker, local_data_folder, bucket, role, prefix,
            sagemaker_session):
    df = pd.read_pickle('{}/{}.{}'.format(local_data_folder, ticker, 'pkl'))
    df.dropna(inplace=True)
    df.drop(columns=["Date"], inplace=True)
    df.loc[df.Label >= threshold, 'direction'] = BUY
    df.loc[df.Label <= -threshold, 'direction'] = SELL
    df.loc[(df.Label < threshold) & (df.Label > -threshold),
           'direction'] = NONE

    # Normalize
    scaler = MinMaxScaler()

    Y_df = pd.DataFrame(df["Label"]).astype('float64')
    X_df = df.drop(columns=["Label"]).astype('float64')

    X = scaler.fit_transform(X_df)
    Y = scaler.fit_transform(Y_df)

    X[:, X.shape[1] - 1] = X_df["direction"].to_numpy()

    #### split data
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.33,
                                                        random_state=1,
                                                        shuffle=True)

    # clustering
    s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)
    kmeans = KMeans(role=role,
                    train_instance_count=1,
                    train_instance_type="ml.m4.xlarge",
                    output_path=s3_output_folder,
                    k=3)

    # Remove direction column and train
    kmeans.fit(
        kmeans.record_set(x_train[:,
                                  0:x_train.shape[1] - 1].astype('float32')))

    # deploy
    print("Deploying model", kmeans.model_data)
    kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                     instance_type="ml.m4.xlarge")

    create_dir('{}/s3/{}'.format(local_data_folder, ticker))
    '''
        Label = Change in price(+ve, -ve, none)
        Direction = BUY, SELL, NONE
        Cluster = cluster_0, cluster_1, cluster_2
    '''
    # train data
    y_train_df = pd.DataFrame(y_train, columns=["Label"])
    x_train_df = pd.DataFrame(
        x_train,
        columns=['col-{}'.format(i)
                 for i in range(x_train.shape[1] - 1)] + ["direction"])
    dataset_with_cluster = pd.concat([y_train_df.astype("float32"), x_train_df.astype("float32"),\
            clustering(x_train_df.drop(columns=["direction"]).astype('float32').values, kmeans_predictor)
        ], axis=1)
    dataset_with_cluster.to_csv('{}/s3/{}/all-train.csv'.format(
        local_data_folder, ticker),
                                header=True,
                                index=False)

    # test data
    y_test_df = pd.DataFrame(y_test, columns=["Label"])
    x_test_df = pd.DataFrame(
        x_test,
        columns=['col-{}'.format(i)
                 for i in range(x_test.shape[1] - 1)] + ['direction'])
    pd.concat([y_test_df.astype("float32"), x_test_df.astype("float32")], axis=1)\
        .to_csv('{}/s3/{}/all-test.csv'.format(local_data_folder, ticker), header=True, index=False)

    # clean clustering end point
    kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

    all_test_pred = pd.read_csv("{}/s3/{}/all-test.csv".format(
        local_data_folder, ticker)).dropna()
    all_train_pred = pd.read_csv("{}/s3/{}/all-train.csv".format(
        local_data_folder, ticker)).dropna()

    cluster0_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       0].drop(columns=["Cluster"])
    save_data(cluster0_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster0_pred"] = estimator.predict(
        all_test_pred.drop(
            columns=["Label", "direction"]).astype('float32').values)
    all_train_pred["cluster0_pred"] = estimator.predict(
        all_train_pred.drop(columns=["Label", "direction", "Cluster"]).astype(
            'float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    cluster1_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       1].drop(columns=["Cluster"])
    save_data(cluster1_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster1_pred"] = estimator.predict(
        all_test_pred.drop(columns=["Label", "direction", "cluster0_pred"
                                    ]).astype('float32').values)
    all_train_pred["cluster1_pred"] = estimator.predict(
        all_train_pred.drop(
            columns=["Label", "direction", "Cluster", "cluster0_pred"]).astype(
                'float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    cluster2_df = dataset_with_cluster[dataset_with_cluster["Cluster"] ==
                                       2].drop(columns=["Cluster"])
    save_data(cluster2_df.drop(columns=["direction"]), ticker,
              local_data_folder)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker,
                                  bucket=bucket,
                                  key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker, bucket, prefix, role,
                                      sagemaker_session)
    all_test_pred["cluster2_pred"] = estimator.predict(
        all_test_pred.drop(
            columns=["Label", "direction", "cluster0_pred", "cluster1_pred"
                     ]).astype('float32').values)
    all_train_pred["cluster2_pred"] = estimator.predict(
        all_train_pred.drop(columns=[
            "Label", "direction", "Cluster", "cluster0_pred", "cluster1_pred"
        ]).astype('float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    os.remove(local_data_folder + '/s3/' + ticker + '/train.csv')
    os.remove(local_data_folder + '/s3/' + ticker + '/validation.csv')

    all_buys = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == BUY].shape[0],
            cluster1_df[cluster1_df['direction'] == BUY].shape[0],
            cluster2_df[cluster2_df['direction'] == BUY].shape[0]
        ],
        columns=["BUY"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    all_sells = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == SELL].shape[0],
            cluster1_df[cluster1_df['direction'] == SELL].shape[0],
            cluster2_df[cluster2_df['direction'] == SELL].shape[0]
        ],
        columns=["SELL"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    all_nones = pd.DataFrame(
        [
            cluster0_df[cluster0_df['direction'] == NONE].shape[0],
            cluster1_df[cluster1_df['direction'] == NONE].shape[0],
            cluster2_df[cluster2_df['direction'] == NONE].shape[0]
        ],
        columns=["NONE"],
        index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    cluster_selection_df = pd.concat([all_buys, all_sells, all_nones], axis=1)

    cluster_selection_index = cluster_selection_df.index
    buy_cluster_name = cluster_selection_index[
        cluster_selection_df['BUY'].values.argmax()]
    sell_cluster_name = cluster_selection_index[cluster_selection_df.drop(
        index=[buy_cluster_name])['SELL'].values.argmax()]
    none_cluster_name = cluster_selection_index[cluster_selection_df.drop(
        index=[buy_cluster_name, sell_cluster_name])['NONE'].values.argmax()]

    # Generate selected-cluster column based on max(cluster0, cluster1, cluster2)
    all_test_pred["selected-cluster"] = all_test_pred[[
        "cluster0_pred", "cluster1_pred", "cluster2_pred"
    ]].idxmax(axis=1)
    all_train_pred["selected-cluster"] = all_train_pred[[
        "cluster0_pred", "cluster1_pred", "cluster2_pred"
    ]].idxmax(axis=1)

    # convert selected-cluster to BUY, SELL, NONE
    all_test_pred.loc[all_test_pred["selected-cluster"] == buy_cluster_name,
                      "prediction"] = BUY
    all_test_pred.loc[all_test_pred["selected-cluster"] == sell_cluster_name,
                      "prediction"] = SELL
    all_test_pred.loc[all_test_pred["selected-cluster"] == none_cluster_name,
                      "prediction"] = NONE

    all_train_pred.loc[all_train_pred["selected-cluster"] == buy_cluster_name,
                       "prediction"] = BUY
    all_train_pred.loc[all_train_pred["selected-cluster"] == sell_cluster_name,
                       "prediction"] = SELL
    all_train_pred.loc[all_train_pred["selected-cluster"] == none_cluster_name,
                       "prediction"] = NONE

    # Bench mark results
    all_test_pred["random-prediction"] = [
        generate_random_direction() for _ in range(all_test_pred.shape[0])
    ]
    all_train_pred["random-prediction"] = [
        generate_random_direction() for _ in range(all_train_pred.shape[0])
    ]

    all_test_pred.to_csv('{}/s3/{}/all-test-pred.csv'.format(
        local_data_folder, ticker),
                         index=None)
    all_train_pred.to_csv('{}/s3/{}/all-train-pred.csv'.format(
        local_data_folder, ticker),
                          index=None)
    cluster_selection_df.to_csv('{}/s3/{}/cluster-selection.csv'.format(
        local_data_folder, ticker),
                                index=None)

    # remove NA
    all_test_pred = all_test_pred.dropna()
    all_train_pred = all_train_pred.dropna()

    # test accuracy
    test_accuracy = accuracy_score(all_test_pred["direction"],
                                   all_test_pred["prediction"],
                                   normalize=True)
    benchmark_test_accuracy = accuracy_score(
        all_test_pred["direction"],
        all_test_pred["random-prediction"],
        normalize=True)
    print('Test accuracy:', test_accuracy, ", Benchmark:",
          benchmark_test_accuracy)

    # train accuracy
    train_accuracy = accuracy_score(all_train_pred["direction"],
                                    all_train_pred["prediction"],
                                    normalize=True)
    benchmark_train_accuracy = accuracy_score(
        all_train_pred["direction"],
        all_train_pred["random-prediction"],
        normalize=True)
    print('Train accuracy:', train_accuracy, ", Benchmark:",
          benchmark_train_accuracy)

    accuracy_df = pd.DataFrame([
        ticker, test_accuracy, benchmark_test_accuracy, train_accuracy,
        benchmark_train_accuracy
    ]).T
    accuracy_df.columns = [
        "ticker", "test_accuracy", "benchmark_test_accuracy", "train_accuracy",
        "benchmark_train_accuracy"
    ]

    accuracy_file = "{}/accuracy.csv".format(local_data_folder)
    header = not os.path.exists(accuracy_file)
    accuracy_df.to_csv(accuracy_file, mode="a", header=header, index=False)
Пример #5
0
                k=10,
                data_location=data_location)


#CODE-5---------------------------------------------------------------------------------------------

%%time

kmeans.fit(kmeans.record_set(train_set[0]))


#CODE-6---------------------------------------------------------------------------------------------

%%time

kmeans_predictor = kmeans.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')


#CODE-7---------------------------------------------------------------------------------------------

import sagemaker
from time import gmtime, strftime

job_name = 'Batch-Transform-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
prefix = 'sagemaker/project_name'

# Initialize the transformer object
transformer =sagemaker.transformer.Transformer(
     model_name=model_name,
     instance_count=1,
     instance_type='ml.c4.xlarge',
Пример #6
0
data_path = "s3://ressonance/data/model_data/"
output_path = "s3://ressonance/models/"

# portfolio clustering

port_kmeans = KMeans(role=role,
                     train_instance_count=2,
                     train_instance_type="ml.c4.xlarge",
                     output_path=output_path + "portfolio",
                     k=5,
                     data_location=data_path + "portfolios.csv")

port_training = pd.read_csv("data/training_data/portfolios.csv")

port_kmeans.fit(port_kmeans.record_set(port_training))
port_predictor = port_kmeans.deploy(initial_instance_count=1,
                                    instance_type="ml.m4.xlarge")

## Step 2: people
# Substiuting portfolios


def sub_port(port):
    return port_predictor(portfolio_processing(list(port)))


clis = None
clis_df = client_processing(clis)

clis_df.portfolio = sub_port(clis_df.portfolio)

clis_df.to_csv(key + "clients.csv")