示例#1
0
def main():
    # df = data.read_visited_key_points('Fri', grouped=True, extra=['category'])
    # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping']
    # df = df[df['category'].isin(categories)].sort_values('Timestamp')
    #
    # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last()
    # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first()

    categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping']
    x, y, prev, ids = pp.get_bag_data(['Fri'], 11, categories, return_prev=True, return_ids=True)
    # discard the day data because we only have 1 day
    ids = ids['group_id'].values
    # clamp x values to 1 or 0
    x = (x > 0).astype('int64')

    x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = (
        cross_validation.train_test_split(x, y, prev, ids, train_size=0.25, random_state=2294967295)
    )

    print('Predicting')
    predictor = ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    predictor.fit(x_train, y_train)
    y_pred = predictor.predict(x_test)

    print('Plotting')
    fig1, ax1 = plt.subplots()
    fig2, ax2 = plt.subplots()
    axs = [ax1, ax2]

    sizes = [get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]]
    max_size = max(sizes)

    axs[0].set_title('Actual Data')
    plot_next_place(prev_test, y_test, ids_test, ax=axs[0])

    axs[1].set_title('Predicted')
    plot_next_place(prev_test, y_pred, ids_test, ax=axs[1])

    fig1.savefig('actual.png', tight=True)
    fig2.savefig('predicted.png', tight=True)

    plt.show()
import matplotlib
matplotlib.use('Qt4Agg')
import data
import pandas as pd
import matplotlib.pyplot as plt
import script.predict.preprocess as pp
import script.predict.predictors as pdt
from sklearn.cross_validation import train_test_split

training_size = 0.25
x, y = pp.get_bag_data(['Sat'], 12, pp.common_categories)
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=training_size, random_state=2294967295)

im = data.read_image(size=1000)
for predictor_name in pdt.all_predictors:
    predictor = pdt.all_predictors[predictor_name]
    predictor.fit(x_train, y_train)
    y_pred = predictor.predict(x_test)
    y_pred_probs = predictor.predict_proba(x_test)

    kp = data.read_key_points().set_index('place_id')
    # Adjust the size of this so that it's proportional to the testing data
    kp['Training Counts'] = ((1 - training_size) / training_size) * pd.Series(y_train).value_counts()
    kp['Test Counts'] = pd.Series(y_test).value_counts()
    kp['Prediction Counts'] = pd.Series(y_pred).value_counts()
    kp['Prediction Probability Sum'] = pd.DataFrame(y_pred_probs, columns=predictor.classes_).sum()
    kp.fillna(0, inplace=True)

    # fig, axs = plt.subplots(2, 2)
    fig, axs = plt.subplots(1, 3)
    fig.suptitle(predictor_name)
def main():
    # df = data.read_visited_key_points('Fri', grouped=True, extra=['category'])
    # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping']
    # df = df[df['category'].isin(categories)].sort_values('Timestamp')
    #
    # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last()
    # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first()

    categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping']
    x, y, prev, ids = pp.get_bag_data(['Fri'], 14, categories, return_prev=True, return_ids=True)
    # discard the day data because we only have 1 day
    ids = ids['group_id'].values
    # clamp x values to 1 or 0
    x = (x > 0).astype('int64')

    x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = (
        cross_validation.train_test_split(x, y, prev, ids, train_size=0.25, random_state=2294967295)
    )

    print('Predicting')

    all_predictors = {
        # 'Decision Tree':
        #     tree.DecisionTreeClassifier(),
        # 'Gradient Boosting':
        #     ensemble.GradientBoostingClassifier(n_estimators=33, learning_rate=1.0, random_state=0),
        'Random Forest':
            ensemble.RandomForestClassifier(max_depth=2),
        'Adaboost':
            ensemble.AdaBoostClassifier(random_state=0),
        'MultinomialNB':
            naive_bayes.MultinomialNB(),
        # 'GaussianNB': gnb_predict,
        'BernoulliNB':
            naive_bayes.BernoulliNB(),
        # 'KNN':
        #     neighbors.KNeighborsClassifier(n_neighbors=10),
        # 'Random':
        #     dummy.DummyClassifier(strategy='stratified'),
        'Most Frequent':
            dummy.DummyClassifier(strategy='most_frequent'),
        'Uniform':
            dummy.DummyClassifier(strategy='uniform'),
    }

    # predictor = ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    # predictor = ensemble.AdaBoostClassifier(random_state=0)
    # predictor = naive_bayes.MultinomialNB()
    # predictor = naive_bayes.BernoulliNB()
    # predictor = dummy.DummyClassifier(strategy='most_frequent')
    predictor = neighbors.KNeighborsClassifier(n_neighbors=10)

    predictor.fit(x_train, y_train)
    y_pred = predictor.predict(x_test)

    print('Plotting')
    fig1, ax1 = plt.subplots()
    fig2, ax2 = plt.subplots()
    axs = [ax1, ax2]

    sizes = [get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]]
    max_size = max(sizes)

    axs[0].set_title('Actual Data')
    plot_next_place(prev_test, y_test, ids_test, ax=axs[0])

    axs[1].set_title('RF Predicted')
    plot_next_place(prev_test, y_pred, ids_test, ax=axs[1])

    # fig1.savefig('actual.png', tight=True)
    # fig2.savefig('RF predicted.png', tight=True)

    plt.show()
示例#4
0
    dummy.DummyClassifier(strategy='stratified'),
    'Most Frequent':
    dummy.DummyClassifier(strategy='most_frequent'),
    'Uniform':
    dummy.DummyClassifier(strategy='uniform'),
    'RNN':
    RNNclassifier,
    # 'DNN':
    #     DNNclassifier
}

categories = [
    'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone',
    'Shows and Entertainment', 'Shopping'
]
x, y = pp.get_bag_data(['Sat'], 16, categories=categories)
x = (x > 0).astype('int64')
kp = data.read_key_points().set_index('place_id')
kp = kp[kp['category'].isin(categories)]
kp['category'] = kp['category'].astype('category')
y = kp.loc[y, 'category'].cat.codes.values

x_train, x_validate, y_train, y_validate = cross_validation.train_test_split(
    x, y, train_size=0.90, random_state=294967295)

# y_train_cats = kp.loc[y_train, 'category'].cat.codes.values

scorings = ['accuracy', 'log_loss']
names = []
scores = {}
def main():
    # df = data.read_visited_key_points('Fri', grouped=True, extra=['category'])
    # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping']
    # df = df[df['category'].isin(categories)].sort_values('Timestamp')
    #
    # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last()
    # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first()

    categories = [
        'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone',
        'Shows & Entertainment', 'Shopping'
    ]
    x, y, prev, ids = pp.get_bag_data(['Fri'],
                                      14,
                                      categories,
                                      return_prev=True,
                                      return_ids=True)
    # discard the day data because we only have 1 day
    ids = ids['group_id'].values
    # clamp x values to 1 or 0
    x = (x > 0).astype('int64')

    x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = (
        cross_validation.train_test_split(x,
                                          y,
                                          prev,
                                          ids,
                                          train_size=0.25,
                                          random_state=2294967295))

    print('Predicting')

    all_predictors = {
        # 'Decision Tree':
        #     tree.DecisionTreeClassifier(),
        # 'Gradient Boosting':
        #     ensemble.GradientBoostingClassifier(n_estimators=33, learning_rate=1.0, random_state=0),
        'Random Forest': ensemble.RandomForestClassifier(max_depth=2),
        'Adaboost': ensemble.AdaBoostClassifier(random_state=0),
        'MultinomialNB': naive_bayes.MultinomialNB(),
        # 'GaussianNB': gnb_predict,
        'BernoulliNB': naive_bayes.BernoulliNB(),
        # 'KNN':
        #     neighbors.KNeighborsClassifier(n_neighbors=10),
        # 'Random':
        #     dummy.DummyClassifier(strategy='stratified'),
        'Most Frequent': dummy.DummyClassifier(strategy='most_frequent'),
        'Uniform': dummy.DummyClassifier(strategy='uniform'),
    }

    # predictor = ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    # predictor = ensemble.AdaBoostClassifier(random_state=0)
    # predictor = naive_bayes.MultinomialNB()
    # predictor = naive_bayes.BernoulliNB()
    # predictor = dummy.DummyClassifier(strategy='most_frequent')
    predictor = neighbors.KNeighborsClassifier(n_neighbors=10)

    predictor.fit(x_train, y_train)
    y_pred = predictor.predict(x_test)

    print('Plotting')
    fig1, ax1 = plt.subplots()
    fig2, ax2 = plt.subplots()
    axs = [ax1, ax2]

    sizes = [
        get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]
    ]
    max_size = max(sizes)

    axs[0].set_title('Actual Data')
    plot_next_place(prev_test, y_test, ids_test, ax=axs[0])

    axs[1].set_title('RF Predicted')
    plot_next_place(prev_test, y_pred, ids_test, ax=axs[1])

    # fig1.savefig('actual.png', tight=True)
    # fig2.savefig('RF predicted.png', tight=True)

    plt.show()
示例#6
0
def main():
    # df = data.read_visited_key_points('Fri', grouped=True, extra=['category'])
    # categories = ['Thrill Rides', 'Kiddie Rides', 'Rides for Everyone', 'Shows & Entertainment', 'Shopping']
    # df = df[df['category'].isin(categories)].sort_values('Timestamp')
    #
    # prev = df[df['Timestamp'] <= '2014-06-06 12'].groupby('group_id').last()
    # next = df[df['Timestamp'] > '2014-06-06 12'].groupby('group_id').first()

    categories = [
        'Thrill Rides', 'Kiddie Rides', 'Rides for Everyone',
        'Shows & Entertainment', 'Shopping'
    ]
    x, y, prev, ids = pp.get_bag_data(['Fri'],
                                      14,
                                      categories,
                                      return_prev=True,
                                      return_ids=True)
    # discard the day data because we only have 1 day
    ids = ids['group_id'].values
    # clamp x values to 1 or 0
    x = (x > 0).astype('int64')

    x_train, x_test, y_train, y_test, prev_train, prev_test, ids_train, ids_test = (
        cross_validation.train_test_split(x,
                                          y,
                                          prev,
                                          ids,
                                          train_size=0.25,
                                          random_state=2294967295))

    print('Predicting')

    #################################random forest##################################
    predictor = ensemble.RandomForestClassifier(n_estimators=100,
                                                max_depth=2,
                                                random_state=0)
    predictor.fit(x_train, y_train)
    y_pred1 = predictor.predict(x_test)

    ################################RNN##################################
    # Build model: a single direction GRU with a single layer
    classifier = learn.TensorFlowRNNClassifier(rnn_size=EMBEDDING_SIZE,
                                               n_classes=82,
                                               cell_type='gru',
                                               input_op_fn=input_op_fn,
                                               num_layers=1,
                                               bidirectional=False,
                                               sequence_length=None,
                                               steps=1000,
                                               optimizer='Adam',
                                               learning_rate=0.01,
                                               continue_training=True)

    # print(x_train)
    # print(y_train)
    # Train and predict
    classifier.fit(x_train, y_train, steps=1000)
    y_pred2 = classifier.predict(x_test)

    print('Plotting')
    fig1, ax1 = plt.subplots()
    fig2, ax2 = plt.subplots()
    fig3, ax3 = plt.subplots()
    axs = [ax1, ax2, ax3]

    # sizes = [get_max_move_size(prev_test, y, ids_test) for y in [y_test, y_pred]]
    # max_size = max(sizes)

    axs[0].set_title('Actual Data')
    plot_next_place(prev_test, y_test, ids_test, ax=axs[0])

    axs[1].set_title('RNN Predicted')
    plot_next_place(prev_test, y_pred2, ids_test, ax=axs[1])

    axs[2].set_title('RF Predicted')
    plot_next_place(prev_test, y_pred1, ids_test, ax=axs[2])

    fig1.savefig('actual.png', tight=True)
    fig2.savefig('RNN predicted.png', tight=True)

    plt.show()