Exemplo n.º 1
0
#write to train now with prediction.

listings = pd.read_sql_query("select * from listings", con, index_col="id")

listings["preds"] = model.predict(X_full)
listings["diff"] = listings["preds"] - listings["price"]

train.to_sql("listings", con=engine, if_exists="replace")

#~~~~~~~~~~~~
#NEURAL NETS:
#~~~~~~~~~~~

model = Sequential()
model.add(Dense(512, activation="relu", input_dim=X_tr.shape[1]))
model.add(Dropout(0.5))
#model.add(Dense(256, activation = "relu"))
#model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss="mse", optimizer="adam")
hist = model.fit(X_tr,
                 y_tr,
                 validation_data=(X_val, y_val),
                 nb_epoch=20,
                 batch_size=128,
                 verbose=0)

pd.DataFrame(hist.history).plot()
Exemplo n.º 2
0
def analyze_space_change(request):
    spacechange_metric_data = pandas.read_csv('spacechange_metric_' +
                                              time.strftime("%d_%m_%Y") +
                                              '.csv')
    #spacechange_metric_data = pandas.read_csv('spacechange_metric_18_04_2019.csv')
    spacechange_metric_data = spacechange_metric_data[
        spacechange_metric_data['DIFF_KB'] != 0]
    spacechange_metric_data = spacechange_metric_data[
        spacechange_metric_data['DIFF_KB'] > 0]
    spacechange_metric_data = spacechange_metric_data[
        spacechange_metric_data['Average Active Sessions'] != 0]

    #print(spacechange_metric_data['begin_time'])

    capicity_change = spacechange_metric_data['DIFF_KB']
    spacechange_metric_data.drop(['DIFF_KB', 'begin_time'],
                                 axis=1,
                                 inplace=True)

    # 采用PCA进行主成分分析
    from sklearn.decomposition import PCA
    pca = PCA(n_components=3)
    new_features = pca.fit_transform(spacechange_metric_data)
    #print(new_features)
    #
    # print(spacechange_metric_data)
    # print(capicity_change)

    x = spacechange_metric_data
    x_train_lasso, x_test_lasso, y_train_lasso, y_test_lasso = train_test_split(
        x, capicity_change, train_size=0.7, random_state=1)
    ss = StandardScaler()
    x_train_lasso = ss.fit_transform(x_train_lasso)
    x_test_lasso = ss.transform(x_test_lasso)

    y_train_lasso = ss.fit_transform(y_train_lasso.to_frame())
    #y_test_lasso = ss.transform(y_test_lasso.to_frame())

    #LASSO 预测
    # model = Lasso()
    # alpha = np.logspace(-3,2,10)
    # np.set_printoptions(suppress=True)
    # # print('alpha:{}'.format(alpha))
    # #print(x_train.T)
    # lasso_model = GridSearchCV(model,param_grid={'alpha': alpha}, cv=5)
    # lasso_model.fit(x_train_lasso,y_train_lasso)
    # y_hat_lasso = lasso_model.predict(x_test_lasso)

    # 采用FA和K-Means处理之后的特征,取前20个
    # ['Current Logons Count', 'Cell Physical IO Interconnect Bytes', 'Total PGA Allocated',
    # 'Physical Read Bytes Per Sec', 'Total PGA Used by SQL Workareas', 'Temp Space Used',
    # 'Physical Write Bytes Per Sec', 'Physical Write Total Bytes Per Sec', 'Cursor Cache Hit Ratio',
    # 'Redo Generated Per Sec', 'Redo Generated Per Txn', 'Logical Reads Per Sec', 'Rows Per Sort',
    # 'Physical Read Total Bytes Per Sec', 'Consistent Read Gets Per Txn', 'Network Traffic Volume Per Sec',
    # 'Physical Reads Direct Per Sec', 'DB Block Changes Per Sec', 'Logical Reads Per User Call',
    # 'Response Time Per Txn']

    #前30个
    # ['Total Sorts Per User Call', 'Cell Physical IO Interconnect Bytes', 'Total PGA Allocated',
    #  'Physical Read Bytes Per Sec', 'Temp Space Used', 'Total PGA Used by SQL Workareas',
    #  'Physical Write Bytes Per Sec', 'Physical Write Total Bytes Per Sec', 'Cursor Cache Hit Ratio',
    #  'Redo Generated Per Sec', 'Redo Generated Per Txn', 'Consistent Read Gets Per Sec', 'Rows Per Sort',
    #  'Physical Read Total Bytes Per Sec', 'Logical Reads Per Txn', 'Network Traffic Volume Per Sec',
    #  'Physical Reads Direct Per Sec', 'Logical Reads Per User Call', 'DB Block Changes Per Sec',
    #  'Logical Reads Per Sec', 'Database Time Per Sec', 'Physical Reads Per Sec', 'Unnamed: 0',
    #  'Physical Read Total IO Requests Per Sec', 'DB Block Changes Per Txn', 'Open Cursors Per Sec',
    #  'Consistent Read Gets Per Txn', 'Response Time Per Txn', 'Physical Reads Per Txn', 'Host CPU Utilization (%)']

    #35个特征
    # ['User Rollbacks Percentage', 'Cell Physical IO Interconnect Bytes', 'Total PGA Allocated',
    #  'Physical Read Total Bytes Per Sec', 'Temp Space Used', 'Total PGA Used by SQL Workareas',
    #  'Physical Write Total Bytes Per Sec', 'Physical Write Bytes Per Sec', 'Cursor Cache Hit Ratio',
    #  'Redo Generated Per Sec', 'Redo Generated Per Txn', 'Consistent Read Gets Per Sec', 'Rows Per Sort',
    #  'Logical Reads Per Txn', 'Physical Read Bytes Per Sec', 'Network Traffic Volume Per Sec',
    #  'Physical Reads Direct Per Sec', 'Logical Reads Per User Call', 'DB Block Gets Per Sec',
    #  'Logical Reads Per Sec', 'DB Block Changes Per Txn', 'Physical Reads Per Sec', 'Response Time Per Txn',
    #  'Physical Read Total IO Requests Per Sec', 'Unnamed: 0', 'Open Cursors Per Sec', 'Database Time Per Sec',
    #  'Physical Reads Per Txn', 'Consistent Read Gets Per Txn', 'Host CPU Utilization (%)',
    #  'Enqueue Requests Per Sec', 'DB Block Changes Per Sec', 'Total Index Scans Per Txn',
    #  'Executions Per User Call', 'Physical Writes Per Sec']

    #60个
    # ['Active Serial Sessions', 'Cell Physical IO Interconnect Bytes', 'Total PGA Allocated',
    #  'Physical Read Total Bytes Per Sec', 'Total PGA Used by SQL Workareas', 'Temp Space Used',
    #  'Physical Write Total Bytes Per Sec', 'Physical Write Bytes Per Sec', 'Cursor Cache Hit Ratio',
    #  'Redo Generated Per Sec', 'Redo Generated Per Txn', 'Consistent Read Gets Per Sec', 'Rows Per Sort',
    #  'Physical Read Bytes Per Sec', 'Consistent Read Gets Per Txn', 'Network Traffic Volume Per Sec',
    #  'Physical Reads Per Sec', 'DB Block Gets Per Sec', 'Logical Reads Per User Call', 'Physical Reads Direct Per Sec',
    #  'DB Block Gets Per Txn', 'I/O Requests per Second', 'Logical Reads Per Sec', 'Response Time Per Txn',
    #  'Open Cursors Per Sec', 'Unnamed: 0', 'Database Time Per Sec', 'Physical Reads Per Txn', 'Logical Reads Per Txn',
    #  'Host CPU Utilization (%)', 'Recursive Calls Per Sec', 'Txns Per Logon', 'Executions Per Txn',
    #  'Physical Writes Per Sec', 'Physical Reads Direct Per Txn', 'Total Index Scans Per Sec',
    #  'Total Index Scans Per Txn', 'DB Block Gets Per User Call', 'Physical Read IO Requests Per Sec',
    #  'Enqueue Requests Per Sec', 'DB Block Changes Per Sec', 'Full Index Scans Per Txn', 'Current Open Cursors Count',
    #  'Total Table Scans Per Txn', 'Database Wait Time Ratio', 'DB Block Changes Per Txn', 'User Calls Ratio',
    #  'User Commits Per Sec']

    #40个特征

    from sklearn.metrics import explained_variance_score, \
        mean_absolute_error, mean_squared_error, \
        median_absolute_error, r2_score

    fa_k_spacechange_metric_data = spacechange_metric_data[[
        'Current Logons Count', 'Cell Physical IO Interconnect Bytes',
        'Total PGA Allocated', 'Physical Read Bytes Per Sec',
        'Total PGA Used by SQL Workareas', 'Temp Space Used',
        'Physical Write Bytes Per Sec', 'Physical Write Total Bytes Per Sec',
        'Cursor Cache Hit Ratio', 'Redo Generated Per Sec',
        'Redo Generated Per Txn', 'Logical Reads Per Sec', 'Rows Per Sort',
        'Physical Read Total Bytes Per Sec', 'Consistent Read Gets Per Txn',
        'Network Traffic Volume Per Sec', 'Physical Reads Direct Per Sec',
        'DB Block Changes Per Sec', 'Logical Reads Per User Call',
        'Response Time Per Txn'
    ]]

    x = fa_k_spacechange_metric_data
    x_train_ridge, x_test_ridge, y_train_ridge, y_test_ridge = train_test_split(
        x, capicity_change, train_size=0.7, random_state=1)

    print('x_test_ridge', x_test_ridge)

    ss = MinMaxScaler(feature_range=(-1, 1))
    x_train_ridge = ss.fit_transform(x_train_ridge)
    x_test_ridge = ss.transform(x_test_ridge)

    y_train_ridge = ss.fit_transform(y_train_ridge.to_frame())
    #y_test = ss.transform(y_test.to_frame())

    model = Ridge()
    alpha = np.logspace(-2, 2, 10)
    np.set_printoptions(suppress=True)
    # print('alpha:{}'.format(alpha))
    # print(x_train.T)
    ridge_model = GridSearchCV(model, param_grid={'alpha': alpha}, cv=5)
    ridge_model.fit(x_train_ridge, y_train_ridge)
    y_hat_ridge = ridge_model.predict(x_test_ridge)
    rescaled_y_pred_ridge = ss.inverse_transform(y_hat_ridge.reshape(-1, 1))
    #print(y_hat_ridge)

    # 高斯分布预测
    # from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared
    # from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
    # kernel = RBF(10, (1e-2, 1e2))
    # from sklearn.kernel_ridge import KernelRidge
    # #使用gaussian 回归进行预测数据
    # # param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3],
    # #               "kernel": [ExpSineSquared(l, p)
    # #                          for l in np.logspace(-2, 2, 10)
    # #                          for p in np.logspace(0, 2, 10)]}
    # # kr = GridSearchCV(KernelRidge(), cv=5, param_grid=param_grid)
    # gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
    #             + WhiteKernel(1e-1)
    # gpr = GaussianProcessRegressor(kernel=kernel,random_state=0).fit(x_train,y_train)
    # y_hat_guassian = gpr.predict(x_test,return_std=False)

    # RNN网络进行预测  LSTM
    from keras.models import Sequential
    from keras.layers import LSTM, Dense

    x_train_rnn, x_test_rnn, y_train_rnn, y_test_rnn = train_test_split(
        spacechange_metric_data,
        capicity_change,
        train_size=0.7,
        random_state=1)
    ss = MinMaxScaler(feature_range=(-1, 1))
    x_train_rnn = ss.fit_transform(x_train_rnn)
    x_test_rnn = ss.transform(x_test_rnn)

    y_train_rnn = ss.fit_transform(y_train_rnn.to_frame())
    #y_test_rnn = ss.transform(y_test_rnn.to_frame())

    # 构建模型
    x_train_rnn = x_train_rnn.reshape(x_train_rnn.shape[0], 1,
                                      x_train_rnn.shape[1])
    x_test_rnn = x_test_rnn.reshape(x_test_rnn.shape[0], 1,
                                    x_test_rnn.shape[1])
    model = Sequential()
    model.add(LSTM(4, batch_input_shape=(1, 1, 162), stateful=True))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    for i in range(50):
        print('已迭代{}次(共{}次) '.format(i + 1, 10))
        model.fit(x_train_rnn,
                  y_train_rnn,
                  epochs=1,
                  batch_size=1,
                  verbose=0,
                  shuffle=False)
        #model.reset_states()

    # 在所有训练样本上运行一次,构建cell状态
    y_pred_rnn = model.predict(x_test_rnn, batch_size=1)
    rescaled_y_pred = ss.inverse_transform(y_pred_rnn.reshape(-1, 1))

    from sklearn.metrics import explained_variance_score, \
        mean_absolute_error, mean_squared_error, \
        median_absolute_error, r2_score

    #使用AdaBoost进行回归

    x_train_ada, x_test_ada, y_train_ada, y_test_ada = train_test_split(
        spacechange_metric_data,
        capicity_change,
        train_size=0.7,
        random_state=1)
    # ss = StandardScaler()
    # x_train_ada = ss.fit_transform(x_train_ada)
    # x_test_ada = ss.transform(x_test_ada)

    n_estimators = 1000
    # tuned_parameters = {"base_estimator__criterion": ["mse","friedman_mse"],
    #                     "base_estimator__min_samples_split": [2, 10,20,25],
    #                     "base_estimator__max_depth": [None, 2,10,20,25,30],
    #                     "base_estimator__min_samples_leaf": [1, 5, 10,20,25],
    #                     "base_estimator__max_leaf_nodes": [None, 5, 10,20,25],
    #                     }
    tuned_parameters = {
        "base_estimator__criterion": ["mse", "friedman_mse"],
        "base_estimator__max_depth": [None, 2, 10],
        "base_estimator__min_samples_leaf": [1, 5]
    }

    # 弱回归
    dt_stump = DecisionTreeRegressor(max_depth=10)
    # AdaBoost 回归
    ada = AdaBoostRegressor(base_estimator=dt_stump,
                            n_estimators=n_estimators,
                            random_state=1,
                            learning_rate=0.001)
    # grid_search_ada = GridSearchCV(ada, param_grid=tuned_parameters, cv=10)
    # grid_search_ada.fit(x_train_ada, y_train_ada)
    # print(grid_search_ada.best_params_)

    # for params, mean_score, scores in grid_search_ada.grid_scores_:
    #     print("%0.3f (+/-%0.03f) for %r"
    #           % (mean_score, scores.std() * 2, params))

    ada.fit(x_train_ada, y_train_ada)
    y_hat_adaboost = ada.predict(x_test_ada)
    # print(y_hat_adaboost)
    # print(y_test_ada)
    # final_model = pk.dumps(lasso_model)
    # f = open('lasso.txt','wb')
    # f.write(final_model)
    # f.close()
    # #print(x_train)
    # print('超参数:\n', lasso_model.best_params_)
    #LASSO预测误差
    # print(lasso_model.score(x_test_lasso, y_test_lasso))
    # mse = np.average((y_hat_lasso - np.array(y_test_lasso)) ** 2)  # Mean Squared Error
    # rmse = np.sqrt(mse)  # Root Mean Squared Error
    # print('Lasso with origion features',mse, rmse)

    #RIDGE+FA+K-MEANS预测误差
    print(ridge_model.score(x_test_ridge, y_test_ridge))
    print('Ridge回归树模型的R^2值为:', r2_score(y_test_ridge, rescaled_y_pred_ridge))
    #print('Ridge回归树模型的平均绝对误差为:', mean_absolute_error(y_test_ridge, rescaled_y_pred_ridge))
    mse = np.average((rescaled_y_pred_ridge -
                      np.array(y_test_ridge))**2)  # Mean Squared Error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    print('Lasso with FA and K-means features', mse, rmse)

    #gaussian的模型性能
    # print(gpr.score(x_test,y_test))
    # mse  = np.average((y_hat_guassian-np.array(y_test)) ** 2)
    # rmse = np.sqrt(mse)  # Root Mean Squared Error
    # print('gaussion',mse, rmse)

    # Adaboost的模型性能
    print('AdaBoost回归树模型的R^2值为:', r2_score(y_test_ada, y_hat_adaboost))
    #print('AdaBoost回归树模型的平均绝对误差为:', mean_absolute_error(y_test_ada, y_hat_adaboost))
    mse = np.average((y_hat_adaboost - np.array(y_test_ada))**2)
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    #print('AdaBoost:', mse, rmse)

    #RNN的预测误差
    print('RNN的LSTM 的R^2值为:', r2_score(y_test_rnn, rescaled_y_pred))

    #print('RNN的LSTM 平均绝对误差为:', mean_absolute_error(y_test_rnn, rescaled_y_pred))

    x_ind = np.arange(len(x_test_ridge))
    #print('y_test:{}'.format(y_test.shape()))
    #print('y_hat: {}'.format(y_hat.shape()))
    # import matplotlib as mpl
    # t = np.arange(len(x_test))
    # mpl.rcParams['font.sans-serif'] = [u'simHei']
    # mpl.rcParams['axes.unicode_minus'] = False
    # plt.figure(facecolor='w')
    # plt.plot(t, y_test, 'r-', linewidth=2, label=u'真实数据')
    # plt.plot(t, y_hat, 'g-', linewidth=2, label=u'预测数据')
    # plt.title(u'线性回归预测', fontsize=18)
    # plt.legend(loc='upper left')
    # plt.grid(b=True, ls=':')
    # plt.show()

    template = loader.get_template(
        './node_modules/gentelella/production/spacechange_trend.html')
    REMOTE_HOST = '/static/assets/js'

    line_lasso = Line("空间容量变化量预测-LSTM")
    line_lasso.add("真实数据", x_ind, y_test_rnn, is_smooth=True)
    line_lasso.add("预测数据", x_ind, rescaled_y_pred, is_smooth=True)

    line_ridge = Line("空间容量变化量预测-RIDGE")
    line_ridge.add("真实数据", x_ind, y_test_ridge, is_smooth=True)
    line_ridge.add("预测数据", x_ind, rescaled_y_pred_ridge, is_smooth=True)

    line_gaussian = Line("空间容量变化量预测-AdaBoost")
    line_gaussian.add("真实数据", x_ind, y_test_ada, is_smooth=True)
    line_gaussian.add("预测数据", x_ind, y_hat_adaboost, is_smooth=True)

    context = dict(y_predict=y_hat_adaboost,
                   y_test=y_test_ada,
                   trend_line_lasso=line_lasso.render_embed(),
                   trend_line_ridge=line_ridge.render_embed(),
                   trend_line_gaussian=line_gaussian.render_embed(),
                   host=REMOTE_HOST,
                   script_list=line_lasso.get_js_dependencies())
    return HttpResponse(template.render(context, request))