예제 #1
0
def main():
    # read data for train
    train = pd.read_csv('data_for_model/new_with_price_per_sqm/training_data.csv')
    LB = LabelBinarizer()
    train['town'] = LB.fit_transform(train['town']) 
    train['flat_model'] = LB.fit_transform(train['flat_model'])
    labels = train.iloc[:,20:].values
    total_price = train.iloc[:,19:20].values
    features = train.iloc[:,:19].values
    floor_area = np.asarray(train['floor_area_sqm'].values).reshape(len(labels),1)

    # preprocess training data
    X_train = preprocessing_X(features)
    scaler_y_train, y_train = preprocessing_Y(labels)

    # read in test data
    test = pd.read_csv('data_for_model/new_with_price_per_sqm/test_data.csv')
    test['town'] = LB.fit_transform(test['town']) 
    test['flat_model'] = LB.fit_transform(test['flat_model'])
    labels_test = test.iloc[:,20:].values
    total_price_test = test.iloc[:,19:20].values
    features_test = test.iloc[:,:19].values
    floor_area_test = np.asarray(test['floor_area_sqm'].values).reshape(len(labels_test),1)

    # preprocess test data
    X_test = preprocessing_X(features_test)
    scaler_y_test, y_test = preprocessing_Y(labels_test)

    # fine_tune
    # fine_tune(X_train, y_train, scaler_y, floor_area, total_price)

    # train on all training data with best hyper-params
    #model = build_model()
    #result = model.fit(X_train, y_train, epochs=300, batch_size = int(len(X_train)/256), verbose=1, shuffle=False)

    # train on all training data with best hyper-params with pdp
    model = KerasRegressor(build_model, epochs=300, batch_size = int(len(X_train)/256), verbose=1, shuffle=False)
    model._estimator_type = "regressor" # Cheap workaround for keras NN to work with plot_partial_dependence
    model.dummy_ = "dummy" # Cheap workaround for keras NN to work with plot_partial_dependence

    model.fit(X_train, y_train)

    print('Computing partial dependence plots...')
    tic = time()
    pdp_features = [10] # remaining_lease, dist_nearest_mrt, dist_nearest_supermarkets, dist_nearest_sports_facilities
    display = plot_partial_dependence(estimator=model, X=X_train, features=pdp_features,
        kind='both', subsample=500, random_state=0, verbose=10)
    print(f"done in {time() - tic:.3f}s")

    display.figure_.suptitle(
    'Plot'
    )
    display.figure_.subplots_adjust(hspace=0.3)
    plt.show()

    # get score for validation
    #score = get_score(scaler_y_train.inverse_transform(model.predict(X_train)) * floor_area, total_price)
    #print('score on validation = {}'.format(score))

    # predict y values for test data
    #val_res = scaler_y_test.inverse_transform(model.predict(X_test))

    # get performance score on test data
    #score = get_score(val_res * floor_area_test, total_price_test)
    # score = get_score(val_res, total_price_test)
    #print('score on test = {}'.format(score))

    '''