示例#1
0
# plt.xlabel("Fare")
# plt.title("Histogram of Fares")
# plt.show()

# df = df[(df['fare_amount'] >= 0) & (df['fare_amount'] <= 100)]

# df['passenger_count'].hist(bins=6, ec='black')
# plt.xlabel("Passenger Count")
# plt.title("Histogram of Passenger Count")
# plt.show()

# df.loc[df['passenger_count']==0, 'passenger_count'] = 1

# df.plot.scatter('pickup_longitude', 'pickup_latitude')
# plt.show()

# for long in ['pickup_longitude', 'dropoff_longitude']:
#     df = df[(df2[long] > nyc_min_longitude) & (df2[long] < nyc_max_longitude)]

# for lat in ['pickup_latitude', 'dropoff_latitude']:
#     df = df[(df2[lat] > nyc_min_latitude) & (df2[lat] < nyc_max_latitude)]



df = feature_engineer(preprocess(df))

# df['distance'] = euc_distance(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])

# df.plot.scatter('fare_amount', 'distance')
# plt.show()
from utils import preprocess, feature_engineer
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization
from sklearn.metrics import mean_squared_error

df = pd.read_csv('NYC_taxi.csv', parse_dates=['pickup_datetime'], nrows=500000)

# Perform preprocessing and feature engineering
df = preprocess(df)
df = feature_engineer(df)

# Scale the features
df_prescaled = df.copy()
df_scaled = df.drop(['fare_amount'], axis=1)
df_scaled = scale(df_scaled)
cols = df.columns.tolist()
cols.remove('fare_amount')
df_scaled = pd.DataFrame(df_scaled, columns=cols, index=df.index)
df_scaled = pd.concat([df_scaled, df['fare_amount']], axis=1)
df = df_scaled.copy()

# Split the dataframe into a training and testing set
X = df.loc[:, df.columns != 'fare_amount']
y = df.fare_amount
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)