# plt.xlabel("Fare") # plt.title("Histogram of Fares") # plt.show() # df = df[(df['fare_amount'] >= 0) & (df['fare_amount'] <= 100)] # df['passenger_count'].hist(bins=6, ec='black') # plt.xlabel("Passenger Count") # plt.title("Histogram of Passenger Count") # plt.show() # df.loc[df['passenger_count']==0, 'passenger_count'] = 1 # df.plot.scatter('pickup_longitude', 'pickup_latitude') # plt.show() # for long in ['pickup_longitude', 'dropoff_longitude']: # df = df[(df2[long] > nyc_min_longitude) & (df2[long] < nyc_max_longitude)] # for lat in ['pickup_latitude', 'dropoff_latitude']: # df = df[(df2[lat] > nyc_min_latitude) & (df2[lat] < nyc_max_latitude)] df = feature_engineer(preprocess(df)) # df['distance'] = euc_distance(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude']) # df.plot.scatter('fare_amount', 'distance') # plt.show()
from utils import preprocess, feature_engineer import pandas as pd import numpy as np from sklearn.preprocessing import scale from sklearn.model_selection import train_test_split from keras.models import Sequential from keras.layers import Dense from keras.layers import BatchNormalization from sklearn.metrics import mean_squared_error df = pd.read_csv('NYC_taxi.csv', parse_dates=['pickup_datetime'], nrows=500000) # Perform preprocessing and feature engineering df = preprocess(df) df = feature_engineer(df) # Scale the features df_prescaled = df.copy() df_scaled = df.drop(['fare_amount'], axis=1) df_scaled = scale(df_scaled) cols = df.columns.tolist() cols.remove('fare_amount') df_scaled = pd.DataFrame(df_scaled, columns=cols, index=df.index) df_scaled = pd.concat([df_scaled, df['fare_amount']], axis=1) df = df_scaled.copy() # Split the dataframe into a training and testing set X = df.loc[:, df.columns != 'fare_amount'] y = df.fare_amount X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)