def plot_scatter_plots(): '''Plots scatter plots to show the relationships between features and appraised_value''' # Take in the dataframe df = acquire.acquire_zillow() # Prepare the data df = prepare.clean_zillow(df) # Split the data set train, validate, test = prepare.split_focused_zillow(df) # Plot plt.subplots(3, 1, figsize=(12,40), sharey=True) sns.set(style="darkgrid") plt.subplot(3,1,1) plt.title("Relationship between Appraised Values, and Bathrooms", size=20, color='black') sns.scatterplot(data=train, x='appraised_value', y='bathrooms', hue='bathrooms', palette='viridis') plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') plt.subplot(3,1,2) plt.title("Relationship between Appraised Values, Bedrooms", size=20, color='black') sns.scatterplot(data=train, x='appraised_value', y='bedrooms', hue='bedrooms', palette='viridis') plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left') plt.subplot(3,1,3) plt.title("Relationship between Appraised Values, Square Footage of Homes", size=20, color='black') sns.scatterplot(data=train, x='appraised_value', y='square_feet', hue='square_feet', palette='viridis') plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
def test_final_model(): df = acquire.acquire_zillow() df = prepare.clean_zillow(df) df = prepare.focused_zillow(df) train, validate, test = prepare.split_focused_zillow(df) X_train = train.drop(columns=['appraised_value']) y_train = train[['appraised_value']] X_validate = validate.drop(columns=['appraised_value']) y_validate = validate[['appraised_value']] X_test = test.drop(columns=['appraised_value']) y_test = test[['appraised_value']] y_train = pd.DataFrame(y_train) y_validate = pd.DataFrame(y_validate) appraised_value_pred_mean = y_train['appraised_value'].mean() y_train['appraised_value_pred_mean'] = appraised_value_pred_mean y_validate['appraised_value_pred_mean'] = appraised_value_pred_mean appraised_value_pred_median = y_train['appraised_value'].median() y_train['appraised_value_pred_median'] = appraised_value_pred_median y_validate['appraised_value_pred_median'] = appraised_value_pred_median rmse_train_mean = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_mean)**(1 / 2) rmse_validate_mean = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_mean)**(1 / 2) # sert up the model lm = LinearRegression(normalize=True) # fit the model lm.fit(X_train, y_train.appraised_value) # predict train y_train['appraised_value_pred_lm'] = lm.predict(X_train) # evaluate: rmse rmse_train_lm = mean_squared_error( y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2) # predict validate y_validate['appraised_value_pred_lm'] = lm.predict(X_validate) # evaluate: rmse rmse_validate_lm = mean_squared_error( y_validate.appraised_value, y_validate.appraised_value_pred_lm)**(1 / 2) # make sure you are using x_validate an not x_train # test the model lm = LinearRegression(normalize=True) lm.fit(X_test, y_test.appraised_value) y_test['appraised_value_pred_lm'] = lm.predict(X_test) rmse_test_lm = mean_squared_error(y_test.appraised_value, y_test.appraised_value_pred_lm)**(1 / 2) print("RMSE for OLS using LinearRegression Test Data: ", rmse_test_lm)
def train_pairplot(): '''Plots histograms of each feature''' # Take in the dataframe df = acquire.acquire_zillow() # Prepare the data df = prepare.clean_zillow(df) # Split the data set train, validate, test = prepare.split_focused_zillow(df) # Plot sns.pairplot(train, hue = 'appraised_value', palette='viridis')
def plot_train_heatmap(): '''Plots heatmap of split cleaned zillow dataset''' # Take in the dataframe df = acquire.acquire_zillow() # Prepare the data df = prepare.clean_zillow(df) # Split the data set train, validate, test = prepare.split_focused_zillow(df) # Plot the heatmap plt.figure(figsize=(16, 6)) sns.heatmap(train.corr(), cmap="viridis", vmin=-1, vmax=1, annot=True, center=0, linewidths=4, linecolor='silver') plt.title('Zillow Correlation Heatmap of Trained Data without Scaling', fontsize=18, pad=12)
def xtrain_xval_xtest(): '''create X_train, X_validate, X_test, y_train, y_validate, y_test''' df = acquire.acquire_zillow() df = prepare.clean_zillow(df) df = prepare.focused_zillow(df) train, validate, test = prepare.split_focused_zillow(df) X_train = train.drop(columns=['appraised_value']) y_train = train.appraised_value X_validate = validate.drop(columns=['appraised_value']) y_validate = validate.appraised_value X_test = test.drop(columns=['appraised_value']) y_test = test.appraised_value y_train = pd.DataFrame(y_train) y_validate = pd.DataFrame(y_validate) return X_train, y_train, X_validate, y_validate, X_test, y_test
def SSE_MSE_RMSE(): 'Finds the Sum of Squares from add_to_train' df = acquire.acquire_zillow() df = prepare.clean_zillow(df) df = prepare.focused_zillow(df) train, validate, test = prepare.split_focused_zillow(df) # pull from add to trian train = add_to_train() # set up for SSE SSE = train['residual_sqr'].sum() SSE_baseline = train['baseline_residual_sqr'].sum() # set up for MSE MSE = SSE / len(df) MSE_baseline = SSE_baseline / len(df) # set up for RMSE RMSE = sqrt(MSE) RMSE_baseline = sqrt(MSE_baseline) return SSE, SSE_baseline, MSE, MSE_baseline, RMSE, RMSE_baseline
def bathroom_corr(): '''Runs correlation test between bathrooms and appraised value plot distribution plot plot box plot''' # Take in the dataframe df = acquire.acquire_zillow() # Prepare the data df = prepare.clean_zillow(df) # Split the data set train, validate, test = prepare.split_focused_zillow(df) # Set nul and alternative hypothesis, confidence level, and alpha null_hypothesis = "There is no correlation between number of bathrooms and appraised value." alt_hypothesis = "There is a correlation between number of bathrooms and appraised value." confidence_level = .95 a = 1 - confidence_level # set x and y x = train.bathrooms y= train.appraised_value # run it corr, p = stats.pearsonr(x, y) print(f' The correlation between Bathrooms and the Appraised value is: ', corr) print(f' The P value between Bathrooms and Appraised Value is: ', p) print(' ') if p < a: print(f"Reject null hypothesis: '{null_hypothesis}'") print(' ') print(f"We now move forward with our alternative hypothesis: '{alt_hypothesis}'") print(' ') if 0 < corr < .6: print("This is a weak positive correlation.") elif .6 < corr < 1: print("That is a strong positive correlation.") elif -.6 < corr < 0: print("This is a weak negative correlation.") elif -1 < corr < -.6: print("That is a strong negative correlation.") else : print("Fail to reject the null hypothesis.") # distplot sns.distplot(train.bathrooms, kde=True, color='teal') # boxplot sns.boxplot(y='appraised_value', x ='bathrooms', data = train, palette='viridis')
def square_feet_corr(): '''Runs correlation test between bedrooms and appraised value plot a distribution plot plot a jointplot''' # Take in the dataframe df = acquire.acquire_zillow() # Prepare the data df = prepare.clean_zillow(df) # Split the data set train, validate, test = prepare.split_focused_zillow(df) # Set nul and alternative hypothesis, confidence level, and alpha null_hypothesis = "There is no correlation between a homes square footage and appraised value." alt_hypothesis = "There is a correlation between square feet and appraised value." confidence_level = .95 a = 1 - confidence_level x = train.square_feet y= train.appraised_value corr, p = stats.pearsonr(x, y) print(f' The correlation between Bathrooms and the Appraised value is: ', corr) print(f' The P value between Bathrooms and Appraised Value is: ', p) print(' ') if p < a: print(f"Reject null hypothesis: '{null_hypothesis}'") print(' ') print(f"We now move forward with our alternative hypothesis: '{alt_hypothesis}'") print(' ') if 0 < corr < .6: print("This is a weak positive correlation.") elif .6 < corr < 1: print("That is a strong positive correlation.") elif -.6 < corr < 0: print("This is a weak negative correlation.") elif -1 < corr < -.6: print("That is a strong negative correlation.") else : print("Fail to reject the null hypothesis.") #distplot sns.distplot(train.square_feet, kde=True, color='teal') #jointplot sns.jointplot(data = train, x = 'square_feet', y = 'appraised_value', color='teal')
def add_to_train(): '''prepare train for the next steps''' df = acquire.acquire_zillow() df = prepare.clean_zillow(df) df = prepare.focused_zillow(df) train, validate, test = prepare.split_focused_zillow(df) ols_model = ols('appraised_value ~ bedrooms', data=train).fit() train['yhat'] = round(ols_model.predict(train), 2) train['baseline'] = train.appraised_value.mean() train['residual'] = train.appraised_value - train.yhat train['baseline_residual'] = train.appraised_value - train.baseline train['residual_sqr'] = train.residual**2 train['baseline_residual_sqr'] = train.baseline_residual**2 SSE = train['residual_sqr'].sum() SSE_baseline = train['baseline_residual_sqr'].sum() MSE = SSE / len(df) MSE_baseline = SSE_baseline / len(df) RMSE = sqrt(MSE) RMSE_baseline = sqrt(MSE_baseline) return train
def add_to_train(): '''prepare train for the next steps''' # get the dataframe df = acquire.acquire_zillow() df = prepare.clean_zillow(df) df = prepare.focused_zillow(df) train, validate, test = prepare.split_focused_zillow(df) # create the old model ols_model = ols('appraised_value ~ bedrooms', data=train).fit() # make new features train['yhat'] = round(ols_model.predict(train), 2) train['baseline'] = train.appraised_value.mean() train['residual'] = train.appraised_value - train.yhat train['baseline_residual'] = train.appraised_value - train.baseline train['residual_sqr'] = train.residual**2 train['baseline_residual_sqr'] = train.baseline_residual**2 # run the SSE, MSE, and RMSE plus their baselines SSE = train['residual_sqr'].sum() SSE_baseline = train['baseline_residual_sqr'].sum() MSE = SSE / len(df) MSE_baseline = SSE_baseline / len(df) RMSE = sqrt(MSE) RMSE_baseline = sqrt(MSE_baseline) return train