示例#1
0
def plot_scatter_plots():
    '''Plots scatter plots to show the relationships between features and appraised_value'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Plot
    plt.subplots(3, 1, figsize=(12,40), sharey=True)
    sns.set(style="darkgrid")

    plt.subplot(3,1,1)
    plt.title("Relationship between Appraised Values, and Bathrooms", size=20, color='black')
    sns.scatterplot(data=train, x='appraised_value', y='bathrooms', hue='bathrooms', palette='viridis')
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

    plt.subplot(3,1,2)
    plt.title("Relationship between Appraised Values, Bedrooms", size=20, color='black')
    sns.scatterplot(data=train, x='appraised_value', y='bedrooms', hue='bedrooms', palette='viridis')
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

    plt.subplot(3,1,3)
    plt.title("Relationship between Appraised Values, Square Footage of Homes", size=20, color='black')
    sns.scatterplot(data=train, x='appraised_value', y='square_feet', hue='square_feet', palette='viridis')
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
示例#2
0
def test_final_model():
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)

    X_train = train.drop(columns=['appraised_value'])
    y_train = train[['appraised_value']]

    X_validate = validate.drop(columns=['appraised_value'])
    y_validate = validate[['appraised_value']]

    X_test = test.drop(columns=['appraised_value'])
    y_test = test[['appraised_value']]
    y_train = pd.DataFrame(y_train)

    y_validate = pd.DataFrame(y_validate)

    appraised_value_pred_mean = y_train['appraised_value'].mean()
    y_train['appraised_value_pred_mean'] = appraised_value_pred_mean
    y_validate['appraised_value_pred_mean'] = appraised_value_pred_mean

    appraised_value_pred_median = y_train['appraised_value'].median()
    y_train['appraised_value_pred_median'] = appraised_value_pred_median
    y_validate['appraised_value_pred_median'] = appraised_value_pred_median

    rmse_train_mean = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_mean)**(1 / 2)

    rmse_validate_mean = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_mean)**(1 / 2)

    # sert up the model
    lm = LinearRegression(normalize=True)
    # fit the model
    lm.fit(X_train, y_train.appraised_value)
    # predict train
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    # evaluate: rmse
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    # predict validate
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    # evaluate: rmse
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    # make sure you are using x_validate an not x_train
    # test the model
    lm = LinearRegression(normalize=True)
    lm.fit(X_test, y_test.appraised_value)
    y_test['appraised_value_pred_lm'] = lm.predict(X_test)
    rmse_test_lm = mean_squared_error(y_test.appraised_value,
                                      y_test.appraised_value_pred_lm)**(1 / 2)
    print("RMSE for OLS using LinearRegression Test Data: ", rmse_test_lm)
示例#3
0
def train_pairplot():
    '''Plots histograms of each feature'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Plot
    sns.pairplot(train, hue = 'appraised_value', palette='viridis')
示例#4
0
def plot_train_heatmap():
    '''Plots heatmap of split cleaned zillow dataset'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Plot the heatmap
    plt.figure(figsize=(16, 6))
    sns.heatmap(train.corr(), cmap="viridis", vmin=-1, vmax=1, annot=True, 
                           center=0, linewidths=4, linecolor='silver')
    plt.title('Zillow Correlation Heatmap of Trained Data without Scaling', fontsize=18, pad=12)
def xtrain_xval_xtest():
    '''create X_train, X_validate, X_test, y_train, y_validate, y_test'''
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)
    X_train = train.drop(columns=['appraised_value'])
    y_train = train.appraised_value
    X_validate = validate.drop(columns=['appraised_value'])
    y_validate = validate.appraised_value
    X_test = test.drop(columns=['appraised_value'])
    y_test = test.appraised_value
    y_train = pd.DataFrame(y_train)
    y_validate = pd.DataFrame(y_validate)
    return X_train, y_train, X_validate, y_validate, X_test, y_test
def SSE_MSE_RMSE():
    'Finds the Sum of Squares from add_to_train'
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)
    # pull from add to trian
    train = add_to_train()
    # set up for SSE
    SSE = train['residual_sqr'].sum()
    SSE_baseline = train['baseline_residual_sqr'].sum()
    # set up for MSE
    MSE = SSE / len(df)
    MSE_baseline = SSE_baseline / len(df)
    # set up for RMSE
    RMSE = sqrt(MSE)
    RMSE_baseline = sqrt(MSE_baseline)
    return SSE, SSE_baseline, MSE, MSE_baseline, RMSE, RMSE_baseline
示例#7
0
def bathroom_corr():
    '''Runs correlation test between bathrooms and appraised value
    plot distribution plot
    plot box plot'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Set nul and alternative hypothesis, confidence level, and alpha
    null_hypothesis = "There is no correlation between number of bathrooms and appraised value."
    alt_hypothesis = "There is a correlation between number of bathrooms and appraised value."
    confidence_level = .95
    a = 1 - confidence_level
    # set x and y
    x = train.bathrooms
    y= train.appraised_value
    # run it
    corr, p = stats.pearsonr(x, y)
    print(f' The correlation between Bathrooms and the Appraised value is: ', corr)
    print(f' The P value between Bathrooms and Appraised Value is: ', p)
    print(' ')
    if p < a:
        print(f"Reject null hypothesis: '{null_hypothesis}'")
        print(' ')
        print(f"We now move forward with our alternative hypothesis: '{alt_hypothesis}'")
        print(' ')
        if 0 < corr < .6:
            print("This is a weak positive correlation.")
        elif .6 < corr < 1:
            print("That is a strong positive correlation.")
        elif -.6 < corr < 0:
            print("This is a weak negative correlation.")
        elif -1 < corr < -.6:
            print("That is a strong negative correlation.")
    
    else : 
        print("Fail to reject the null hypothesis.")
    # distplot
    sns.distplot(train.bathrooms, kde=True, color='teal')
    # boxplot
    sns.boxplot(y='appraised_value', x ='bathrooms', data = train, palette='viridis')
示例#8
0
def square_feet_corr():
    '''Runs correlation test between bedrooms and appraised value
    plot a distribution plot
    plot a jointplot'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Set nul and alternative hypothesis, confidence level, and alpha
    null_hypothesis = "There is no correlation between a homes square footage and appraised value."
    alt_hypothesis = "There is a correlation between square feet and appraised value."
    confidence_level = .95
    a = 1 - confidence_level
    x = train.square_feet
    y= train.appraised_value

    corr, p = stats.pearsonr(x, y)
    print(f' The correlation between Bathrooms and the Appraised value is: ', corr)
    print(f' The P value between Bathrooms and Appraised Value is: ', p)
    print(' ')
    if p < a:
        print(f"Reject null hypothesis: '{null_hypothesis}'")
        print(' ')
        print(f"We now move forward with our alternative hypothesis: '{alt_hypothesis}'")
        print(' ')
        if 0 < corr < .6:
            print("This is a weak positive correlation.")
        elif .6 < corr < 1:
            print("That is a strong positive correlation.")
        elif -.6 < corr < 0:
            print("This is a weak negative correlation.")
        elif -1 < corr < -.6:
            print("That is a strong negative correlation.")

    else : 
        print("Fail to reject the null hypothesis.")
    #distplot
    sns.distplot(train.square_feet, kde=True, color='teal')
    #jointplot
    sns.jointplot(data = train, x = 'square_feet', y = 'appraised_value', color='teal')
def add_to_train():
    '''prepare train for the next steps'''
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)
    ols_model = ols('appraised_value ~ bedrooms', data=train).fit()
    train['yhat'] = round(ols_model.predict(train), 2)
    train['baseline'] = train.appraised_value.mean()
    train['residual'] = train.appraised_value - train.yhat
    train['baseline_residual'] = train.appraised_value - train.baseline
    train['residual_sqr'] = train.residual**2
    train['baseline_residual_sqr'] = train.baseline_residual**2
    SSE = train['residual_sqr'].sum()
    SSE_baseline = train['baseline_residual_sqr'].sum()
    MSE = SSE / len(df)
    MSE_baseline = SSE_baseline / len(df)
    RMSE = sqrt(MSE)
    RMSE_baseline = sqrt(MSE_baseline)
    return train
示例#10
0
def add_to_train():
    '''prepare train for the next steps'''
    # get the dataframe
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)
    # create the old model
    ols_model = ols('appraised_value ~ bedrooms', data=train).fit()
    # make new features
    train['yhat'] = round(ols_model.predict(train), 2)
    train['baseline'] = train.appraised_value.mean()
    train['residual'] = train.appraised_value - train.yhat
    train['baseline_residual'] = train.appraised_value - train.baseline
    train['residual_sqr'] = train.residual**2
    train['baseline_residual_sqr'] = train.baseline_residual**2
    # run the SSE, MSE, and RMSE plus their baselines
    SSE = train['residual_sqr'].sum()
    SSE_baseline = train['baseline_residual_sqr'].sum()
    MSE = SSE / len(df)
    MSE_baseline = SSE_baseline / len(df)
    RMSE = sqrt(MSE)
    RMSE_baseline = sqrt(MSE_baseline)
    return train