示例#1
0
def plot_actual_and_pred():
    '''takes in data from all_models_info
    plots the actual appraised_value and the predicted appraised_value'''
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    # Baseline
    appraised_value_pred_mean = y_train['appraised_value'].mean()
    y_train['appraised_value_pred_mean'] = appraised_value_pred_mean
    y_validate['appraised_value_pred_mean'] = appraised_value_pred_mean
    #compute appraised_value_pred_median
    # same process as mean (above)
    appraised_value_pred_median = y_train['appraised_value'].median()
    y_train['appraised_value_pred_median'] = appraised_value_pred_median
    y_validate['appraised_value_pred_median'] = appraised_value_pred_median
    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    # Make the plot
    plt.figure(figsize=(20, 10))
    sns.set(style="darkgrid")

    plt.scatter(y_validate.appraised_value,
                y_validate.appraised_value_pred_lm,
                alpha=.5,
                color="mediumblue",
                s=100,
                label="Model: LinearRegression")
    m, b = np.polyfit(y_validate.appraised_value,
                      y_validate.appraised_value_pred_lm, 1)
    plt.plot(y_validate.appraised_value,
             m * y_validate.appraised_value + b,
             color='limegreen',
             label='Line of Regrssion',
             linewidth=5)
    plt.plot(y_validate.appraised_value,
             y_validate.appraised_value_pred_mean,
             alpha=.5,
             color="yellow",
             label='Baseline',
             linewidth=5)
    plt.plot(y_validate.appraised_value,
             y_validate.appraised_value,
             alpha=.5,
             color="cyan",
             label='The Ideal Line: Predicted = Actual',
             linewidth=5)
    plt.title('Plotting Actual vs. Predicted Values')
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
def SSE_MSE_RMSE_info():
    'Finds the SSE, MSE, and RMSE from add_to_train'
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = xtrain_xval_xtest(
    )
    home_value_baseline_median = y_train['appraised_value'].median()
    y_train['appraised_value_pred_median'] = round(home_value_baseline_median,
                                                   2)
    y_validate['appraised_value_pred_median'] = round(
        home_value_baseline_median, 2)
    home_value_baseline_mean = y_train['appraised_value'].mean()
    y_train['appraised_value_pred_mean'] = round(home_value_baseline_mean, 2)
    y_validate['appraised_value_pred_mean'] = round(home_value_baseline_mean,
                                                    2)
    # set up for SSE
    train['residual_sqr'] = train.residual**2
    train['baseline_residual_sqr'] = train.baseline_residual**2
    SSE = train['residual_sqr'].sum()
    SSE_baseline = train['baseline_residual_sqr'].sum()
    print("SSE = ", round(SSE, 3))
    print("SSE Baseline = ", round(SSE_baseline, 3))
    print('------------------------------------------')
    # set up for MSE
    MSE = SSE / len(df)
    MSE_baseline = SSE_baseline / len(df)
    print("MSE = ", round(MSE, 3))
    print("MSE baseline = ", round(MSE_baseline, 3))
    print('------------------------------------------')
    # set up for RMSE
    RMSE = sqrt(MSE)
    RMSE_baseline = sqrt(MSE_baseline)
    print("RMSE = ", round(RMSE, 3))
    print("RMSE baseline = ", round(RMSE_baseline, 3))
    print('------------------------------------------')
    # plot to visualize actual vs predicted.
    sns.set(style="darkgrid")
    plt.hist(y_train.appraised_value,
             color='teal',
             alpha=.5,
             label="Actual Home Value")
    plt.vlines(y_train.appraised_value_pred_mean,
               0,
               5000,
               color='yellow',
               alpha=.3,
               label="Predicted Mean Home Value")
    plt.vlines(y_train.appraised_value_pred_median,
               0,
               5000,
               color='lawngreen',
               alpha=.2,
               label="Predicted Median Home Value")
    plt.xlabel("Home Value")
    plt.ylabel("Number Homes")
    plt.legend()
    plt.show()
示例#3
0
def plot_scatter_plots():
    '''Plots scatter plots to show the relationships between features and appraised_value'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Plot
    plt.subplots(3, 1, figsize=(12,40), sharey=True)
    sns.set(style="darkgrid")

    plt.subplot(3,1,1)
    plt.title("Relationship between Appraised Values, and Bathrooms", size=20, color='black')
    sns.scatterplot(data=train, x='appraised_value', y='bathrooms', hue='bathrooms', palette='viridis')
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

    plt.subplot(3,1,2)
    plt.title("Relationship between Appraised Values, Bedrooms", size=20, color='black')
    sns.scatterplot(data=train, x='appraised_value', y='bedrooms', hue='bedrooms', palette='viridis')
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')

    plt.subplot(3,1,3)
    plt.title("Relationship between Appraised Values, Square Footage of Homes", size=20, color='black')
    sns.scatterplot(data=train, x='appraised_value', y='square_feet', hue='square_feet', palette='viridis')
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
def wrangle_zillow_data():
    df = acquire.acquire_zillow()
    df = prepare.zillow_single_unit(df)
    df = prepare.remove_columns(df,['calculatedbathnbr','finishedsquarefeet12',\
        'fullbathcnt','propertycountylandusecode','unitcnt','structuretaxvaluedollarcnt',\
        'landtaxvaluedollarcnt','assessmentyear','propertyzoningdesc'])
    df = prepare.handle_missing_values(df)
    df.dropna(inplace=True)
    return df
示例#5
0
def test_final_model():
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)

    X_train = train.drop(columns=['appraised_value'])
    y_train = train[['appraised_value']]

    X_validate = validate.drop(columns=['appraised_value'])
    y_validate = validate[['appraised_value']]

    X_test = test.drop(columns=['appraised_value'])
    y_test = test[['appraised_value']]
    y_train = pd.DataFrame(y_train)

    y_validate = pd.DataFrame(y_validate)

    appraised_value_pred_mean = y_train['appraised_value'].mean()
    y_train['appraised_value_pred_mean'] = appraised_value_pred_mean
    y_validate['appraised_value_pred_mean'] = appraised_value_pred_mean

    appraised_value_pred_median = y_train['appraised_value'].median()
    y_train['appraised_value_pred_median'] = appraised_value_pred_median
    y_validate['appraised_value_pred_median'] = appraised_value_pred_median

    rmse_train_mean = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_mean)**(1 / 2)

    rmse_validate_mean = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_mean)**(1 / 2)

    # sert up the model
    lm = LinearRegression(normalize=True)
    # fit the model
    lm.fit(X_train, y_train.appraised_value)
    # predict train
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    # evaluate: rmse
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    # predict validate
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    # evaluate: rmse
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    # make sure you are using x_validate an not x_train
    # test the model
    lm = LinearRegression(normalize=True)
    lm.fit(X_test, y_test.appraised_value)
    y_test['appraised_value_pred_lm'] = lm.predict(X_test)
    rmse_test_lm = mean_squared_error(y_test.appraised_value,
                                      y_test.appraised_value_pred_lm)**(1 / 2)
    print("RMSE for OLS using LinearRegression Test Data: ", rmse_test_lm)
示例#6
0
def train_pairplot():
    '''Plots histograms of each feature'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Plot
    sns.pairplot(train, hue = 'appraised_value', palette='viridis')
示例#7
0
def plot_zillow_heatmap():
    '''Plots heatmap of cleaned zillow dataset'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # plot the heatmap
    plt.figure(figsize=(16, 6))
    corr_map = sns.heatmap(df.corr(), cmap="viridis", vmin=-1, vmax=1, annot=True)
    corr_map.set_title('Zillow Correlation Heatmap of Zilllow Data', fontdict={'fontsize':18}, pad=12)
示例#8
0
def plot_ols_errors():
    '''takes in data from all_models_info
    plots the errors of the model'''
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    # Baseline
    appraised_value_pred_mean = y_train['appraised_value'].mean()
    y_train['appraised_value_pred_mean'] = appraised_value_pred_mean
    y_validate['appraised_value_pred_mean'] = appraised_value_pred_mean
    #compute appraised_value_pred_median
    # same process as mean (above)
    appraised_value_pred_median = y_train['appraised_value'].median()
    y_train['appraised_value_pred_median'] = appraised_value_pred_median
    y_validate['appraised_value_pred_median'] = appraised_value_pred_median
    appraised_value_pred_median = y_train['appraised_value'].median()
    rmse_train_mean = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_mean)**(1 / 2)
    rmse_validate_mean = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_mean)**(1 / 2)
    rmse_train_median = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_median)**(1 / 2)
    rmse_validate_median = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_median)**(1 / 2)
    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    # Make the plot
    plt.figure(figsize=(20, 10))
    sns.set(style="darkgrid")

    plt.scatter(y_validate.appraised_value,
                y_validate.appraised_value_pred_lm -
                y_validate.appraised_value,
                alpha=.5,
                color="mediumblue",
                s=100,
                label="Model: LinearRegression")
    plt.axhline(label="No Error", color='black', linewidth=7)
    plt.title('Plotting the Errors in Predictions')
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
示例#9
0
def plot_train_heatmap():
    '''Plots heatmap of split cleaned zillow dataset'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Plot the heatmap
    plt.figure(figsize=(16, 6))
    sns.heatmap(train.corr(), cmap="viridis", vmin=-1, vmax=1, annot=True, 
                           center=0, linewidths=4, linecolor='silver')
    plt.title('Zillow Correlation Heatmap of Trained Data without Scaling', fontsize=18, pad=12)
def xtrain_xval_xtest():
    '''create X_train, X_validate, X_test, y_train, y_validate, y_test'''
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)
    X_train = train.drop(columns=['appraised_value'])
    y_train = train.appraised_value
    X_validate = validate.drop(columns=['appraised_value'])
    y_validate = validate.appraised_value
    X_test = test.drop(columns=['appraised_value'])
    y_test = test.appraised_value
    y_train = pd.DataFrame(y_train)
    y_validate = pd.DataFrame(y_validate)
    return X_train, y_train, X_validate, y_validate, X_test, y_test
示例#11
0
def choose_best_model():
    '''takes in data from all_models_info
    and choosed which model should move forwards as the best model
    this model will be ran using the test data'''
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    # make into data frames
    y_train = pd.DataFrame(y_train)
    # turn it into a single pandas dataframe
    y_validate = pd.DataFrame(y_validate)
    # Predict appraised_value_pred_mean
    appraised_value_pred_mean = y_train['appraised_value'].mean()
    y_train['appraised_value_pred_mean'] = appraised_value_pred_mean
    y_validate['appraised_value_pred_mean'] = appraised_value_pred_mean
    #compute appraised_value_pred_median
    appraised_value_pred_median = y_train['appraised_value'].median()
    y_train['appraised_value_pred_median'] = appraised_value_pred_median
    y_validate['appraised_value_pred_median'] = appraised_value_pred_median
    # RMSE of appraised_value_pred_mean
    rmse_train_mean = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_mean)**(1 / 2)
    rmse_validate_mean = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_mean)**(1 / 2)
    # OLS mode
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    # make sure you are using x_validate an not x_train
    # Make the choice
    print("Model Selected: RMSE for OLS using Linear Regression")
    print("--------------------------------------------------------------")
    print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train_mean, 2),
          "\nValidate/Out-of-Sample: ", round(rmse_validate_mean, 2))
    print("--------------------------------------------------------------")
    print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ",
          rmse_train_lm, "\nValidation/Out-of-Sample: ", rmse_validate_lm)
def SSE_MSE_RMSE():
    'Finds the Sum of Squares from add_to_train'
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)
    # pull from add to trian
    train = add_to_train()
    # set up for SSE
    SSE = train['residual_sqr'].sum()
    SSE_baseline = train['baseline_residual_sqr'].sum()
    # set up for MSE
    MSE = SSE / len(df)
    MSE_baseline = SSE_baseline / len(df)
    # set up for RMSE
    RMSE = sqrt(MSE)
    RMSE_baseline = sqrt(MSE_baseline)
    return SSE, SSE_baseline, MSE, MSE_baseline, RMSE, RMSE_baseline
示例#13
0
def get_baseline():
    ''' takes in data and sets the baseline for the model'''
    # get data
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    # make into data frames
    y_train = pd.DataFrame(y_train)
    # turn it into a single pandas dataframe
    y_validate = pd.DataFrame(y_validate)
    # 1. Predict appraised_value_pred_mean
    # 2 different aselines of mean and medium
    appraised_value_pred_mean = y_train['appraised_value'].mean()
    y_train['appraised_value_pred_mean'] = appraised_value_pred_mean
    y_validate['appraised_value_pred_mean'] = appraised_value_pred_mean
    #compute appraised_value_pred_median
    # same process as mean (above)
    appraised_value_pred_median = y_train['appraised_value'].median()
    y_train['appraised_value_pred_median'] = appraised_value_pred_median
    y_validate['appraised_value_pred_median'] = appraised_value_pred_median
    # RMSE of appraised_value_pred_mean
    rmse_train_mean = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_mean)**(1 / 2)
    rmse_validate_mean = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_mean)**(1 / 2)
    # medium
    rmse_train_median = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_median)**(1 / 2)

    rmse_validate_median = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_median)**(1 / 2)
    # do the same thing for the validate set as done above for the train set
    print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train_mean, 2),
          "\nValidate/Out-of-Sample: ", round(rmse_validate_mean, 2))
    print(' ')

    print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train_median, 2),
          "\nValidate/Out-of-Sample: ", round(rmse_validate_mean, 2))
示例#14
0
def bathroom_corr():
    '''Runs correlation test between bathrooms and appraised value
    plot distribution plot
    plot box plot'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Set nul and alternative hypothesis, confidence level, and alpha
    null_hypothesis = "There is no correlation between number of bathrooms and appraised value."
    alt_hypothesis = "There is a correlation between number of bathrooms and appraised value."
    confidence_level = .95
    a = 1 - confidence_level
    # set x and y
    x = train.bathrooms
    y= train.appraised_value
    # run it
    corr, p = stats.pearsonr(x, y)
    print(f' The correlation between Bathrooms and the Appraised value is: ', corr)
    print(f' The P value between Bathrooms and Appraised Value is: ', p)
    print(' ')
    if p < a:
        print(f"Reject null hypothesis: '{null_hypothesis}'")
        print(' ')
        print(f"We now move forward with our alternative hypothesis: '{alt_hypothesis}'")
        print(' ')
        if 0 < corr < .6:
            print("This is a weak positive correlation.")
        elif .6 < corr < 1:
            print("That is a strong positive correlation.")
        elif -.6 < corr < 0:
            print("This is a weak negative correlation.")
        elif -1 < corr < -.6:
            print("That is a strong negative correlation.")
    
    else : 
        print("Fail to reject the null hypothesis.")
    # distplot
    sns.distplot(train.bathrooms, kde=True, color='teal')
    # boxplot
    sns.boxplot(y='appraised_value', x ='bathrooms', data = train, palette='viridis')
示例#15
0
def square_feet_corr():
    '''Runs correlation test between bedrooms and appraised value
    plot a distribution plot
    plot a jointplot'''
    # Take in the dataframe
    df = acquire.acquire_zillow()
    # Prepare the data
    df = prepare.clean_zillow(df)
    # Split the data set
    train, validate, test = prepare.split_focused_zillow(df)
    # Set nul and alternative hypothesis, confidence level, and alpha
    null_hypothesis = "There is no correlation between a homes square footage and appraised value."
    alt_hypothesis = "There is a correlation between square feet and appraised value."
    confidence_level = .95
    a = 1 - confidence_level
    x = train.square_feet
    y= train.appraised_value

    corr, p = stats.pearsonr(x, y)
    print(f' The correlation between Bathrooms and the Appraised value is: ', corr)
    print(f' The P value between Bathrooms and Appraised Value is: ', p)
    print(' ')
    if p < a:
        print(f"Reject null hypothesis: '{null_hypothesis}'")
        print(' ')
        print(f"We now move forward with our alternative hypothesis: '{alt_hypothesis}'")
        print(' ')
        if 0 < corr < .6:
            print("This is a weak positive correlation.")
        elif .6 < corr < 1:
            print("That is a strong positive correlation.")
        elif -.6 < corr < 0:
            print("This is a weak negative correlation.")
        elif -1 < corr < -.6:
            print("That is a strong negative correlation.")

    else : 
        print("Fail to reject the null hypothesis.")
    #distplot
    sns.distplot(train.square_feet, kde=True, color='teal')
    #jointplot
    sns.jointplot(data = train, x = 'square_feet', y = 'appraised_value', color='teal')
def add_to_train():
    '''prepare train for the next steps'''
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)
    ols_model = ols('appraised_value ~ bedrooms', data=train).fit()
    train['yhat'] = round(ols_model.predict(train), 2)
    train['baseline'] = train.appraised_value.mean()
    train['residual'] = train.appraised_value - train.yhat
    train['baseline_residual'] = train.appraised_value - train.baseline
    train['residual_sqr'] = train.residual**2
    train['baseline_residual_sqr'] = train.baseline_residual**2
    SSE = train['residual_sqr'].sum()
    SSE_baseline = train['baseline_residual_sqr'].sum()
    MSE = SSE / len(df)
    MSE_baseline = SSE_baseline / len(df)
    RMSE = sqrt(MSE)
    RMSE_baseline = sqrt(MSE_baseline)
    return train
示例#17
0
def tax_rate_dist():
    '''
    This function creates the dataframe used to calculate the tax distribution rate per county. 
    '''
    # pull uncleaned data b/c cleaned already removed outliers and most columns
    tax = acquire.acquire_zillow()
    # set the index
    tax.set_index('parcelid', inplace=True)
    # what features will this df focus on?
    features = ['fips', 'taxvaluedollarcnt', 'taxamount']
    tax = tax[features]
    # rename the columns
    tax.columns = ['fips', 'tax_value', 'tax_amount']
    # dorp any and all null values
    tax = tax.dropna()
    ## create a reature name tax_rate
    tax['tax_rate'] = (tax.tax_amount / tax.tax_value)
    #remove the outliers using the function remove_outliers
    tax = remove_outliers(tax, 'tax_rate', 2.5)
    tax = remove_outliers(tax, 'tax_value', 2.5)

    return tax
示例#18
0
def add_to_train():
    '''prepare train for the next steps'''
    # get the dataframe
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    train, validate, test = prepare.split_focused_zillow(df)
    # create the old model
    ols_model = ols('appraised_value ~ bedrooms', data=train).fit()
    # make new features
    train['yhat'] = round(ols_model.predict(train), 2)
    train['baseline'] = train.appraised_value.mean()
    train['residual'] = train.appraised_value - train.yhat
    train['baseline_residual'] = train.appraised_value - train.baseline
    train['residual_sqr'] = train.residual**2
    train['baseline_residual_sqr'] = train.baseline_residual**2
    # run the SSE, MSE, and RMSE plus their baselines
    SSE = train['residual_sqr'].sum()
    SSE_baseline = train['baseline_residual_sqr'].sum()
    MSE = SSE / len(df)
    MSE_baseline = SSE_baseline / len(df)
    RMSE = sqrt(MSE)
    RMSE_baseline = sqrt(MSE_baseline)
    return train
示例#19
0
def all_models_info():
    '''takes in data
    sets baseline
    sets SSE, MSE, and RMSE
    returns infor for all 4'''
    # get data
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    #LARS Model
    lars = LassoLars(alpha=1.0)
    lars.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lars'] = lars.predict(X_train)
    rmse_train_lars = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lars)**1 / 2
    y_validate['appraised_value_pred_lars'] = lars.predict(X_validate)
    rmse_validate_lars = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lars)**1 / 2
    #GLM
    glm = TweedieRegressor(power=1, alpha=0)
    glm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_glm'] = glm.predict(X_train)
    rmse_train_glm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_glm)**1 / 2
    y_validate['appraised_value_pred_glm'] = glm.predict(X_validate)
    rmse_validate_glm = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_glm)**1 / 2
    # PF
    pf = PolynomialFeatures(degree=2)
    X_train_degree2 = pf.fit_transform(X_train)
    X_validate_degree2 = pf.transform(X_validate)
    X_test_degree2 = pf.transform(X_test)
    # LM2
    lm2 = LinearRegression(normalize=True)
    lm2.fit(X_train_degree2, y_train.appraised_value)
    y_train['appraised_value_pred_lm2'] = lm2.predict(X_train_degree2)
    rmse_train_lm2 = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm2)**1 / 2
    y_validate['appraised_value_pred_lm2'] = lm2.predict(X_validate_degree2)
    rmse_validate_lm2 = mean_squared_error(
        y_validate.appraised_value, y_validate.appraised_value_pred_lm2)**1 / 2
    print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ",
          rmse_train_lm, "\nValidation/Out-of-Sample: ", rmse_validate_lm)
    print("--------------------------------------------------------------")
    print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train_lars,
          "\nValidation/Out-of-Sample: ", rmse_validate_lars)
    print("--------------------------------------------------------------")
    print(
        "RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ",
        rmse_train_glm, "\nValidation/Out-of-Sample: ", rmse_validate_glm)
    print("--------------------------------------------------------------")
    print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ",
          rmse_train_lm2, "\nValidation/Out-of-Sample: ", rmse_validate_lm2)
示例#20
0
def hist_ols_appraised_value():
    '''takes in data from all_models_info
    plots histograms of actual and predicted appraised_value'''
    df = acquire.acquire_zillow()
    df = prepare.clean_zillow(df)
    df = prepare.focused_zillow(df)
    # pull from add to trian
    train = evaluate.add_to_train()
    X_train, y_train, X_validate, y_validate, X_test, y_test = evaluate.xtrain_xval_xtest(
    )
    # Baseline
    appraised_value_pred_mean = y_train['appraised_value'].mean()
    y_train['appraised_value_pred_mean'] = appraised_value_pred_mean
    y_validate['appraised_value_pred_mean'] = appraised_value_pred_mean
    #compute appraised_value_pred_median
    # same process as mean (above)
    appraised_value_pred_median = y_train['appraised_value'].median()
    y_train['appraised_value_pred_median'] = appraised_value_pred_median
    y_validate['appraised_value_pred_median'] = appraised_value_pred_median
    appraised_value_pred_median = y_train['appraised_value'].median()
    rmse_train_mean = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_mean)**(1 / 2)
    rmse_validate_mean = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_mean)**(1 / 2)
    rmse_train_median = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_median)**(1 / 2)
    rmse_validate_median = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_median)**(1 / 2)
    #OLS Model
    lm = LinearRegression(normalize=True)
    lm.fit(X_train, y_train.appraised_value)
    y_train['appraised_value_pred_lm'] = lm.predict(X_train)
    rmse_train_lm = mean_squared_error(
        y_train.appraised_value, y_train.appraised_value_pred_lm)**(1 / 2)
    y_validate['appraised_value_pred_lm'] = lm.predict(X_validate)
    rmse_validate_lm = mean_squared_error(
        y_validate.appraised_value,
        y_validate.appraised_value_pred_lm)**(1 / 2)
    # Create histograms
    plt.subplots(3, 5, figsize=(8, 16), sharey=True)
    sns.set(style="darkgrid")
    plt.title(
        "Comparing the Distribution of appraised_values to Distributions of Predicted appraised_values Linear Regression Models"
    )
    plt.xlabel("appraised_value", size=15)
    plt.ylabel("appraised_value Count", size=15)

    plt.subplot(3, 1, 1)
    plt.hist(y_validate.appraised_value, color='cyan', alpha=.5, ec='black')
    plt.title('Actual appraised_values', size=15)

    plt.subplot(3, 1, 2)
    plt.hist(y_validate.appraised_value_pred_lm,
             color='lawngreen',
             alpha=.5,
             ec='black')
    plt.title('Model: LinearRegression', size=15)

    plt.subplot(3, 1, 3)
    plt.hist(y_validate.appraised_value,
             color='lawngreen',
             alpha=.5,
             label="Actual Final appraised_values",
             ec='black')
    plt.hist(y_validate.appraised_value_pred_lm,
             color='cyan',
             alpha=.5,
             label="Model: LinearRegression",
             ec='black')
    plt.title("All Graphs Stacked", size=15)
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')