示例#1
0
def clean_zillow(cached=True):
    '''This function acquires and prepares the zillow data from a local csv, default. Passing cached=False acquires fresh data from sql and writes to csv.'''
    # use my aquire function to read data into a df from a csv file
    df = get_zillow_data()
    # drop duplicates
    df.drop_duplicates(inplace=True)
    # drop duplicate columns and remove columns with more than 50% nulls
    df = df.drop(columns=[
        'unitcnt', 'propertylandusedesc', 'heatingorsystemdesc',
        'propertyzoningdesc', 'heatingorsystemtypeid', 'taxdelinquencyflag',
        'taxdelinquencyyear', 'yardbuildingsqft17', 'finishedsquarefeet50',
        'finishedfloor1squarefeet', 'fireplacecnt', 'threequarterbathnbr',
        'pooltypeid7', 'poolcnt', 'numberofstories', 'airconditioningdesc',
        'garagetotalsqft', 'garagecarcnt', 'regionidneighborhood',
        'hashottuborspa', 'pooltypeid2', 'poolsizesum', 'pooltypeid10',
        'typeconstructiontypeid', 'typeconstructiondesc',
        'architecturalstyledesc', 'finishedsquarefeet6', 'fireplaceflag',
        'yardbuildingsqft26', 'finishedsquarefeet13', 'storytypeid',
        'storydesc', 'basementsqft', 'finishedsquarefeet15',
        'buildingclassdesc', 'architecturalstyletypeid',
        'airconditioningtypeid', 'buildingclasstypeid',
        'buildingqualitytypeid', 'decktypeid', 'architecturalstyletypeid.1',
        'airconditioningtypeid.1', 'heatingorsystemtypeid.1',
        'propertylandusetypeid.1', 'buildingclasstypeid.1', 'storytypeid.1',
        'typeconstructiontypeid.1', 'id.1', 'Unnamed: 0', 'calculatedbathnbr',
        'fips', 'latitude', 'longitude', 'regionidcounty', 'roomcnt',
        'yearbuilt', 'assessmentyear', 'propertycountylandusecode',
        'propertylandusetypeid', 'parcelid.2', 'parcelid.1'
    ])
    #removing columns
    df.replace(',', '', regex=True, inplace=True)
    #handling nan's
    df = df.fillna(df.mean())
    return df
def zillow_clean():

    df = a.get_zillow_data()

    df = handle_missing_values(df,
                               prop_required_column=.5,
                               prop_required_row=.75)

    cols_to_remove = [
        'buildingqualitytypeid', 'heatingorsystemtypeid', 'propertyzoningdesc',
        'censustractandblock'
    ]
    df = remove_columns(df, cols_to_remove)

    columns = [
        'calculatedbathnbr', 'fullbathcnt', 'regionidcity', 'regionidzip',
        'unitcnt', 'yearbuilt'
    ]
    strategy = 'most_frequent'
    df = imputer_9000(df, columns, strategy)

    columns = [
        'calculatedbathnbr', 'calculatedfinishedsquarefeet',
        'finishedsquarefeet12', 'lotsizesquarefeet',
        'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'taxamount',
        'landtaxvaluedollarcnt'
    ]
    strategy = 'most_frequent'
    df = imputer_9000(df, columns, strategy)

    return df
示例#3
0
def prep_zillow_data():
    df = acquire.get_zillow_data()
    df = df.drop(columns = {'airconditioningtypeid', 'architecturalstyletypeid', 'basementsqft',
                            'taxdelinquencyyear', 'buildingclasstypeid', 'decktypeid', 'finishedfloor1squarefeet',
                            'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50',
                            'finishedsquarefeet6', 'fireplacecnt', 'hashottuborspa', 'poolsizesum',
                            'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'threequarterbathnbr',
                            'typeconstructiontypeid', 'yardbuildingsqft17', 'yardbuildingsqft26', 'numberofstories',
                            'fireplaceflag', 'taxdelinquencyflag', 'buildingqualitytypeid', 'garagecarcnt',
                            'garagetotalsqft', 'poolcnt', 'regionidneighborhood', 'propertyzoningdesc',
                            'propertycountylandusecode', 'id', 'parcelid', 'transactiondate', 'finishedsquarefeet12',
                            'censustractandblock', 'logerror'})
    df['propertylandusetypeid'] = df['propertylandusetypeid'].astype(float)
    df['calculatedfinishedsquarefeet'] = df['calculatedfinishedsquarefeet'].fillna((df['calculatedfinishedsquarefeet'].mean()))
    df['calculatedbathnbr'] = df['calculatedbathnbr'].fillna((df['calculatedbathnbr'].mean()))
    df['fullbathcnt'] = df['fullbathcnt'].fillna((df['fullbathcnt'].mean()))
    df['heatingorsystemtypeid'] = df['heatingorsystemtypeid'].fillna((df['heatingorsystemtypeid'].mean()))
    df['lotsizesquarefeet'] = df['lotsizesquarefeet'].fillna((df['lotsizesquarefeet'].mean()))
    df['regionidcity'] = df['regionidcity'].fillna((df['regionidcity'].mean()))
    df['regionidzip'] = df['regionidzip'].fillna((df['regionidzip'].mean()))
    df['unitcnt'] = df['unitcnt'].fillna((df['unitcnt'].mean()))
    df['yearbuilt'] = df['yearbuilt'].fillna((df['yearbuilt'].mean()))
    df['calculatedfinishedsquarefeet'] = df.calculatedfinishedsquarefeet[(np.abs(stats.zscore(df.calculatedfinishedsquarefeet)) < 3)]
    df = df.dropna()
    return df
def wrangle_zillow_data():
    df = acquire.get_zillow_data()
    df = prepare.zillow_single_unit(df)
    df = prepare.remove_columns(df,['calculatedbathnbr','finishedsquarefeet12',\
        'fullbathcnt','propertycountylandusecode','unitcnt','structuretaxvaluedollarcnt',\
        'landtaxvaluedollarcnt','assessmentyear','propertyzoningdesc'])
    df = prepare.handle_missing_values(df)
    df.dropna(inplace=True)
    return df
示例#5
0
def wrangle_zillow_data():
    df = acquire.get_zillow_data()
    df = prepare.zillow_single_unit(df)
    df = prepare.remove_columns(df, [
        'finishedsquarefeet12', 'fullbathcnt', 'unitcnt',
        'structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt',
        'assessmentyear', 'propertyzoningdesc'
    ])
    df = prepare.handle_missing_values(df)
    return df
def final_prep():
    """
    Accepts DF. Performs all changes outlined in prep phase and returns scaled and unscaled versions of 
    train, validate, and test samples (6 in total).
    """
    # Acquiring data
    df = get_zillow_data()

    # Preparing data with changes outlined in prepare section of notebook
    drop_missing_columns(df)
    drop_selected_columns(df)
    df.dropna(inplace=True)
    df = d_type_convert(df)
    df = zillow_dummy(df)
    df = column_sort_rename(df)
    train, validate, test = split_data(df)
    train_scaled, validate_scaled, test_scaled = data_scaler(
        train, validate, test)

    # returning DFs
    return train, validate, test, train_scaled, validate_scaled, test_scaled
示例#7
0
def clean_zillow():
    #get the zillow data
    df = get_zillow_data()
    #handle the nan's
    df['bedroomcnt'].fillna(df['bedroomcnt'].mode()[0], inplace=True)
    df['bathroomcnt'].fillna(df['bathroomcnt'].mode()[0], inplace=True)
    df['calculatedfinishedsquarefeet'].fillna(
        df['calculatedfinishedsquarefeet'].mode()[0], inplace=True)
    df['fips'].fillna(df['fips'].mode()[0], inplace=True)
    df['taxamount'].fillna(df['taxamount'].mode()[0], inplace=True)
    df['taxvaluedollarcnt'].fillna(df['taxvaluedollarcnt'].mode()[0],
                                   inplace=True)
    df['lotsizesquarefeet'].fillna(df['lotsizesquarefeet'].mode()[0],
                                   inplace=True)
    # make new column for county names
    rating = []
    for row in df['fips']:
        if row == 6037.0: rating.append('los_angeles_county')
        elif row == 6059.0: rating.append('orange_county')
        elif row == 6111.0: rating.append('ventura_county')
        else: rating.append('no_county')
    df['county'] = rating
    #rename columns to make it easier to call later
    df = df.rename(
        columns={
            "bedroomcnt": "bedroom_count",
            "bathroomcnt": "bathroom_count",
            "lotsizesquarefeet": "lot_size",
            "calculatedfinishedsquarefeet": "square_feet",
            "taxamount": "tax_amount",
            "taxvaluedollarcnt": "tax_value"
        })
    # drop unneeded columns
    df = df.drop(columns=["Unnamed: 0", "parcelid", 'latitude', 'longitude'])
    # return the clean dataframe
    return df
示例#8
0
def prep_zillow_data():
    df = acquire.get_zillow_data()
    df = drop_nulls(df, prop_required_column=.70, prop_required_row=.70)
    df['finishedsquarefeet12'] = df['finishedsquarefeet12'].fillna(
        (df['finishedsquarefeet12'].mean()))
    df['lotsizesquarefeet'] = df['lotsizesquarefeet'].fillna(
        (df['lotsizesquarefeet'].mean()))
    df['regionidcity'] = df['regionidcity'].fillna((df['regionidcity'].mean()))
    df['fullbathcnt'] = df['fullbathcnt'].fillna((df['fullbathcnt'].mean()))
    df['calculatedbathnbr'] = df['calculatedbathnbr'].fillna(
        (df['calculatedbathnbr'].mean()))
    df['yearbuilt'] = df['yearbuilt'].fillna((df['yearbuilt'].mean()))
    df['censustractandblock'] = df['censustractandblock'].fillna(
        (df['censustractandblock'].mean()))
    df['calculatedfinishedsquarefeet'] = df[
        'calculatedfinishedsquarefeet'].fillna(
            (df['calculatedfinishedsquarefeet'].mean()))
    df['structuretaxvaluedollarcnt'] = df['structuretaxvaluedollarcnt'].fillna(
        (df['structuretaxvaluedollarcnt'].mean()))
    df['regionidzip'] = df['regionidzip'].fillna((df['regionidzip'].mean()))
    df = df.dropna()
    df = df.drop(columns=['pid', 'id'])
    county_dummies = pd.get_dummies(df['fips'])
    df = pd.concat((df, county_dummies), axis=1)
    df = df.rename(
        columns={
            6059.0: 'is_orange_county',
            6037.0: 'is_la_county',
            6111.0: 'is_ventura_county'
        })
    df = df[df.bedroomcnt < 8]
    df = df[df.bedroomcnt > 0]
    df = df[df.bathroomcnt > 0]
    df = df[df.bathroomcnt < 6]
    df.to_csv('prep_zillow.csv')
    return df
示例#9
0
def prep_zillow_data(target):
    """
    function takes in name of targeted variable column (taxvaluedollarcnt), acquires zillow house data, prepares it via removing null values, renaming columns, scaling data, and returning data split into all of the data frames needed to progress through project notebook.
    """
    # acquire zillow data
    df = get_zillow_data()
    # filter columns down to the target variable (taxvaluedollarcnt) and the final four columns from prep
    df = df[[
        'calculatedfinishedsquarefeet', 'bedroomcnt', 'bathroomcnt',
        'lotsizesquarefeet', 'taxvaluedollarcnt'
    ]]

    # drop na values
    df.dropna(inplace=True)

    # dropping extreme outliers
    df.drop(df[df['lotsizesquarefeet'] > 3000000].index, inplace=True)
    df.drop(df[df['calculatedfinishedsquarefeet'] > 35000].index, inplace=True)
    df.drop(df[df['taxvaluedollarcnt'] > 20000000].index, inplace=True)

    # splitting data
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate,
                                       test_size=.3,
                                       random_state=123)

    # specifying which columns to keep in outputted dataframe
    # x = features | y = target variable
    X_train = train.drop(columns=[target])
    y_train = train[[target]]

    X_validate = validate.drop(columns=[target])
    y_validate = validate[[target]]

    X_test = test.drop(columns=[target])
    y_test = test[[target]]

    # importing scaler
    scaler = sklearn.preprocessing.MinMaxScaler()

    # fitting scaler
    scaler.fit(X_train)

    # scaling data in dataframes
    X_train_scaled = pd.DataFrame(scaler.transform(X_train))
    X_validate_scaled = pd.DataFrame(scaler.transform(X_validate))
    X_test_scaled = pd.DataFrame(scaler.transform(X_test))

    # renaming data frame columns
    train.rename(columns={
        'taxvaluedollarcnt': 'tax_dollar_value',
        'calculatedfinishedsquarefeet': 'unit_sq_feet',
        'bedroomcnt': 'bedroom_count',
        'bathroomcnt': 'bathroom_count',
        'lotsizesquarefeet': 'lot_size_sq_feet'
    },
                 inplace=True)

    X_train_scaled.rename(columns={
        0: 'unit_sq_feet',
        1: 'bedroom_count',
        2: 'bathroom_count',
        3: 'lot_size_sq_feet'
    },
                          inplace=True)
    X_validate_scaled.rename(columns={
        0: 'unit_sq_feet',
        1: 'bedroom_count',
        2: 'bathroom_count',
        3: 'lot_size_sq_feet'
    },
                             inplace=True)
    X_test_scaled.rename(columns={
        0: 'unit_sq_feet',
        1: 'bedroom_count',
        2: 'bathroom_count',
        3: 'lot_size_sq_feet'
    },
                         inplace=True)

    X_train.rename(columns={
        'calculatedfinishedsquarefeet': 'unit_sq_feet',
        'bedroomcnt': 'bedroom_count',
        'bathroomcnt': 'bathroom_count',
        'lotsizesquarefeet': 'lot_size_sq_feet'
    },
                   inplace=True)
    X_validate.rename(columns={
        'calculatedfinishedsquarefeet': 'unit_sq_feet',
        'bedroomcnt': 'bedroom_count',
        'bathroomcnt': 'bathroom_count',
        'lotsizesquarefeet': 'lot_size_sq_feet'
    },
                      inplace=True)
    X_test.rename(columns={
        'calculatedfinishedsquarefeet': 'unit_sq_feet',
        'bedroomcnt': 'bedroom_count',
        'bathroomcnt': 'bathroom_count',
        'lotsizesquarefeet': 'lot_size_sq_feet'
    },
                  inplace=True)

    y_train.rename(columns={'taxvaluedollarcnt': 'tax_dollar_value'},
                   inplace=True)
    y_validate.rename(columns={'taxvaluedollarcnt': 'tax_dollar_value'},
                      inplace=True)
    y_test.rename(columns={'taxvaluedollarcnt': 'tax_dollar_value'},
                  inplace=True)

    # returning data frames
    return train, y_train, y_validate, y_test, X_train_scaled, X_train, X_validate, X_test, X_validate_scaled, X_test_scaled
示例#10
0
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import env

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

from acquire import get_zillow_data

# In[109]:

df = get_zillow_data()
df.head()

# In[110]:


# Let's figure out how much data is missing where
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    pct_missing = num_missing / rows
    cols_missing = pd.DataFrame({
        'number_missing_rows': num_missing,
        'percent_rows_missing': pct_missing
    })
    return cols_missing
示例#11
0
def prep_zillow_data():
    df = acquire.get_zillow_data()
    df = missing_rows(df)
    df = missing_cols(df)
    df = drop_nulls(df, prop_required_column=.70, prop_required_row=.70)
    return df
示例#12
0
def prepare_zillow():
    '''
    Signature: prepare_zillow() -> pandas.core.frame.DataFrame
    Docstring:
        Prepare the zillow dataset for data EDA
        Return DataFrame of zillow dataset
    '''
    df = get_zillow_data()

    df.storydesc.fillna(0, inplace=True)
    df.storydesc.replace('Basement', 1, inplace=True)
    df['has_basement'] = df.storydesc

    df.hashottuborspa.fillna(0, inplace=True)
    df['has_hottub_or_spa'] = df.hashottuborspa

    df.poolcnt.fillna(0, inplace=True)
    df['has_pool'] = df.poolcnt

    df.poolsizesum.fillna(0, inplace=True)
    df['pool_area_sqft'] = df.poolsizesum

    df['has_patio'] = df.yardbuildingsqft17.notnull().astype(np.int)
    df['patio_area_sqft'] = df.yardbuildingsqft17.fillna(0)

    df['has_shed'] = df.yardbuildingsqft26.notnull().astype(np.int)
    df['basement_area_sqft'] = df.yardbuildingsqft26.fillna(0)

    features_to_drop = [
        'decktypeid',
        'buildingclasstypeid',
        'buildingqualitytypeid',
        'finishedsquarefeet6',
        'finishedsquarefeet13',
        'finishedsquarefeet15',
        'buildingclassdesc',
        'pooltypeid2',
        'pooltypeid7',
        'pooltypeid10',
        'regionidcity',
        'regionidcounty',
        'regionidzip',
        'typeconstructiontypeid',
        'typeconstructiondesc',
        'architecturalstyletypeid',
        'architecturalstyledesc',
        'storytypeid',
        'storydesc',
        'hashottuborspa',
        'poolcnt',
        'poolsizesum',
        'yardbuildingsqft17',
        'yardbuildingsqft26',
        'taxdelinquencyyear',
        'taxdelinquencyflag',
        'finishedsquarefeet50',
        'finishedfloor1squarefeet',
        'censustractandblock',
        'rawcensustractandblock',
        'propertylandusetypeid',
        'id',
        'assessmentyear',
        'finishedsquarefeet12',
        'bathroomcnt',
        'fullbathcnt',
        'basementsqft',
        'threequarterbathnbr',
        'propertylandusedesc',
        'propertycountylandusecode',
        'regionidcounty',
    ]

    df.drop(columns=features_to_drop, inplace=True)

    df.rename(columns={
        'lotsizesquarefeet': 'lot_size_sqft',
        'calculatedfinishedsquarefeet': 'livable_area_sqft',
        'structuretaxvaluedollarcnt': 'structure_tax',
        'taxvaluedollarcnt': 'taxable_value',
        'landtaxvaluedollarcnt': 'land_tax',
        'taxamount': 'property_tax',
        'lasttransactiondate': 'date_sold',
        'roomcnt': 'num_of_rooms',
        'yearbuilt': 'year_built',
        'bedroomcnt': 'num_of_bedrooms',
        'calculatedbathnbr': 'num_of_restrooms'
    },
              inplace=True)

    df = df[[
        'parcelid', 'num_of_bedrooms', 'num_of_restrooms', 'livable_area_sqft',
        'lot_size_sqft', 'year_built', 'has_basement', 'has_hottub_or_spa',
        'has_pool', 'pool_area_sqft', 'has_patio', 'patio_area_sqft',
        'has_shed', 'basement_area_sqft', 'property_tax', 'structure_tax',
        'land_tax', 'taxable_value', 'date_sold', 'fips', 'latitude',
        'longitude', 'logerror'
    ]]

    df.fips = df.fips.astype(np.int)
    df.has_hottub_or_spa = df.has_hottub_or_spa.astype(np.int)
    df.has_pool = df.has_pool.astype(np.int)
    df.num_of_bedrooms = df.num_of_bedrooms.replace(0, np.nan)
    df.latitude = df.latitude / 1_000_000
    df.longitude = df.longitude / 1_000_000

    return df
示例#13
0
# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# default pandas decimal number display format.
pd.options.display.float_format = '{:20,.2f}'.format

import acquire

df = acquire.get_zillow_data()


def nulls_by_col(df):
    # Look at the number missing.
    num_missing = df.isnull().sum()

    # number of rows
    rows = df.shape[0]

    # percent_missing
    pct_missing = num_missing / rows

    # Create dataframe of column name, num_rows missing, and pct of rows missing.
    cols_missing = pd.DataFrame({
        'num_rows': num_missing,