Пример #1
0
def wrangle():
    # get zillow data
    df = acquire.get_data(query, db)
    # keep only the most recent transaction date
    df = df.sort_values("transactiondate", ascending=False).drop_duplicates("parcelid")
    # keep only 2017 values
    df = df [df.transactiondate.str.startswith("2017")]

    # remove all the duplicate id columns
    df.drop(columns = ["typeconstructiontypeid","storytypeid", "propertylandusetypeid", "heatingorsystemtypeid", "buildingclasstypeid","architecturalstyletypeid","airconditioningtypeid","id"], inplace=True)

    # keep single family homes and remove unit counts greater than 1
    df = df [df.propertylandusedesc == "Single Family Residential"]
    df = df [(df.unitcnt != 2) & (df.unitcnt != 3)]

    # remove rows or columns that have 99% null values
    prep.handle_missing_values(df, .5, .5)

    # remove the following columns with not enough info or duplicate info
    df.drop(columns = ["finishedsquarefeet12","buildingqualitytypeid", "fullbathcnt", "propertyzoningdesc", "unitcnt", "heatingorsystemdesc","assessmentyear","regionidcounty", "rawcensustractandblock", "calculatedbathnbr", "propertycountylandusecode"], inplace=True)

    # remove remaining rows with blanks 
    df.dropna(inplace=True)

    # set index as parcelid
    df.set_index("parcelid", inplace=True)
    
    return df
def main():
    """
    Main entry point for the script.
    """
    df, dfc = prep_data(ac.get_data())

    print('Done and doner.')
    print(df.head(10))
Пример #3
0
def wrangle_data():
    '''Takes no arguments and returns a prepared zillow DataFrame'''
    zillow = get_data()
    zillow = drop_columns(zillow)
    zillow = impute_median(zillow)
    zillow = create_new_features(zillow)
    zillow = zillow[(zillow.bathroomcnt > 0) & (zillow.bedroomcnt > 0)]
    zillow = handle_outliers(zillow)

    return zillow
Пример #4
0
def acquire_and_prep_data():
    zillow = acquire.get_data()

    zillow = zillow.drop_duplicates()
    zillow = zillow.dropna()
    zillow = zillow.drop(columns=['fips', 'roomcnt'])

    zillow.bedroomcnt = zillow.bedroomcnt.astype('int')
    zillow.calculatedfinishedsquarefeet = zillow.calculatedfinishedsquarefeet.astype('int')
    zillow.fullbathcnt = zillow.fullbathcnt.astype('int')
    zillow.yearbuilt = zillow.yearbuilt.astype('int')
    zillow.taxvaluedollarcnt = zillow.taxvaluedollarcnt.astype('int')

    zillow = zillow.rename(columns={'calculatedfinishedsquarefeet': 'squarefeet', 'Name': 'County'})
    
    zillow.latitude = zillow.latitude / 1000000
    zillow.longitude = zillow.longitude / 1000000

    zillow = zillow[zillow.bedroomcnt > 0]
    zillow = zillow[zillow.bedroomcnt > 0]

    return zillow
Пример #5
0
from acquire import get_data
from prep import prep_data

# Get the raw data from .csv or MySQL query
raw = get_data()

# Remove nulls
df = prep_data(raw)

# Milestones before Friday:
# 2. Scale
# 3. Super basic Model

df.info()
df.describe()

# First pass for outlier detection:
# Do the value counts and distribution make sense?
# Is there anything way out of line here?
df.bedrooms.value_counts()  # encode as discrete
df.bathrooms.value_counts()  # encode as discrete
df.sqft.value_counts()  # can bin or scale
df.taxvalue.value_counts()  # scale this (also our target variable)
Пример #6
0
def get_mall():
    return acquire.get_data("SELECT * FROM customers", "mall_customers")
Пример #7
0
import acquire
import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

df = acquire.get_data()


def get_single_unit(df):
    '''Takes in a dataframe, removes the duplicate column names and filters it based on the property land use description and returns a new
    dataframe of just single family residential property'''
    df = df.loc[:, ~df.columns.duplicated()]
    df = df[df.propertylandusetypeid.isin([260, 261, 262, 279])]

    return df


def handle_missing_values(df, column_prop, row_prop):
    '''Takes in a dataframe, the proportion of the column with non NA, the proportion with the rows with Non NA 
    and returns dataframe with the na removed at given proportion'''
    threshold = int(round(column_prop * len(df), 0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(row_prop * len(df.columns), 0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df