예제 #1
0
def get_iris_data():
    url = env.get_db_url('iris_db')
    query = '''
    SELECT * FROM measurements
    JOIN species USING (species_id)
    '''
    return pd.read_sql(query, url)
def get_zillow_data():
    query = '''
    select * 
    from `predictions_2017`
    left join properties_2017
    using(`parcelid`)
    left join `airconditioningtype`
    using (`airconditioningtypeid`)
    left join `architecturalstyletype` as arch
    using(`architecturalstyletypeid`)
    left join `buildingclasstype`
    using(`buildingclasstypeid`)
    left join `heatingorsystemtype`
    using(`heatingorsystemtypeid`)
    left join `propertylandusetype`
    using(`propertylandusetypeid`)
    left join `storytype`
    using(`storytypeid`)
    left join `typeconstructiontype`
    using(`typeconstructiontypeid`)
    where (`latitude` is not null) and (`longitude` is not NULL)
    '''
    df = pd.read_sql(query, get_db_url('zillow'))

    df = df.sort_values(by=['transactiondate'],
                        axis=0).drop_duplicates(keep='last', subset='parcelid')

    df.drop('id', axis=1, inplace=True)

    return df
def get_titanic_data():
    sql = """
    SELECT *
    FROM passengers
    """
    url = get_db_url('titanic_db')
    return pd.read_sql(sql, url)
예제 #4
0
def get_data_from_mysql():
    query = """
    SELECT customer_id, monthly_charges, tenure, total_charges FROM customers
    JOIN contract_types USING (contract_type_id)
    WHERE contract_type_id = 3;
    """
    df = pd.read_sql(query, get_db_url("telco_churn"))
    return df
def get_iris_data():
    sql = """
    SELECT * 
    FROM measurements
    JOIN species USING (species_id)
    """
    url = get_db_url('iris_db')
    return pd.read_sql(sql, url)
def get_data_from_sql():
    sql = """
    SELECT customer_id, monthly_charges, tenure, total_charges
    FROM customers
    WHERE contract_type_id = 3
    """
    url = get_db_url('telco_churn')
    tc_df = pd.read_sql(sql, url)
    return tc_df
def get_iris_data():
    query = '''
    select * 
    from measurements
    join species 
    using(`species_id`)
    '''
    df = pd.read_sql(query, get_db_url('iris_db'))
    return df
예제 #8
0
def wrangle_telco():
    url = get_db_url('telco_churn')
    query = ('''
        SELECT customer_id, monthly_charges, tenure, total_charges
        FROM customers
        WHERE contract_type_id = 3
    ''')
    df = pd.read_sql(query, url)

    df.total_charges = df.total_charges.str.strip()
    df.total_charges = df.total_charges.replace('', 0).astype(float)
    return df
def wrangle_telco():
    query = """SELECT customer_id,
    tenure,
    monthly_charges,
    total_charges
    FROM customers
    WHERE contract_type_id = 3;"""
    url = get_db_url("telco_churn")
    telco = pd.read_sql(query, url)
    telco.total_charges = telco.total_charges.str.strip()
    telco = telco.replace("", np.nan)
    telco = telco.dropna()
    telco.total_charges = telco.total_charges.astype("float")
    return telco
예제 #10
0
def get_zillow_data():
    url = env.get_db_url('zillow')
    query = '''
    SELECT * FROM predictions_2017
    LEFT JOIN properties_2017 USING (parcelid)
    LEFT JOIN airconditioningtype USING (airconditioningtypeid)
    LEFT JOIN architecturalstyletype USING (architecturalstyletypeid)
    LEFT JOIN buildingclasstype USING (buildingclasstypeid)
    LEFT JOIN heatingorsystemtype USING (heatingorsystemtypeid)
    LEFT JOIN propertylandusetype USING (propertylandusetypeid)
    LEFT JOIN storytype USING (storytypeid)
    LEFT JOIN typeconstructiontype USING (typeconstructiontypeid)
    WHERE (latitude IS NOT NULL AND 
            longitude IS NOT NULL)'''
    df = pd.read_sql(query, url)

    new_dates = df.groupby(by='parcelid').transactiondate.max().reset_index()
    df.drop(columns=['parcelid', 'transactiondate'], inplace=True)
    df = new_dates.join(df, how='left')
    df.drop(columns='id', inplace=True)
    return df
예제 #11
0
def get_titanic_data(cached=False):
    '''
    This function returns the titanic database as a pandas dataframe. If the data is cached or the file exists in the directory, the
    function will read the data into a df and return it. Otherwise, the function will read the database into a dataframe, cache it as a csv file
    and return the dataframe.
    '''
    #If the cached parameter is false, or the csv file is not on disk, read from the database into a dataframe
    if cached == False or os.path.isfile('titanic_df.csv') == False:
        query = '''
        SELECT * 
        FROM passengers;
        '''
        titanic_df = pd.read_sql(query, get_db_url('titanic_db'))
        #also cache the data we read from the db, to a file on disk
        titanic_df.to_csv('titanic_df.csv')
    else:
        #either the cached parameter was true, or a file exists on disk. Read that into a df instead of going to the database
        titanic_df = pd.read_csv('titanic_df.csv', index_col=0)

    #return our dataframe regardless of its origin
    return titanic_df
예제 #12
0
def get_iris_data(cached=False):
    '''
    This function will return the iris db as a pandas df. If the data is cached or the file exists in the directory, the function
    will read that file into a pandas df and return it. Otherwise, the function will read data from the codeup db into a df,
    and return it to the caller.
    '''

    # read the db from codeup db into a df if the cached parameter is false or the file is not on disk
    if cached == False or os.path.isfile('iris_df.csv') == False:
        query = '''
        SELECT * 
        FROM measurements
        JOIN species USING (species_id);;
        '''
        iris_df = pd.read_sql(query, get_db_url('iris_db'))
        # cache it as a csv file
        iris_df.to_csv('iris_df.csv')

    else:  # if cached parameter is True or file exists on disk, read the file into a pandas df
        iris_df = pd.read_csv('iris_df', index_col=0)
    # return the iris df regardless of origin
    return iris_df
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import split_scale
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import wrangle
import env
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

url = env.get_db_url('zillow')


def prep_predictions():
    df = pd.read_sql("""

SELECT *
FROM predictions_2017
""", url)
    df['transactiondate'] = pd.to_datetime(df['transactiondate'])
    df = df[df.groupby('parcelid')['transactiondate'].transform('max') ==
            df['transactiondate']]

    return df

예제 #14
0
# In[84]:

#wrangle data
import warnings
warnings.filterwarnings('ignore')

from env import get_db_url
import pandas as pd
import numpy as np

# ### 1) Acquire customer_id, monthly_charges, tenure, and total_charges from telco_churn database for all customers with a 2 year contract.

# In[176]:

url = get_db_url('telco_churn')
df = pd.read_sql(
    '''
SELECT customer_id, monthly_charges, total_charges
FROM customers
where contract_type_id = 3
''', url)

# In[177]:

df.head(5)

# In[178]:

df.shape
def wrangle_grades():
    grades = pd.read_csv("student_grades.csv")
    grades.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    df = grades.dropna().astype('int')
    df.drop(columns="student_id", inplace=True)
    return df


telco_query = """
SELECT c.customer_id, c.monthly_charges, c.tenure, c.total_charges
FROM customers AS c
JOIN contract_types AS ct USING(contract_type_id)
WHERE ct.contract_type = 'Two year';
"""

telco_url = get_db_url("telco_churn")


def wrangle_telco():
    """
    This function does the following:
        1. Queries data from the telco_churn database into a pandas DataFrame
        2. Cleans the total_charges feature
        3. Replaces any empty strings with np.nan
        4. Removes any rows with missing values
        5. Reassigns the total_charges feature as a float
        6. Returns a new pandas DataFrame
    """
    customers = pd.read_sql(telco_query, telco_url)
    customers.total_charges = customers.total_charges.str.strip()
    customers = customers.replace("", np.nan)
import pandas as pd
import numpy as np
import split_scale
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import wrangle
import env
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler


url2 = env.get_db_url('titanic_db')

def prep_titanic():
    
    df = pd.read_sql("""

SELECT *
FROM passengers
"""
,url2)

    df.drop(columns=['deck'],inplace=True)
    df.fillna(np.nan,inplace=True)
    imp_mode = SimpleImputer(missing_values=np.nan,strategy='most_frequent')

    imp_mode.fit(df[['embarked']])
예제 #17
0
def get_titanic_data():
    url = env.get_db_url('titanic_db')
    query = '''
    SELECT * FROM passengers
    '''
    return pd.read_sql(query, url)
def get_titanic_data():
    query = 'select * from passengers'
    df = pd.read_sql(query, get_db_url('titanic_db'))
    return df
예제 #19
0
import pandas as pd
import numpy as np
import split_scale
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import wrangle
import env
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

url = env.get_db_url('telco_churn')


def clean_telco(df):
    df.total_charges = df.total_charges.replace(r'^\s*$', np.nan, regex=True)
    df = df[df.total_charges.isna() == False]
    df['total_charges'] = df['total_charges'].astype(float)
    df['churn'] = df.churn == 'Yes'
    df['senior_citizen'] = df.senior_citizen == 1
    df['is_male'] = df.gender == 'Male'
    df['paperless_billing'] = df.paperless_billing == 'Yes'

    df['family'] = (df.partner == 'Yes') | (df.dependents == 'Yes')

    df['phone'] = (df.phone_service == 'Yes') | (df.multiple_lines == 'Yes')
    df['streaming'] = (df.streaming_tv == 'Yes') | (df.streaming_movies
def get_zillow_data():
    url = get_db_url("zillow")

    sql = """
    SELECT
    Z.parcelid,
    Z.basementsqft,
    Z.bathroomcnt,
    Z.bedroomcnt,
    Z.calculatedbathnbr,
    Z.finishedfloor1squarefeet,
    Z.calculatedfinishedsquarefeet,
    Z.finishedsquarefeet12,
    Z.finishedsquarefeet13,
    Z.finishedsquarefeet15,
    Z.finishedsquarefeet50,
    Z.finishedsquarefeet6,
    Z.fips,
    Z.fireplacecnt,
    Z.fullbathcnt,
    Z.garagecarcnt,
    Z.garagetotalsqft,
    Z.hashottuborspa,
    Z.latitude,
    Z.longitude,
    Z.lotsizesquarefeet,
    Z.poolcnt,
    Z.poolsizesum,
    Z.propertycountylandusecode,
    Z.propertyzoningdesc,
    Z.regionidcity,
    Z.regionidcounty,
    Z.regionidneighborhood,
    Z.regionidzip,
    Z.roomcnt,
    Z.threequarterbathnbr,
    Z.unitcnt,
    Z.yardbuildingsqft17,
    Z.yardbuildingsqft26,
    Z.yearbuilt,
    Z.numberofstories,
    Z.fireplaceflag,
    Z.structuretaxvaluedollarcnt,
    Z.taxvaluedollarcnt,
    Z.assessmentyear,
    Z.landtaxvaluedollarcnt,
    Z.taxamount,
    Z.taxdelinquencyflag,
    Z.taxdelinquencyyear,
    Z.censustractandblock,
    unique_properties.logerror,
    unique_properties.transactiondate,
    plt.propertylandusedesc,
    st.storydesc,
    ct.typeconstructiondesc,
    act.airconditioningdesc,
    bct.buildingclassdesc,
    hst.heatingorsystemdesc
    FROM 
    (SELECT 
    p17.parcelid,
    logerror,
    transactiondate
    FROM 
    predictions_2017 AS p17
    JOIN
    (SELECT 
    predictions_2017.parcelid,
    MAX(transactiondate) AS max_trans_date
    FROM predictions_2017
    GROUP BY predictions_2017.parcelid) AS pred_agg ON (p17.parcelid=pred_agg.parcelid) AND (pred_agg.max_trans_date=p17.transactiondate)) AS unique_properties
    LEFT JOIN properties_2017 AS Z ON (Z.parcelid=unique_properties.parcelid)
    LEFT JOIN propertylandusetype AS plt ON (Z.propertylandusetypeid=plt.propertylandusetypeid)
    LEFT JOIN storytype AS st ON (Z.storytypeid=st.storytypeid)
    LEFT JOIN typeconstructiontype AS ct ON (Z.typeconstructiontypeid=ct.typeconstructiontypeid)
    LEFT JOIN airconditioningtype AS act ON (Z.airconditioningtypeid=act.airconditioningtypeid)
    LEFT JOIN architecturalstyletype AS ast ON (Z.architecturalstyletypeid=ast.architecturalstyletypeid)
    LEFT JOIN buildingclasstype AS bct ON (Z.buildingclasstypeid=bct.buildingclasstypeid)
    LEFT JOIN heatingorsystemtype AS hst ON (Z.heatingorsystemtypeid=hst.heatingorsystemtypeid)
    WHERE Z.latitude IS NOT NULL AND Z.longitude IS NOT NULL
    """
    df = pd.read_sql(sql, url)
    return df
def clean_telco_data():
    #pull data
    query = '''
    select * 
    from customers as cust
    join `internet_service_types` as net
    on cust.`internet_service_type_id` = net.internet_service_type_id
    join `contract_types` as cont
    on cust.`contract_type_id` = cont.`contract_type_id`
    join payment_types as pmt
    using(`payment_type_id`);
    '''
    churn_df = pd.read_sql(query, get_db_url('telco_churn'))
    
    #for duplicate columns
    churn_df = churn_df.loc[:,~churn_df.columns.duplicated()]

    #for duplicat rows
    churn_df = churn_df.drop_duplicates()
    
    #drop redundant collumns
    churn_df = (churn_df.drop('contract_type_id', axis = 1)
                        .drop('internet_service_type_id', axis = 1)
                        .drop('payment_type_id', axis = 1))
    
    #change 'no internets' and no phones to just no
    churn_df.replace('No internet service', 'No', inplace=True)
    churn_df.replace('No phone service', 'No', inplace=True)
    
    # change to float
    churn_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    churn_df = churn_df.dropna(axis=0)
    churn_df.total_charges = churn_df.total_charges.astype(float)

    #get features and target
    target = 'churn'
    features = churn_df.columns.tolist()
    features.remove(target)
    features.remove('customer_id')

    #change churn column to boolean
    churn_df['churn'] = LabelEncoder().fit_transform(churn_df['churn']).astype(bool)
    churn_df.senior_citizen = churn_df.senior_citizen.astype(bool)
    
    #create new e-check collumn
    churn_df['e_check'] = churn_df.payment_type == 'Electronic check'

    #remove total_charges and senior citizens
    features.remove('total_charges')
    
    #remove collumns with little effect on tenure
    features.remove('gender')
    features.remove('phone_service')
    features.remove('payment_type')
    features.remove('contract_type')
    features.remove('internet_service_type')
    features.remove('multiple_lines')

    #encode yes no collumns
    for i in features:
        if churn_df[i].unique().tolist() == ['No', 'Yes'] or churn_df[i].unique().tolist() == ['Yes', 'No']:
            churn_df[i] = churn_df[i] == 'Yes'

    #one hot encode collumns
    churn_df = (churn_df.join(pd.get_dummies(churn_df.contract_type), on= churn_df.index)
                        .join(pd.get_dummies(churn_df.internet_service_type), on = churn_df.index))
    
    #add to features
    new_features = pd.get_dummies(churn_df.contract_type).columns.tolist()
    new_features += pd.get_dummies(churn_df.internet_service_type).columns.tolist()
    features += new_features
    
    #split data
    train, test = split_scale.split_my_data(churn_df, stratify=churn_df.churn)
    return train, test, features, target
예제 #22
0
def get_zillow(sql):
    url = get_db_url('zillow')
    zillow_df = pd.read_sql(sql, url, index_col='id')
    return zillow_df