def prep_titanic(df):
    df.drop(columns=['deck'], inplace=True)
    df.embark_town = df.embark_town.fillna('Southampton')
    df.embarked = df.embarked.fillna('S')

    train, test = split_scale.split_my_data(df, .8)

    encoder = OneHotEncoder(sparse=False)
    encoder.fit(train[['embarked']])
    cols = [c for c in encoder.categories_[0]]
    m_train = encoder.transform(train[['embarked']])
    m_test = encoder.transform(test[['embarked']])

    encoded_train = pd.DataFrame(m_train, columns=cols, index=train.index)
    encoded_test = pd.DataFrame(m_test, columns=cols, index=test.index)

    train = pd.concat([train, encoded_train], axis=1).drop(columns='embarked')
    test = pd.concat([test, encoded_test], axis=1).drop(columns='embarked')

    imputer = SimpleImputer(strategy='mean')
    imputer.fit(train[['age']])
    train.age = imputer.transform(train[['age']])
    test.age = imputer.transform(test[['age']])

    train_to_scale = train[['age', 'fare']]
    test_to_scale = test[['age', 'fare']]
    scaler, train_scaled, test_scaled = \
        split_scale.min_max_scaler(train_to_scale, test_to_scale)

    train.update(train_scaled)
    test.update(test_scaled)

    return train, test
예제 #2
0
def recursive_feature_elimination(features, target, dataframe, train_pct=0.8):
    cols = features + target

    train, test = ss.split_my_data(dataframe[cols], train_pct=train_pct)
    n = optimum_feature_count(train[features], train[target], test[features],
                              test[target])

    features = optimum_feature_names(train[features], train[target], n)

    train = train[features].join(train[target])
    test = test[features].join(test[target])
    return train, test, features
def split_scale_df(df):

    train, test = split_scale.split_my_data(df, train_ratio=.8, seed=123)

    scaler, train, test = split_scale.standard_scaler(train, test)

    X_train = train.drop(columns='tax_value')
    y_train = train[['tax_value']]
    X_test = test.drop(columns='tax_value')
    y_test = test[['tax_value']]
    ols_model = ols('y_train ~ X_train', data=train).fit()
    train['yhat'] = ols_model.predict(y_train)
    return train, test, X_train, y_train, X_test, y_test, ols_model
def prep_iris(df):
    df.drop(columns=['species_id', 'measurement_id'], inplace=True)
    df.columns = [
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'
    ]

    train, test = split_scale.split_my_data(df, .8)

    encoder = OneHotEncoder(sparse=False)
    encoder.fit(train[['species']])
    cols = [c for c in encoder.categories_[0]]
    m_train = encoder.transform(train[['species']])
    m_test = encoder.transform(test[['species']])
    encoded_train = pd.DataFrame(m_train, columns=cols, index=train.index)
    encoded_test = pd.DataFrame(m_test, columns=cols, index=test.index)
    train = pd.concat([train, encoded_train], axis=1).drop(columns='species')
    test = pd.concat([test, encoded_test], axis=1).drop(columns='species')

    return train, test
예제 #5
0
url = f'mysql+pymysql://{user}:{password}@{host}/zillow'

data = pd.read_sql(
    '''select id, calculatedfinishedsquarefeet, bedroomcnt, bathroomcnt, taxvaluedollarcnt from properties_2017
join predictions_2017 using (id)
join propertylandusetype using (propertylandusetypeid)
where transactiondate between "2017-05-01" and "2017-06-30"
and propertylandusetypeid not in ("31", "47", "246", "247", "248","264", "265", "266","267", "269", "270" )
and calculatedfinishedsquarefeet * bathroomcnt * bedroomcnt != 0
and taxvaluedollarcnt != 0''', url)

data = data.set_index(data.id)

#sns.pairplot(data=data)

train, test = split_scale.split_my_data(data)

X_train = train.drop(columns=["id", "taxvaluedollarcnt"])
y_train = pd.DataFrame([train.taxvaluedollarcnt])
y_train = y_train.transpose()

X_test = test.drop(columns=["id", "taxvaluedollarcnt"])
y_test = pd.DataFrame([test.taxvaluedollarcnt])
y_test = y_test.transpose()

X_train_scaled = split_scale.standard_scaler(X_train)
#sns.heatmap(data.corr(), cmap='Blues', annot=True)

predictions = pd.DataFrame({
    'actual': y_train.taxvaluedollarcnt
}).reset_index(drop=True)
예제 #6
0
import env
import wrangle
import split_scale

# Acquire and prep data
df = wrangle.wrangle_telco()
df.head()
df.info()
df.drop(columns=['customer_id'], inplace=True)
df.head()

# Explore data
sns.pairplot(data=df)

# split data
train, test = split_scale.split_my_data(data=df, train_ratio=.80, seed=123)

# Scale

scaler, train_scaled, test_scaled = split_scale.standard_scaler(train, test)

# Seperate into X and y dataframes

X_train = train.drop(columns=['total_charges'])
y_train = train[['total_charges']]

X_test = test.drop(columns=['total_charges'])
y_test = test[['total_charges']]

X_train_scaled = train_scaled.drop(columns=['total_charges'])
y_train_scaled = train_scaled[['total_charges']]
print(df.columns[df.isnull().any()])

df.monthly_charges.value_counts(sort=True, ascending=True)

df.describe()

df.groupby('tenure').mean().plot.bar(figsize=(16, 9), ec='black', width=.9)

col_names = ['customer_id', 'tenure', 'monthly_charges', 'total_charges']

X_train_scaled = df.copy()

X = df[['tenure', 'monthly_charges']]
y = df[['total_charges']]
train_pct = .25
X_train, X_test, y_train, y_test = split_my_data(X, y, train_pct)

assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

# ### Standard Scaler
# $$x'={\frac {x-{\bar {x}}}{\sigma }}$$

X_train_standard_scaled, X_test_standard_scaled, standard_scaler = standard_scaler(
    X_train, X_test)

X_train_standard_scaled, X_test_standard_scaled

X_train_standard_scaled.head()

# ### Standard Scale Inverse
예제 #8
0
#### Feature Engineering for telco_churn data

import pandas as pd

from wrangle import wrangle_telco
from split_scale import split_my_data
import features

### SelectKBest - Top Features of Unscaled Data

## Step 1. Load Data
telco_df = wrangle_telco()
telco_df.head()
telco_X = telco_df[["monthly_charges", "tenure"]]
telco_y = telco_df["total_charges"]

## Step 2. Split Data to X and y, and test and train = 4 data frames
telco_X_train, telco_X_test, telco_y_train, telco_y_test = split_my_data(
    telco_X, telco_y, 0.80)

## Step 3. Run select_kbest_freg_unscaled
f_features = features.selectkbest_optimal_features(telco_X_train,
                                                   telco_y_train, 2)
예제 #9
0
import warnings
warnings.filterwarnings("ignore")

import env
import wrangle as w
import split_scale as ss

df = w.wrangle_telco()

df

x = df[['tenure', 'monthly_charges']]

y = df[['total_charges']]

x_train, x_test, y_train, y_test = ss.split_my_data(x, y, train_pct=.8)

# 1. Write a function, select_kbest_freg_unscaled() that takes X_train, y_train and k as input
# (X_train and y_train should not be scaled!) and returns a list of the top k features.

from sklearn.feature_selection import SelectKBest, f_regression

k = 1

# print(x_train)
# print(y_train)

# print(x_test)
# print(y_test)

예제 #10
0
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

import env
import wrangle
import split_scale
from sklearn.feature_selection import SelectKBest, f_regression

df = wrangle.wrangle_telco()
X = df.drop(columns=['total_charges', 'customer_id'])
y = pd.DataFrame(df.total_charges)

x_train, x_test, y_train, y_test = split_scale.split_my_data(X, y)
#X_train


def select_kbest_freg(x_train, y_train, k):

    f_selector = SelectKBest(f_regression, k)

    f_selector.fit(x_train, y_train)

    f_support = f_selector.get_support()
    f_feature = x_train.loc[:, f_support].columns.tolist()

    return f_feature

import numpy as numpy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

import env
import wrangle
import split_scale

df = wrangle.wrangle_telco()

X = df[['tenure','monthly_charges','total_charges']]

X_train, x_test = split_scale.split_my_data(X)

# 1. Write a function, plot_variable_pairs(dataframe) that plots all of the pairwise relationships along with the regression line for each pair.

def plot_variable_pairs(df):
    train, test = train_test_split(df)
    return sns.pairplot(data = train, kind = 'reg'), sns.pairplot(data = test, kind = 'reg')

plot_variable_pairs(X)

# 2. Write a function, months_to_years(tenure_months,df) that returns your dataframe with a new feature tenure_years, in complete years as a customer.
def months_to_years(tenure_month,df):
    df['tenure_years'] = round(tenure_month / 12)
    return df

months_to_years(df.tenure,df)
import pandas as pd
import numpy as np
import seaborn as sns
import split_scale as ss
from wrangle import wrangle_telco
import matplotlib.pyplot as plt

def plot_variable_pairs(df):
    graph = sns.PairGrid(df)
    graph.map_diag(plt.hist)
    graph.map_offdiag(sns.regplot)
    plt.show()

def months_to_years(tenure_months, df):
    df['tenure_years'] = tenure_months // 12
    return df

def plot_categorical_and_continuous_vars(categorical_var, continuous_var, df):
    bar plot 
    box plot
    pie chart

if __name__ == '__main__':
    telco = wrangle_telco()
    telco.set_index([telco.customer_id], inplace=True)
    train_telco, test_telco = ss.split_my_data(telco, .7, seed)
    plot_variable_pairs(telco)
    months_to_years(telco['tenure'], telco)
    plot_categorical_and_continuous_vars()
def clean_telco_data():
    #pull data
    query = '''
    select * 
    from customers as cust
    join `internet_service_types` as net
    on cust.`internet_service_type_id` = net.internet_service_type_id
    join `contract_types` as cont
    on cust.`contract_type_id` = cont.`contract_type_id`
    join payment_types as pmt
    using(`payment_type_id`);
    '''
    churn_df = pd.read_sql(query, get_db_url('telco_churn'))
    
    #for duplicate columns
    churn_df = churn_df.loc[:,~churn_df.columns.duplicated()]

    #for duplicat rows
    churn_df = churn_df.drop_duplicates()
    
    #drop redundant collumns
    churn_df = (churn_df.drop('contract_type_id', axis = 1)
                        .drop('internet_service_type_id', axis = 1)
                        .drop('payment_type_id', axis = 1))
    
    #change 'no internets' and no phones to just no
    churn_df.replace('No internet service', 'No', inplace=True)
    churn_df.replace('No phone service', 'No', inplace=True)
    
    # change to float
    churn_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    churn_df = churn_df.dropna(axis=0)
    churn_df.total_charges = churn_df.total_charges.astype(float)

    #get features and target
    target = 'churn'
    features = churn_df.columns.tolist()
    features.remove(target)
    features.remove('customer_id')

    #change churn column to boolean
    churn_df['churn'] = LabelEncoder().fit_transform(churn_df['churn']).astype(bool)
    churn_df.senior_citizen = churn_df.senior_citizen.astype(bool)
    
    #create new e-check collumn
    churn_df['e_check'] = churn_df.payment_type == 'Electronic check'

    #remove total_charges and senior citizens
    features.remove('total_charges')
    
    #remove collumns with little effect on tenure
    features.remove('gender')
    features.remove('phone_service')
    features.remove('payment_type')
    features.remove('contract_type')
    features.remove('internet_service_type')
    features.remove('multiple_lines')

    #encode yes no collumns
    for i in features:
        if churn_df[i].unique().tolist() == ['No', 'Yes'] or churn_df[i].unique().tolist() == ['Yes', 'No']:
            churn_df[i] = churn_df[i] == 'Yes'

    #one hot encode collumns
    churn_df = (churn_df.join(pd.get_dummies(churn_df.contract_type), on= churn_df.index)
                        .join(pd.get_dummies(churn_df.internet_service_type), on = churn_df.index))
    
    #add to features
    new_features = pd.get_dummies(churn_df.contract_type).columns.tolist()
    new_features += pd.get_dummies(churn_df.internet_service_type).columns.tolist()
    features += new_features
    
    #split data
    train, test = split_scale.split_my_data(churn_df, stratify=churn_df.churn)
    return train, test, features, target
예제 #14
0
    return number_of_features


def top_n_features(X_train, y_train, n, model):
    cols = X_train.columns
    rfe = RFE(model, n)
    X_rfe = rfe.fit_transform(X_train, y_train)
    model.fit(X_rfe, y_train)
    features = list(cols[rfe.support_])
    return features


if __name__ == '__main__':
    seed = 43

    telco = wrangle.wrangle_telco()

    train, test = ss.split_my_data(telco, .8, seed)
    X_train = train.drop(columns='total_charges').set_index('customer_id')
    y_train = train[['customer_id', 'total_charges']].set_index('customer_id')
    X_test = test.drop(columns='total_charges')
    y_test = test[['total_charges']]

    select_kbest_freg_unscaled(X_train, y_train, 1)
    x_scale = ss.standard_scaler(X_train, X_train)[1]
    y_scale = ss.standard_scaler(y_train, y_train)[1]
    select_kbest_freg_scaled(X_test, y_test, 1)
    ols_backward_elimination(x_scale, y_scale)
    lasso_cv_coef(x_scale, y_train)
    n = optimal_feature_n(X_train, y_train)
    top_n_features(X_train, y_train, n)
def train_test(data_frame):
    train, test = split_scale.split_my_data(data_frame)
    return train,test
# Our scenario continues:

# As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer.

# Create a file, explore.py, that contains the following functions for exploring your variables (features & target).

# Write a function, plot_variable_pairs(dataframe) that plots all of the pairwise relationships along with the regression line for each pair.
import matplotlib.pyplot as plt
import seaborn as sns

import wrangle
import split_scale

X_train, X_test, y_train, y_test = split_scale.split_my_data()

train = pd.merge(X_train, y_train, left_index=True, right_index=True)
test = pd.merge(X_test, y_test, left_index=True, right_index=True)


def plot_variable_pairs(dataframe):
    plot = sns.pairplot(train,
                        x_vars="total_charges",
                        y_vars=["monthly_charges", "tenure"])
    return plot


# Write a function, months_to_years(tenure_months, df) that returns your dataframe with a new feature tenure_years, in complete years as a customer.
def months_to_years(tenure_months, df):
    df["tenure_years"] = round(tenure_months / 12, 0)
    return df
# Fit the logistic regression classifier to your training sample
# and transform, i.e. make predictions on the training sample

from sklearn.linear_model import LogisticRegression
from acquire import get_iris_data
from split_scale import split_my_data

df = get_iris_data()
df.head()

X = df[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
y = df[["species_name"]]

X_train, X_test, y_train, y_test = split_my_data(X, y, 0.7)

log_model = LogisticRegression(C=1, random_state=123,
                               solver='saga').fit(X_train, y_train)
y_train_pred = log_model.predict(X_train)

y_train_pred = pd.DataFrame(y_train_pred).set_index = y_train
y_train_pred