def prep_titanic(df): df.drop(columns=['deck'], inplace=True) df.embark_town = df.embark_town.fillna('Southampton') df.embarked = df.embarked.fillna('S') train, test = split_scale.split_my_data(df, .8) encoder = OneHotEncoder(sparse=False) encoder.fit(train[['embarked']]) cols = [c for c in encoder.categories_[0]] m_train = encoder.transform(train[['embarked']]) m_test = encoder.transform(test[['embarked']]) encoded_train = pd.DataFrame(m_train, columns=cols, index=train.index) encoded_test = pd.DataFrame(m_test, columns=cols, index=test.index) train = pd.concat([train, encoded_train], axis=1).drop(columns='embarked') test = pd.concat([test, encoded_test], axis=1).drop(columns='embarked') imputer = SimpleImputer(strategy='mean') imputer.fit(train[['age']]) train.age = imputer.transform(train[['age']]) test.age = imputer.transform(test[['age']]) train_to_scale = train[['age', 'fare']] test_to_scale = test[['age', 'fare']] scaler, train_scaled, test_scaled = \ split_scale.min_max_scaler(train_to_scale, test_to_scale) train.update(train_scaled) test.update(test_scaled) return train, test
def recursive_feature_elimination(features, target, dataframe, train_pct=0.8): cols = features + target train, test = ss.split_my_data(dataframe[cols], train_pct=train_pct) n = optimum_feature_count(train[features], train[target], test[features], test[target]) features = optimum_feature_names(train[features], train[target], n) train = train[features].join(train[target]) test = test[features].join(test[target]) return train, test, features
def split_scale_df(df): train, test = split_scale.split_my_data(df, train_ratio=.8, seed=123) scaler, train, test = split_scale.standard_scaler(train, test) X_train = train.drop(columns='tax_value') y_train = train[['tax_value']] X_test = test.drop(columns='tax_value') y_test = test[['tax_value']] ols_model = ols('y_train ~ X_train', data=train).fit() train['yhat'] = ols_model.predict(y_train) return train, test, X_train, y_train, X_test, y_test, ols_model
def prep_iris(df): df.drop(columns=['species_id', 'measurement_id'], inplace=True) df.columns = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species' ] train, test = split_scale.split_my_data(df, .8) encoder = OneHotEncoder(sparse=False) encoder.fit(train[['species']]) cols = [c for c in encoder.categories_[0]] m_train = encoder.transform(train[['species']]) m_test = encoder.transform(test[['species']]) encoded_train = pd.DataFrame(m_train, columns=cols, index=train.index) encoded_test = pd.DataFrame(m_test, columns=cols, index=test.index) train = pd.concat([train, encoded_train], axis=1).drop(columns='species') test = pd.concat([test, encoded_test], axis=1).drop(columns='species') return train, test
url = f'mysql+pymysql://{user}:{password}@{host}/zillow' data = pd.read_sql( '''select id, calculatedfinishedsquarefeet, bedroomcnt, bathroomcnt, taxvaluedollarcnt from properties_2017 join predictions_2017 using (id) join propertylandusetype using (propertylandusetypeid) where transactiondate between "2017-05-01" and "2017-06-30" and propertylandusetypeid not in ("31", "47", "246", "247", "248","264", "265", "266","267", "269", "270" ) and calculatedfinishedsquarefeet * bathroomcnt * bedroomcnt != 0 and taxvaluedollarcnt != 0''', url) data = data.set_index(data.id) #sns.pairplot(data=data) train, test = split_scale.split_my_data(data) X_train = train.drop(columns=["id", "taxvaluedollarcnt"]) y_train = pd.DataFrame([train.taxvaluedollarcnt]) y_train = y_train.transpose() X_test = test.drop(columns=["id", "taxvaluedollarcnt"]) y_test = pd.DataFrame([test.taxvaluedollarcnt]) y_test = y_test.transpose() X_train_scaled = split_scale.standard_scaler(X_train) #sns.heatmap(data.corr(), cmap='Blues', annot=True) predictions = pd.DataFrame({ 'actual': y_train.taxvaluedollarcnt }).reset_index(drop=True)
import env import wrangle import split_scale # Acquire and prep data df = wrangle.wrangle_telco() df.head() df.info() df.drop(columns=['customer_id'], inplace=True) df.head() # Explore data sns.pairplot(data=df) # split data train, test = split_scale.split_my_data(data=df, train_ratio=.80, seed=123) # Scale scaler, train_scaled, test_scaled = split_scale.standard_scaler(train, test) # Seperate into X and y dataframes X_train = train.drop(columns=['total_charges']) y_train = train[['total_charges']] X_test = test.drop(columns=['total_charges']) y_test = test[['total_charges']] X_train_scaled = train_scaled.drop(columns=['total_charges']) y_train_scaled = train_scaled[['total_charges']]
print(df.columns[df.isnull().any()]) df.monthly_charges.value_counts(sort=True, ascending=True) df.describe() df.groupby('tenure').mean().plot.bar(figsize=(16, 9), ec='black', width=.9) col_names = ['customer_id', 'tenure', 'monthly_charges', 'total_charges'] X_train_scaled = df.copy() X = df[['tenure', 'monthly_charges']] y = df[['total_charges']] train_pct = .25 X_train, X_test, y_train, y_test = split_my_data(X, y, train_pct) assert X_train.shape[0] == y_train.shape[0] assert X_test.shape[0] == y_test.shape[0] # ### Standard Scaler # $$x'={\frac {x-{\bar {x}}}{\sigma }}$$ X_train_standard_scaled, X_test_standard_scaled, standard_scaler = standard_scaler( X_train, X_test) X_train_standard_scaled, X_test_standard_scaled X_train_standard_scaled.head() # ### Standard Scale Inverse
#### Feature Engineering for telco_churn data import pandas as pd from wrangle import wrangle_telco from split_scale import split_my_data import features ### SelectKBest - Top Features of Unscaled Data ## Step 1. Load Data telco_df = wrangle_telco() telco_df.head() telco_X = telco_df[["monthly_charges", "tenure"]] telco_y = telco_df["total_charges"] ## Step 2. Split Data to X and y, and test and train = 4 data frames telco_X_train, telco_X_test, telco_y_train, telco_y_test = split_my_data( telco_X, telco_y, 0.80) ## Step 3. Run select_kbest_freg_unscaled f_features = features.selectkbest_optimal_features(telco_X_train, telco_y_train, 2)
import warnings warnings.filterwarnings("ignore") import env import wrangle as w import split_scale as ss df = w.wrangle_telco() df x = df[['tenure', 'monthly_charges']] y = df[['total_charges']] x_train, x_test, y_train, y_test = ss.split_my_data(x, y, train_pct=.8) # 1. Write a function, select_kbest_freg_unscaled() that takes X_train, y_train and k as input # (X_train and y_train should not be scaled!) and returns a list of the top k features. from sklearn.feature_selection import SelectKBest, f_regression k = 1 # print(x_train) # print(y_train) # print(x_test) # print(y_test)
import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler import warnings warnings.filterwarnings("ignore") import env import wrangle import split_scale from sklearn.feature_selection import SelectKBest, f_regression df = wrangle.wrangle_telco() X = df.drop(columns=['total_charges', 'customer_id']) y = pd.DataFrame(df.total_charges) x_train, x_test, y_train, y_test = split_scale.split_my_data(X, y) #X_train def select_kbest_freg(x_train, y_train, k): f_selector = SelectKBest(f_regression, k) f_selector.fit(x_train, y_train) f_support = f_selector.get_support() f_feature = x_train.loc[:, f_support].columns.tolist() return f_feature
import numpy as numpy from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler import warnings warnings.filterwarnings("ignore") import env import wrangle import split_scale df = wrangle.wrangle_telco() X = df[['tenure','monthly_charges','total_charges']] X_train, x_test = split_scale.split_my_data(X) # 1. Write a function, plot_variable_pairs(dataframe) that plots all of the pairwise relationships along with the regression line for each pair. def plot_variable_pairs(df): train, test = train_test_split(df) return sns.pairplot(data = train, kind = 'reg'), sns.pairplot(data = test, kind = 'reg') plot_variable_pairs(X) # 2. Write a function, months_to_years(tenure_months,df) that returns your dataframe with a new feature tenure_years, in complete years as a customer. def months_to_years(tenure_month,df): df['tenure_years'] = round(tenure_month / 12) return df months_to_years(df.tenure,df)
import pandas as pd import numpy as np import seaborn as sns import split_scale as ss from wrangle import wrangle_telco import matplotlib.pyplot as plt def plot_variable_pairs(df): graph = sns.PairGrid(df) graph.map_diag(plt.hist) graph.map_offdiag(sns.regplot) plt.show() def months_to_years(tenure_months, df): df['tenure_years'] = tenure_months // 12 return df def plot_categorical_and_continuous_vars(categorical_var, continuous_var, df): bar plot box plot pie chart if __name__ == '__main__': telco = wrangle_telco() telco.set_index([telco.customer_id], inplace=True) train_telco, test_telco = ss.split_my_data(telco, .7, seed) plot_variable_pairs(telco) months_to_years(telco['tenure'], telco) plot_categorical_and_continuous_vars()
def clean_telco_data(): #pull data query = ''' select * from customers as cust join `internet_service_types` as net on cust.`internet_service_type_id` = net.internet_service_type_id join `contract_types` as cont on cust.`contract_type_id` = cont.`contract_type_id` join payment_types as pmt using(`payment_type_id`); ''' churn_df = pd.read_sql(query, get_db_url('telco_churn')) #for duplicate columns churn_df = churn_df.loc[:,~churn_df.columns.duplicated()] #for duplicat rows churn_df = churn_df.drop_duplicates() #drop redundant collumns churn_df = (churn_df.drop('contract_type_id', axis = 1) .drop('internet_service_type_id', axis = 1) .drop('payment_type_id', axis = 1)) #change 'no internets' and no phones to just no churn_df.replace('No internet service', 'No', inplace=True) churn_df.replace('No phone service', 'No', inplace=True) # change to float churn_df.replace(r'^\s*$', np.nan, regex=True, inplace=True) churn_df = churn_df.dropna(axis=0) churn_df.total_charges = churn_df.total_charges.astype(float) #get features and target target = 'churn' features = churn_df.columns.tolist() features.remove(target) features.remove('customer_id') #change churn column to boolean churn_df['churn'] = LabelEncoder().fit_transform(churn_df['churn']).astype(bool) churn_df.senior_citizen = churn_df.senior_citizen.astype(bool) #create new e-check collumn churn_df['e_check'] = churn_df.payment_type == 'Electronic check' #remove total_charges and senior citizens features.remove('total_charges') #remove collumns with little effect on tenure features.remove('gender') features.remove('phone_service') features.remove('payment_type') features.remove('contract_type') features.remove('internet_service_type') features.remove('multiple_lines') #encode yes no collumns for i in features: if churn_df[i].unique().tolist() == ['No', 'Yes'] or churn_df[i].unique().tolist() == ['Yes', 'No']: churn_df[i] = churn_df[i] == 'Yes' #one hot encode collumns churn_df = (churn_df.join(pd.get_dummies(churn_df.contract_type), on= churn_df.index) .join(pd.get_dummies(churn_df.internet_service_type), on = churn_df.index)) #add to features new_features = pd.get_dummies(churn_df.contract_type).columns.tolist() new_features += pd.get_dummies(churn_df.internet_service_type).columns.tolist() features += new_features #split data train, test = split_scale.split_my_data(churn_df, stratify=churn_df.churn) return train, test, features, target
return number_of_features def top_n_features(X_train, y_train, n, model): cols = X_train.columns rfe = RFE(model, n) X_rfe = rfe.fit_transform(X_train, y_train) model.fit(X_rfe, y_train) features = list(cols[rfe.support_]) return features if __name__ == '__main__': seed = 43 telco = wrangle.wrangle_telco() train, test = ss.split_my_data(telco, .8, seed) X_train = train.drop(columns='total_charges').set_index('customer_id') y_train = train[['customer_id', 'total_charges']].set_index('customer_id') X_test = test.drop(columns='total_charges') y_test = test[['total_charges']] select_kbest_freg_unscaled(X_train, y_train, 1) x_scale = ss.standard_scaler(X_train, X_train)[1] y_scale = ss.standard_scaler(y_train, y_train)[1] select_kbest_freg_scaled(X_test, y_test, 1) ols_backward_elimination(x_scale, y_scale) lasso_cv_coef(x_scale, y_train) n = optimal_feature_n(X_train, y_train) top_n_features(X_train, y_train, n)
def train_test(data_frame): train, test = split_scale.split_my_data(data_frame) return train,test
# Our scenario continues: # As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer. # Create a file, explore.py, that contains the following functions for exploring your variables (features & target). # Write a function, plot_variable_pairs(dataframe) that plots all of the pairwise relationships along with the regression line for each pair. import matplotlib.pyplot as plt import seaborn as sns import wrangle import split_scale X_train, X_test, y_train, y_test = split_scale.split_my_data() train = pd.merge(X_train, y_train, left_index=True, right_index=True) test = pd.merge(X_test, y_test, left_index=True, right_index=True) def plot_variable_pairs(dataframe): plot = sns.pairplot(train, x_vars="total_charges", y_vars=["monthly_charges", "tenure"]) return plot # Write a function, months_to_years(tenure_months, df) that returns your dataframe with a new feature tenure_years, in complete years as a customer. def months_to_years(tenure_months, df): df["tenure_years"] = round(tenure_months / 12, 0) return df
# Fit the logistic regression classifier to your training sample # and transform, i.e. make predictions on the training sample from sklearn.linear_model import LogisticRegression from acquire import get_iris_data from split_scale import split_my_data df = get_iris_data() df.head() X = df[["sepal_length", "sepal_width", "petal_length", "petal_width"]] y = df[["species_name"]] X_train, X_test, y_train, y_test = split_my_data(X, y, 0.7) log_model = LogisticRegression(C=1, random_state=123, solver='saga').fit(X_train, y_train) y_train_pred = log_model.predict(X_train) y_train_pred = pd.DataFrame(y_train_pred).set_index = y_train y_train_pred