def select_kbest_freg_scaled(X_train, y_train, k): X_scaler, X_train_scaled_data, X_test_scaled_data = split_scale.standard_scaler(X_train) y_scaler, y_train_scaled_data, y_test_scaled_data = split_scale.standard_scaler(y_train) f_selector = SelectKBest(f_regression,k=k) f_selector.fit(X_train_scaled_data, y_train_scaled_data) f_support = f_selector.get_support() f_feature = X_train_scaled_data.loc[:,f_support].columns.tolist() return f_feature
def split_scale_df(df): train, test = split_scale.split_my_data(df, train_ratio=.8, seed=123) scaler, train, test = split_scale.standard_scaler(train, test) X_train = train.drop(columns='tax_value') y_train = train[['tax_value']] X_test = test.drop(columns='tax_value') y_test = test[['tax_value']] ols_model = ols('y_train ~ X_train', data=train).fit() train['yhat'] = ols_model.predict(y_train) return train, test, X_train, y_train, X_test, y_test, ols_model
data = data.set_index(data.id) #sns.pairplot(data=data) train, test = split_scale.split_my_data(data) X_train = train.drop(columns=["id", "taxvaluedollarcnt"]) y_train = pd.DataFrame([train.taxvaluedollarcnt]) y_train = y_train.transpose() X_test = test.drop(columns=["id", "taxvaluedollarcnt"]) y_test = pd.DataFrame([test.taxvaluedollarcnt]) y_test = y_test.transpose() X_train_scaled = split_scale.standard_scaler(X_train) #sns.heatmap(data.corr(), cmap='Blues', annot=True) predictions = pd.DataFrame({ 'actual': y_train.taxvaluedollarcnt }).reset_index(drop=True) # model 1 using square feet only lm1 = LinearRegression() lm1.fit(X_train_scaled[['calculatedfinishedsquarefeet']], y_train) lm1_predictions = lm1.predict(X_train_scaled[['calculatedfinishedsquarefeet']]) predictions['lm1'] = lm1_predictions # model 2 using square feet and bedroom count lm2 = LinearRegression() lm2.fit(X_train_scaled[['calculatedfinishedsquarefeet', 'bedroomcnt']],
df = wrangle.wrangle_telco() df.dtypes df = df[['monthly_charges', 'tenure', 'total_charges']] sns.pairplot(data=df) train, test = split_scale.split_my_data(df) # split into train and test # For feature engineering methods, we want to use the scaled data: # scale the data using standard scaler scaler, train_scaled_data, test_scaled_data = \ split_scale.standard_scaler(df) train_scaled_data['filler'] = -1 test_scaled_data['filler'] = -1 test['filler'] = -1 train['filler'] = -1 X_train = train[['monthly_charges', 'tenure', 'filler']] y_train = train[['total_charges']] X_train_scaled_data = train_scaled_data[[ 'monthly_charges', 'tenure', 'filler' ]] y_train_scaled_data = train_scaled_data[['total_charges']] # to return to orignal values
# Acquire and prep data df = wrangle.wrangle_telco() df.head() df.info() df.drop(columns=['customer_id'], inplace=True) df.head() # Explore data sns.pairplot(data=df) # split data train, test = split_scale.split_my_data(data=df, train_ratio=.80, seed=123) # Scale scaler, train_scaled, test_scaled = split_scale.standard_scaler(train, test) # Seperate into X and y dataframes X_train = train.drop(columns=['total_charges']) y_train = train[['total_charges']] X_test = test.drop(columns=['total_charges']) y_test = test[['total_charges']] X_train_scaled = train_scaled.drop(columns=['total_charges']) y_train_scaled = train_scaled[['total_charges']] X_test_scaled = test_scaled.drop(columns=['total_charges']) y_test_scaled = test_scaled[['total_charges']]
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler import warnings warnings.filterwarnings("ignore") import env import wrangle import split_scale # 1. Write a function, plot_variable_pairs(dataframe) that plots all of the pairwise relationships along with the regression line for each pair. split_scale.standard_scaler() df_train_x = pd.DataFrame(train_x_scaled_data) df_train_y = pd.DataFrame(train_y_scaled_data) df_test_x = pd.DataFrame(test_x_scaled_data) df_test_y = pd.DataFrame(test_y_scaled_data) df_train_x['target'] = df_train_y df_test_x['target'] = df_test_y df_train_x.head() df_test_x.head() df_train_x.rename(columns={0: "monthly_charges", 1: "tenure"}) df_test_x.rename(columns={0: "monthly_charges", 1: "tenure"}) g = sns.PairGrid(df_train_x) g.map_diag(plt.hist)
col_names = ['customer_id', 'tenure', 'monthly_charges', 'total_charges'] X_train_scaled = df.copy() X = df[['tenure', 'monthly_charges']] y = df[['total_charges']] train_pct = .25 X_train, X_test, y_train, y_test = split_my_data(X, y, train_pct) assert X_train.shape[0] == y_train.shape[0] assert X_test.shape[0] == y_test.shape[0] # ### Standard Scaler # $$x'={\frac {x-{\bar {x}}}{\sigma }}$$ X_train_standard_scaled, X_test_standard_scaled, standard_scaler = standard_scaler( X_train, X_test) X_train_standard_scaled, X_test_standard_scaled X_train_standard_scaled.head() # ### Standard Scale Inverse X_train_standard_unscaled, X_test_standard_unscaled = scale_inverse( X_train_standard_scaled, X_test_standard_scaled, standard_scaler) X_train_standard_unscaled.head() # ### Uniform Scaler # # - It smooths out unusual distributions, and it spreads out the most frequent values and reduces the impact of (marginal) outliers →∴ a robust preprocessing scheme.
return f_feature select_kbest_freg(x_train, y_train, 2) #print(str(len(f_feature)), 'selected features') #print(f_feature) plt.figure(figsize=(6, 5)) cor = x_train.corr() sns.heatmap(cor, annot=True, cmap=plt.cm.Reds) plt.show() # ## Write a function, select_kbest_freg() that takes X_train, y_train (scaled) and k as input and returns a list of the top k features. train_x_scaled_data, test_x_scaled_data, scaler_x_train, scaler_x_test = split_scale.standard_scaler( x_train, x_test) def select_kbest_freg(train_x_scaled_data, k): f_selector = SelectKBest(f_regression, k) f_selector.fit(x_train, y_train) f_support = f_selector.get_support() f_feature = x_train.loc[:, f_support].columns.tolist() return f_feature select_kbest_freg(train_x_scaled_data, 2)
# Write a function, select_kbest_freg_unscaled() that takes X_train, y_train and k as input (X_train and y_train should not be scaled!) and returns a list of the top k features. X_train = train.drop(columns = "total_charges") y_train = train["total_charges"] X_test = test.drop(columns = "total_charges") y_test = test["total_charges"] def select_kbest_freg_unscaled(X_train, y_train, k): f_selector = SelectKBest(f_regression, k=k).fit(X_train, y_train) f_support = f_selector.get_support() f_feature = X_train.loc[:,f_support].columns.tolist() return f_feature # Write a function, select_kbest_freg_scaled() that takes X_train, y_train (scaled) and k as input and returns a list of the top k features. X_train_scaled, X_test_scaled, scaler = split_scale.standard_scaler(X_train, X_test) def select_kbest_freg_scaled(X_train, y_train, k): f_selector = SelectKBest(f_regression, k=k).fit(X_train, y_train) f_support = f_selector.get_support() f_feature = X_train.loc[:,f_support].columns.tolist() return f_feature # Write a function, ols_backware_elimination() that takes X_train and y_train (scaled) as input and returns selected features based on the ols backwards elimination method. def ols_backware_elimination(X_train, y_train): ols_model = sm.OLS(y_train, X_train) fit = ols_model.fit() cols = list(X_train.columns) while (len(cols)>0): X_1 = X_train[cols]
def select_kbest_freg_scaled(X_train, y_train, k): x_scale = ss.standard_scaler(X_train, X_train)[1] y_scale = ss.standard_scaler(y_train, y_train)[1] return select_kbest_freg_scaled(x_scale, y_scale, k)
return number_of_features def top_n_features(X_train, y_train, n, model): cols = X_train.columns rfe = RFE(model, n) X_rfe = rfe.fit_transform(X_train, y_train) model.fit(X_rfe, y_train) features = list(cols[rfe.support_]) return features if __name__ == '__main__': seed = 43 telco = wrangle.wrangle_telco() train, test = ss.split_my_data(telco, .8, seed) X_train = train.drop(columns='total_charges').set_index('customer_id') y_train = train[['customer_id', 'total_charges']].set_index('customer_id') X_test = test.drop(columns='total_charges') y_test = test[['total_charges']] select_kbest_freg_unscaled(X_train, y_train, 1) x_scale = ss.standard_scaler(X_train, X_train)[1] y_scale = ss.standard_scaler(y_train, y_train)[1] select_kbest_freg_scaled(X_test, y_test, 1) ols_backward_elimination(x_scale, y_scale) lasso_cv_coef(x_scale, y_train) n = optimal_feature_n(X_train, y_train) top_n_features(X_train, y_train, n)