def final_plot(): df = wrangle.wrangle_telco().set_index("customer_id") plot_pairs = plot_variable_pairs(df) tenure_in_year = months_to_years(df) plot_category = plot_categorical_and_continous_vars(tenure_in_year) return plot_pairs, plot_category
def scale_telco_data(): df = wrangle_telco() train, validate, test = telco_split(df) scaler = sklearn.preprocessing.MinMaxScaler() columns_to_scale = ['monthly_charges', 'tenure', 'total_charges'] train, validate, test = add_scaled_columns(train, validate, test, scaler, columns_to_scale) return train, validate, test
def scale_wrangle_telco(cached=True): ''' This function acquires wrangle_telco data, splits into train, validate, and test, scales the numeric columns using min-max scaling, and adds the scaled columns to the respective split data sets ''' #acquires 'wrangle_telco' and saves it as df df = wrangle_telco(cached) #uses the function above to split the into train, validate and test train, validate, test = telco_split(df) #assigns the scaling method as min-max scaler scaler = sklearn.preprocessing.MinMaxScaler() #identifies the columns to scale columns_to_scale = ['monthly_charges', 'tenure', 'total_charges'] #adds '_scaled' to the end of the newly scaled columns to identify differences new_column_names = [c + '_scaled' for c in columns_to_scale] #fts the columns to the scaler scaler.fit(train[columns_to_scale]) #concatonates the newly created scaled columns to their respective data sets, #adds 'new_column_names' as the label to the added columns #uses the original index since the new columns no longer have an index train = pd.concat([ train, pd.DataFrame(scaler.transform(train[columns_to_scale]), columns=new_column_names, index=train.index), ], axis=1) validate = pd.concat([ validate, pd.DataFrame(scaler.transform(validate[columns_to_scale]), columns=new_column_names, index=validate.index), ], axis=1) test = pd.concat([ test, pd.DataFrame(scaler.transform(test[columns_to_scale]), columns=new_column_names, index=test.index), ], axis=1) #returns the data sets with the new respective scaled data return train, validate, test
def scale_telco(df): ''' Scale_telco wrangles the telco dataframe from the codeup database, splits the df into three data sets (train, validate, test), and scales the data using SKLEARN's Min Max Scaler. It returns three datasets: train_scaled, validate_scaled, test_scaled ''' df = wrangle.wrangle_telco() train, validate, test = wrangle.train_validate_test_split(df) scaler = sklearn.preprocessing.MinMaxScaler() scaler.fit(train) train_scaled = scaler.transform(train) validate_scaled = scaler.transform(validate) test_scaled = scaler.transform(test) train_scaled = pd.DataFrame(train_scaled, columns=train.columns) validate_scaled = pd.DataFrame(validate_scaled, columns=train.columns) test_scaled = pd.DataFrame(test_scaled, columns=train.columns) return train_scaled, validate_scaled, test_scaled
def prepare_for_split(): df = wrangle.wrangle_telco() X = df[["monthly_charges", "tenure"]] y = df.total_charges return X, y
# Create a file, explore.py, that contains the following functions for exploring your variables (features & target). import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler import warnings warnings.filterwarnings('ignore') import env import wrangle import split_scale df = wrangle.wrangle_telco() df.head() train, test = split_my_data_whole(df) train.head(), test.head() type(train) type(test) #1. Write a function, plot_variable_pairs(dataframe) that plots all of the pairwise relationships along with the regression line for each pair. def plot_variable_pairs(df): scaled_train, scaled_test = standard_scaler(train, test) df_plt = sns.jointplot('monthly_charges', 'tenure', data=train, kind='reg')
import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np from wrangle import wrangle_telco import env from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler df = wrangle_telco() # add this into split function X = df.drop(columns=['customer_id', 'total_charges']) y = pd.DataFrame(df['total_charges']) def split_my_data(df, train_pct=.80, random_state=123): train, test = train_test_split(df, train_size=train_pct, random_state=random_state) return train, test def standard_scaler(train, test): scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index( [train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index( [test.index.values])
import seaborn as sns sns.set_style=("whitegrid") import statsmodels.api as sm import wrangle import split_scale from sklearn.metrics import mean_squared_error from math import sqrt from sklearn.linear_model import LinearRegression from sklearn.linear_model import LassoCV from sklearn.feature_selection import SelectKBest, f_regression from statsmodels.formula.api import ols import warnings from sklearn.feature_selection import RFE warnings.filterwarnings("ignore") data = wrangle.wrangle_telco() X = data.drop(columns='total_charges').set_index('customer_id') y = pd.DataFrame(data.total_charges).set_index(data['customer_id']) y_train, y_test = split_scale.split_my_data(y) X_train, X_test = split_scale.split_my_data(X) # 1.) Write a function, select_kbest_freg() that takes X_train, y_train and k as input (X_train and y_train should not be scaled!) and returns a list of the top k features. def select_kbest_freg_unscaled(X_train, y_train, k): f_selector = SelectKBest(f_regression, k=k).fit(X_train, y_train) f_support = f_selector.get_support() f_feature = X_train.loc[:,f_support].columns.tolist() return (str(len(f_feature)), 'selected features'),(f_feature),(f_selector.scores_)
#### Feature Engineering for telco_churn data import pandas as pd from wrangle import wrangle_telco from split_scale import split_my_data import features ### SelectKBest - Top Features of Unscaled Data ## Step 1. Load Data telco_df = wrangle_telco() telco_df.head() telco_X = telco_df[["monthly_charges", "tenure"]] telco_y = telco_df["total_charges"] ## Step 2. Split Data to X and y, and test and train = 4 data frames telco_X_train, telco_X_test, telco_y_train, telco_y_test = split_my_data( telco_X, telco_y, 0.80) ## Step 3. Run select_kbest_freg_unscaled f_features = features.selectkbest_optimal_features(telco_X_train, telco_y_train, 2)
#Create split_scale.py that will contain the functions that follow. #Each scaler function should create the object, fit and transform both train and test. #They should return the scaler, train dataframe scaled, test dataframe scaled. # Be sure your indices represent the original indices from train/test, as those represent the indices from the original dataframe. # Be sure to set a random state where applicable for reproducibility! from wrangle import wrangle_telco import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler # For this project we are wrangling data from the telco-churn database. # This function pulls the data and cleans it. customers = wrangle_telco() # We isolate our X and y variables for train_pct = .8 def pull_X_y(train, test, y): X_train = train.drop(columns=y) y_train = train[[y]] X_test = test.drop(columns=y) y_test = test[[y]] return X_train, y_train, X_test, y_test # Function used to split the data. Although we do produce 4 new datasets (X["train", "test"] and y["train","test"]) def split_my_data(X, y, train_pct): X_train, X_test, y_train, y_test = train_test_split(X,
# I need to do this within an average of $5.00 per customer. import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler import warnings warnings.filterwarnings("ignore") import env import wrangle as w import split_scale as ss df = w.wrangle_telco() df x = df[['tenure', 'monthly_charges']] y = df[['total_charges']] x_train, x_test, y_train, y_test = ss.split_my_data(x, y, train_pct=.8) # 1. Write a function, select_kbest_freg_unscaled() that takes X_train, y_train and k as input # (X_train and y_train should not be scaled!) and returns a list of the top k features. from sklearn.feature_selection import SelectKBest, f_regression k = 1
import pandas as pd import numpy as np import seaborn as sns import split_scale as ss from wrangle import wrangle_telco import matplotlib.pyplot as plt def plot_variable_pairs(df): graph = sns.PairGrid(df) graph.map_diag(plt.hist) graph.map_offdiag(sns.regplot) plt.show() def months_to_years(tenure_months, df): df['tenure_years'] = tenure_months // 12 return df def plot_categorical_and_continuous_vars(categorical_var, continuous_var, df): bar plot box plot pie chart if __name__ == '__main__': telco = wrangle_telco() telco.set_index([telco.customer_id], inplace=True) train_telco, test_telco = ss.split_my_data(telco, .7, seed) plot_variable_pairs(telco) months_to_years(telco['tenure'], telco) plot_categorical_and_continuous_vars()
def get_X_y(): df = wrangle.wrangle_telco() X = df.drop(columns=['customer_id', 'total_charges']) y = df.total_charges return X, y
import warnings warnings.filterwarnings("ignore") import pandas as pd import numpy as np import wrangle import env from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler from sklearn.model_selection import train_test_split df = wrangle.wrangle_telco().set_index("customer_id") X = df.loc[:, ("tenure", "monthly_charges")] y = pd.DataFrame(df.total_charges) # split dataframe into train(train_percent: 80%) & test(20%) def split_my_data(df): train, test = train_test_split(df, train_size = 0.8, random_state = 123) return train, test # split_my_data(df) # standard def perform_standard_scaler(train, test): scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values]) return scaler, train_scaled, test_scaled
def prepare_telco_for_split(): df = wrangle.wrangle_telco() df.drop(columns="customer_id", inplace=True) return df
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler from env import user, password, host import wrangle import split_scale from statsmodels.formula.api import ols from math import sqrt from sklearn.feature_selection import SelectKBest # Our scenario continues: # As a customer analyst, I want to know who has spent the most money with us over their # lifetime. I have monthly charges and tenure, so I think I will be able to use those two # attributes as features to estimate total_charges. I need to do this within an average # of $5.00 per customer. wrangle.wrangle_telco() get_db_url(user, host, password, database="telco_churn") telco = wrangle_telco() telco telco.head() telco.describe() telco.info() telco.dtypes telco.columns.values train, test = train_test_split(telco, train_size=0.80, random_state=123) train = train.drop('customer_id', axis=1) test = test.drop('customer_id', axis=1) train.head()