def clean_telco(cached=False): df = get_telco_data() # use my aquire function to read data into a df from a csv file # drop duplicates df.drop_duplicates(inplace=True) # drop and rename columns df = df.drop(columns=[ 'payment_type_id', 'contract_type_id.1', 'contract_type_id', 'internet_service_type_id' ]) df['years_tenure'] = df.tenure / 12 df['has_streaming'] = df["streaming_tv" or "streaming_movies"] == 'Yes' df['is_family'] = df["partner" or "dependents"] == 'Yes' df['has_phones'] = df['phone_service' or 'multiple_lines'] == 'Yes' df['has_security_features'] = df['online_security' or 'online_backup'] == 'Yes' df['years_tenure'] = df.tenure / 12 # create dummy columns for churn telco_dummies = pd.get_dummies(df.churn, drop_first=True) # add dummy columns to df df = pd.concat([df, telco_dummies], axis=1) # rename dummy columns df = df.rename(columns={'Yes': 'is_churn'}) return df
def prep_telco(cached=True): df = get_telco_data(cached) df = df.drop(columns=[ 'customer_id', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'internet_service_type_id', 'contract_type_id', 'payment_type_id', 'partner', 'dependents', 'multiple_lines', 'streaming_tv', 'streaming_movies', 'total_charges' ]) df_dummies = pd.get_dummies(df.contract_type) df = pd.concat([df, df_dummies], axis=1) df = df.drop(columns='contract_type') df_dummies = pd.get_dummies(df.internet_service_type) df = pd.concat([df, df_dummies], axis=1) df = df.drop(columns='internet_service_type') df_dummies = pd.get_dummies(df.payment_type) df = pd.concat([df, df_dummies], axis=1) df = df.drop(columns='payment_type') df.loc[df['phone_service'] == 'No', 'phone_service'] = 0 df.loc[df['phone_service'] == 'Yes', 'phone_service'] = 1 df.loc[df['paperless_billing'] == 'No', 'paperless_billing'] = 0 df.loc[df['paperless_billing'] == 'Yes', 'paperless_billing'] = 1 df.loc[df['churn'] == 'No', 'churn'] = 0 df.loc[df['churn'] == 'Yes', 'churn'] = 1 df_dummies = pd.get_dummies(df.gender) df = pd.concat([df, df_dummies], axis=1) df = df.drop(columns='gender') df['phone_service'] = df.phone_service.astype('int') df['paperless_billing'] = df.paperless_billing.astype('int') df['churn'] = df.churn.astype('int') return df
def clean_telco(): ''' clean_telco will take in the telco_churn data as df, replace "No service" string with "No", convert categorical variable into dummy/indicator variables, drop columns with duplicate information as well as statistically invalid columns, normalize the column titles, rename some titles for legibility, convert total_charges to dtype='float64' and fill null values in total_charges with 0 since the customers tenure was 0 and have yet to pay their first bill. Finally,this function converts 'Yes' to 1 and 'No' to 0. ''' df = get_telco_data() df.replace('No phone service', 'No', inplace=True) df.replace('No internet service', 'No', inplace=True) dummy_df = pd.get_dummies(df[['gender', 'contract_type', 'payment_type']], drop_first=True) dummy2_df = pd.get_dummies(df[['internet_service_type']], drop_first=False) dropcols = [ 'gender', 'payment_type_id', 'internet_service_type_id', 'contract_type_id', 'internet_service_type_none' ] df = pd.concat([df, dummy_df, dummy2_df], axis=1) df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace( '(', '').str.replace(')', '') df.drop(columns=dropcols, inplace=True) df.columns = [ 'customer_id', 'senior_citizen', 'partner', 'dependents', 'tenure', 'phone_service', 'multiple_lines', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'monthly_charges', 'total_charges', 'churn', 'contract_type', 'internet_service_type', 'payment_type', 'gender_male', 'one_year_contract', 'two_year_contract', 'credit_card_payment', 'e_check_payment', 'mailed_check_payment', 'dsl_internet', 'fiber_optic_internet' ] df.total_charges = pd.to_numeric(df.total_charges, errors='coerce').astype('float64') df.total_charges = df.total_charges.fillna(value=0) df = df.replace("Yes", 1).replace("No", 0) return df
def wrangle_telco(): df = get_telco_data() df['total_charges'] = df.total_charges.where((df.tenure != 0), 0) df['total_charges'] = df.total_charges.astype(float) train_validate, test = train_test_split(df, test_size=.2, random_state=123) train, validate = train_test_split(train_validate, test_size=.3, random_state=123) return train, validate, test
def telco_data_prep(data_split=False): ''' Load telco_data as a pandas DataFrame Clean numeric types and creates dummy variables ''' # Call function to load the telco dataset df = get_telco_data() # Clean and cast total_charges column from type object to float df.total_charges = df.total_charges.str.strip() # New customers that have recently signed up have value of ''. '' == 0 tenure and 0 total charges df.total_charges = df.total_charges.replace( '', 0) # Replace empty strings with 0, np.nan iteration 2 df.total_charges = df.total_charges.astype( "float") # Cast the entire column from type string to float # Drop the observations where customers have not paid their first month with telco df.dropna(inplace=True) # Replace target variable strings('Yes'/'No') with int's(1/0) df.churn = np.where(df.churn == 'Yes', 1, 0) # Clean tenure columns df.rename(columns={'tenure': 'tenure_in_months'}, inplace=True) df['tenure_in_years'] = round(df.tenure_in_months / 12, 2) # Collect the column name where values are categorical/strings/objects encoded_columns = df.nunique()[df.nunique() <= 4].index.to_list( ) # Columns with 4 or less unique values encoded_columns.remove( 'churn' ) # remove churn from this list, we've already converted the values to binary outcomes encoded_attributes = pd.get_dummies( df[encoded_columns], drop_first=True ) # with the remaining object columns, create dummy variables. df = pd.concat( [df, encoded_attributes], axis=1) # concat the original dataframe with the encoded attributes. # encoded_columns.append('customer_id') # append categorical columns with customer_id df.drop( columns=encoded_columns, inplace=True) # drop all columns that are represented by pd.dummies churn = df[['churn']] df.drop(columns='churn', inplace=True) df = pd.concat([df, churn], axis=1) return df
def wrangle_telco(): ''' wrangle_telco will read in our telco data for two year contract customers via a sql query, clean the data down to monthly charges, total charges, tenure, and customer id, replace whitespace values in total charges with zeros where appropriate and then split the data return: train, validate, and test sets of telco data ''' df = clean_telco(acquire.get_telco_data()) return split_data(df)
def wrangle_telco(): df = acquire.get_telco_data() df.tenure.replace(0, 1, inplace=True) df.total_charges = df.total_charges.str.strip() df.total_charges.replace('', df.monthly_charges, inplace=True) df.total_charges = df.total_charges.astype(float) df['automatic_payment'] = ((df['payment_type_id'] == 3) | (df['payment_type_id'] == 4)) scaler = MinMaxScaler() df['monthly_charges_scaled'] = scaler.fit_transform( df['monthly_charges'].values.reshape(-1, 1)) df['tenure_3_or_less'] = df['tenure'] <= 3 return df
def wrangle_telco(): ''' This function acquires/loads `telco_churn` data from a SQL database using SQL and Pandas. Cleans the acquired data and returns a dataframe to be used in EDA and Modeling. ''' df = get_telco_data() df['total_charges'] = df.total_charges.str.strip() df.total_charges.replace("", 0, inplace=True) df['total_charges'] = df.total_charges.astype('float') df = df[['customer_id', 'tenure', 'monthly_charges', 'total_charges']] train, validate, test = split_data(df) return train, validate, test
def wrangle_telco(): ''' Transforms data brought in from SQL to handle nulls in tenure Turn the total_charges column to a float Add calculated fields ''' df = acquire.get_telco_data() df.tenure.replace(0, 1, inplace=True) df.total_charges = df.total_charges.str.strip() df.total_charges.replace('', df.monthly_charges, inplace=True) df.total_charges = df.total_charges.astype(float) df['automatic_payment'] = ((df['payment_type_id'] == 3) | (df['payment_type_id'] == 4)) scaler = MinMaxScaler() df['monthly_charges_scaled'] = scaler.fit_transform(df['monthly_charges'].values.reshape(-1,1)) df['tenure_3_or_less'] = df['tenure']<=3 return df
def prep_telco_data(): # Use acquire file to import telco data df = get_telco_data() # Drop duplicates if there are any df.drop_duplicates(inplace=True) # payment_type_cols = [1, 2, 3, 4] df['is_automatic_payment'] = df.payment_type_id.replace( payment_type_cols, [0, 0, 1, 1]) no_internet = [ 'streaming_movies', 'tech_support', 'streaming_tv', 'online_security', 'online_backup', 'device_protection' ] df[no_internet] = df[no_internet].replace({ 'No internet service': 0, 'No': 1, 'Yes': 2 }) binary_cols = [ 'churn', 'partner', 'dependents', 'phone_service', 'multiple_lines', 'paperless_billing' ] df[binary_cols] = df[binary_cols].replace('Yes', 1).replace('No', 0) df.rename(columns={'gender': 'male'}, inplace=True) df['male'] = df['male'].replace('Male', 1).replace('Female', 0) df['total_charges'] = df.total_charges.where((df.tenure != 0), 0) df = df.astype({'total_charges': 'float64'}) df['phone_lines'] = df['multiple_lines'].replace({ 'No phone service': 0, 'No': 1, 'Yes': 2 }) df.drop(columns='multiple_lines') df.rename(columns={'tenure': 'monthly_tenure'}, inplace=True) df['yearly_tenure'] = round(df.monthly_tenure / 12, 2) df['part_depend'] = df['partner'] + df['dependents'] df = df.drop(df[((df['monthly_tenure'].sort_values() == 0))].index) internet_types = [1, 2, 3] df['fiber'] = df['internet_service_type_id'].replace( internet_types, [0, 1, 0]) train_validate, test = train_test_split(df, test_size=.2, random_state=123) train, validate = train_test_split(train_validate, test_size=.3, random_state=123) return train, validate, test
def run_model(): # Get the Data df = acquire.get_telco_data() # Prepare the Data df = prepare.drop_columns(df) df = prepare.fix_dtypes(df) # Add Features df = features.create_features(df) # Encode DataFrame df = encode.encode_df(df) # Select features to be used in the model cols = ['contract_type', 'tenure', 'monthly_charges', 'payment_type', 'has_internet'] X = df[cols] y = df.churn # Create and fit the model forest = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=123).fit(X, y) # Create a DataFrame to hold predictions results = pd.DataFrame( {'Costumer_ID': df.customer_id, 'Model_Predictions': forest.predict(X), 'Model_Probabilities': forest.predict_proba(X)[:,1] }) # Generate csv results.to_csv('model_results.csv') return results
def model_and_prediction(): # acquire data df = ac.get_telco_data() # prepare data encoder, train, validate, test = pr.prep_telco(df) scaler, train, validate, test = pr.min_max_scaler(train, validate, test) # select desired features X = ['tenure', 'monthly_charges', 'contract_type_id', 'senior_citizen'] y = ['churn'] X_train = train[X] y_train = train[y] X_test = test[X] y_test = test[y] #scale the whole dataset df[["tenure", "monthly_charges" ]] = scaler.transform(df[["tenure", "monthly_charges"]]) # create a model trained from train dataset rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', min_samples_leaf=3, n_estimators=100, max_depth=20, random_state=123) rf.fit(X_train, y_train) # Make prediction of each customer df['predicted'] = rf.predict( df[['tenure', 'monthly_charges', 'contract_type_id', 'senior_citizen']]) #Show prediction of each customer df["probability"] = rf.predict_proba( df[['tenure', 'monthly_charges', 'contract_type_id', 'senior_citizen']])[:, 1] # encode int to string df['predicted'] = df.predicted.apply(lambda x: 'Yes' if x == 1 else 'No') # Select columns for csv file df = df.rename(columns={'churn': 'actual'}) df = df[['customer_id', 'actual', 'predicted', "probability"]] # write dataframe to a csv file df.to_csv('prediction.csv')
def telco_data_prep(): df = get_telco_data() df.total_charges = df.total_charges.str.strip() df.total_charges = df.total_charges.replace('', 0) df.churn.replace(['Yes', 'No'], [1,0], inplace = True) return df
from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler # ignore warnings import warnings warnings.filterwarnings("ignore") from acquire import get_telco_data # In[93]: telco=get_telco_data() # In[91]: #Getting just a look at what the data frame looks like by using .head() telco.head() # In[94]: # Describing the numerical data telco.describe()
random_state=123) train, validate = train_test_split(train_and_validate, test_size=.15, random_state=123) return train, test, validate # # Telco Churn Exercise Problem # Throughout the exercises for Regression in Python lessons, you will use the following example scenario: As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer. # - The first step will be to acquire and prep the data. # In[20]: df = acquire.get_telco_data() # In[21]: df.shape # In[22]: df.head() # In[23]: df.dtypes # In[24]:
def prep_telco(): ''' No arguments taken. Returns a df of the telco customer churn data in the Codeup SQL database. ''' # acquire the data from csv (or sql if no csv exists) and assign it as a DataFrame df = get_telco_data() # set the index to be customer_id df = df.set_index('customer_id') # cut down on the complexity of data by combining the variables for add-on packages # into one variable, num_add_ons, that adds up the number of add-on services each customer has # map strings into 1s and 0s, 1 meaning that the customer has that add-on df.online_security = df.online_security.map({ 'Yes': 1, 'No': 0, 'No internet service': 0 }) df.online_backup = df.online_backup.map({ 'Yes': 1, 'No': 0, 'No internet service': 0 }) df.device_protection = df.device_protection.map({ 'Yes': 1, 'No': 0, 'No internet service': 0 }) df.tech_support = df.tech_support.map({ 'Yes': 1, 'No': 0, 'No internet service': 0 }) df.streaming_tv = df.streaming_tv.map({ 'Yes': 1, 'No': 0, 'No internet service': 0 }) df.streaming_movies = df.streaming_movies.map({ 'Yes': 1, 'No': 0, 'No internet service': 0 }) # add and put into new column, num_add_ons (number of add-on services) df['num_add_ons'] = (df.online_security + df.online_backup + df.device_protection + df.tech_support + df.streaming_tv + df.streaming_movies) # drop the add-on columns we don't need anymore. df = df.drop(columns=[ 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies' ]) # encode phone_service and gender df.phone_service = df.phone_service.map({'Yes': 1, 'No': 0}) # make new column is_male and encode df['is_male'] = df.gender.map({'Male': 1, 'Female': 0}) # Now drop gender column df = df.drop(columns=['gender']) # encode partner, dependents, and churn df.partner = df.partner.map({'Yes': 1, 'No': 0}) df.dependents = df.dependents.map({'Yes': 1, 'No': 0}) df.churn = df.churn.map({'Yes': 1, 'No': 0}) # Right now the contract types are map from 1 to 3, so I'm going to re-map it starting from 0. # 0 is Month-to-Month # 1 is Yearly # 2 is Two Year df.contract_type_id = df.contract_type_id.map({1: 0, 2: 1, 3: 2}) # Going to re-map the internet_type_id column in the same way. # 0 is No Internet Service # 1 is DSL # 2 is Fiber Optic df.internet_service_type_id = df.internet_service_type_id.map({ 3: 0, 1: 1, 2: 2 }) # Now to add a column called tenure_yrs that represents tenure in years df['tenure_yrs'] = round((df.tenure / 12), 2) # Rename columns to shorter names for readability and ease of use: # contract_type_id --> contract_type # phone_service --> phone # internet_service_type_id --> internet_type # senior_citizen --> senior # dependents --> depend df = df.rename( columns={ "contract_type_id": "contract_type", "phone_service": "phone", "internet_service_type_id": "internet_type", "senior_citizen": "senior", "dependents": "depend" }) # convert total_charges to a float and drop the rows with null total_charges df['total_charges'] = pd.to_numeric(df['total_charges'], errors='coerce') df = df[~df.total_charges.isnull()] # Data is now tidy and ready to split into train, validate, and test. return df
def prep_telco_data(cached=True): # use my acquire function to read data into a df from a csv file df = pd.DataFrame(get_telco_data()) #set index as customer_id df = df.set_index('customer_id') #rows full of Nan values rows_to_drop = ([ 6895, 6686, 6615, 6605, 6569, 6143, 2132, 2048, 2029, 1949, 1878 ]) #dropping the Nan values df.drop(df.index[rows_to_drop], inplace=True) #changing data type of total charges to float df.total_charges = pd.to_numeric(df.total_charges) # change device_protection into numeric values df.device_protection = df.device_protection.apply(online_checker) # change tech_suppport into numeric values df.tech_support = df.tech_support.apply(online_checker) # change online security into numeric values df.online_security = df.online_security.apply(online_checker) #change online backup into numeric values df.online_backup = df.online_backup.apply(online_checker) #change streaming tv into numeric values df.streaming_tv = df.streaming_tv.apply(online_checker) #change streaming movies into numeric values df.streaming_movies = df.streaming_movies.apply(online_checker) #change multiple lines into numeric values df.multiple_lines = df.multiple_lines.apply(online_checker) #create a new column with tenure by the year df['tenure_by_year'] = df.tenure / 12 #change multiple_lines into a numeric value df.multiple_lines = df.multiple_lines.apply(online_checker) #create dummy values for gender, churn, and paperless_billing telco_dummies = pd.get_dummies(df[[ 'gender', 'partner', 'dependents', 'phone_service', 'churn', 'paperless_billing' ]], drop_first=True) #create dummy values for internet internet_dummie = pd.get_dummies(df['internet_service_type_id']) #rename the dummy columns internet_dummie = internet_dummie.rename(columns={ 1: 'DSL', 2: 'Fiber', 3: 'No_internet' }) #create dummy variable for payment payment_dummie = pd.get_dummies(df['payment_type_id']) #rename the dummy columns payment_dummie = payment_dummie.rename(columns={ 1: 'echeck', 2: 'mailed_check', 3: 'bank_transfer', 4: 'credit_card' }) #create dummy variable for contract type contract_dummie = pd.get_dummies(df['contract_type_id']) #rename dummy columns contract_dummie = contract_dummie.rename(columns={ 1: 'Month2month', 2: '1year', 3: '2year' }) #add dummy values into the main dataframe df = pd.concat( [df, telco_dummies, payment_dummie, contract_dummie, internet_dummie], axis=1) #list duplicate columns col_to_drop = [ 'payment_type_id', 'gender', 'partner', 'dependents', 'phone_service', 'churn', 'paperless_billing', 'contract_type', 'internet_service_type', 'payment_type' ] #drop duplicate columns df = df.drop(columns=col_to_drop) #split data into train, validate and test subsets train, validate, test = telco_split(df) return train, validate, test
def prep_telco_churn_data(splain=local_settings.splain, **kwargs): ''' prep_telco_churn_data(splain=local_settings.splain, **kwargs) RETURNS: df, encoder, scaler ''' df = get_telco_data(splain=splain) df = df.set_index('customer_id') category_cols=[ 'partner_deps_id', 'partner_deps', 'gender', 'contract_type_id', 'internet_service_type_id', 'internet_service_type', 'contract_type', 'payment_type_id', 'payment_type', 'phone_service_type', 'phone_service_id' ] retype_cols(df=df, cols=category_cols, to_dtype='category') boolean_cols = [ 'is_male', 'is_female', 'senior_citizen', 'partner', 'dependents', 'family', 'thru_first_month', 'thru_first_quarter', 'thru_first_half', 'thru_first_year', 'thru_first_term', 'phone_service', 'multiple_lines', 'internet_service', 'has_dsl', 'has_fiber', 'online_security', 'online_backup', 'online_security_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies', 'streaming_services', 'streaming_dsl', 'streaming_fiber', 'on_contract', 'paperless_billing', 'manual_mtm', 'auto_pay' ] retype_cols(df=df, cols=boolean_cols, to_dtype='bool') string_cols = [ 'partner_deps', 'gender', 'internet_service_type', 'contract_type', 'payment_type', 'phone_service_type', ] #df_dtypes = pd.DataFrame(df.dtypes).rename(columns={0:'dtype'}).reset_index() dfo = set_dfo(dfo_df=df, y_column='churn', splain=True, string_cols=string_cols) return dfo #encoder, scaler