def prep_titanic(): ''' This function reads titanic data into a df from a csv file. Returns prepped train, validate, and test dfs ''' # use my acquire function to read data into a df from a csv file df = get_titanic_data() # drop rows where embarked/embark town are null values df = df[~df.embarked.isnull()] # encode embarked & sex using dummy columns titanic_dummies = pd.get_dummies(df[['sex', 'embarked']], drop_first=True) # join dummy columns back to df df = pd.concat([df, titanic_dummies], axis=1) # drop the deck column df = df.drop(columns=[ 'passenger_id', 'deck', 'sex', 'embarked', 'class', 'embark_town' ]) # impute missing age values imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = imp_mean.fit(df[['age']]) df[['age']] = imputer.transform(df[['age']]) return df
def prep_titanic(cached=True): ''' This function reads titanic data into a df from a csv file. Returns prepped train, validate, and test dfs ''' # use my acquire function to read data into a df from a csv file df = get_titanic_data(cached) # drop rows where embarked/embark town are null values df = df[~df.embarked.isnull()] # encode embarked using dummy columns titanic_dummies = pd.get_dummies(df.embarked, drop_first=True) # join dummy columns back to df df = pd.concat([df, titanic_dummies], axis=1) # drop the deck column df = df.drop(columns='deck') # split data into train, validate, test dfs train, validate, test = titanic_split(df) # impute mean of age into null values in age column train, validate, test = impute_mean_age(train, validate, test) return train, validate, test
def prep_titanic_data(cached=True): # use my acquire function to read data into a df from a csv file df = get_titanic_data(cached) # drop rows where embarked/embark town are null values df = df[~df.embarked.isnull()] # encode embarked and sex using dummy columns titanic_dummies = pd.get_dummies(df[['sex', 'embarked']], drop_first=True) # join dummy columns back to df df = pd.concat([df, titanic_dummies], axis=1) # drop the deck column df = df.drop(columns=[ 'deck', 'sex', 'embarked', 'class', 'embark_town', 'passenger_id' ]) # # split data into train, validate, test dfs train, validate, test = titanic_split(df) # # impute mean of age into null values in age column train, validate, test = impute_mean_age(train, validate, test) return train, validate, test
def prep_titanic(df=get_titanic_data()): """ prep_titanic accepts the titanic dataset and returns a transformed titanic dataset for exploratory analysis. type(df) >>> pandas.core.frame.DataFrame """ # Drop missing values in the embarked column. # This removes missing values in embark_town as well. # df.dropna(how='any', subset=['embarked'], inplace=True) # Throw the deck overboard because there are too many missing values. df.drop(columns=['deck'], inplace=True) # Create dummy variables for our targets. encoded_embarked = pd.get_dummies(df.embark_town, drop_first=True) encoded_class = pd.get_dummies(df['class'], drop_first=True) encoded_sex = pd.get_dummies(df.sex, drop_first=True) df = df.select_dtypes(exclude='O') # Scale numerical columns using MinMaxScalar() # scalar = MinMaxScaler() # Use `.transform_fit` on the scalar object to fit and transform the data. # Assign directly to 'age' and 'fare' columns. # df[['age', 'fare']] = scalar.fit_transform(df[['age', 'fare']]) # Add the encoded target names as columns to the dataframe. df = pd.concat([df, encoded_embarked, encoded_class, encoded_sex], axis=1) return df
def prep_titanic_data(splain=local_settings.splain, **kwargs): ''' prep_titanic(splain=local_settings.splain, **kwargs) RETURNS: df, encoder, scaler # Titanic Data # 1. Use the function you defined in acquire.py to load the titanic data set. # 2. Handle the missing values in the embark_town and embarked columns. # 3. Remove the deck column. # 4. Use a label encoder to transform the embarked column. # 5. Scale the age and fare columns using a min max scaler. Why might this be # beneficial? When might you not want to do this? # 6. Create a function named prep_titanic that accepts the untransformed # titanic data, and returns the data with the transformations above applied. # Note: drop columns updated to deck, embarked, passenger_id in explore # Note: encoding changed to embark_town ''' df = get_titanic_data(splain=splain) df.drop(columns=['deck', 'embarked','passenger_id'], inplace=True) df = simpute(df=df, column='embark_town', splain=splain) df, encoder = encode_col(df=df, col='embark_town') scaler = MinMaxScaler() scaler.fit(df[['age','fare']]) df[['age','fare']] = scaler.transform(df[['age','fare']]) return df, encoder, scaler
def prep_titanic_data(cached=True): ''' Takes the titanic data, does data prep, and returns train, test, and validate data splits ''' # use my acquire function to read data into a df from a csv file df = get_titanic_data(cached) # drop rows where embarked/embark town are null values df = df[~df.embarked.isnull()] # encode embarked and sex using dummy columns titanic_dummies = pd.get_dummies(df[['sex', 'embarked']], drop_first=True) # join dummy columns back to df df = pd.concat([df, titanic_dummies], axis=1) # drop the deck column df = df.drop(columns=[ 'deck', 'sex', 'embarked', 'class', 'embark_town', 'passenger_id' ]) # # split data into train, validate, test dfs train, validate, test = titanic_split(df) # # impute mean of age into null values in age column train, validate, test = impute_mean_age(train, validate, test) return train, validate, test #################### Scale Any Data Set ################## def add_scaled_columns(train, validate, test, scaler, columns_to_scale): new_column_names = [c + '_scaled' for c in columns_to_scale] scaler.fit(train[columns_to_scale]) train = pd.concat([ train, pd.DataFrame(scaler.transform(train[columns_to_scale]), columns=new_column_names, index=train.index), ], axis=1) validate = pd.concat([ validate, pd.DataFrame(scaler.transform(validate[columns_to_scale]), columns=new_column_names, index=validate.index), ], axis=1) test = pd.concat([ test, pd.DataFrame(scaler.transform(test[columns_to_scale]), columns=new_column_names, index=test.index), ], axis=1) return train, validate, test
def prep_titanic(): titanic = acquire.get_titanic_data() titanic = titanic[~titanic.embarked.isnull()] titanic = titanic[~titanic.embark_town.isnull()] cols_to_drop = ['passenger_id', 'pclass', 'embark_town', 'deck'] titanic = titanic.drop(columns=cols_to_drop) train, test, validate = split_data(titanic) return train, test, validate
def titanic_prep(cached=True): df = acquire.get_titanic_data() df = df[~df.embarked.isnull()] titanic_dummies = pd.get_dummies(df.embarked, drop_first=True) df = pd.concat([df, titanic_dummies], axis=1) df = df.drop(columns='deck') train, validate, test = titanic_split(df) train, validate, test = impute_mean_age(train, validate, test) return train, validate, test
def prep_titanic_exercise(): titanic = acquire.get_titanic_data() titanic = titanic[~titanic.embark_town.isnull()] titanic.drop(columns=['deck']) titanic_dummies = pd.get_dummies(titanic['embarked']) titanic = pd.concat([titanic, titanic_dummies], axis=1) imputer = SimpleImputer(strategy='mean') imputer = imputer.fit(titanic[['age']]) titanic['impute_age'] = imputer.transform(titanic[['age']]) return titanic
def prep_titanic(): df = acquire.get_titanic_data() df = df[~df.embarked.isnull()] titanic_dummies = pd.get_dummies(df[['embarked', 'sex']], drop_first=True) df = pd.concat([df, titanic_dummies], axis=1) df = df.drop(columns=['deck', 'passenger_id', 'sex', 'embarked', 'embark_town', 'class']) train, validate, test = titanic_split(df) train, validate, test = impute_mean_age(train, validate, test) return train, validate, test
def prep_titanic(): df = acquire.get_titanic_data() df = df.drop(columns=['deck', 'class','embark_town']) df.embarked = df.embarked.fillna('S') df.embarked = df.embarked.astype("|S") train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size=.8) train, test = encode_embarked(train,test) train, test = scale_age_and_fare(train,test) train, test = fillna_age(train,test) return train, test
def prep_titanic(): # Acquire titanic dataset df_titanic = acquire.get_titanic_data() # Make the passenger_id the index of the dataset df_titanic.set_index('passenger_id', inplace=True) # df_titanic.head() # Look at how many null values are in each column # df_titanic.isnull().sum() # df_titanic.shape # Fill null values with np.nan df_titanic.embark_town.fillna('Other', inplace=True) df_titanic.embarked.fillna('Other', inplace=True) # Deck column had 688 null values out of 891 rows. # Because the majority of values are empty we do not not have enough information to go off of. # We will drop 'deck' column because we cannot use the data in this analysis df_titanic.drop(columns=['deck'], inplace=True) # Split dataframe into train, test train, test = train_test_split(df_titanic, test_size=.3, random_state=123, stratify=df_titanic.survived) # Train DataFrame: Fill values with 'most_frequent' that are np.NAN in embarked, embark_town imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imp_mode.fit_transform(train[['embarked', 'embark_town']]) test[['embarked', 'embark_town' ]] = imp_mode.transform(test[['embarked', 'embark_town']]) # Change categorical variables in 'embarked' to numerical values int_encoder = LabelEncoder() int_encoder.fit(train[['embarked']]) train['embarked_encoded'] = int_encoder.transform(train[['embarked']]) test['embarked_encoded'] = int_encoder.transform(test[['embarked']]) train.head() # Scale age and fare using MinMaxScaler scaler = MinMaxScaler() train[['age', 'fare']] = scaler.fit_transform(train[['age', 'fare']]) test[['age', 'fare']] = scaler.transform(test[['age', 'fare']]) return train, test, int_encoder
def prep_titanic(): df = acquire.get_titanic_data() df.embark_town.fillna('Other', inplace=True) df.embarked.fillna('Unknown', inplace=True) df.drop(columns=['deck'], inplace=True) encoder = LabelEncoder() df.embarked = encoder.fit_transform(df.embarked) scaler = MinMaxScaler() df.age = scaler.fit_transform(df[['age']]) scaler = MinMaxScaler() df.fare = scaler.fit_transform(df[['fare']]) return df
def prep_titanic(): titanic = acquire.get_titanic_data() titanic = titanic[~ titanic.embarked.isnull()] titanic = titanic[~ titanic.embark_town.isnull()] df_dummies = pd.get_dummies(titanic[['embark_town']], drop_first = True) df_dum = pd.get_dummies(titanic[['sex']], drop_first = True) titanic = pd.concat([titanic, df_dummies, df_dum], axis = 1) cols_to_drop = ['passenger_id','pclass', 'embarked', 'deck', 'sex'] titanic = titanic.drop(columns = cols_to_drop) train, test, validate = split_data(titanic) return train, test, validate
def prep_titanic(): df_titanic = acquire.get_titanic_data() df_titanic.embark_town.fillna('Other', inplace=True) df_titanic.embarked.fillna('Unknown', inplace=True) df_titanic.drop('deck', inplace=True, axis=1) lab_enc = LabelEncoder() lab_enc.fit(df_titanic.embarked) df_titanic.embarked = lab_enc.transform(df_titanic.embarked) scaler = MinMaxScaler() scaler.fit(df_titanic[['fare','age']]) df_titanic.fare = scaler.transform(df_titanic[['fare', 'age']]) return df_titanic #USE df.nunique()<5 instead of this temp list # def pick_viable_categories(df): # discretes = df.select_dtypes(include='object') # temp = [] # for column in discretes: # columnSeriesObj = discretes[column] # if len(columnSeriesObj.unique()) < 4: # temp.append(columnSeriesObj.name) # return temp # def plot_viable_categories(target, df): # x = pick_viable_categories(df) # _, ax = plt.subplots(nrows=1, ncols=len(x), figsize=(16,5)) # average_rate = df.target.mean() # for i, feature in enumerate(x): # sns.barplot(feature, target, data=df_titanic, ax=ax[i], alpha=.5) # ax[i].set_ylabel('average_rate') # ax[i].axhline(average_rate, ls='--', color='grey') # def pick_viable_regressors(): # regressors = df_titanic.select_dtypes(include=['float64','int64']) # temp = [] # for column in regressors: # columnSeriesObj = regressors[column] # temp.append(columnSeriesObj.name) # return temp
import pandas as pd from acquire import get_titanic_data from prepare import prep_titanic_data def set_features(df, target, *features): X = df[['pclass', 'age', 'fare', 'sibsp', 'parch']] y = df[[target]] return X, y # Get and prepare the data df = prep_titanic_data(get_titanic_data()) # Set the features features = ['pclass', 'age', 'fare', 'sibsp', 'parch'] target = 'survived' X, y = set_features( df, target, *features, )
from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns # ignore warnings import warnings warnings.filterwarnings("ignore") from acquire import get_titanic_data from prepare import prepare_titanic_data df = prepare_titanic_data(get_titanic_data()) X = df[['pclass','age','fare','sibsp','parch']] y = df[['survived']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123) X_train.head() # Create the logistic regression object logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga') # Fit the model to the training data logit.fit(X_train, y_train) print('Coefficient: \n', logit.coef_)
df = acquire.get_iris_data() df = df.drop(columns=['species_id', 'species_id.1']).rename( columns={'species_name': 'species'}) species_dummies = pd.get_dummies(df.species, drop_first=True) df = pd.concat([df, species_dummies], axis=1) return df # In[9]: prepped = iris_prep() prepped.sample(3) # In[26]: titanic = acquire.get_titanic_data() titanic.head() # In[27]: ##handling nulls titanic[titanic.embark_town.isnull()] titanic[titanic.embarked.isnull()] # In[28]: titanic = titanic[~titanic.embarked.isnull()] titanic.info() # In[29]:
def encode_species_col(iris_df): from sklearn import preprocessing encoder = preprocessing.LabelEncoder() encoder.fit(iris_df.species) return iris_df.assign(species_encode = encoder.transform(iris_df.species)) def prep_iris(iris_df): return iris_df.pipe(drop_columns)\ .pipe(rename_columns)\ .pipe(encode_species_col) # 2. Titanic Data # Use the function you defined in acquire.py to load the titanic data set. from acquire import get_titanic_data titanic_df = get_titanic_data() # print(titanic_df) # Write the code to perform the operations below. (Do this yourself, don't copy from the curriculum.) # a. Handle the missing values in the embark_town and embarked columns. # print(titanic_df['embark_town'].unique()) titanic_df.embark_town.fillna(value='Unknown', inplace=True) # print(titanic_df) # print(titanic_df['embarked'].unique()) titanic_df.embarked.fillna(value='Unknown', inplace=True) # b. Remove the deck column. titanic_df = titanic_df.drop(['deck'], axis=1) # print(titanic_df)
import pandas as pd import numpy as np %matplotlib inline import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split import acquire as a import prepare as p df = a.get_titanic_data() df = p.prep_titanic_data(df) def loopy_graphs(df, target): features = list(df.columns[(df.dtypes == object) | (df.nunique()<5)]) pop_rate = df[target].mean() for i, feature in enumerate(features): sns.barplot(feature,target,data=df,alpha=.6) plt.show() def plot_violin(features, target, df): for descrete in df[features].select_dtypes([object,int]).columns.tolist(): if df[descrete].nunique() <= 5: