def feature_engineering(x_input): print("\n*****FUNCTION feature_engineering*****") x = x_input.copy(deep=True) global MEDIAN_IMPUTER global OHCE MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median', variables=['Age','Fare']) MEDIAN_IMPUTER.fit(x) x=MEDIAN_IMPUTER.transform(x) print(MEDIAN_IMPUTER.imputer_dict_) OHCE=ce.OneHotCategoricalEncoder(variables=['Sex','Embarked'], drop_last=True) OHCE.fit(x) x=OHCE.transform(x) print(OHCE.encoder_dict_) # Transformed df - No Nulls after imputation print("Null Count after Missing Data Imputation:") print(x.isnull().sum()) # Transformed df - dummy vars created print("Dummy Variables after OHE:") display(x.head()) return(x)
def impute_missing_values(self, data, cols_with_missing_values): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using frequent Imputer for categorical columns where as median imputer for numrical columns. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception """ self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class') self.data= data self.cols_with_missing_values=cols_with_missing_values try: for col in self.cols_with_missing_values: if self.data[col].dtypes == 'O': self.imputer_s=mdi.CategoricalVariableImputer(imputation_method='frequent',variables=[col]) self.data[col]=self.imputer_s.fit_transform(self.data[[col]]) else: self.imputer_n=mdi.MeanMedianImputer(imputation_method='median',variables=[col]) self.data[col]=self.imputer_n.fit_transform(self.data[[col]]) self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class') return self.data except Exception as e: self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class') raise Exception()
def imputeYearofRecord(self, traindata, testdata): imputer = mdi.MeanMedianImputer(imputation_method='median', variables=['Year of Record']) imputer.fit(traindata) print("Replacing NaNs with Median in Column Year of Record: ",imputer.imputer_dict_) traindata = imputer.transform(traindata) testdata = imputer.transform(testdata) traindata['Year of Record'] = traindata['Year of Record']**(1/2) testdata['Year of Record'] = testdata['Year of Record']**(1/2) return traindata, testdata
def feature_engineering(x_train_input, x_test_input): print("\n*****FUNCTION feature_engineering*****") x_train = x_train_input.copy(deep=True) x_test = x_test_input.copy(deep=True) global MEDIAN_IMPUTER global OHCE # Median Imputation for Age , Fare MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median', variables=['TotalCharges']) # fit,transform x_train MEDIAN_IMPUTER.fit(x_train) x_train = MEDIAN_IMPUTER.transform(x_train) print(MEDIAN_IMPUTER.imputer_dict_) # transform x_test x_test = MEDIAN_IMPUTER.transform(x_test) # Transformed df - No Nulls after imputation x_train.isnull().sum() # OHE for Categorical Vars OHCE = ce.OneHotCategoricalEncoder(variables=[ 'Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod' ], drop_last=True) # fit,transform x_train OHCE.fit(x_train) x_train = OHCE.transform(x_train) print(OHCE.encoder_dict_) # transform x_test x_test = OHCE.transform(x_test) # Transformed x_train - dummy vars created print(x_train.head()) # Transformed x_test - dummy vars created print(x_test.head()) return (x_train, x_test)
def feature_engineering(x_train_input, x_test_input): print("\n*****FUNCTION feature_engineering*****") x_train = x_train_input.copy(deep=True) x_test = x_test_input.copy(deep=True) global MEDIAN_IMPUTER global OHCE # Median Imputation for Age , Fare MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median', variables=['Age', 'Fare']) # fit,transform x_train MEDIAN_IMPUTER.fit(x_train) x_train = MEDIAN_IMPUTER.transform(x_train) print(MEDIAN_IMPUTER.imputer_dict_) # transform x_test x_test = MEDIAN_IMPUTER.transform(x_test) # OHE for Categorical Vars OHCE = ce.OneHotCategoricalEncoder(variables=['Sex', 'Embarked', 'Pclass'], drop_last=True) # fit,transform x_train OHCE.fit(x_train) x_train = OHCE.transform(x_train) print(OHCE.encoder_dict_) # transform x_test x_test = OHCE.transform(x_test) # Transformed x_train - dummy vars created print(x_train.head()) # Transformed x_test - dummy vars created print(x_test.head()) return (x_train, x_test)
def feature_engineering(df_input): print("\n*****FUNCTION feature_engineering\n") df = df_input.copy(deep=True) # Create a Pie Chart to check Balance df['Survived'].value_counts(sort=True) #Plotting Parameters plt.figure(figsize=(5, 5)) sizes = df['Survived'].value_counts(sort=True) colors = ["grey", 'purple'] labels = ['No', 'Yes'] #Plot plt.pie( sizes, colors=colors, labels=labels, autopct='%1.1f%%', shadow=True, startangle=270, ) plt.title('Percentage of Churn in Dataset') plt.show() df['Age'].hist(bins=30) plt.show() df['Fare'].hist(bins=30) plt.show() print("\nBEFORE PIPELINE\n") disp_null_counts(df) # Set up a Feature Engineering pipleine titanic_pipe = Pipeline([ # replace NA, NaNs, nulls with median of the non-null cells ('median_imputer', mdi.MeanMedianImputer(imputation_method='median', variables=['Age', 'Fare'])), ('ohce1', ce.OneHotCategoricalEncoder(variables=['Sex'], drop_last=True)), ('ohce2', ce.OneHotCategoricalEncoder(variables=['Embarked'], drop_last=False)), ]) ### ### ### ALTERNATIVE METHOD OF DOING ABOVE OHCE One-Hot Encoding ### # df['Sex'].replace(['male','female'],[0,1], inplace = True) # onehot = pd.get_dummies(df['Embarked']) # df = df.drop('Embarked', axis = 'columns') # df = df.join(onehot) # df['Embarked'].replace(['C','Q','S'],[0,1,2], inplace = True) ### ### ### print("\nAFTER PIPELINE definition, before pipeline fit&transform\n") disp_null_counts(df) print("\ndf unique value counts\n", df.nunique(axis=0), sep="") print("\n", df.groupby('Age').size().reset_index(name="Age count"), sep="") # Fit will ONLY learn the mean, median values # Transform will Apply the changes to the df # # use the mean and median from the training data for # transform of the new data for the trained model titanic_pipe.fit(df) df = titanic_pipe.transform(df) print("\nafter pipeline fit&transform\n") disp_null_counts(df) print("\ndf unique value counts\n", df.nunique(axis=0), sep="") print("\n", df.groupby('Age').size().reset_index(name="Age count"), sep="") # Transformed df - No Nulls after imputation print("\nNulls after transformation\n", df.isnull().sum(), sep="") # Transformed df - dummy vars created print("\ndf head after pipeline transform (ohe and median)\n", df.head(), sep="") return (df)
'Census_IsVirtualDevice', 'Census_IsTouchEnabled', 'Census_IsPenCapable', 'Census_IsAlwaysOnAlwaysConnectedCapable', 'Wdft_IsGamer', 'Wdft_RegionIdentifier'] """ testset = all_dataset[all_dataset.HasDetections == -1] trainset = all_dataset[all_dataset.HasDetections != -1] # seperate original train and test sets using -1 sentinel value del all_dataset gc.collect() cat_imputer = missing_data_imputers.FrequentCategoryImputer(variables=dropped_machine_id_cat_names) # missing value imputer for categorical values (the most frequent value) cont_imputer = missing_data_imputers.MeanMedianImputer(variables=dropped_targets_cont_names) # missing value imputer for numeric values (median value) cat_imputer.fit(trainset) trainset = cat_imputer.transform(trainset) # cat_imputer fit and transform for train set print(trainset.head()) testset = cat_imputer.transform(testset) # cat_imputer (only) transform for test set print(testset.head()) rare_cat_enc = categorical_encoders.RareLabelCategoricalEncoder(variables=dropped_machine_id_cat_names) rare_cat_enc.fit(trainset) trainset = rare_cat_enc.transform(trainset) # find rare (using a threshold value) categorical values from train set and replace these values with 'rare', this technique is useful to prevent overfitting because of CountFrequencyCategoricalEncoder testset = rare_cat_enc.transform(testset) # rare_cat_enc (only) transform for test set cf_cat_enc = categorical_encoders.CountFrequencyCategoricalEncoder(encoding_method='frequency', variables=dropped_machine_id_cat_names) cf_cat_enc.fit(trainset)
object_cols_x_target = [c for c in object_cols if c != target] cols_to_drop = datetime_cols + ['address'] p = pipe([ # Add missing indicators for numeric features ("num_nan_ind",mdi.AddMissingIndicator(variables=numeric_cols)), # Add missing level for categorical features ("fill_cat_nas",mdi.CategoricalVariableImputer( fill_value = "_MISSING_", variables=object_cols_x_target)), # Bin rare levels of categorical variables ("rare_cats",ce.RareLabelCategoricalEncoder( tol = 0.02, replace_with = "_RARE_", variables=object_cols_x_target)), # Impute missing numeric variables with media ("rmmean",mdi.MeanMedianImputer(imputation_method = 'median',variables=numeric_cols)), # Drop dates and address columns ("drop_date",fs.DropFeatures(features_to_drop=cols_to_drop))]) # In[13]: df_train_prepped = p.fit_transform(df_train) # In[14]: df_train_prepped.head()
X_train[discrete] = X_train[discrete].astype('O') X_test[discrete] = X_test[discrete].astype('O') from feature_engine import variable_transformers as vt house_pipe = Pipeline([ # missing data imputation - section 4 ('missing_ind', mdi.AddNaNBinaryImputer( variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])), ('imputer_num', mdi.MeanMedianImputer( imputation_method='median', variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])), ('imputer_cat', mdi.CategoricalVariableImputer(variables=categorical)), # categorical encoding - section 6 ('rare_label_enc', ce.RareLabelCategoricalEncoder(tol=0.05, n_categories=6, variables=categorical + discrete)), ('categorical_enc', ce.OrdinalCategoricalEncoder(encoding_method='ordered', variables=categorical + discrete)), # discretisation + encoding - section 8