Exemplo n.º 1
0
def feature_engineering(x_input):
    print("\n*****FUNCTION feature_engineering*****")

    x = x_input.copy(deep=True)
    global MEDIAN_IMPUTER
    global OHCE
    
   
    MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median',
                                            variables=['Age','Fare'])
    
    MEDIAN_IMPUTER.fit(x)
    x=MEDIAN_IMPUTER.transform(x)
    print(MEDIAN_IMPUTER.imputer_dict_)
        
    OHCE=ce.OneHotCategoricalEncoder(variables=['Sex','Embarked'], 
                                                  drop_last=True)
        
    OHCE.fit(x)
    x=OHCE.transform(x) 
    print(OHCE.encoder_dict_)
        
    # Transformed df - No Nulls after imputation
    print("Null Count after Missing Data Imputation:")
    print(x.isnull().sum())

    # Transformed df - dummy vars created
    print("Dummy Variables after OHE:")
    display(x.head())

    return(x)
    def impute_missing_values(self, data, cols_with_missing_values):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using frequent Imputer for categorical columns
                                        where as median imputer for numrical columns.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception


                     """
        self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class')
        self.data= data
        self.cols_with_missing_values=cols_with_missing_values
        try:
            for col in self.cols_with_missing_values:
                if self.data[col].dtypes == 'O':
                    self.imputer_s=mdi.CategoricalVariableImputer(imputation_method='frequent',variables=[col])
                    self.data[col]=self.imputer_s.fit_transform(self.data[[col]])
                else:
                    self.imputer_n=mdi.MeanMedianImputer(imputation_method='median',variables=[col])
                    self.data[col]=self.imputer_n.fit_transform(self.data[[col]])

            self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class')
            return self.data
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class')
            raise Exception()
Exemplo n.º 3
0
    def imputeYearofRecord(self, traindata, testdata):


        imputer = mdi.MeanMedianImputer(imputation_method='median',
                                variables=['Year of Record'])

        imputer.fit(traindata)
        print("Replacing NaNs with Median in Column Year of Record: ",imputer.imputer_dict_)
        traindata = imputer.transform(traindata)
        testdata = imputer.transform(testdata)

        traindata['Year of Record'] = traindata['Year of Record']**(1/2)
        testdata['Year of Record'] = testdata['Year of Record']**(1/2)

        return traindata, testdata
Exemplo n.º 4
0
def feature_engineering(x_train_input, x_test_input):
    print("\n*****FUNCTION feature_engineering*****")

    x_train = x_train_input.copy(deep=True)
    x_test = x_test_input.copy(deep=True)

    global MEDIAN_IMPUTER
    global OHCE

    # Median Imputation for Age , Fare
    MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median',
                                           variables=['TotalCharges'])
    # fit,transform x_train
    MEDIAN_IMPUTER.fit(x_train)
    x_train = MEDIAN_IMPUTER.transform(x_train)
    print(MEDIAN_IMPUTER.imputer_dict_)

    # transform x_test
    x_test = MEDIAN_IMPUTER.transform(x_test)

    # Transformed df - No Nulls after imputation
    x_train.isnull().sum()

    # OHE for Categorical Vars
    OHCE = ce.OneHotCategoricalEncoder(variables=[
        'Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
        'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
        'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
        'Contract', 'PaperlessBilling', 'PaymentMethod'
    ],
                                       drop_last=True)

    # fit,transform x_train
    OHCE.fit(x_train)
    x_train = OHCE.transform(x_train)
    print(OHCE.encoder_dict_)

    # transform x_test
    x_test = OHCE.transform(x_test)

    # Transformed x_train - dummy vars created
    print(x_train.head())

    # Transformed x_test - dummy vars created
    print(x_test.head())

    return (x_train, x_test)
Exemplo n.º 5
0
def feature_engineering(x_train_input, x_test_input):
    print("\n*****FUNCTION feature_engineering*****")

    x_train = x_train_input.copy(deep=True)
    x_test = x_test_input.copy(deep=True)

    global MEDIAN_IMPUTER
    global OHCE

    # Median Imputation for Age , Fare
    MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median',
                                           variables=['Age', 'Fare'])
    # fit,transform x_train
    MEDIAN_IMPUTER.fit(x_train)
    x_train = MEDIAN_IMPUTER.transform(x_train)
    print(MEDIAN_IMPUTER.imputer_dict_)

    # transform x_test
    x_test = MEDIAN_IMPUTER.transform(x_test)

    # OHE for Categorical Vars
    OHCE = ce.OneHotCategoricalEncoder(variables=['Sex', 'Embarked', 'Pclass'],
                                       drop_last=True)
    # fit,transform x_train
    OHCE.fit(x_train)
    x_train = OHCE.transform(x_train)
    print(OHCE.encoder_dict_)

    # transform x_test
    x_test = OHCE.transform(x_test)

    # Transformed x_train - dummy vars created
    print(x_train.head())

    # Transformed x_test - dummy vars created
    print(x_test.head())

    return (x_train, x_test)
Exemplo n.º 6
0
def feature_engineering(df_input):

    print("\n*****FUNCTION feature_engineering\n")

    df = df_input.copy(deep=True)

    # Create a Pie Chart to check Balance
    df['Survived'].value_counts(sort=True)

    #Plotting Parameters
    plt.figure(figsize=(5, 5))
    sizes = df['Survived'].value_counts(sort=True)
    colors = ["grey", 'purple']
    labels = ['No', 'Yes']

    #Plot
    plt.pie(
        sizes,
        colors=colors,
        labels=labels,
        autopct='%1.1f%%',
        shadow=True,
        startangle=270,
    )

    plt.title('Percentage of Churn in Dataset')
    plt.show()

    df['Age'].hist(bins=30)
    plt.show()

    df['Fare'].hist(bins=30)
    plt.show()

    print("\nBEFORE PIPELINE\n")
    disp_null_counts(df)

    # Set up a Feature Engineering pipleine

    titanic_pipe = Pipeline([
        # replace NA, NaNs, nulls with median of the non-null cells
        ('median_imputer',
         mdi.MeanMedianImputer(imputation_method='median',
                               variables=['Age', 'Fare'])),
        ('ohce1', ce.OneHotCategoricalEncoder(variables=['Sex'],
                                              drop_last=True)),
        ('ohce2',
         ce.OneHotCategoricalEncoder(variables=['Embarked'], drop_last=False)),
    ])

    ###
    ###
    ### ALTERNATIVE METHOD OF DOING ABOVE OHCE One-Hot Encoding
    ###
    #     df['Sex'].replace(['male','female'],[0,1], inplace = True)
    #     onehot = pd.get_dummies(df['Embarked'])
    #     df = df.drop('Embarked', axis = 'columns')
    #     df = df.join(onehot)

    #     df['Embarked'].replace(['C','Q','S'],[0,1,2], inplace = True)
    ###
    ###
    ###

    print("\nAFTER PIPELINE definition, before pipeline fit&transform\n")
    disp_null_counts(df)
    print("\ndf unique value counts\n", df.nunique(axis=0), sep="")
    print("\n", df.groupby('Age').size().reset_index(name="Age count"), sep="")

    # Fit will ONLY learn the mean, median values
    # Transform will Apply the changes to the df
    #
    # use the mean and median from the training data for
    # transform of the new data for the trained model
    titanic_pipe.fit(df)
    df = titanic_pipe.transform(df)

    print("\nafter pipeline fit&transform\n")

    disp_null_counts(df)
    print("\ndf unique value counts\n", df.nunique(axis=0), sep="")
    print("\n", df.groupby('Age').size().reset_index(name="Age count"), sep="")

    # Transformed df - No Nulls after imputation
    print("\nNulls after transformation\n", df.isnull().sum(), sep="")

    # Transformed df - dummy vars created
    print("\ndf head after pipeline transform (ohe and median)\n",
          df.head(),
          sep="")

    return (df)
Exemplo n.º 7
0
 'Census_IsVirtualDevice',
 'Census_IsTouchEnabled',
 'Census_IsPenCapable',
 'Census_IsAlwaysOnAlwaysConnectedCapable',
 'Wdft_IsGamer',
 'Wdft_RegionIdentifier']
"""

testset = all_dataset[all_dataset.HasDetections == -1]
trainset = all_dataset[all_dataset.HasDetections != -1] # seperate original train and test sets using -1 sentinel value

del all_dataset
gc.collect()

cat_imputer = missing_data_imputers.FrequentCategoryImputer(variables=dropped_machine_id_cat_names) # missing value imputer for categorical values (the most frequent value)
cont_imputer = missing_data_imputers.MeanMedianImputer(variables=dropped_targets_cont_names) # missing value imputer for numeric values (median value)

cat_imputer.fit(trainset)
trainset = cat_imputer.transform(trainset) # cat_imputer fit and transform for train set
print(trainset.head())

testset = cat_imputer.transform(testset) # cat_imputer (only) transform for test set 
print(testset.head())

rare_cat_enc = categorical_encoders.RareLabelCategoricalEncoder(variables=dropped_machine_id_cat_names)
rare_cat_enc.fit(trainset)
trainset = rare_cat_enc.transform(trainset) # find rare (using a threshold value) categorical values from train set and replace these values with 'rare', this technique is useful to prevent overfitting because of CountFrequencyCategoricalEncoder
testset = rare_cat_enc.transform(testset) # rare_cat_enc (only) transform for test set

cf_cat_enc = categorical_encoders.CountFrequencyCategoricalEncoder(encoding_method='frequency', variables=dropped_machine_id_cat_names)
cf_cat_enc.fit(trainset)
Exemplo n.º 8
0
object_cols_x_target = [c for c in object_cols if c != target]

cols_to_drop = datetime_cols + ['address']

p = pipe([
    # Add missing indicators for numeric features
    ("num_nan_ind",mdi.AddMissingIndicator(variables=numeric_cols)),
    # Add missing level for categorical features
    ("fill_cat_nas",mdi.CategoricalVariableImputer(
        fill_value = "_MISSING_", variables=object_cols_x_target)),
    # Bin rare levels of categorical variables
    ("rare_cats",ce.RareLabelCategoricalEncoder(
        tol = 0.02, replace_with = "_RARE_", variables=object_cols_x_target)),
    # Impute missing numeric variables with media
    ("rmmean",mdi.MeanMedianImputer(imputation_method = 'median',variables=numeric_cols)),
    # Drop dates and address columns
    ("drop_date",fs.DropFeatures(features_to_drop=cols_to_drop))])


# In[13]:


df_train_prepped = p.fit_transform(df_train)


# In[14]:


df_train_prepped.head()
X_train[discrete] = X_train[discrete].astype('O')
X_test[discrete] = X_test[discrete].astype('O')

from feature_engine import variable_transformers as vt

house_pipe = Pipeline([

    # missing data imputation - section 4
    ('missing_ind',
     mdi.AddNaNBinaryImputer(
         variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])),
    
    ('imputer_num',
     mdi.MeanMedianImputer(
         imputation_method='median',
         variables=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])),
    
    ('imputer_cat', mdi.CategoricalVariableImputer(variables=categorical)),

    
    # categorical encoding - section 6
    ('rare_label_enc',
     ce.RareLabelCategoricalEncoder(tol=0.05,
                                    n_categories=6,
                                    variables=categorical + discrete)),
    ('categorical_enc',
     ce.OrdinalCategoricalEncoder(encoding_method='ordered',
                                  variables=categorical + discrete)),

    # discretisation + encoding - section 8