Exemplo n.º 1
0
def feature_engineering(x_input):
    print("\n*****FUNCTION feature_engineering*****")

    x = x_input.copy(deep=True)
    global MEDIAN_IMPUTER
    global OHCE
    
   
    MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median',
                                            variables=['Age','Fare'])
    
    MEDIAN_IMPUTER.fit(x)
    x=MEDIAN_IMPUTER.transform(x)
    print(MEDIAN_IMPUTER.imputer_dict_)
        
    OHCE=ce.OneHotCategoricalEncoder(variables=['Sex','Embarked'], 
                                                  drop_last=True)
        
    OHCE.fit(x)
    x=OHCE.transform(x) 
    print(OHCE.encoder_dict_)
        
    # Transformed df - No Nulls after imputation
    print("Null Count after Missing Data Imputation:")
    print(x.isnull().sum())

    # Transformed df - dummy vars created
    print("Dummy Variables after OHE:")
    display(x.head())

    return(x)
Exemplo n.º 2
0
def feature_engineering(x_train_input, x_test_input):
    print("\n*****FUNCTION feature_engineering*****")

    x_train = x_train_input.copy(deep=True)
    x_test = x_test_input.copy(deep=True)

    global MEDIAN_IMPUTER
    global OHCE

    # Median Imputation for Age , Fare
    MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median',
                                           variables=['TotalCharges'])
    # fit,transform x_train
    MEDIAN_IMPUTER.fit(x_train)
    x_train = MEDIAN_IMPUTER.transform(x_train)
    print(MEDIAN_IMPUTER.imputer_dict_)

    # transform x_test
    x_test = MEDIAN_IMPUTER.transform(x_test)

    # Transformed df - No Nulls after imputation
    x_train.isnull().sum()

    # OHE for Categorical Vars
    OHCE = ce.OneHotCategoricalEncoder(variables=[
        'Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
        'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
        'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
        'Contract', 'PaperlessBilling', 'PaymentMethod'
    ],
                                       drop_last=True)

    # fit,transform x_train
    OHCE.fit(x_train)
    x_train = OHCE.transform(x_train)
    print(OHCE.encoder_dict_)

    # transform x_test
    x_test = OHCE.transform(x_test)

    # Transformed x_train - dummy vars created
    print(x_train.head())

    # Transformed x_test - dummy vars created
    print(x_test.head())

    return (x_train, x_test)
Exemplo n.º 3
0
def feature_engineering(x_train_input, x_test_input):
    print("\n*****FUNCTION feature_engineering*****")

    x_train = x_train_input.copy(deep=True)
    x_test = x_test_input.copy(deep=True)

    global MEDIAN_IMPUTER
    global OHCE

    # Median Imputation for Age , Fare
    MEDIAN_IMPUTER = mdi.MeanMedianImputer(imputation_method='median',
                                           variables=['Age', 'Fare'])
    # fit,transform x_train
    MEDIAN_IMPUTER.fit(x_train)
    x_train = MEDIAN_IMPUTER.transform(x_train)
    print(MEDIAN_IMPUTER.imputer_dict_)

    # transform x_test
    x_test = MEDIAN_IMPUTER.transform(x_test)

    # OHE for Categorical Vars
    OHCE = ce.OneHotCategoricalEncoder(variables=['Sex', 'Embarked', 'Pclass'],
                                       drop_last=True)
    # fit,transform x_train
    OHCE.fit(x_train)
    x_train = OHCE.transform(x_train)
    print(OHCE.encoder_dict_)

    # transform x_test
    x_test = OHCE.transform(x_test)

    # Transformed x_train - dummy vars created
    print(x_train.head())

    # Transformed x_test - dummy vars created
    print(x_test.head())

    return (x_train, x_test)
Exemplo n.º 4
0
def feature_engineering(df_input):

    print("\n*****FUNCTION feature_engineering\n")

    df = df_input.copy(deep=True)

    # Create a Pie Chart to check Balance
    df['Survived'].value_counts(sort=True)

    #Plotting Parameters
    plt.figure(figsize=(5, 5))
    sizes = df['Survived'].value_counts(sort=True)
    colors = ["grey", 'purple']
    labels = ['No', 'Yes']

    #Plot
    plt.pie(
        sizes,
        colors=colors,
        labels=labels,
        autopct='%1.1f%%',
        shadow=True,
        startangle=270,
    )

    plt.title('Percentage of Churn in Dataset')
    plt.show()

    df['Age'].hist(bins=30)
    plt.show()

    df['Fare'].hist(bins=30)
    plt.show()

    print("\nBEFORE PIPELINE\n")
    disp_null_counts(df)

    # Set up a Feature Engineering pipleine

    titanic_pipe = Pipeline([
        # replace NA, NaNs, nulls with median of the non-null cells
        ('median_imputer',
         mdi.MeanMedianImputer(imputation_method='median',
                               variables=['Age', 'Fare'])),
        ('ohce1', ce.OneHotCategoricalEncoder(variables=['Sex'],
                                              drop_last=True)),
        ('ohce2',
         ce.OneHotCategoricalEncoder(variables=['Embarked'], drop_last=False)),
    ])

    ###
    ###
    ### ALTERNATIVE METHOD OF DOING ABOVE OHCE One-Hot Encoding
    ###
    #     df['Sex'].replace(['male','female'],[0,1], inplace = True)
    #     onehot = pd.get_dummies(df['Embarked'])
    #     df = df.drop('Embarked', axis = 'columns')
    #     df = df.join(onehot)

    #     df['Embarked'].replace(['C','Q','S'],[0,1,2], inplace = True)
    ###
    ###
    ###

    print("\nAFTER PIPELINE definition, before pipeline fit&transform\n")
    disp_null_counts(df)
    print("\ndf unique value counts\n", df.nunique(axis=0), sep="")
    print("\n", df.groupby('Age').size().reset_index(name="Age count"), sep="")

    # Fit will ONLY learn the mean, median values
    # Transform will Apply the changes to the df
    #
    # use the mean and median from the training data for
    # transform of the new data for the trained model
    titanic_pipe.fit(df)
    df = titanic_pipe.transform(df)

    print("\nafter pipeline fit&transform\n")

    disp_null_counts(df)
    print("\ndf unique value counts\n", df.nunique(axis=0), sep="")
    print("\n", df.groupby('Age').size().reset_index(name="Age count"), sep="")

    # Transformed df - No Nulls after imputation
    print("\nNulls after transformation\n", df.isnull().sum(), sep="")

    # Transformed df - dummy vars created
    print("\ndf head after pipeline transform (ohe and median)\n",
          df.head(),
          sep="")

    return (df)
    # frequent label categorical encoder
    ('rare_encoder', 
        ce.RareLabelCategoricalEncoder(tol=0.02, 
                                        n_categories=4,
                                        variables=config.VARS_WITH_RARE,
                                        replace_with='Other', 
                                        return_object=True)),

    # target guided ordinal categorical variable encoder
    ('ordinal_encoder', 
        ce.OrdinalCategoricalEncoder(encoding_method='ordered',
                                     variables=config.ORDINAL_VARS)),
    
    # nominal categorical variable encoder (one hot)
    ('nominal_encoder', 
        ce.OneHotCategoricalEncoder(variables=config.NOMINAL_VARS,
                                    drop_last=True)),
        
    # Yeo-Johnson numerical variable transformer
    ('yeo_johnson_transformer', 
        vt.YeoJohnsonTransformer(variables=config.NUMERICAL_VARS)),
    
    # scaler
    ('min_max_scaler', MinMaxScaler()),
    
    # logistic regression classifier
    ('log_classifier', 
        LogisticRegression(class_weight='balanced', 
                          random_state=config.RANDOM_STATE, 
                          n_jobs=-1))
])
Exemplo n.º 6
0
def one_hot_encoding(X_train, X_test, variable):
    encoder = ce.OneHotCategoricalEncoder(top_categories=None,
                                          variables=variable,
                                          drop_last=True)
    encoder.fit(X_train)
    return encoder.transform(X_train), encoder.transform(X_test)