示例#1
0
def feature_engineered(df):
    # use only feature engineered stuff
    df = feature_engineering.drop_useless_columns(df)
    df = encode_days_as_costumer(df)
    df = feature_engineering.partner_binary(df)
    df = feature_engineering.responsiveness_share(df)
    df = feature_engineering.alcoholic(df)
    df = feature_engineering.income_housemember(df)
    df = feature_engineering.kids_home(df)
    df = feature_engineering.income_share(df)
    df = feature_engineering.veggie(df)
    df = feature_engineering.phd(df)
    df = feature_engineering.ave_purchase(df)
    df = feature_engineering.tutti_frutti(df)
    df = df.drop(columns=[
        "Year_Birth", "Income", 'MntWines', 'MntFruits', 'MntMeatProducts',
        'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
        'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
        'NumStorePurchases', 'NumWebVisitsMonth', 'Dt_Customer', 'Recency',
        'Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'AcceptedCmp3',
        'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2',
        'Complain'
    ],
                 axis=1)
    df = outlier_IQR(
        df, columns=['income_housemember', 'income_share', 'ave_purchase'])

    return df
示例#2
0
def box_cox_pipeline(df):
    # delete unwanted columns
    df = feature_engineering.drop_useless_columns(df)
    df = encode_education(df)
    df = one_hot_encoding(df, columns=["Marital_Status"])
    df = impute_income_KNN(df)
    df = encode_days_as_costumer(df)

    bx_cx_trans_dict = {
        "log": np.log,
        "sqrt": np.sqrt,
        "exp": np.exp,
        "**1/4": lambda x: np.power(x, 0.25),
        "**2": lambda x: np.power(x, 2),
        "**4": lambda x: np.power(x, 4)
    }

    # treatment weird values
    columns = [
        "Income", "Kidhome", "Teenhome", 'MntWines', 'MntFruits',
        'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
        'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
        'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
        'Recency'
    ]
    # 3) perform power transformations on scaled features and select the best
    for feature in columns:
        for trans_key, trans_value in bx_cx_trans_dict.items():
            # 3) 1) 1) apply transformation on training data
            feature_trans = np.round(trans_value(df[feature]), 4)
            feature_trans.loc[np.isfinite(feature_trans) == False] = -5
            df[str(feature) + str(trans_key)] = feature_trans
    return df
示例#3
0
def chop_off(df):
    # delete unwanted columns
    df = feature_engineering.drop_useless_columns(df)

    # check for nan
    df = df.dropna()
    df = encode_days_as_costumer(df)
    # treatment weird values
    df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"])
    df = outlier_IQR(df,
                     columns=[
                         "Year_Birth", "Income", 'MntWines', 'MntFruits',
                         'MntMeatProducts', 'MntFishProducts',
                         'MntSweetProducts', 'MntGoldProds',
                         'NumDealsPurchases', 'NumWebPurchases',
                         'NumCatalogPurchases', 'NumStorePurchases',
                         'NumWebVisitsMonth', 'Recency'
                     ])

    #encoding
    df = one_hot_encoding(df, columns=["Marital_Status"])
    df = one_hot_encoding(df, columns=["Education"])

    # cutoff based on chi-squared test

    return df
示例#4
0
def small_pipeline(df):
    df = impute_income_KNN(df)
    df = df.drop(["Kidhome", "Teenhome"], axis=1)
    df = feature_engineering.drop_useless_columns(df)
    df = encode_days_as_costumer(df)

    df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"])
    df = one_hot_encoding(df, columns=["Marital_Status"])
    df = one_hot_encoding(df, columns=["Education"])

    return df
示例#5
0
def joris_preprocessing_pipeline(df):
    df = impute_income_KNN(df)
    df = feature_engineering.partner_binary(df)
    df = feature_engineering.income_housemember(df)
    df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"])
    df = one_hot_encoding(df, columns=["Marital_Status"])
    df = one_hot_encoding(df, columns=["Education"])
    df = encode_days_as_costumer(df)
    df = feature_engineering.drop_useless_columns(df)
    df = replace_income(df)
    df = feature_engineering.responsiveness_share(df)
    df = feature_engineering.ave_purchase(df)
    df = feature_engineering.income_share(df)
    return df
示例#6
0
def simple_pipeline(df):
    # delete unwanted columns
    df = feature_engineering.drop_useless_columns(df)
    # treatment weird values
    df = marital_others(df)
    df = encode_days_as_costumer(df)
    # check for nan
    df = df.dropna()
    # look at extreme values
    df = one_hot_encoding(df, columns=["Marital_Status"])
    df = one_hot_encoding(df, columns=["Education"])
    # feature engineering
    df = encode_days_as_costumer(df)

    return df
示例#7
0
def bin_it_preprocessing_pipeline(df):
    df = impute_income_KNN(df)
    df = feature_engineering.partner_binary(df)
    df = feature_engineering.income_housemember(df)
    df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"])
    df = one_hot_encoding(df, columns=["Marital_Status"])
    df = one_hot_encoding(df, columns=["Education"])
    df = encode_days_as_costumer(df)
    df = feature_engineering.drop_useless_columns(df)
    df = replace_income(df)
    df = feature_engineering.responsiveness_share(df)
    df = feature_engineering.ave_purchase(df)
    df = feature_engineering.income_share(df)
    df = Binning_Features(df, "Income", n_bins=5)
    df = Binning_Features(df, "MntWines", n_bins=5)
    df = Binning_Features(df, "MntFruits", n_bins=5)
    df = Binning_Features(df, "MntMeatProducts", n_bins=5)
    df = Binning_Features(df, "MntFishProducts", n_bins=5)
    df = Binning_Features(df, "MntSweetProducts", n_bins=5)
    df = Binning_Features(df, "MntGoldProds", n_bins=5)
    return df
示例#8
0
def morten_preprocessing_pipeline(df):
    """
    One-Version of a Preprocessing Pipeline. Decisions are justified in Data_CLeaning.ipynb.
    """
    df = remove_birthyear(df, 1940)
    df = missing_imputer(df, "Income", "median")
    df = outlier_cutoff(df, "MntSweetProducts", 210)
    df = outlier_cutoff(df, "MntMeatProducts", 1250)
    df = outlier_cutoff(df, "MntGoldProds", 250)
    df = outlier_value_imputer(df, "NumWebPurchases", 11, 11)
    df = outlier_value_imputer(df, "NumCatalogPurchases", 11, 11)
    df = outlier_value_imputer(df, "NumWebVisitsMonth", 9, 9)
    df = anomalies_treatment(df, "Marital_Status", ["YOLO", "Absurd"])
    df = encode_education(df)
    df = feature_engineering.partner_binary(df)
    df = one_hot_encoding(df, columns=["Marital_Status"])
    df = encode_days_as_costumer(df)
    df = feature_engineering.drop_useless_columns(df)
    df = feature_engineering.responsiveness_share(df)
    del df["Complain"]
    return df