def auth_flag_features(source_df, field, path=path.path, nrows=None):
    """
        Returns number of levels and mode for a given field
        :param source_df: historical/new transactions
        :param field: feature to be extracted from
        :param path: path of source df
        :param nrows: no. of rows to be read
        :return: df with unique levels count and mode
        """
    _log.info("Creating features from {}".format(field))
    prefix = source_df.split("_")[0]

    source_df = "{}/{}".format(path, source_df)
    _log.info("Reading from {}".format(source_df))
    try:
        df = pd.read_csv(source_df, usecols=["card_id", field], nrows=nrows)
    except Exception as e:
        _log.exception(e)
    _log.info("Successfully read from {}".format(source_df))

    df["dummy"] = 1

    _log.info("Computing successful and unsuccesful authorizations")
    df_agg = (df.groupby([field, "card_id"]).agg({
        "dummy": np.sum
    }).reset_index().pivot_table(index="card_id",
                                 columns=field,
                                 values="dummy"))
    df_agg["total"] = df_agg['N'] + df_agg['Y']
    df_agg['Y'] = df_agg['Y'] / df_agg["total"]
    df_agg.drop(columns=["N", "total"], inplace=True)
    df_agg.rename(columns={'Y': (prefix + '_Y')}, inplace=True)
    _log.info("Succesfully computed features for {}".format(field))

    return df_agg
Пример #2
0
def field_to_features(source_df, field, path=path.path, nrows=None):
    """
    :param source_df: historical or new transactions.csv
    :param field:param to be used for making feature
    :param nrows: number of rows to be read
    :param path: path of source_df
    Reads selected field from data frame and
    :return: df with columns:
        card_id,
        column each for distinct levels  of field (giving count of txn),
        level with max txn
    """
    _log.info("Creating features from {}".format(field))
    prefix = source_df.split("_")[0]

    source_df = "{}/{}".format(path, source_df)
    _log.info("Reading from {}".format(source_df))
    try:
        df = pd.read_csv(source_df, usecols=["card_id", field], nrows=nrows)
    except Exception as e:
        _log.exception(e)
    _log.info("Successfully read from {}".format(source_df))
    df.loc[pd.isnull(df[field]), field] = -100

    keys = pd.Series(list(set(df[field])))
    vals = keys.apply(lambda x: "{}_{}_{}".format(prefix, field, x))
    rename_dict = dict(zip(keys, vals))
    _log.info("Rename dict: {}".format(rename_dict))
    df["dummy"] = 1
    df_agg = (df.groupby(["card_id", field]).agg({
        "dummy": np.sum
    }).reset_index().pivot_table(index="card_id",
                                 columns=field,
                                 values="dummy",
                                 fill_value=0).reset_index())
    field_name = prefix + "_max_txn_" + str(field)
    df_agg[field_name] = df_agg.drop(columns="card_id").idxmax(axis=1)
    df_agg.rename(columns=rename_dict, inplace=True)
    _log.info("Successfully computed feature")
    return df_agg
Пример #3
0
def frequency_features_from_field(source_df):
    """
    Gives average number of transactions and max number of transactions in a day
    :param source_df: historical/new transactions csv with essential derived features. Refer to earlier function
    :return: df with features corresponding to transactions in a day
    """
    _log.info("Computing average and max frequency transactions in a day")
    prefix = source_df.split("_")[0]
    source_df = "{}/{}".format(path.feature_path, source_df)
    _log.info("Reading from {}".format(source_df))
    try:
        df = pd.read_csv(source_df)
    except Exception as e:
        _log.exception(e)
    df["dummy"] = 1
    day_features_rename_dict = {
        "amax": "{}_{}".format(prefix, "max_freq_in_day"),
        "mean": "{}_{}".format(prefix, "avg_freq_in_transacting_day")
    }
    day_features = (df.groupby(["card_id", "date"]).agg({
        "dummy": np.sum
    }).reset_index().groupby(["card_id"]).agg({
        "dummy": [np.max, np.mean]
    }).reset_index())

    day_features = (pd.DataFrame([
        day_features["card_id"], day_features["dummy"]["amax"],
        day_features["dummy"]["mean"]
    ]).T.rename(columns=day_features_rename_dict))
    _log.info("Successfully computed feature")
    return day_features
Пример #4
0
def get_avg_days_bw_purchases(source_df):
    """
    Gives average days between two purchases for a given card id
    :param source_df: historical/new transactions csv with essential derived features. Refer to earlier function
    :return: df with features corresponding to days between purchases
    """
    _log.info("Computing average number of days between consecutive purchases")
    prefix = source_df.split("_")[0]
    source_df = "{}/{}".format(path.feature_path, source_df)
    _log.info("Reading from {}".format(source_df))
    try:
        df = pd.read_csv(source_df)
    except Exception as e:
        _log.exception(e)
    df["next_purchase_date"] = df.groupby("card_id").shift(-1)["purchase_date"]
    df.loc[
        pd.isnull(df["next_purchase_date"]),
        "next_purchase_date"] = dt.datetime.now()  # should be a better value
    df["purchase_date"] = pd.to_datetime(df["purchase_date"])
    df["next_purchase_date"] = pd.to_datetime(df["next_purchase_date"])
    df["days_between_purchases"] = df["next_purchase_date"] - df[
        "purchase_date"]
    df["days_between_purchases"] = df["days_between_purchases"].apply(
        lambda x: x.days)
    days_bw_purchase = (df.groupby("card_id").agg({
        "days_between_purchases":
        np.mean
    }).reset_index().rename(
        {"mean": "{}_{}".format(prefix, "avg_days_bw_purchases")}))
    _log.info("Successfully computed feature")
    return days_bw_purchase
from config import path
from services.get_successful_auth_perc import auth_flag_features
from utils.log_setup import _log

new_auth_flag_features = auth_flag_features("new_merchant_transactions.csv",
                                            "authorized_flag")
output_file_path = "{}/{}_{}.csv".format(path.feature_path, "new",
                                         "new_auth_flag_features")
_log.info("Writing to {}".format(output_file_path))
new_auth_flag_features.to_csv(output_file_path)
Пример #6
0
def get_unique_count_and_mode(source_df, field, path=path.path, nrows=None):
    """
    Returns number of levels and mode for a given field
    :param source_df: historical/new transactions
    :param field: feature to be extracted from
    :param path: path of source df
    :param nrows: no. of rows to be read
    :return: df with unique levels count and mode
    """
    _log.info("Creating features from {}".format(field))
    prefix = source_df.split("_")[0]

    source_df = "{}/{}".format(path, source_df)
    _log.info("Reading from {}".format(source_df))
    try:
        df = pd.read_csv(source_df, usecols=["card_id", field], nrows=nrows)
    except Exception as e:
        _log.exception(e)
    _log.info("Successfully read from {}".format(source_df))

    if sum(pd.isnull(df[field])):
        field_mode = df[field].mode()[0]
        df.loc[pd.isnull(df[field]), field] = field_mode

    _log.info("Fetching no. of distinct merchants transacted on")
    df_uniq = df.groupby("card_id").agg({field: pd.Series.nunique}).reset_index().rename(columns={field: "{}_{}_{}".format(prefix, "unique", field)})

    _log.info("Computing mode of merchants")
    df_agg = df.groupby(["card_id", field]).agg({field: np.count_nonzero}).rename(columns={field: "count"}).reset_index()
    df_max = df_agg.groupby("card_id").agg({"count": np.max}).reset_index()
    df_max = df_agg.merge(df_max, how="inner", on=["card_id", "count"]).drop_duplicates().reset_index(drop=True)
    df_max['rank'] = df_max.groupby(['card_id']).cumcount() + 1
    df_max = df_max[df_max['rank'] == 1]
    df = df_max.merge(df_uniq, on="card_id", how="inner")
    df.rename(columns={"count":"{}_{}_{}".format(prefix, "max_count", field)}, inplace=True)

    _log.info("Succesfully computed mode and levels for {}".format(field))
    return df
from config import path
from services.distinct_levels_and_mode import get_unique_count_and_mode
from utils.log_setup import _log

_log.info("-----------------------------------------------------------------------------------------------------------")
_log.info("|                                Running merchant_metadata feature maker                                  |")
_log.info("-----------------------------------------------------------------------------------------------------------")

_log.info("Running merchant_category_id feature maker")
historical_merc_cat_id_features = get_unique_count_and_mode("historical_transactions.csv", "merchant_category_id")
historical_merc_cat_id_features.to_csv("{}/{}.csv".format(path.feature_path, "historical_merc_cat_id_features"))
del historical_merc_cat_id_features
_log.info("Completed running merchant_category_id feature maker")

_log.info("Running merchant_id feature maker")
historical_merc_id_features = get_unique_count_and_mode("historical_transactions.csv", "merchant_id")
historical_merc_id_features.to_csv("{}/{}.csv".format(path.feature_path, "historical_merc_id_features"))
del historical_merc_id_features
_log.info("Completed running merchant_id feature maker")

_log.info("Running state_id feature maker")
historical_state_id_features = get_unique_count_and_mode("historical_transactions.csv", "state_id")
historical_state_id_features.to_csv("{}/{}.csv".format(path.feature_path, "historical_state_id_features"))
del historical_state_id_features
_log.info("Completed running state_id feature maker")

_log.info("Running city_id feature maker")
historical_city_id_features = get_unique_count_and_mode("historical_transactions.csv", "city_id")
historical_city_id_features.to_csv("{}/{}.csv".format(path.feature_path, "historical_city_id_features"))
del historical_city_id_features
_log.info("Completed running city_id feature maker")
Пример #8
0
def get_purchase_amt_dist(source_df,
                          field="purchase_amount",
                          path=path.path,
                          nrows=None):
    """
    Returns min,max and distribution of purchase amount greater than and less than zero
        :param source_df: historical/new transactions
        :param field: feature to be extracted from
        :param path: path of source df
        :param nrows: no. of rows to be read
        :return: df with unique levels count and mode
    :return: df with card wise purchase amount details
    """
    _log.info("Creating features from {}".format(field))
    prefix = source_df.split("_")[0]
    source_df = "{}/{}".format(path, source_df)

    _log.info("Reading from {}".format(source_df))
    try:
        df = pd.read_csv(source_df, usecols=["card_id", field], nrows=nrows)
        _log.info("Successfully read from {}".format(source_df))
    except Exception as e:
        _log.exception(e)

    _log.info("Computing distribution of purchase amount")
    func_to_be_applied = [
        count_greater_than_equal_to_zero, sum_greater_than_equal_to_zero,
        count_less_than_zero, sum_less_than_zero, min, max
    ]

    rename_dict = create_rename_dict(prefix, field, func_to_be_applied)

    _log.info(("Creating final df"))
    df_features = df.groupby("card_id").agg({
        field: func_to_be_applied
    }).reset_index()
    df_features = pd.concat(
        [pd.DataFrame(df_features["card_id"]), df_features[field]],
        axis=1,
        sort=False)
    _log.info("Renaming columns: {}".format(rename_dict))
    df_features.rename(columns=rename_dict, inplace=True)

    return df_features
from config import path
from services.purchase_amount_distribution import get_purchase_amt_dist
from utils.log_setup import _log

_log.info("-----------------------------------------------------------------------------------------------------------")
_log.info("|                               Running purchase amount feature maker                                     |")
_log.info("-----------------------------------------------------------------------------------------------------------")

historical_purchase_amount_features = get_purchase_amt_dist("historical_transactions.csv")

output_file_path = "{}/{}_{}.csv".format(path.feature_path, "historical", "purchase_amount_features")
_log.info("Writing to {}".format(output_file_path))
historical_purchase_amount_features.to_csv(output_file_path)

_log.info("-----------------------------------------------------------------------------------------------------------")
_log.info("|                             Successfully ran purchase amount feature maker                              |")
_log.info("-----------------------------------------------------------------------------------------------------------")
Пример #10
0
from config import path
from services.field_to_features import field_to_features
from utils.log_setup import _log

_log.info(
    "-----------------------------------------------------------------------------------------------------------"
)
_log.info(
    "|                                     Running category feature maker                                      |"
)
_log.info(
    "-----------------------------------------------------------------------------------------------------------"
)

_log.info(
    "|                                    Running category 1 feature maker                                     |"
)
historical_cat1_features = field_to_features("historical_transactions.csv",
                                             "category_1")
historical_cat1_features.to_csv("{}/{}.csv".format(path.feature_path,
                                                   "historical_cat1_features"))
del historical_cat1_features

_log.info(
    "|                                    Running category 2 feature maker                                     |"
)
historical_cat2_features = field_to_features("historical_transactions.csv",
                                             "category_2")
historical_cat2_features.to_csv("{}/{}.csv".format(path.feature_path,
                                                   "historical_cat2_features"))
del historical_cat2_features
from config import path
from services.lag_distribution import month_lag_distribution
from utils.log_setup import _log

_log.info(
    "-----------------------------------------------------------------------------------------------------------"
)
_log.info(
    "|                                  Running month lag feature maker                                        |"
)
_log.info(
    "-----------------------------------------------------------------------------------------------------------"
)

historical_month_lag_features = month_lag_distribution(
    "historical_transactions.csv")

output_file_path = "{}/{}_{}.csv".format(path.feature_path, "historical",
                                         "month_lag_features")
_log.info("Writing to {}".format(output_file_path))
historical_month_lag_features.to_csv(output_file_path)

_log.info(
    "-----------------------------------------------------------------------------------------------------------"
)
_log.info(
    "|                                Successfully ran month lag feature maker                                 |"
)
_log.info(
    "-----------------------------------------------------------------------------------------------------------"
)
Пример #12
0
from config import path
from services.purchase_date_df_gen import purchase_date_features_df_gen, get_avg_days_bw_purchases, \
    frequency_features_from_field
from services.field_to_features import field_to_features
from utils.log_setup import _log
_log.info(
    "-----------------------------------------------------------------------------------------------------------"
)
_log.info(
    "|                                 Running purchase date feature maker                                     |"
)
_log.info(
    "-----------------------------------------------------------------------------------------------------------"
)
_log.info(
    "Creating necessary columns to extract features from purchase_date param")
raw_file_name = purchase_date_features_df_gen("new_merchant_transactions.csv")

_log.info("Computing {}, {}, {}".format("time_of_day_features",
                                        "day_of_week_features",
                                        "month_features"))
time_of_day_features = field_to_features(raw_file_name,
                                         "time_of_day",
                                         path=path.feature_path)
day_of_week_features = field_to_features(raw_file_name,
                                         "day_of_week",
                                         path=path.feature_path)
month_features = field_to_features(raw_file_name,
                                   "month",
                                   path=path.feature_path)
Пример #13
0
def purchase_date_features_df_gen(source_df):
    """
    create preliminary cols to derive essential features
    :param source_df:historica;/new transactions
    :return: file name of csv with essential cols created
    """
    _log.info(
        "Creating necessary columns for extracting purchase date features")
    prefix = source_df.split("_")[0]
    source_df = "{}/{}".format(path.path, source_df)
    _log.info("Reading from {}".format(source_df))
    try:
        df = pd.read_csv(source_df,
                         usecols=["card_id", "purchase_date"],
                         dtype=dtypes_specifier.dtypes,
                         parse_dates=dtypes_specifier.parse_dates)
    except Exception as e:
        _log.exception(e)

    _log.info("Creating feature: {}".format("time_of_day"))
    df["time_of_day"] = df["purchase_date"].apply(
        time_fragments.time_of_day_fragment)
    _log.info("Creating feature: {}".format("day_of_week"))
    df["day_of_week"] = df["purchase_date"].apply(time_fragments.day_of_week)
    _log.info("Creating feature: {}".format("month"))
    df["month"] = df["purchase_date"].apply(time_fragments.month_from_date)
    _log.info("Creating feature: {}".format("date"))
    df["date"] = df["purchase_date"].apply(lambda x: x.date())

    written_file_name = "{}_{}.csv".format(prefix,
                                           "purchase_date_features_raw")
    written_file = "{}/{}_{}.csv".format(path.feature_path, prefix,
                                         "purchase_date_features_raw")
    _log.info("Writing file {}".format(written_file))
    df.to_csv(written_file)
    _log.info("Process successfully completed")
    return written_file_name