def auth_flag_features(source_df, field, path=path.path, nrows=None): """ Returns number of levels and mode for a given field :param source_df: historical/new transactions :param field: feature to be extracted from :param path: path of source df :param nrows: no. of rows to be read :return: df with unique levels count and mode """ _log.info("Creating features from {}".format(field)) prefix = source_df.split("_")[0] source_df = "{}/{}".format(path, source_df) _log.info("Reading from {}".format(source_df)) try: df = pd.read_csv(source_df, usecols=["card_id", field], nrows=nrows) except Exception as e: _log.exception(e) _log.info("Successfully read from {}".format(source_df)) df["dummy"] = 1 _log.info("Computing successful and unsuccesful authorizations") df_agg = (df.groupby([field, "card_id"]).agg({ "dummy": np.sum }).reset_index().pivot_table(index="card_id", columns=field, values="dummy")) df_agg["total"] = df_agg['N'] + df_agg['Y'] df_agg['Y'] = df_agg['Y'] / df_agg["total"] df_agg.drop(columns=["N", "total"], inplace=True) df_agg.rename(columns={'Y': (prefix + '_Y')}, inplace=True) _log.info("Succesfully computed features for {}".format(field)) return df_agg
def frequency_features_from_field(source_df): """ Gives average number of transactions and max number of transactions in a day :param source_df: historical/new transactions csv with essential derived features. Refer to earlier function :return: df with features corresponding to transactions in a day """ _log.info("Computing average and max frequency transactions in a day") prefix = source_df.split("_")[0] source_df = "{}/{}".format(path.feature_path, source_df) _log.info("Reading from {}".format(source_df)) try: df = pd.read_csv(source_df) except Exception as e: _log.exception(e) df["dummy"] = 1 day_features_rename_dict = { "amax": "{}_{}".format(prefix, "max_freq_in_day"), "mean": "{}_{}".format(prefix, "avg_freq_in_transacting_day") } day_features = (df.groupby(["card_id", "date"]).agg({ "dummy": np.sum }).reset_index().groupby(["card_id"]).agg({ "dummy": [np.max, np.mean] }).reset_index()) day_features = (pd.DataFrame([ day_features["card_id"], day_features["dummy"]["amax"], day_features["dummy"]["mean"] ]).T.rename(columns=day_features_rename_dict)) _log.info("Successfully computed feature") return day_features
def get_avg_days_bw_purchases(source_df): """ Gives average days between two purchases for a given card id :param source_df: historical/new transactions csv with essential derived features. Refer to earlier function :return: df with features corresponding to days between purchases """ _log.info("Computing average number of days between consecutive purchases") prefix = source_df.split("_")[0] source_df = "{}/{}".format(path.feature_path, source_df) _log.info("Reading from {}".format(source_df)) try: df = pd.read_csv(source_df) except Exception as e: _log.exception(e) df["next_purchase_date"] = df.groupby("card_id").shift(-1)["purchase_date"] df.loc[ pd.isnull(df["next_purchase_date"]), "next_purchase_date"] = dt.datetime.now() # should be a better value df["purchase_date"] = pd.to_datetime(df["purchase_date"]) df["next_purchase_date"] = pd.to_datetime(df["next_purchase_date"]) df["days_between_purchases"] = df["next_purchase_date"] - df[ "purchase_date"] df["days_between_purchases"] = df["days_between_purchases"].apply( lambda x: x.days) days_bw_purchase = (df.groupby("card_id").agg({ "days_between_purchases": np.mean }).reset_index().rename( {"mean": "{}_{}".format(prefix, "avg_days_bw_purchases")})) _log.info("Successfully computed feature") return days_bw_purchase
def get_purchase_amt_dist(source_df, field="purchase_amount", path=path.path, nrows=None): """ Returns min,max and distribution of purchase amount greater than and less than zero :param source_df: historical/new transactions :param field: feature to be extracted from :param path: path of source df :param nrows: no. of rows to be read :return: df with unique levels count and mode :return: df with card wise purchase amount details """ _log.info("Creating features from {}".format(field)) prefix = source_df.split("_")[0] source_df = "{}/{}".format(path, source_df) _log.info("Reading from {}".format(source_df)) try: df = pd.read_csv(source_df, usecols=["card_id", field], nrows=nrows) _log.info("Successfully read from {}".format(source_df)) except Exception as e: _log.exception(e) _log.info("Computing distribution of purchase amount") func_to_be_applied = [ count_greater_than_equal_to_zero, sum_greater_than_equal_to_zero, count_less_than_zero, sum_less_than_zero, min, max ] rename_dict = create_rename_dict(prefix, field, func_to_be_applied) _log.info(("Creating final df")) df_features = df.groupby("card_id").agg({ field: func_to_be_applied }).reset_index() df_features = pd.concat( [pd.DataFrame(df_features["card_id"]), df_features[field]], axis=1, sort=False) _log.info("Renaming columns: {}".format(rename_dict)) df_features.rename(columns=rename_dict, inplace=True) return df_features
def field_to_features(source_df, field, path=path.path, nrows=None): """ :param source_df: historical or new transactions.csv :param field:param to be used for making feature :param nrows: number of rows to be read :param path: path of source_df Reads selected field from data frame and :return: df with columns: card_id, column each for distinct levels of field (giving count of txn), level with max txn """ _log.info("Creating features from {}".format(field)) prefix = source_df.split("_")[0] source_df = "{}/{}".format(path, source_df) _log.info("Reading from {}".format(source_df)) try: df = pd.read_csv(source_df, usecols=["card_id", field], nrows=nrows) except Exception as e: _log.exception(e) _log.info("Successfully read from {}".format(source_df)) df.loc[pd.isnull(df[field]), field] = -100 keys = pd.Series(list(set(df[field]))) vals = keys.apply(lambda x: "{}_{}_{}".format(prefix, field, x)) rename_dict = dict(zip(keys, vals)) _log.info("Rename dict: {}".format(rename_dict)) df["dummy"] = 1 df_agg = (df.groupby(["card_id", field]).agg({ "dummy": np.sum }).reset_index().pivot_table(index="card_id", columns=field, values="dummy", fill_value=0).reset_index()) field_name = prefix + "_max_txn_" + str(field) df_agg[field_name] = df_agg.drop(columns="card_id").idxmax(axis=1) df_agg.rename(columns=rename_dict, inplace=True) _log.info("Successfully computed feature") return df_agg
def get_unique_count_and_mode(source_df, field, path=path.path, nrows=None): """ Returns number of levels and mode for a given field :param source_df: historical/new transactions :param field: feature to be extracted from :param path: path of source df :param nrows: no. of rows to be read :return: df with unique levels count and mode """ _log.info("Creating features from {}".format(field)) prefix = source_df.split("_")[0] source_df = "{}/{}".format(path, source_df) _log.info("Reading from {}".format(source_df)) try: df = pd.read_csv(source_df, usecols=["card_id", field], nrows=nrows) except Exception as e: _log.exception(e) _log.info("Successfully read from {}".format(source_df)) if sum(pd.isnull(df[field])): field_mode = df[field].mode()[0] df.loc[pd.isnull(df[field]), field] = field_mode _log.info("Fetching no. of distinct merchants transacted on") df_uniq = df.groupby("card_id").agg({field: pd.Series.nunique}).reset_index().rename(columns={field: "{}_{}_{}".format(prefix, "unique", field)}) _log.info("Computing mode of merchants") df_agg = df.groupby(["card_id", field]).agg({field: np.count_nonzero}).rename(columns={field: "count"}).reset_index() df_max = df_agg.groupby("card_id").agg({"count": np.max}).reset_index() df_max = df_agg.merge(df_max, how="inner", on=["card_id", "count"]).drop_duplicates().reset_index(drop=True) df_max['rank'] = df_max.groupby(['card_id']).cumcount() + 1 df_max = df_max[df_max['rank'] == 1] df = df_max.merge(df_uniq, on="card_id", how="inner") df.rename(columns={"count":"{}_{}_{}".format(prefix, "max_count", field)}, inplace=True) _log.info("Succesfully computed mode and levels for {}".format(field)) return df
def purchase_date_features_df_gen(source_df): """ create preliminary cols to derive essential features :param source_df:historica;/new transactions :return: file name of csv with essential cols created """ _log.info( "Creating necessary columns for extracting purchase date features") prefix = source_df.split("_")[0] source_df = "{}/{}".format(path.path, source_df) _log.info("Reading from {}".format(source_df)) try: df = pd.read_csv(source_df, usecols=["card_id", "purchase_date"], dtype=dtypes_specifier.dtypes, parse_dates=dtypes_specifier.parse_dates) except Exception as e: _log.exception(e) _log.info("Creating feature: {}".format("time_of_day")) df["time_of_day"] = df["purchase_date"].apply( time_fragments.time_of_day_fragment) _log.info("Creating feature: {}".format("day_of_week")) df["day_of_week"] = df["purchase_date"].apply(time_fragments.day_of_week) _log.info("Creating feature: {}".format("month")) df["month"] = df["purchase_date"].apply(time_fragments.month_from_date) _log.info("Creating feature: {}".format("date")) df["date"] = df["purchase_date"].apply(lambda x: x.date()) written_file_name = "{}_{}.csv".format(prefix, "purchase_date_features_raw") written_file = "{}/{}_{}.csv".format(path.feature_path, prefix, "purchase_date_features_raw") _log.info("Writing file {}".format(written_file)) df.to_csv(written_file) _log.info("Process successfully completed") return written_file_name