def df_encoder(state='TX', sel_feature='primary_merchant_name'): ''' Returns ------- [df, df_null, embedding_maps[sel_feature]] ''' connection = create_connection(db_name=acc.YDB_name, db_user=acc.YDB_user, db_password=acc.YDB_password, db_host=acc.YDB_host, db_port=acc.YDB_port) states = [ 'NJ', 'AL', 'WA', 'NM', 'TX', 'AR', 'NY', 'CT', 'MI', 'CO', 'CA', 'PA', 'IN', 'OK', 'MD', 'AK', 'VA', 'GA', 'NC', 'TN', 'OH', 'IL', 'FL', 'AZ', 'DC', 'LA', 'KY', 'KS', 'IA', 'SC', 'WI', 'DE', 'HI', 'MT', 'MO', 'NV', 'ID', 'MN', 'MS', 'OR', 'UT', 'NH', 'MA', 'WV', 'NE', 'ND', 'RI', 'VT', 'WY', 'ME', 'SD', 'PR', 'GU' ] fields = [ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id', 'amount', 'currency', 'description', 'transaction_base_type', 'transaction_category_name', 'primary_merchant_name', 'city', 'state', 'transaction_origin', 'optimized_transaction_date', 'account_type', 'account_source_type', 'account_score', 'user_score', 'panel_file_created_date' ] try: if state in states: filter_query = f"SELECT {', '.join(field for field in fields)} \ FROM bank_record WHERE unique_mem_id IN ( \ SELECT unique_mem_id \ FROM user_demographic \ WHERE state = '{state}' LIMIT 1)" transaction_query = execute_read_query(connection, filter_query) df = pd.DataFrame(transaction_query, columns=fields) print(f"{len(df)} transactions for random user in state: {state}.") except OperationalError as e: print(f"The error '{e}' occurred") connection.rollback # first conversion from date to datetime objects; then conversion to unix df['panel_file_created_date'] = pd.to_datetime( df['panel_file_created_date']) df['optimized_transaction_date'] = pd.to_datetime( df['optimized_transaction_date']) # set optimized transaction_date as index for later df.set_index('optimized_transaction_date', drop=False, inplace=True) ''' After successfully loading the data, columns that are of no importance have been removed and missing values replaced Then the dataframe is ready to be encoded to get rid of all non-numerical data ''' try: # Include below if need unique ID's later: df['unique_mem_id'] = df['unique_mem_id'].astype('str', errors='ignore') df['unique_bank_account_id'] = df['unique_bank_account_id'].astype( 'str', errors='ignore') df['unique_bank_transaction_id'] = df[ 'unique_bank_transaction_id'].astype('str', errors='ignore') df['amount'] = df['amount'].astype('float64') df['transaction_base_type'] = df['transaction_base_type'].replace( to_replace=["debit", "credit"], value=[1, 0]) except (TypeError, OSError, ValueError) as e: print(f"Problem with conversion: {e}") # attempt to convert date objects to unix timestamps as numeric value (fl64) if they have no missing values; otherwise they are being dropped date_features = ['optimized_transaction_date', 'panel_file_created_date'] try: for feature in date_features: if df[feature].isnull().sum() == 0: df[feature] = df[feature].apply(lambda x: dt.timestamp(x)) else: df = df.drop(columns=feature, axis=1) print(f"Column {feature} dropped") except (TypeError, OSError, ValueError) as e: print(f"Problem with conversion: {e}") ''' The columns PRIMARY_MERCHANT_NAME; CITY, STATE, DESCRIPTION, TRANSACTION_CATEGORY_NAME, CURRENCY are encoded manually and cleared of empty values ''' encoding_features = [ 'primary_merchant_name', 'city', 'state', 'description', 'transaction_category_name', 'transaction_origin', 'currency' ] # dropping currency if there is only one if len(df['currency'].value_counts()) == 1: df = df.drop(columns=['currency'], axis=1) encoding_features.remove('currency') UNKNOWN_TOKEN = '<unknown>' embedding_maps = {} for feature in encoding_features: unique_list = df[feature].unique().astype('str').tolist() unique_list.append(UNKNOWN_TOKEN) le = LabelEncoder() le.fit_transform(unique_list) embedding_maps[feature] = dict( zip(le.classes_, le.transform(le.classes_))) # APPLICATION TO OUR DATASET df[feature] = df[feature].apply( lambda x: x if x in embedding_maps[feature] else UNKNOWN_TOKEN) df[feature] = df[feature].map(lambda x: le.transform([x])[0] if type(x) == str else x) #drop all features left with empty (NaN) values df = df.dropna() # extract rows with empty value of selected feature into own db df_null = df[df[sel_feature] == 0] df = df[df[sel_feature] != 0] return [df, df_null, embedding_maps[sel_feature]]
def db_insert_section(section=1): connection = create_connection(db_name=acc.YDB_name, db_user=acc.YDB_user, db_password=acc.YDB_password, db_host=acc.YDB_host, db_port=acc.YDB_port) fields = [ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id', 'amount', 'currency', 'description', 'transaction_base_type', 'transaction_category_name', 'primary_merchant_name', 'city', 'state', 'transaction_origin', 'optimized_transaction_date', 'account_type', 'account_source_type', 'account_score', 'user_score', 'panel_file_created_date' ] try: filter_query = f"(SELECT {', '.join(field for field in fields)} FROM card_record \ WHERE unique_mem_id IN (SELECT unique_mem_id FROM user_demographic\ ORDER BY unique_mem_id ASC LIMIT 10000 OFFSET\ {10000*(section-1)})) UNION ALL (SELECT {', '.join(field for field in fields)} \ FROM bank_record WHERE unique_mem_id IN (SELECT unique_mem_id FROM \ user_demographic ORDER BY unique_mem_id ASC LIMIT 10000 OFFSET {10000*(section-1)}))" transaction_query = execute_read_query(connection, filter_query) main_df = pd.DataFrame(transaction_query, columns=fields) print(f"{len(transaction_query)} transactions.") except OperationalError as e: print(f"The error '{e}' occurred") connection.rollback for num, user in enumerate(main_df.groupby('unique_mem_id')): print( f"user {user[0]}, {num+1}/10000 users, {round(((num+1)/10000)*100, 2)}%." ) # running functions in order df, df_null, embedding_map = df_encoder( df=main_df, sel_feature='primary_merchant_name') X_features, X_test, X_train, y_test, y_train = split_data( full_df=df, null_df=df_null) grid_search_svc = svc_class(X_train, X_test, y_train, y_test) prediction_df, predictions = add_pred(grid_search=grid_search_svc, X_features=X_features, label='primary_merchant_name') merchants = [] for i in predictions: for val, enc in embedding_map.items(): if enc in predictions: merchants.append(val) else: merchants.append("unseen in training") merchants_short = merchants[:len(predictions)] # attach merchants to prediction_df; list will be turned into df column prediction_df = prediction_df.assign(merchants=merchants_short) card_ess = ['Rewards', 'Transfers', 'Refunds/Adjustments', 'Gifts'] card_non_ess = [ 'Groceries', 'Automotive/Fuel', 'Home Improvement', 'Travel', 'Restaurants', 'Healthcare/Medical', 'Credit Card Payments', 'Electronics/General Merchandise', 'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses', 'Personal/Family', 'Service Charges/Fees', 'Services/Supplies', 'Utilities', 'Office Expenses', 'Cable/Satellite/Telecom', 'Subscriptions/Renewals', 'Insurance' ] bank_ess = [ 'Deposits', 'Salary/Regular Income', 'Transfers', 'Investment/Retirement Income', 'Rewards', 'Other Income', 'Refunds/Adjustments', 'Interest Income', 'Gifts', 'Expense Reimbursement' ] bank_non_ess = [ 'Service Charges/Fees', 'Credit Card Payments', 'Utilities', 'Healthcare/Medical', 'Loans', 'Check Payment', 'Electronics/General Merchandise', 'Groceries', 'Automotive/Fuel', 'Restaurants', 'Personal/Family', 'Entertainment/Recreation', 'Services/Supplies', 'Other Expenses', 'ATM/Cash Withdrawals', 'Cable/Satellite/Telecom', 'Postage/Shipping', 'Insurance', 'Travel', 'Taxes', 'Home Improvement', 'Education', 'Charitable Giving', 'Subscriptions/Renewals', 'Rent', 'Office Expenses', 'Mortgage' ] # Iterate through rows and mark transactions try: tr_class = pd.Series([], dtype='object') for index, i in enumerate( prediction_df['transaction_category_name']): if i in bank_ess: tr_class[index] = "essential" elif i in card_ess: tr_class[index] = "essential" elif i in bank_non_ess: tr_class[index] = "non_essential" elif i in card_non_ess: tr_class[index] = "non_essential" else: tr_class[index] = "unknown" prediction_df = prediction_df.assign( tr_essentiality=tr_class.values) except BaseException as error: print(f"column is already existing or following: {error}") cash_env = [ 'Rewards', 'Transfers', 'Refunds/Adjustments', 'Interest Income', 'Restaurants', 'Electronics/General Merchandise', 'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses', 'Other Income', 'Expense Reimbursement', 'Personal/Family', 'Travel', 'Office Expenses', 'Deposits', 'Salary/Regular Income', 'Investment/Retirement Income', 'ATM/Cash Withdrawals' ] bill_env = [ 'Check Payment', 'Rent', 'Mortgage', 'Subscriptions/Renewals', 'Healthcare/Medical', 'Credit Card Payments', 'Service Charges/Fees', 'Services/Supplies', 'Utilities', 'Insurance', 'Taxes', 'Home Improvement', 'Cable/Satellite/Telecom' ] car_env = ['Automotive/Fuel'] gro_env = ['Groceries'] chari_env = ['Charitable Giving', 'Gifts'] #iterate through rows and create a new columns with a note that it is either an expense or income try: envel_cat = pd.Series([], dtype='object') for index, i in enumerate( prediction_df['transaction_category_name']): if i in cash_env: envel_cat[index] = "cash" elif i in bill_env: envel_cat[index] = "bill" elif i in car_env: envel_cat[index] = "automobile" elif i in gro_env: envel_cat[index] = "groceries" elif i in chari_env: envel_cat[index] = "charity" else: envel_cat[index] = "not_classified" prediction_df = prediction_df.assign( envelope_category=envel_cat.values) except BaseException as error: print(f"envelope column is already existing or following {error}") db_name = "postgres" db_user = "******" db_pw = "envel" db_host = "0.0.0.0" db_port = "5432" ''' Always use %s placeholder for queries; psycopg2 will convert most data automatically For special cases or conversion problems use adapters or "AsIs" Enclose the tuples in square brackets or leave without square brackets (no performance diff) Dataframes have to be split up into tuples and passed as list ''' try: connection = create_connection(db_name=db_name, db_user=db_user, db_password=db_pw, db_host=db_host, db_port=db_port) print("Sesson_1\n-------------") cursor = connection.cursor() sql_insert_query = """ INSERT INTO test (test_col, test_col_2, test_col_3, test_col_4, test_col_5, test_col_6, test_col_7, test_col_8, test_col_9, test_col_10, test_col_11, test_col_12, test_col_13, test_col_14, test_col_15, test_col_16, test_col_17, test_col_18, test_col_19, test_col_20) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ for i in prediction_df.itertuples(): # executemany() to insert multiple rows rows # one-element-tuple with (i, ) cursor.execute(sql_insert_query, ([ i.unique_mem_id, i.unique_bank_account_id, i.unique_bank_transaction_id, i.amount, i.description, i.transaction_base_type, i.transaction_category_name, i.city, i.state, i.transaction_origin, i.optimized_transaction_date, i.account_type, i.account_source_type, i.account_score, i.user_score, i.panel_file_created_date, i.primary_merchant_name, i.merchants, i.tr_essentiality, i.envelope_category ])) connection.commit() print(len(prediction_df), "record(s) inserted successfully.") except (Exception, psycopg2.Error) as error: print("Failed inserting record; {}".format(error)) finally: # closing database connection. if (connection): cursor.close() connection.close() print("Operation completed.\nPostgreSQL connection is closed.") print("=========================") return 'insert done'
def pull_df(rng=4, spending_report=False, plots=False): ''' Parameters ---------- rng : int, Random Seed for user picker. The default is 4. spending_report : bool, Save a spending report in directory if True. Default is False. plots : bool, Plots various graphs if True. Default is False. Returns ------- df. ''' connection = create_connection(db_name=acc.YDB_name, db_user=acc.YDB_user, db_password=acc.YDB_password, db_host=acc.YDB_host, db_port=acc.YDB_port) # establish connection to get user IDs for all users in MA filter_query = f"SELECT unique_mem_id, state, city, income_class FROM user_demographic WHERE state = 'MA'" transaction_query = execute_read_query(connection, filter_query) query_df = pd.DataFrame( transaction_query, columns=['unique_mem_id', 'state', 'city', 'income_class']) try: for i in pd.Series(query_df['unique_mem_id'].unique()).sample( n=1, random_state=rng): filter_query = f"SELECT * FROM bank_record WHERE unique_mem_id = '{i}'" transaction_query = execute_read_query(connection, filter_query) df = pd.DataFrame( transaction_query, columns=[ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id', 'amount', 'currency', 'description', 'transaction_date', 'post_date', 'transaction_base_type', 'transaction_category_name', 'primary_merchant_name', 'secondary_merchant_name', 'city', 'state', 'zip_code', 'transaction_origin', 'factual_category', 'factual_id', 'file_created_date', 'optimized_transaction_date', 'yodlee_transaction_status', 'mcc_raw', 'mcc_inferred', 'swipe_date', 'panel_file_created_date', 'update_type', 'is_outlier', 'change_source', 'account_type', 'account_source_type', 'account_score', 'user_score', 'lag', 'is_duplicate' ]) print(f"User {i} has {len(df)} transactions on record.") # all these columns are empty or almost empty and contain no viable information df = df.drop(columns=[ 'secondary_merchant_name', 'swipe_date', 'update_type', 'is_outlier', 'is_duplicate', 'change_source', 'lag', 'mcc_inferred', 'mcc_raw', 'factual_id', 'factual_category', 'zip_code', 'yodlee_transaction_status', 'file_created_date', 'panel_file_created_date', 'account_source_type', 'account_type', 'account_score', 'user_score', 'post_date', 'transaction_date' ], axis=1) except OperationalError as e: print(f"The error '{e}' occurred") connection.rollback ''' Plotting of various relations The Counter object keeps track of permutations in a dictionary which can then be read and used as labels ''' if plots: # Pie chart States state_ct = Counter(list(df['state'])) # The * operator can be used in conjunction with zip() to unzip the list. labels, values = zip(*state_ct.items()) # Pie chart, where the slices will be ordered and plotted counter-clockwise: fig1, ax = plt.subplots(figsize=(20, 12)) ax.pie(values, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle. ax.axis('equal') #ax.title('Transaction locations of user {df[unique_mem_id][0]}') ax.legend(loc='center right') plt.show() # Pie chart transaction type trans_ct = Counter(list(df['transaction_category_name'])) # The * operator can be used in conjunction with zip() to unzip the list. labels_2, values_2 = zip(*trans_ct.items()) #Pie chart, where the slices will be ordered and plotted counter-clockwise: fig1, ax = plt.subplots(figsize=(20, 12)) ax.pie(values_2, labels=labels_2, autopct='%1.1f%%', shadow=True, startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle. ax.axis('equal') ax.title("Transaction categories") ax.legend(loc='center right') plt.show() ''' Generate a spending report of the unaltered dataframe Use the datetime columns just defined This report measures either the sum or mean of transactions happening on various days of the week/or wihtin a week or a month over the course of the year ''' df['optimized_transaction_date'] = pd.to_datetime( df['optimized_transaction_date']) # set optimized transaction_date as index for later df.set_index('optimized_transaction_date', drop=False, inplace=True) df = df.drop(['unique_bank_account_id', 'unique_bank_transaction_id'], axis=1) # generate the spending report with the above randomly picked user ID if spending_report: create_spending_report(df=df.copy()) return df
def df_encoder(rng=4, spending_report=False, plots=False, include_lag_features=True): ''' Parameters ---------- rng : int, Random Seed for user picker. The default is 4. spending_report : bool, Save a spending report in directory if True. Default is False. plots : bool, Plots various graphs if True. Default is False. include_lag_features : include lag feature 'amount' to database with 3, 7, and 30 day rolls. Default is True Returns ------- bank_df. ''' connection = create_connection(db_name=acc.YDB_name, db_user=acc.YDB_user, db_password=acc.YDB_password, db_host=acc.YDB_host, db_port=acc.YDB_port) # establish connection to get user IDs for all users in MA filter_query = f"SELECT unique_mem_id, state, city, income_class FROM user_demographic WHERE state = 'MA'" transaction_query = execute_read_query(connection, filter_query) query_df = pd.DataFrame( transaction_query, columns=['unique_mem_id', 'state', 'city', 'income_class']) # dateframe to gather bank data from one randomly chosen user # test user 1= 4 # test user 2= 8 try: for i in pd.Series(query_df['unique_mem_id'].unique()).sample( n=1, random_state=rng): print(i) filter_query = f"SELECT * FROM bank_record WHERE unique_mem_id = '{i}'" transaction_query = execute_read_query(connection, filter_query) bank_df = pd.DataFrame( transaction_query, columns=[ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id', 'amount', 'currency', 'description', 'transaction_date', 'post_date', 'transaction_base_type', 'transaction_category_name', 'primary_merchant_name', 'secondary_merchant_name', 'city', 'state', 'zip_code', 'transaction_origin', 'factual_category', 'factual_id', 'file_created_date', 'optimized_transaction_date', 'yodlee_transaction_status', 'mcc_raw', 'mcc_inferred', 'swipe_date', 'panel_file_created_date', 'update_type', 'is_outlier', 'change_source', 'account_type', 'account_source_type', 'account_score', 'user_score', 'lag', 'is_duplicate' ]) print(f"User {i} has {len(bank_df)} transactions on record.") #all these columns are empty or almost empty and contain no viable information bank_df = bank_df.drop(columns=[ 'secondary_merchant_name', 'swipe_date', 'update_type', 'is_outlier', 'is_duplicate', 'change_source', 'lag', 'mcc_inferred', 'mcc_raw', 'factual_id', 'factual_category', 'zip_code', 'yodlee_transaction_status', 'file_created_date', 'panel_file_created_date', 'account_type', 'account_source_type', 'account_score' ], axis=1) except OperationalError as e: print(f"The error '{e}' occurred") connection.rollback ''' Plotting of various relations The Counter object keeps track of permutations in a dictionary which can then be read and used as labels ''' if plots: # Pie chart States state_ct = Counter(list(bank_df['state'])) # The * operator can be used in conjunction with zip() to unzip the list. labels, values = zip(*state_ct.items()) # Pie chart, where the slices will be ordered and plotted counter-clockwise: fig1, ax = plt.subplots(figsize=(20, 12)) ax.pie(values, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle. ax.axis('equal') #ax.title('Transaction locations of user {bank_df[unique_mem_id][0]}') ax.legend(loc='center right') plt.show() # Pie chart transaction type trans_ct = Counter(list(bank_df['transaction_category_name'])) # The * operator can be used in conjunction with zip() to unzip the list. labels_2, values_2 = zip(*trans_ct.items()) #Pie chart, where the slices will be ordered and plotted counter-clockwise: fig1, ax = plt.subplots(figsize=(20, 12)) ax.pie(values_2, labels=labels_2, autopct='%1.1f%%', shadow=True, startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle. ax.axis('equal') #ax.title('Transaction categories of user {bank_df[unique_mem_id][0]}') ax.legend(loc='center right') plt.show() ''' Generate a spending report of the unaltered dataframe Use the datetime columns just defined This report measures either the sum or mean of transactions happening on various days of the week/or wihtin a week or a month over the course of the year ''' # convert all date col from date to datetime objects # date objects will block Select K Best if not converted # first conversion from date to datetime objects; then conversion to unix bank_df['post_date'] = pd.to_datetime(bank_df['post_date']) bank_df['transaction_date'] = pd.to_datetime(bank_df['transaction_date']) bank_df['optimized_transaction_date'] = pd.to_datetime( bank_df['optimized_transaction_date']) bank_df['file_created_date'] = pd.to_datetime(bank_df['file_created_date']) bank_df['panel_file_created_date'] = pd.to_datetime( bank_df['panel_file_created_date']) # set optimized transaction_date as index for later bank_df.set_index('optimized_transaction_date', drop=False, inplace=True) # generate the spending report with the above randomly picked user ID if spending_report: create_spending_report(df=bank_df.copy()) ''' After successfully loading the data, columns that are of no importance have been removed and missing values replaced Then the dataframe is ready to be encoded to get rid of all non-numerical data ''' try: # Include below if need unique ID's later: # bank_df['unique_mem_id'] = bank_df['unique_mem_id'].astype( # 'str', errors='ignore') # bank_df['unique_bank_account_id'] = bank_df['unique_bank_account_id'].astype( # 'str', errors='ignore') # bank_df['unique_bank_transaction_id'] = bank_df['unique_bank_transaction_id'].astype( # 'str', errors='ignore') bank_df['amount'] = bank_df['amount'].astype('float64') bank_df['transaction_base_type'] = bank_df[ 'transaction_base_type'].replace(to_replace=["debit", "credit"], value=[1, 0]) except (TypeError, OSError, ValueError) as e: print(f"Problem with conversion: {e}") # attempt to convert date objects to unix timestamps as numeric value (fl64) if they have no missing values; otherwise they are being dropped date_features = [ 'post_date', 'transaction_date', 'optimized_transaction_date', 'file_created_date', 'panel_file_created_date' ] try: for feature in date_features: if bank_df[feature].isnull().sum() == 0: bank_df[feature] = bank_df[feature].apply( lambda x: dt.timestamp(x)) else: bank_df = bank_df.drop(columns=feature, axis=1) print(f"Column {feature} dropped") except (TypeError, OSError, ValueError) as e: print(f"Problem with conversion: {e}") ''' The columns PRIMARY_MERCHANT_NAME; CITY, STATE, DESCRIPTION, TRANSACTION_CATEGORY_NAME, CURRENCY are encoded manually and cleared of empty values ''' encoding_features = [ 'primary_merchant_name', 'city', 'state', 'description', 'transaction_category_name', 'transaction_origin', 'currency' ] UNKNOWN_TOKEN = '<unknown>' embedding_maps = {} for feature in encoding_features: unique_list = bank_df[feature].unique().astype('str').tolist() unique_list.append(UNKNOWN_TOKEN) le = LabelEncoder() le.fit_transform(unique_list) embedding_maps[feature] = dict( zip(le.classes_, le.transform(le.classes_))) # APPLICATION TO OUR DATASET bank_df[feature] = bank_df[feature].apply( lambda x: x if x in embedding_maps[feature] else UNKNOWN_TOKEN) bank_df[feature] = bank_df[feature].map(lambda x: le.transform([x])[0] if type(x) == str else x) # dropping currency if there is only one if len(bank_df['currency'].value_counts()) == 1: bank_df = bank_df.drop(columns=['currency'], axis=1) ''' IMPORTANT The lagging features produce NaN for the first two rows due to unavailability of values NaNs need to be dropped to make scaling and selection of features working ''' if include_lag_features: #FEATURE ENGINEERING #typical engineered features based on lagging metrics #mean + stdev of past 3d/7d/30d/ + rolling volume date_index = bank_df.index.values bank_df.reset_index(drop=True, inplace=True) #pick lag features to iterate through and calculate features lag_features = ["amount"] #set up time frames; how many days/months back/forth t1 = 3 t2 = 7 t3 = 30 #rolling values for all columns ready to be processed bank_df_rolled_3d = bank_df[lag_features].rolling(window=t1, min_periods=0) bank_df_rolled_7d = bank_df[lag_features].rolling(window=t2, min_periods=0) bank_df_rolled_30d = bank_df[lag_features].rolling(window=t3, min_periods=0) #calculate the mean with a shifting time window bank_df_mean_3d = bank_df_rolled_3d.mean().shift( periods=1).reset_index().astype(np.float32) bank_df_mean_7d = bank_df_rolled_7d.mean().shift( periods=1).reset_index().astype(np.float32) bank_df_mean_30d = bank_df_rolled_30d.mean().shift( periods=1).reset_index().astype(np.float32) #calculate the std dev with a shifting time window bank_df_std_3d = bank_df_rolled_3d.std().shift( periods=1).reset_index().astype(np.float32) bank_df_std_7d = bank_df_rolled_7d.std().shift( periods=1).reset_index().astype(np.float32) bank_df_std_30d = bank_df_rolled_30d.std().shift( periods=1).reset_index().astype(np.float32) for feature in lag_features: bank_df[f"{feature}_mean_lag{t1}"] = bank_df_mean_3d[feature] bank_df[f"{feature}_mean_lag{t2}"] = bank_df_mean_7d[feature] bank_df[f"{feature}_mean_lag{t3}"] = bank_df_mean_30d[feature] bank_df[f"{feature}_std_lag{t1}"] = bank_df_std_3d[feature] bank_df[f"{feature}_std_lag{t2}"] = bank_df_std_7d[feature] bank_df[f"{feature}_std_lag{t3}"] = bank_df_std_30d[feature] bank_df.set_index(date_index, drop=False, inplace=True) #drop all features left with empty (NaN) values bank_df = bank_df.dropna() #drop user IDs to avoid overfitting with useless information bank_df = bank_df.drop([ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id' ], axis=1) if plots: # seaborn plots ax_desc = bank_df['description'].astype('int64', errors='ignore') ax_amount = bank_df['amount'].astype('int64', errors='ignore') sns.pairplot(bank_df) sns.boxplot(x=ax_desc, y=ax_amount) sns.heatmap(bank_df) return bank_df