def list_insert(values): """ Parameters. ------------ values. List/array/iterable. Values to be inserted into the databank. Returns. ----------- 'Operation complete' + disconnect """ try: connection = create_connection(db_name=db_name, db_user=db_user, db_password=db_pw, db_host=db_host, db_port=db_port) print("-------------") cursor = connection.cursor() sql_insert_query = """ INSERT INTO test (test_col_2) VALUES (%s); """ # tuple or list works for i in values: # executemany() to insert multiple rows rows cursor.execute(sql_insert_query, (i, )) connection.commit() print(len(values), "record(s) inserted successfully. \n ------------") except (Exception, psycopg2.Error) as error: print("Failed inserting record {}".format(error)) finally: # closing database connection. if (connection): cursor.close() connection.close() print("Operation accomplished.\nPostgreSQL connection is closed.") print("================") return 'done'
def df_encoder(state='TX', sel_feature='primary_merchant_name'): ''' Returns ------- [df, df_null, embedding_maps[sel_feature]] ''' connection = create_connection(db_name=acc.YDB_name, db_user=acc.YDB_user, db_password=acc.YDB_password, db_host=acc.YDB_host, db_port=acc.YDB_port) states = [ 'NJ', 'AL', 'WA', 'NM', 'TX', 'AR', 'NY', 'CT', 'MI', 'CO', 'CA', 'PA', 'IN', 'OK', 'MD', 'AK', 'VA', 'GA', 'NC', 'TN', 'OH', 'IL', 'FL', 'AZ', 'DC', 'LA', 'KY', 'KS', 'IA', 'SC', 'WI', 'DE', 'HI', 'MT', 'MO', 'NV', 'ID', 'MN', 'MS', 'OR', 'UT', 'NH', 'MA', 'WV', 'NE', 'ND', 'RI', 'VT', 'WY', 'ME', 'SD', 'PR', 'GU' ] fields = [ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id', 'amount', 'currency', 'description', 'transaction_base_type', 'transaction_category_name', 'primary_merchant_name', 'city', 'state', 'transaction_origin', 'optimized_transaction_date', 'account_type', 'account_source_type', 'account_score', 'user_score', 'panel_file_created_date' ] try: if state in states: filter_query = f"SELECT {', '.join(field for field in fields)} \ FROM bank_record WHERE unique_mem_id IN ( \ SELECT unique_mem_id \ FROM user_demographic \ WHERE state = '{state}' LIMIT 1)" transaction_query = execute_read_query(connection, filter_query) df = pd.DataFrame(transaction_query, columns=fields) print(f"{len(df)} transactions for random user in state: {state}.") except OperationalError as e: print(f"The error '{e}' occurred") connection.rollback # first conversion from date to datetime objects; then conversion to unix df['panel_file_created_date'] = pd.to_datetime( df['panel_file_created_date']) df['optimized_transaction_date'] = pd.to_datetime( df['optimized_transaction_date']) # set optimized transaction_date as index for later df.set_index('optimized_transaction_date', drop=False, inplace=True) ''' After successfully loading the data, columns that are of no importance have been removed and missing values replaced Then the dataframe is ready to be encoded to get rid of all non-numerical data ''' try: # Include below if need unique ID's later: df['unique_mem_id'] = df['unique_mem_id'].astype('str', errors='ignore') df['unique_bank_account_id'] = df['unique_bank_account_id'].astype( 'str', errors='ignore') df['unique_bank_transaction_id'] = df[ 'unique_bank_transaction_id'].astype('str', errors='ignore') df['amount'] = df['amount'].astype('float64') df['transaction_base_type'] = df['transaction_base_type'].replace( to_replace=["debit", "credit"], value=[1, 0]) except (TypeError, OSError, ValueError) as e: print(f"Problem with conversion: {e}") # attempt to convert date objects to unix timestamps as numeric value (fl64) if they have no missing values; otherwise they are being dropped date_features = ['optimized_transaction_date', 'panel_file_created_date'] try: for feature in date_features: if df[feature].isnull().sum() == 0: df[feature] = df[feature].apply(lambda x: dt.timestamp(x)) else: df = df.drop(columns=feature, axis=1) print(f"Column {feature} dropped") except (TypeError, OSError, ValueError) as e: print(f"Problem with conversion: {e}") ''' The columns PRIMARY_MERCHANT_NAME; CITY, STATE, DESCRIPTION, TRANSACTION_CATEGORY_NAME, CURRENCY are encoded manually and cleared of empty values ''' encoding_features = [ 'primary_merchant_name', 'city', 'state', 'description', 'transaction_category_name', 'transaction_origin', 'currency' ] # dropping currency if there is only one if len(df['currency'].value_counts()) == 1: df = df.drop(columns=['currency'], axis=1) encoding_features.remove('currency') UNKNOWN_TOKEN = '<unknown>' embedding_maps = {} for feature in encoding_features: unique_list = df[feature].unique().astype('str').tolist() unique_list.append(UNKNOWN_TOKEN) le = LabelEncoder() le.fit_transform(unique_list) embedding_maps[feature] = dict( zip(le.classes_, le.transform(le.classes_))) # APPLICATION TO OUR DATASET df[feature] = df[feature].apply( lambda x: x if x in embedding_maps[feature] else UNKNOWN_TOKEN) df[feature] = df[feature].map(lambda x: le.transform([x])[0] if type(x) == str else x) #drop all features left with empty (NaN) values df = df.dropna() # extract rows with empty value of selected feature into own db df_null = df[df[sel_feature] == 0] df = df[df[sel_feature] != 0] return [df, df_null, embedding_maps[sel_feature]]
db_name = "postgres" db_user = "******" db_pw = "envel" db_host = "0.0.0.0" db_port = "5432" ''' Always use %s placeholder for queries; psycopg2 will convert most data automatically For special cases or conversion problems use adapters or "AsIs" Enclose the tuples in square brackets or leave without square brackets (no performance diff) Dataframes have to be split up into tuples and passed as list ''' try: connection = create_connection(db_name=db_name, db_user=db_user, db_password=db_pw, db_host=db_host, db_port=db_port) cursor = connection.cursor() sql_insert_query = """ INSERT INTO test (test_col, test_col_2, test_col_3, test_col_4, test_col_5, test_col_6, test_col_7, test_col_8, test_col_9, test_col_10, test_col_11, test_col_12, test_col_13, test_col_14, test_col_15, test_col_16, test_col_17, test_col_18, test_col_19, test_col_20) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ for i in feat_df.itertuples(): # executemany() to insert multiple rows rows
def db_filler(section, store_model=False): # dropped amount_mean_lag7 to avoid errors # keep optimized_transaction_date as it becomes index fields = ['unique_mem_id', 'optimized_transaction_date', 'amount', 'description', 'primary_merchant_name', 'transaction_category_name', 'state', 'city', 'transaction_base_type', 'transaction_origin'] # dont put label here feat_list = ['description', 'transaction_category_name', 'amount', 'city', 'state', 'transaction_base_type', 'transaction_origin'] # connection = create_connection(db_name=acc.YDB_name, # db_user=acc.YDB_user, # db_password=acc.YDB_password, # db_host=acc.YDB_host, # db_port=acc.YDB_port) # # pull data and encode # # section from 1 - 10 # try: # filter_query = f"(SELECT {', '.join(field for field in fields)} FROM card_record \ # WHERE unique_mem_id IN \ # (SELECT unique_mem_id FROM user_demographic \ # ORDER BY unique_mem_id ASC limit 10000 offset {10000*(section-1)})) \ # UNION ALL (SELECT {', '.join(field for field in fields)} \ # FROM bank_record WHERE unique_mem_id IN \ # (SELECT unique_mem_id FROM user_demographic \ # ORDER BY unique_mem_id ASC limit 10000 offset {10000*(section-1)}))" # transaction_query = execute_read_query(connection, filter_query) # df = pd.DataFrame(transaction_query, columns=fields) # print(f"{len(transaction_query)} transactions.") # except OperationalError as e: # print(f"The error '{e}' occurred") # connection.rollback ################################################# # TEST DF NOT ENCODED!! # test_df_1 = pull_df(rng=20, # spending_report=False, # plots=False) # test_df_2 = pull_df(rng=22, # spending_report=False, # plots=False) # df = pd.concat([test_df_1, test_df_2], axis=0) df = pull_df(rng=22, spending_report=False, plots=False) ######################################## for num, user in enumerate(df.groupby('unique_mem_id')): print(f"User: {user[0]}, {num+1}/10000 users, {round(((num+1)/10000)*100, 2)}%.") main_df, embedding_map, label_map = encoding_step(df=df, spending_report=False, include_lag_features=False) X_train, X_train_scaled, X_train_minmax, X_test, X_test_scaled, \ X_test_minmax, y_train, y_test = split_data_feat(df=main_df, features=feat_list, test_size=0.2, label='primary_merchant_name') # convert train data to ndarray to avoid feature_names mismatch error X_array = X_train.values y_array = y_train.values Xt_array = X_test.values yt_array = y_test.values # X_train and y_train used to train pipeline clf_object = pipeline_xgb(x=X_array, y=y_array, test_features=Xt_array, test_target=yt_array) # array object y_pred = clf_object.predict(Xt_array) merchants = [] # label_map ('merchant' : '1') for i in y_pred: for val, enc in label_map.items(): if enc in y_pred: merchants.append(val) else: merchants.append("unseen in training") if store_model: # store trained model as pickle store_pickle(model=clf_object, file_name=f"trained_model_{user[0]}") db_name = "postgres" db_user = "******" db_pw = "envel" db_host = "0.0.0.0" db_port = "5432" ''' Always use %s placeholder for queries; psycopg2 will convert most data automatically ''' try: connection = create_connection(db_name=db_name, db_user=db_user, db_password=db_pw, db_host=db_host, db_port=db_port) print("-------------") cursor = connection.cursor() sql_insert_query = """ INSERT INTO test (test_col_2) VALUES (%s); """ # merch_list = np.ndarray(['Tatte Bakery', 'Star Market', 'Stop n Shop', 'Auto Parts Shop', # 'Trader Joes', 'Insomnia Cookies']) # this line would take individual letters as tuples and inserts # values = tuple([tuple(row) for row in merchants]) # tuple or list works values = merchants for i in values: # executemany() to insert multiple rows rows # one-element-tuple with (i, ) cursor.execute(sql_insert_query, (i, )) connection.commit() print(len(values), "record(s) inserted successfully.") except (Exception, psycopg2.Error) as error: print("Failed inserting record; {}".format(error)) finally: # closing database connection. if (connection): cursor.close() connection.close() print("Operation completed.\nPostgreSQL connection is closed.") print("=========================") return 'completed'
def essentiality_cat(df): ''' POSTGRESQL COLUMN - CLASSIFICATION OF TRANSACTIONS Following lists contains the categories to classify transactions either as expense or income names taken directly from the Yodlee dataset; can be appended at will ''' #essential/ non-essential transactions card_ess = ['Rewards', 'Transfers', 'Refunds/Adjustments', 'Gifts'] card_non_ess = [ 'Groceries', 'Automotive/Fuel', 'Home Improvement', 'Travel', 'Restaurants', 'Healthcare/Medical', 'Credit Card Payments', 'Electronics/General Merchandise', 'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses', 'Personal/Family', 'Service Charges/Fees', 'Services/Supplies', 'Utilities', 'Office Expenses', 'Cable/Satellite/Telecom', 'Subscriptions/Renewals', 'Insurance' ] bank_ess = [ 'Deposits', 'Salary/Regular Income', 'Transfers', 'Investment/Retirement Income', 'Rewards', 'Other Income', 'Refunds/Adjustments', 'Interest Income', 'Gifts', 'Expense Reimbursement' ] bank_non_ess = [ 'Service Charges/Fees', 'Credit Card Payments', 'Utilities', 'Healthcare/Medical', 'Loans', 'Check Payment', 'Electronics/General Merchandise', 'Groceries', 'Automotive/Fuel', 'Restaurants', 'Personal/Family', 'Entertainment/Recreation', 'Services/Supplies', 'Other Expenses', 'ATM/Cash Withdrawals', 'Cable/Satellite/Telecom', 'Postage/Shipping', 'Insurance', 'Travel', 'Taxes', 'Home Improvement', 'Education', 'Charitable Giving', 'Subscriptions/Renewals', 'Rent', 'Office Expenses', 'Mortgage' ] # Iterate through rows and mark transactions try: tr_class = pd.Series([], dtype='object') for index, i in enumerate(df['transaction_category_name']): if i in bank_ess: tr_class[index] = "essential" elif i in card_ess: tr_class[index] = "essential" elif i in bank_non_ess: tr_class[index] = "non_essential" elif i in card_non_ess: tr_class[index] = "non_essential" else: tr_class[index] = "unknown" except BaseException as error: print("column is already existing or following: {error}") db_name = "postgres" db_user = "******" db_pw = "envel" db_host = "0.0.0.0" db_port = "5432" ''' Always use %s placeholder for queries; psycopg2 will convert most data automatically For special cases or conversion problems use adapters or "AsIs" Enclose the tuples in square brackets or leave without square brackets (no performance diff) Dataframes have to be split up into tuples and passed as list ''' try: connection = create_connection(db_name=db_name, db_user=db_user, db_password=db_pw, db_host=db_host, db_port=db_port) print("-------------") cursor = connection.cursor() sql_insert_query = """ INSERT INTO test (test_col_12) VALUES (%s); """ # tuple or list works tup_tr = tuple(tr_class) for i in tup_tr: # one-element-tuple with (i, ) cursor.execute(sql_insert_query, (i, )) connection.commit() print(len(df), "record(s) inserted successfully.") except (Exception, psycopg2.Error) as error: print("Failed inserting record; {}".format(error)) finally: # closing database connection. if (connection): cursor.close() connection.close() print("Operation completed.\nPostgreSQL connection is closed.") print("=========================") return 'essentiality insert complete'
def pull_df(rng=4, spending_report=False, plots=False): ''' Parameters ---------- rng : int, Random Seed for user picker. The default is 4. spending_report : bool, Save a spending report in directory if True. Default is False. plots : bool, Plots various graphs if True. Default is False. Returns ------- df. ''' connection = create_connection(db_name=acc.YDB_name, db_user=acc.YDB_user, db_password=acc.YDB_password, db_host=acc.YDB_host, db_port=acc.YDB_port) # establish connection to get user IDs for all users in MA filter_query = f"SELECT unique_mem_id, state, city, income_class FROM user_demographic WHERE state = 'MA'" transaction_query = execute_read_query(connection, filter_query) query_df = pd.DataFrame( transaction_query, columns=['unique_mem_id', 'state', 'city', 'income_class']) try: for i in pd.Series(query_df['unique_mem_id'].unique()).sample( n=1, random_state=rng): filter_query = f"SELECT * FROM bank_record WHERE unique_mem_id = '{i}'" transaction_query = execute_read_query(connection, filter_query) df = pd.DataFrame( transaction_query, columns=[ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id', 'amount', 'currency', 'description', 'transaction_date', 'post_date', 'transaction_base_type', 'transaction_category_name', 'primary_merchant_name', 'secondary_merchant_name', 'city', 'state', 'zip_code', 'transaction_origin', 'factual_category', 'factual_id', 'file_created_date', 'optimized_transaction_date', 'yodlee_transaction_status', 'mcc_raw', 'mcc_inferred', 'swipe_date', 'panel_file_created_date', 'update_type', 'is_outlier', 'change_source', 'account_type', 'account_source_type', 'account_score', 'user_score', 'lag', 'is_duplicate' ]) print(f"User {i} has {len(df)} transactions on record.") # all these columns are empty or almost empty and contain no viable information df = df.drop(columns=[ 'secondary_merchant_name', 'swipe_date', 'update_type', 'is_outlier', 'is_duplicate', 'change_source', 'lag', 'mcc_inferred', 'mcc_raw', 'factual_id', 'factual_category', 'zip_code', 'yodlee_transaction_status', 'file_created_date', 'panel_file_created_date', 'account_source_type', 'account_type', 'account_score', 'user_score', 'post_date', 'transaction_date' ], axis=1) except OperationalError as e: print(f"The error '{e}' occurred") connection.rollback ''' Plotting of various relations The Counter object keeps track of permutations in a dictionary which can then be read and used as labels ''' if plots: # Pie chart States state_ct = Counter(list(df['state'])) # The * operator can be used in conjunction with zip() to unzip the list. labels, values = zip(*state_ct.items()) # Pie chart, where the slices will be ordered and plotted counter-clockwise: fig1, ax = plt.subplots(figsize=(20, 12)) ax.pie(values, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle. ax.axis('equal') #ax.title('Transaction locations of user {df[unique_mem_id][0]}') ax.legend(loc='center right') plt.show() # Pie chart transaction type trans_ct = Counter(list(df['transaction_category_name'])) # The * operator can be used in conjunction with zip() to unzip the list. labels_2, values_2 = zip(*trans_ct.items()) #Pie chart, where the slices will be ordered and plotted counter-clockwise: fig1, ax = plt.subplots(figsize=(20, 12)) ax.pie(values_2, labels=labels_2, autopct='%1.1f%%', shadow=True, startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle. ax.axis('equal') ax.title("Transaction categories") ax.legend(loc='center right') plt.show() ''' Generate a spending report of the unaltered dataframe Use the datetime columns just defined This report measures either the sum or mean of transactions happening on various days of the week/or wihtin a week or a month over the course of the year ''' df['optimized_transaction_date'] = pd.to_datetime( df['optimized_transaction_date']) # set optimized transaction_date as index for later df.set_index('optimized_transaction_date', drop=False, inplace=True) df = df.drop(['unique_bank_account_id', 'unique_bank_transaction_id'], axis=1) # generate the spending report with the above randomly picked user ID if spending_report: create_spending_report(df=df.copy()) return df
def db_insert_section(section=1): connection = create_connection(db_name=acc.YDB_name, db_user=acc.YDB_user, db_password=acc.YDB_password, db_host=acc.YDB_host, db_port=acc.YDB_port) fields = [ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id', 'amount', 'currency', 'description', 'transaction_base_type', 'transaction_category_name', 'primary_merchant_name', 'city', 'state', 'transaction_origin', 'optimized_transaction_date', 'account_type', 'account_source_type', 'account_score', 'user_score', 'panel_file_created_date' ] try: filter_query = f"(SELECT {', '.join(field for field in fields)} FROM card_record \ WHERE unique_mem_id IN (SELECT unique_mem_id FROM user_demographic\ ORDER BY unique_mem_id ASC LIMIT 10000 OFFSET\ {10000*(section-1)})) UNION ALL (SELECT {', '.join(field for field in fields)} \ FROM bank_record WHERE unique_mem_id IN (SELECT unique_mem_id FROM \ user_demographic ORDER BY unique_mem_id ASC LIMIT 10000 OFFSET {10000*(section-1)}))" transaction_query = execute_read_query(connection, filter_query) main_df = pd.DataFrame(transaction_query, columns=fields) print(f"{len(transaction_query)} transactions.") except OperationalError as e: print(f"The error '{e}' occurred") connection.rollback for num, user in enumerate(main_df.groupby('unique_mem_id')): print( f"user {user[0]}, {num+1}/10000 users, {round(((num+1)/10000)*100, 2)}%." ) # running functions in order df, df_null, embedding_map = df_encoder( df=main_df, sel_feature='primary_merchant_name') X_features, X_test, X_train, y_test, y_train = split_data( full_df=df, null_df=df_null) grid_search_svc = svc_class(X_train, X_test, y_train, y_test) prediction_df, predictions = add_pred(grid_search=grid_search_svc, X_features=X_features, label='primary_merchant_name') merchants = [] for i in predictions: for val, enc in embedding_map.items(): if enc in predictions: merchants.append(val) else: merchants.append("unseen in training") merchants_short = merchants[:len(predictions)] # attach merchants to prediction_df; list will be turned into df column prediction_df = prediction_df.assign(merchants=merchants_short) card_ess = ['Rewards', 'Transfers', 'Refunds/Adjustments', 'Gifts'] card_non_ess = [ 'Groceries', 'Automotive/Fuel', 'Home Improvement', 'Travel', 'Restaurants', 'Healthcare/Medical', 'Credit Card Payments', 'Electronics/General Merchandise', 'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses', 'Personal/Family', 'Service Charges/Fees', 'Services/Supplies', 'Utilities', 'Office Expenses', 'Cable/Satellite/Telecom', 'Subscriptions/Renewals', 'Insurance' ] bank_ess = [ 'Deposits', 'Salary/Regular Income', 'Transfers', 'Investment/Retirement Income', 'Rewards', 'Other Income', 'Refunds/Adjustments', 'Interest Income', 'Gifts', 'Expense Reimbursement' ] bank_non_ess = [ 'Service Charges/Fees', 'Credit Card Payments', 'Utilities', 'Healthcare/Medical', 'Loans', 'Check Payment', 'Electronics/General Merchandise', 'Groceries', 'Automotive/Fuel', 'Restaurants', 'Personal/Family', 'Entertainment/Recreation', 'Services/Supplies', 'Other Expenses', 'ATM/Cash Withdrawals', 'Cable/Satellite/Telecom', 'Postage/Shipping', 'Insurance', 'Travel', 'Taxes', 'Home Improvement', 'Education', 'Charitable Giving', 'Subscriptions/Renewals', 'Rent', 'Office Expenses', 'Mortgage' ] # Iterate through rows and mark transactions try: tr_class = pd.Series([], dtype='object') for index, i in enumerate( prediction_df['transaction_category_name']): if i in bank_ess: tr_class[index] = "essential" elif i in card_ess: tr_class[index] = "essential" elif i in bank_non_ess: tr_class[index] = "non_essential" elif i in card_non_ess: tr_class[index] = "non_essential" else: tr_class[index] = "unknown" prediction_df = prediction_df.assign( tr_essentiality=tr_class.values) except BaseException as error: print(f"column is already existing or following: {error}") cash_env = [ 'Rewards', 'Transfers', 'Refunds/Adjustments', 'Interest Income', 'Restaurants', 'Electronics/General Merchandise', 'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses', 'Other Income', 'Expense Reimbursement', 'Personal/Family', 'Travel', 'Office Expenses', 'Deposits', 'Salary/Regular Income', 'Investment/Retirement Income', 'ATM/Cash Withdrawals' ] bill_env = [ 'Check Payment', 'Rent', 'Mortgage', 'Subscriptions/Renewals', 'Healthcare/Medical', 'Credit Card Payments', 'Service Charges/Fees', 'Services/Supplies', 'Utilities', 'Insurance', 'Taxes', 'Home Improvement', 'Cable/Satellite/Telecom' ] car_env = ['Automotive/Fuel'] gro_env = ['Groceries'] chari_env = ['Charitable Giving', 'Gifts'] #iterate through rows and create a new columns with a note that it is either an expense or income try: envel_cat = pd.Series([], dtype='object') for index, i in enumerate( prediction_df['transaction_category_name']): if i in cash_env: envel_cat[index] = "cash" elif i in bill_env: envel_cat[index] = "bill" elif i in car_env: envel_cat[index] = "automobile" elif i in gro_env: envel_cat[index] = "groceries" elif i in chari_env: envel_cat[index] = "charity" else: envel_cat[index] = "not_classified" prediction_df = prediction_df.assign( envelope_category=envel_cat.values) except BaseException as error: print(f"envelope column is already existing or following {error}") db_name = "postgres" db_user = "******" db_pw = "envel" db_host = "0.0.0.0" db_port = "5432" ''' Always use %s placeholder for queries; psycopg2 will convert most data automatically For special cases or conversion problems use adapters or "AsIs" Enclose the tuples in square brackets or leave without square brackets (no performance diff) Dataframes have to be split up into tuples and passed as list ''' try: connection = create_connection(db_name=db_name, db_user=db_user, db_password=db_pw, db_host=db_host, db_port=db_port) print("Sesson_1\n-------------") cursor = connection.cursor() sql_insert_query = """ INSERT INTO test (test_col, test_col_2, test_col_3, test_col_4, test_col_5, test_col_6, test_col_7, test_col_8, test_col_9, test_col_10, test_col_11, test_col_12, test_col_13, test_col_14, test_col_15, test_col_16, test_col_17, test_col_18, test_col_19, test_col_20) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ for i in prediction_df.itertuples(): # executemany() to insert multiple rows rows # one-element-tuple with (i, ) cursor.execute(sql_insert_query, ([ i.unique_mem_id, i.unique_bank_account_id, i.unique_bank_transaction_id, i.amount, i.description, i.transaction_base_type, i.transaction_category_name, i.city, i.state, i.transaction_origin, i.optimized_transaction_date, i.account_type, i.account_source_type, i.account_score, i.user_score, i.panel_file_created_date, i.primary_merchant_name, i.merchants, i.tr_essentiality, i.envelope_category ])) connection.commit() print(len(prediction_df), "record(s) inserted successfully.") except (Exception, psycopg2.Error) as error: print("Failed inserting record; {}".format(error)) finally: # closing database connection. if (connection): cursor.close() connection.close() print("Operation completed.\nPostgreSQL connection is closed.") print("=========================") return 'insert done'
def envelope_cat(df): ''' POSTGRE-SQL COLUMNS - ALLOCATION TO ENVELOPES This section adds a classification of transaction categories to allow a proper allocation to either the cash or the bills envelope Bills describe as of 3/26/2020 all kinds of payment whose occurrence are beyond one's control, that come due and for which non-compliance has severe consequences All other kinds of payments that are of optional nature and can be avoided are classifed as cash ''' cash_env = [ 'Rewards', 'Transfers', 'Refunds/Adjustments', 'Interest Income', 'Restaurants', 'Electronics/General Merchandise', 'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses', 'Other Income', 'Expense Reimbursement', 'Personal/Family', 'Travel', 'Office Expenses', 'Deposits', 'Salary/Regular Income', 'Investment/Retirement Income', 'ATM/Cash Withdrawals' ] bill_env = [ 'Check Payment', 'Rent', 'Mortgage', 'Subscriptions/Renewals', 'Healthcare/Medical', 'Credit Card Payments', 'Service Charges/Fees', 'Services/Supplies', 'Utilities', 'Insurance', 'Taxes', 'Home Improvement', 'Cable/Satellite/Telecom' ] car_env = ['Automotive/Fuel'] gro_env = ['Groceries'] chari_env = ['Charitable Giving', 'Gifts'] #iterate through rows and create a new columns with a note that it is either an expense or income try: envel_cat = pd.Series([], dtype='object') for index, i in enumerate(df['transaction_category_name']): if i in cash_env: envel_cat[index] = "cash" elif i in bill_env: envel_cat[index] = "bill" elif i in car_env: envel_cat[index] = "automobile" elif i in gro_env: envel_cat[index] = "groceries" elif i in chari_env: envel_cat[index] = "charity" else: envel_cat[index] = "NOT_CLASSIFIED" except BaseException as error: print("envelope column is already existing or following {error}") db_name = "postgres" db_user = "******" db_pw = "envel" db_host = "0.0.0.0" db_port = "5432" ''' Always use %s placeholder for queries; psycopg2 will convert most data automatically For special cases or conversion problems use adapters or "AsIs" Enclose the tuples in square brackets or leave without square brackets (no performance diff) Dataframes have to be split up into tuples and passed as list ''' try: connection = create_connection(db_name=db_name, db_user=db_user, db_password=db_pw, db_host=db_host, db_port=db_port) print("-------------") cursor = connection.cursor() sql_insert_query = """ INSERT INTO test (test_col_13) VALUES (%s); """ # tuple or list works tup_ec = tuple(envel_cat) for i in tup_ec: # one-element-tuple with (i, ) cursor.execute(sql_insert_query, (i, )) connection.commit() print(len(df), "record(s) inserted successfully.") except (Exception, psycopg2.Error) as error: print("Failed inserting record; {}".format(error)) finally: # closing database connection. if (connection): cursor.close() connection.close() print("Operation completed.\nPostgreSQL connection is closed.") print("=========================") return 'envelope categotization complete'
def df_encoder(rng=4, spending_report=False, plots=False, include_lag_features=True): ''' Parameters ---------- rng : int, Random Seed for user picker. The default is 4. spending_report : bool, Save a spending report in directory if True. Default is False. plots : bool, Plots various graphs if True. Default is False. include_lag_features : include lag feature 'amount' to database with 3, 7, and 30 day rolls. Default is True Returns ------- bank_df. ''' connection = create_connection(db_name=acc.YDB_name, db_user=acc.YDB_user, db_password=acc.YDB_password, db_host=acc.YDB_host, db_port=acc.YDB_port) # establish connection to get user IDs for all users in MA filter_query = f"SELECT unique_mem_id, state, city, income_class FROM user_demographic WHERE state = 'MA'" transaction_query = execute_read_query(connection, filter_query) query_df = pd.DataFrame( transaction_query, columns=['unique_mem_id', 'state', 'city', 'income_class']) # dateframe to gather bank data from one randomly chosen user # test user 1= 4 # test user 2= 8 try: for i in pd.Series(query_df['unique_mem_id'].unique()).sample( n=1, random_state=rng): print(i) filter_query = f"SELECT * FROM bank_record WHERE unique_mem_id = '{i}'" transaction_query = execute_read_query(connection, filter_query) bank_df = pd.DataFrame( transaction_query, columns=[ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id', 'amount', 'currency', 'description', 'transaction_date', 'post_date', 'transaction_base_type', 'transaction_category_name', 'primary_merchant_name', 'secondary_merchant_name', 'city', 'state', 'zip_code', 'transaction_origin', 'factual_category', 'factual_id', 'file_created_date', 'optimized_transaction_date', 'yodlee_transaction_status', 'mcc_raw', 'mcc_inferred', 'swipe_date', 'panel_file_created_date', 'update_type', 'is_outlier', 'change_source', 'account_type', 'account_source_type', 'account_score', 'user_score', 'lag', 'is_duplicate' ]) print(f"User {i} has {len(bank_df)} transactions on record.") #all these columns are empty or almost empty and contain no viable information bank_df = bank_df.drop(columns=[ 'secondary_merchant_name', 'swipe_date', 'update_type', 'is_outlier', 'is_duplicate', 'change_source', 'lag', 'mcc_inferred', 'mcc_raw', 'factual_id', 'factual_category', 'zip_code', 'yodlee_transaction_status', 'file_created_date', 'panel_file_created_date', 'account_type', 'account_source_type', 'account_score' ], axis=1) except OperationalError as e: print(f"The error '{e}' occurred") connection.rollback ''' Plotting of various relations The Counter object keeps track of permutations in a dictionary which can then be read and used as labels ''' if plots: # Pie chart States state_ct = Counter(list(bank_df['state'])) # The * operator can be used in conjunction with zip() to unzip the list. labels, values = zip(*state_ct.items()) # Pie chart, where the slices will be ordered and plotted counter-clockwise: fig1, ax = plt.subplots(figsize=(20, 12)) ax.pie(values, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle. ax.axis('equal') #ax.title('Transaction locations of user {bank_df[unique_mem_id][0]}') ax.legend(loc='center right') plt.show() # Pie chart transaction type trans_ct = Counter(list(bank_df['transaction_category_name'])) # The * operator can be used in conjunction with zip() to unzip the list. labels_2, values_2 = zip(*trans_ct.items()) #Pie chart, where the slices will be ordered and plotted counter-clockwise: fig1, ax = plt.subplots(figsize=(20, 12)) ax.pie(values_2, labels=labels_2, autopct='%1.1f%%', shadow=True, startangle=90) # Equal aspect ratio ensures that pie is drawn as a circle. ax.axis('equal') #ax.title('Transaction categories of user {bank_df[unique_mem_id][0]}') ax.legend(loc='center right') plt.show() ''' Generate a spending report of the unaltered dataframe Use the datetime columns just defined This report measures either the sum or mean of transactions happening on various days of the week/or wihtin a week or a month over the course of the year ''' # convert all date col from date to datetime objects # date objects will block Select K Best if not converted # first conversion from date to datetime objects; then conversion to unix bank_df['post_date'] = pd.to_datetime(bank_df['post_date']) bank_df['transaction_date'] = pd.to_datetime(bank_df['transaction_date']) bank_df['optimized_transaction_date'] = pd.to_datetime( bank_df['optimized_transaction_date']) bank_df['file_created_date'] = pd.to_datetime(bank_df['file_created_date']) bank_df['panel_file_created_date'] = pd.to_datetime( bank_df['panel_file_created_date']) # set optimized transaction_date as index for later bank_df.set_index('optimized_transaction_date', drop=False, inplace=True) # generate the spending report with the above randomly picked user ID if spending_report: create_spending_report(df=bank_df.copy()) ''' After successfully loading the data, columns that are of no importance have been removed and missing values replaced Then the dataframe is ready to be encoded to get rid of all non-numerical data ''' try: # Include below if need unique ID's later: # bank_df['unique_mem_id'] = bank_df['unique_mem_id'].astype( # 'str', errors='ignore') # bank_df['unique_bank_account_id'] = bank_df['unique_bank_account_id'].astype( # 'str', errors='ignore') # bank_df['unique_bank_transaction_id'] = bank_df['unique_bank_transaction_id'].astype( # 'str', errors='ignore') bank_df['amount'] = bank_df['amount'].astype('float64') bank_df['transaction_base_type'] = bank_df[ 'transaction_base_type'].replace(to_replace=["debit", "credit"], value=[1, 0]) except (TypeError, OSError, ValueError) as e: print(f"Problem with conversion: {e}") # attempt to convert date objects to unix timestamps as numeric value (fl64) if they have no missing values; otherwise they are being dropped date_features = [ 'post_date', 'transaction_date', 'optimized_transaction_date', 'file_created_date', 'panel_file_created_date' ] try: for feature in date_features: if bank_df[feature].isnull().sum() == 0: bank_df[feature] = bank_df[feature].apply( lambda x: dt.timestamp(x)) else: bank_df = bank_df.drop(columns=feature, axis=1) print(f"Column {feature} dropped") except (TypeError, OSError, ValueError) as e: print(f"Problem with conversion: {e}") ''' The columns PRIMARY_MERCHANT_NAME; CITY, STATE, DESCRIPTION, TRANSACTION_CATEGORY_NAME, CURRENCY are encoded manually and cleared of empty values ''' encoding_features = [ 'primary_merchant_name', 'city', 'state', 'description', 'transaction_category_name', 'transaction_origin', 'currency' ] UNKNOWN_TOKEN = '<unknown>' embedding_maps = {} for feature in encoding_features: unique_list = bank_df[feature].unique().astype('str').tolist() unique_list.append(UNKNOWN_TOKEN) le = LabelEncoder() le.fit_transform(unique_list) embedding_maps[feature] = dict( zip(le.classes_, le.transform(le.classes_))) # APPLICATION TO OUR DATASET bank_df[feature] = bank_df[feature].apply( lambda x: x if x in embedding_maps[feature] else UNKNOWN_TOKEN) bank_df[feature] = bank_df[feature].map(lambda x: le.transform([x])[0] if type(x) == str else x) # dropping currency if there is only one if len(bank_df['currency'].value_counts()) == 1: bank_df = bank_df.drop(columns=['currency'], axis=1) ''' IMPORTANT The lagging features produce NaN for the first two rows due to unavailability of values NaNs need to be dropped to make scaling and selection of features working ''' if include_lag_features: #FEATURE ENGINEERING #typical engineered features based on lagging metrics #mean + stdev of past 3d/7d/30d/ + rolling volume date_index = bank_df.index.values bank_df.reset_index(drop=True, inplace=True) #pick lag features to iterate through and calculate features lag_features = ["amount"] #set up time frames; how many days/months back/forth t1 = 3 t2 = 7 t3 = 30 #rolling values for all columns ready to be processed bank_df_rolled_3d = bank_df[lag_features].rolling(window=t1, min_periods=0) bank_df_rolled_7d = bank_df[lag_features].rolling(window=t2, min_periods=0) bank_df_rolled_30d = bank_df[lag_features].rolling(window=t3, min_periods=0) #calculate the mean with a shifting time window bank_df_mean_3d = bank_df_rolled_3d.mean().shift( periods=1).reset_index().astype(np.float32) bank_df_mean_7d = bank_df_rolled_7d.mean().shift( periods=1).reset_index().astype(np.float32) bank_df_mean_30d = bank_df_rolled_30d.mean().shift( periods=1).reset_index().astype(np.float32) #calculate the std dev with a shifting time window bank_df_std_3d = bank_df_rolled_3d.std().shift( periods=1).reset_index().astype(np.float32) bank_df_std_7d = bank_df_rolled_7d.std().shift( periods=1).reset_index().astype(np.float32) bank_df_std_30d = bank_df_rolled_30d.std().shift( periods=1).reset_index().astype(np.float32) for feature in lag_features: bank_df[f"{feature}_mean_lag{t1}"] = bank_df_mean_3d[feature] bank_df[f"{feature}_mean_lag{t2}"] = bank_df_mean_7d[feature] bank_df[f"{feature}_mean_lag{t3}"] = bank_df_mean_30d[feature] bank_df[f"{feature}_std_lag{t1}"] = bank_df_std_3d[feature] bank_df[f"{feature}_std_lag{t2}"] = bank_df_std_7d[feature] bank_df[f"{feature}_std_lag{t3}"] = bank_df_std_30d[feature] bank_df.set_index(date_index, drop=False, inplace=True) #drop all features left with empty (NaN) values bank_df = bank_df.dropna() #drop user IDs to avoid overfitting with useless information bank_df = bank_df.drop([ 'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id' ], axis=1) if plots: # seaborn plots ax_desc = bank_df['description'].astype('int64', errors='ignore') ax_amount = bank_df['amount'].astype('int64', errors='ignore') sns.pairplot(bank_df) sns.boxplot(x=ax_desc, y=ax_amount) sns.heatmap(bank_df) return bank_df
def df_insert_query(df): """ Parameters --------- df. pandas dataframe that is to be inserted into a databank. Returns -------- 'insert complete' msg. """ # create the placeholders for the columns that will be fitted with values #col_records = ", ".join(["%s"] * len(df.columns)) db_name = "postgres" db_user = "******" db_pw = "envel" db_host = "0.0.0.0" db_port = "5432" ''' Always use %s placeholder for queries; psycopg2 will convert most data automatically For special cases or conversion problems use adapters or "AsIs" Enclose the tuples in square brackets or leave without square brackets (no performance diff) ''' try: connection = create_connection(db_name=db_name, db_user=db_user, db_password=db_pw, db_host=db_host, db_port=db_port) print("-------------") cursor = connection.cursor() sql_insert_query = """ INSERT INTO test (test_col, test_col_2, test_col_3, test_col_4, test_col_5, test_col_6, test_col_7, test_col_8, test_col_9, test_col_10, test_col_11) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ # merch_list = np.ndarray(['Tatte Bakery', 'Star Market', 'Stop n Shop', 'Auto Parts Shop', # 'Trader Joes', 'Insomnia Cookies']) # tuple or list works # split up into tuples and pass as list for i in df.itertuples(): # executemany() to insert multiple rows rows # one-element-tuple with (i, ) cursor.execute(sql_insert_query, ([i.unique_mem_id, i.amount, i.currency, i.description, i.transaction_base_type, i.transaction_category_name, i.primary_merchant_name, i.city, i.state, i.transaction_origin, i.optimized_transaction_date])) connection.commit() print(len(df), "record(s) inserted successfully.") except (Exception, psycopg2.Error) as error: print("Failed inserting record; {}".format(error)) finally: # closing database connection. if (connection): cursor.close() connection.close() print("Operation completed.\nPostgreSQL connection is closed.") print("=========================") return'insert completed'