def list_insert(values):
    """
    Parameters.
    ------------
    values. List/array/iterable. Values to be inserted into the databank.

    Returns.
    -----------
    'Operation complete' + disconnect
    """

    try:
        connection = create_connection(db_name=db_name,
                                       db_user=db_user,
                                       db_password=db_pw,
                                       db_host=db_host,
                                       db_port=db_port)
        print("-------------")
        cursor = connection.cursor()
        sql_insert_query = """
        INSERT INTO test (test_col_2)
        VALUES (%s);
        """

        # tuple or list works
        for i in values:
            # executemany() to insert multiple rows rows
            cursor.execute(sql_insert_query, (i, ))

        connection.commit()
        print(len(values), "record(s) inserted successfully. \n ------------")

    except (Exception, psycopg2.Error) as error:
        print("Failed inserting record {}".format(error))

    finally:
        # closing database connection.
        if (connection):
            cursor.close()
            connection.close()
            print("Operation accomplished.\nPostgreSQL connection is closed.")
    print("================")
    return 'done'
def df_encoder(state='TX', sel_feature='primary_merchant_name'):
    '''
    Returns
    -------
    [df, df_null, embedding_maps[sel_feature]]
    '''

    connection = create_connection(db_name=acc.YDB_name,
                                   db_user=acc.YDB_user,
                                   db_password=acc.YDB_password,
                                   db_host=acc.YDB_host,
                                   db_port=acc.YDB_port)

    states = [
        'NJ', 'AL', 'WA', 'NM', 'TX', 'AR', 'NY', 'CT', 'MI', 'CO', 'CA', 'PA',
        'IN', 'OK', 'MD', 'AK', 'VA', 'GA', 'NC', 'TN', 'OH', 'IL', 'FL', 'AZ',
        'DC', 'LA', 'KY', 'KS', 'IA', 'SC', 'WI', 'DE', 'HI', 'MT', 'MO', 'NV',
        'ID', 'MN', 'MS', 'OR', 'UT', 'NH', 'MA', 'WV', 'NE', 'ND', 'RI', 'VT',
        'WY', 'ME', 'SD', 'PR', 'GU'
    ]

    fields = [
        'unique_mem_id', 'unique_bank_account_id',
        'unique_bank_transaction_id', 'amount', 'currency', 'description',
        'transaction_base_type', 'transaction_category_name',
        'primary_merchant_name', 'city', 'state', 'transaction_origin',
        'optimized_transaction_date', 'account_type', 'account_source_type',
        'account_score', 'user_score', 'panel_file_created_date'
    ]

    try:
        if state in states:
            filter_query = f"SELECT {', '.join(field for field in fields)} \
                            FROM bank_record WHERE unique_mem_id IN ( \
                             SELECT unique_mem_id \
                             FROM user_demographic \
                             WHERE state = '{state}' LIMIT 1)"

            transaction_query = execute_read_query(connection, filter_query)
            df = pd.DataFrame(transaction_query, columns=fields)
            print(f"{len(df)} transactions for random user in state: {state}.")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        connection.rollback

    # first conversion from date to datetime objects; then conversion to unix
    df['panel_file_created_date'] = pd.to_datetime(
        df['panel_file_created_date'])
    df['optimized_transaction_date'] = pd.to_datetime(
        df['optimized_transaction_date'])

    # set optimized transaction_date as index for later
    df.set_index('optimized_transaction_date', drop=False, inplace=True)
    '''
    After successfully loading the data, columns that are of no importance have been removed and missing values replaced
    Then the dataframe is ready to be encoded to get rid of all non-numerical data
    '''
    try:
        # Include below if need unique ID's later:
        df['unique_mem_id'] = df['unique_mem_id'].astype('str',
                                                         errors='ignore')
        df['unique_bank_account_id'] = df['unique_bank_account_id'].astype(
            'str', errors='ignore')
        df['unique_bank_transaction_id'] = df[
            'unique_bank_transaction_id'].astype('str', errors='ignore')
        df['amount'] = df['amount'].astype('float64')
        df['transaction_base_type'] = df['transaction_base_type'].replace(
            to_replace=["debit", "credit"], value=[1, 0])
    except (TypeError, OSError, ValueError) as e:
        print(f"Problem with conversion: {e}")

    # attempt to convert date objects to unix timestamps as numeric value (fl64) if they have no missing values; otherwise they are being dropped
    date_features = ['optimized_transaction_date', 'panel_file_created_date']
    try:
        for feature in date_features:
            if df[feature].isnull().sum() == 0:
                df[feature] = df[feature].apply(lambda x: dt.timestamp(x))
            else:
                df = df.drop(columns=feature, axis=1)
                print(f"Column {feature} dropped")

    except (TypeError, OSError, ValueError) as e:
        print(f"Problem with conversion: {e}")
    '''
    The columns PRIMARY_MERCHANT_NAME; CITY, STATE, DESCRIPTION, TRANSACTION_CATEGORY_NAME, CURRENCY
    are encoded manually and cleared of empty values
    '''
    encoding_features = [
        'primary_merchant_name', 'city', 'state', 'description',
        'transaction_category_name', 'transaction_origin', 'currency'
    ]

    # dropping currency if there is only one
    if len(df['currency'].value_counts()) == 1:
        df = df.drop(columns=['currency'], axis=1)
        encoding_features.remove('currency')

    UNKNOWN_TOKEN = '<unknown>'
    embedding_maps = {}
    for feature in encoding_features:
        unique_list = df[feature].unique().astype('str').tolist()
        unique_list.append(UNKNOWN_TOKEN)
        le = LabelEncoder()
        le.fit_transform(unique_list)
        embedding_maps[feature] = dict(
            zip(le.classes_, le.transform(le.classes_)))

        # APPLICATION TO OUR DATASET
        df[feature] = df[feature].apply(
            lambda x: x if x in embedding_maps[feature] else UNKNOWN_TOKEN)
        df[feature] = df[feature].map(lambda x: le.transform([x])[0]
                                      if type(x) == str else x)

    #drop all features left with empty (NaN) values
    df = df.dropna()

    # extract rows with empty value of selected feature into own db
    df_null = df[df[sel_feature] == 0]
    df = df[df[sel_feature] != 0]

    return [df, df_null, embedding_maps[sel_feature]]
db_name = "postgres"
db_user = "******"
db_pw = "envel"
db_host = "0.0.0.0"
db_port = "5432"
'''
Always use %s placeholder for queries; psycopg2 will convert most data automatically
For special cases or conversion problems use adapters or "AsIs"
Enclose the tuples in square brackets or leave without square brackets (no performance diff)
Dataframes have to be split up into tuples and passed as list
'''

try:
    connection = create_connection(db_name=db_name,
                                   db_user=db_user,
                                   db_password=db_pw,
                                   db_host=db_host,
                                   db_port=db_port)
    cursor = connection.cursor()
    sql_insert_query = """
    INSERT INTO test (test_col, test_col_2, test_col_3, test_col_4, test_col_5,
                      test_col_6, test_col_7, test_col_8, test_col_9, test_col_10,
                      test_col_11, test_col_12, test_col_13, test_col_14,
                      test_col_15, test_col_16, test_col_17, test_col_18,
                      test_col_19, test_col_20)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
            %s, %s, %s, %s, %s, %s, %s);
    """

    for i in feat_df.itertuples():
        # executemany() to insert multiple rows rows
def db_filler(section, store_model=False):


    # dropped amount_mean_lag7 to avoid errors
    # keep optimized_transaction_date as it becomes index
    fields = ['unique_mem_id', 'optimized_transaction_date', 'amount', 'description',
              'primary_merchant_name', 'transaction_category_name', 'state',
              'city', 'transaction_base_type', 'transaction_origin']

    # dont put label here
    feat_list = ['description', 'transaction_category_name', 'amount', 'city',
                 'state', 'transaction_base_type', 'transaction_origin']
    # connection = create_connection(db_name=acc.YDB_name,
    #                                 db_user=acc.YDB_user,
    #                                 db_password=acc.YDB_password,
    #                                 db_host=acc.YDB_host,
    #                                 db_port=acc.YDB_port)

    # # pull data and encode
    # # section from 1 - 10
    # try:
    #     filter_query = f"(SELECT {', '.join(field for field in fields)} FROM card_record \
    #                         WHERE unique_mem_id IN \
    #                         (SELECT unique_mem_id FROM user_demographic \
    #                          ORDER BY unique_mem_id ASC limit 10000 offset {10000*(section-1)})) \
    #                         UNION ALL (SELECT {', '.join(field for field in fields)} \
    #                        FROM bank_record WHERE unique_mem_id IN \
    #                        (SELECT unique_mem_id FROM user_demographic \
    #                         ORDER BY unique_mem_id ASC limit 10000 offset {10000*(section-1)}))"
    #     transaction_query = execute_read_query(connection, filter_query)
    #     df = pd.DataFrame(transaction_query, columns=fields)
    #     print(f"{len(transaction_query)} transactions.")
    # except OperationalError as e:
    #     print(f"The error '{e}' occurred")
    #     connection.rollback

    #################################################
    # TEST DF NOT ENCODED!!
    # test_df_1 = pull_df(rng=20,
    #                   spending_report=False,
    #                   plots=False)
    # test_df_2 = pull_df(rng=22,
    #                   spending_report=False,
    #                   plots=False)
    # df = pd.concat([test_df_1, test_df_2], axis=0)
    df = pull_df(rng=22,
                 spending_report=False,
                 plots=False)
    ########################################

    for num, user in enumerate(df.groupby('unique_mem_id')):
        print(f"User: {user[0]}, {num+1}/10000 users, {round(((num+1)/10000)*100, 2)}%.")

        main_df, embedding_map, label_map = encoding_step(df=df,
                                                          spending_report=False,
                                                          include_lag_features=False)


        X_train, X_train_scaled, X_train_minmax, X_test, X_test_scaled, \
            X_test_minmax, y_train, y_test = split_data_feat(df=main_df,
                                                             features=feat_list,
                                                             test_size=0.2,
                                                             label='primary_merchant_name')

        # convert train data to ndarray to avoid feature_names mismatch error
        X_array = X_train.values
        y_array = y_train.values
        Xt_array = X_test.values
        yt_array = y_test.values
        # X_train and y_train used to train pipeline
        clf_object = pipeline_xgb(x=X_array,
                                  y=y_array,
                                  test_features=Xt_array,
                                  test_target=yt_array)

        # array object
        y_pred = clf_object.predict(Xt_array)

        merchants = []
        # label_map ('merchant' : '1')
        for i in y_pred:
            for val, enc in label_map.items():
                if enc in y_pred:
                    merchants.append(val)
                else:
                    merchants.append("unseen in training")

        if store_model:
            # store trained model as pickle
            store_pickle(model=clf_object,
                         file_name=f"trained_model_{user[0]}")


        db_name = "postgres"
        db_user = "******"
        db_pw = "envel"
        db_host = "0.0.0.0"
        db_port = "5432"

        '''
        Always use %s placeholder for queries; psycopg2 will convert most data automatically
        '''

        try:
            connection = create_connection(db_name=db_name,
                                            db_user=db_user,
                                            db_password=db_pw,
                                            db_host=db_host,
                                            db_port=db_port)
            print("-------------")
            cursor = connection.cursor()
            sql_insert_query = """
            INSERT INTO test (test_col_2)
            VALUES (%s);
            """
            # merch_list = np.ndarray(['Tatte Bakery', 'Star Market', 'Stop n Shop', 'Auto Parts Shop',
            #               'Trader Joes', 'Insomnia Cookies'])

            # this line would take individual letters as tuples and inserts
            # values = tuple([tuple(row) for row in merchants])
            # tuple or list works
            values = merchants
            for i in values:
            # executemany() to insert multiple rows rows
            # one-element-tuple with (i, )
                cursor.execute(sql_insert_query, (i, ))

            connection.commit()
            print(len(values), "record(s) inserted successfully.")

        except (Exception, psycopg2.Error) as error:
            print("Failed inserting record; {}".format(error))

        finally:
            # closing database connection.
            if (connection):
                cursor.close()
                connection.close()
                print("Operation completed.\nPostgreSQL connection is closed.")
        print("=========================")


    return 'completed'
示例#5
0
def essentiality_cat(df):
    '''
    POSTGRESQL COLUMN - CLASSIFICATION OF TRANSACTIONS
    Following lists contains the categories to classify transactions either as expense or income
    names taken directly from the Yodlee dataset; can be appended at will
    '''
    #essential/ non-essential transactions
    card_ess = ['Rewards', 'Transfers', 'Refunds/Adjustments', 'Gifts']
    card_non_ess = [
        'Groceries', 'Automotive/Fuel', 'Home Improvement', 'Travel',
        'Restaurants', 'Healthcare/Medical', 'Credit Card Payments',
        'Electronics/General Merchandise', 'Entertainment/Recreation',
        'Postage/Shipping', 'Other Expenses', 'Personal/Family',
        'Service Charges/Fees', 'Services/Supplies', 'Utilities',
        'Office Expenses', 'Cable/Satellite/Telecom', 'Subscriptions/Renewals',
        'Insurance'
    ]
    bank_ess = [
        'Deposits', 'Salary/Regular Income', 'Transfers',
        'Investment/Retirement Income', 'Rewards', 'Other Income',
        'Refunds/Adjustments', 'Interest Income', 'Gifts',
        'Expense Reimbursement'
    ]
    bank_non_ess = [
        'Service Charges/Fees', 'Credit Card Payments', 'Utilities',
        'Healthcare/Medical', 'Loans', 'Check Payment',
        'Electronics/General Merchandise', 'Groceries', 'Automotive/Fuel',
        'Restaurants', 'Personal/Family', 'Entertainment/Recreation',
        'Services/Supplies', 'Other Expenses', 'ATM/Cash Withdrawals',
        'Cable/Satellite/Telecom', 'Postage/Shipping', 'Insurance', 'Travel',
        'Taxes', 'Home Improvement', 'Education', 'Charitable Giving',
        'Subscriptions/Renewals', 'Rent', 'Office Expenses', 'Mortgage'
    ]

    # Iterate through rows and mark transactions
    try:
        tr_class = pd.Series([], dtype='object')
        for index, i in enumerate(df['transaction_category_name']):
            if i in bank_ess:
                tr_class[index] = "essential"
            elif i in card_ess:
                tr_class[index] = "essential"
            elif i in bank_non_ess:
                tr_class[index] = "non_essential"
            elif i in card_non_ess:
                tr_class[index] = "non_essential"
            else:
                tr_class[index] = "unknown"
    except BaseException as error:
        print("column is already existing or following: {error}")

    db_name = "postgres"
    db_user = "******"
    db_pw = "envel"
    db_host = "0.0.0.0"
    db_port = "5432"
    '''
    Always use %s placeholder for queries; psycopg2 will convert most data automatically
    For special cases or conversion problems use adapters or "AsIs"
    Enclose the tuples in square brackets or leave without square brackets (no performance diff)
    Dataframes have to be split up into tuples and passed as list
    '''

    try:
        connection = create_connection(db_name=db_name,
                                       db_user=db_user,
                                       db_password=db_pw,
                                       db_host=db_host,
                                       db_port=db_port)
        print("-------------")
        cursor = connection.cursor()
        sql_insert_query = """
        INSERT INTO test (test_col_12)
        VALUES (%s);
        """

        # tuple or list works
        tup_tr = tuple(tr_class)
        for i in tup_tr:

            # one-element-tuple with (i, )
            cursor.execute(sql_insert_query, (i, ))

        connection.commit()
        print(len(df), "record(s) inserted successfully.")

    except (Exception, psycopg2.Error) as error:
        print("Failed inserting record; {}".format(error))

    finally:
        # closing database connection.
        if (connection):
            cursor.close()
            connection.close()
            print("Operation completed.\nPostgreSQL connection is closed.")
    print("=========================")

    return 'essentiality insert complete'
示例#6
0
def pull_df(rng=4, spending_report=False, plots=False):
    '''
    Parameters
    ----------
    rng : int, Random Seed for user picker. The default is 4.
    spending_report : bool, Save a spending report in directory if True. Default is False.
    plots : bool, Plots various graphs if True. Default is False.
    Returns
    -------
    df.
    '''

    connection = create_connection(db_name=acc.YDB_name,
                                   db_user=acc.YDB_user,
                                   db_password=acc.YDB_password,
                                   db_host=acc.YDB_host,
                                   db_port=acc.YDB_port)

    # establish connection to get user IDs for all users in MA
    filter_query = f"SELECT unique_mem_id, state, city, income_class FROM user_demographic WHERE state = 'MA'"
    transaction_query = execute_read_query(connection, filter_query)
    query_df = pd.DataFrame(
        transaction_query,
        columns=['unique_mem_id', 'state', 'city', 'income_class'])

    try:
        for i in pd.Series(query_df['unique_mem_id'].unique()).sample(
                n=1, random_state=rng):
            filter_query = f"SELECT * FROM bank_record WHERE unique_mem_id = '{i}'"
            transaction_query = execute_read_query(connection, filter_query)
            df = pd.DataFrame(
                transaction_query,
                columns=[
                    'unique_mem_id', 'unique_bank_account_id',
                    'unique_bank_transaction_id', 'amount', 'currency',
                    'description', 'transaction_date', 'post_date',
                    'transaction_base_type', 'transaction_category_name',
                    'primary_merchant_name', 'secondary_merchant_name', 'city',
                    'state', 'zip_code', 'transaction_origin',
                    'factual_category', 'factual_id', 'file_created_date',
                    'optimized_transaction_date', 'yodlee_transaction_status',
                    'mcc_raw', 'mcc_inferred', 'swipe_date',
                    'panel_file_created_date', 'update_type', 'is_outlier',
                    'change_source', 'account_type', 'account_source_type',
                    'account_score', 'user_score', 'lag', 'is_duplicate'
                ])
            print(f"User {i} has {len(df)} transactions on record.")
            # all these columns are empty or almost empty and contain no viable information
            df = df.drop(columns=[
                'secondary_merchant_name', 'swipe_date', 'update_type',
                'is_outlier', 'is_duplicate', 'change_source', 'lag',
                'mcc_inferred', 'mcc_raw', 'factual_id', 'factual_category',
                'zip_code', 'yodlee_transaction_status', 'file_created_date',
                'panel_file_created_date', 'account_source_type',
                'account_type', 'account_score', 'user_score', 'post_date',
                'transaction_date'
            ],
                         axis=1)
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        connection.rollback
    '''
    Plotting of various relations
    The Counter object keeps track of permutations in a dictionary which can then be read and
    used as labels
    '''
    if plots:
        # Pie chart States
        state_ct = Counter(list(df['state']))
        # The * operator can be used in conjunction with zip() to unzip the list.
        labels, values = zip(*state_ct.items())
        # Pie chart, where the slices will be ordered and plotted counter-clockwise:
        fig1, ax = plt.subplots(figsize=(20, 12))
        ax.pie(values,
               labels=labels,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        #ax.title('Transaction locations of user {df[unique_mem_id][0]}')
        ax.legend(loc='center right')
        plt.show()

        # Pie chart transaction type
        trans_ct = Counter(list(df['transaction_category_name']))
        # The * operator can be used in conjunction with zip() to unzip the list.
        labels_2, values_2 = zip(*trans_ct.items())
        #Pie chart, where the slices will be ordered and plotted counter-clockwise:
        fig1, ax = plt.subplots(figsize=(20, 12))
        ax.pie(values_2,
               labels=labels_2,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        ax.title("Transaction categories")
        ax.legend(loc='center right')
        plt.show()
    '''
    Generate a spending report of the unaltered dataframe
    Use the datetime columns just defined
    This report measures either the sum or mean of transactions happening
    on various days of the week/or wihtin a week or a month over the course of the year
    '''

    df['optimized_transaction_date'] = pd.to_datetime(
        df['optimized_transaction_date'])

    # set optimized transaction_date as index for later
    df.set_index('optimized_transaction_date', drop=False, inplace=True)

    df = df.drop(['unique_bank_account_id', 'unique_bank_transaction_id'],
                 axis=1)
    # generate the spending report with the above randomly picked user ID
    if spending_report:
        create_spending_report(df=df.copy())

    return df
示例#7
0
def db_insert_section(section=1):

    connection = create_connection(db_name=acc.YDB_name,
                                   db_user=acc.YDB_user,
                                   db_password=acc.YDB_password,
                                   db_host=acc.YDB_host,
                                   db_port=acc.YDB_port)

    fields = [
        'unique_mem_id', 'unique_bank_account_id',
        'unique_bank_transaction_id', 'amount', 'currency', 'description',
        'transaction_base_type', 'transaction_category_name',
        'primary_merchant_name', 'city', 'state', 'transaction_origin',
        'optimized_transaction_date', 'account_type', 'account_source_type',
        'account_score', 'user_score', 'panel_file_created_date'
    ]

    try:
        filter_query = f"(SELECT {', '.join(field for field in fields)} FROM card_record \
                        WHERE unique_mem_id IN (SELECT unique_mem_id FROM user_demographic\
                        ORDER BY unique_mem_id ASC LIMIT 10000 OFFSET\
                        {10000*(section-1)})) UNION ALL (SELECT {', '.join(field for field in fields)} \
                         FROM bank_record WHERE unique_mem_id IN (SELECT unique_mem_id FROM \
                          user_demographic ORDER BY unique_mem_id ASC LIMIT 10000 OFFSET {10000*(section-1)}))"

        transaction_query = execute_read_query(connection, filter_query)
        main_df = pd.DataFrame(transaction_query, columns=fields)
        print(f"{len(transaction_query)} transactions.")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        connection.rollback

    for num, user in enumerate(main_df.groupby('unique_mem_id')):
        print(
            f"user {user[0]}, {num+1}/10000 users, {round(((num+1)/10000)*100, 2)}%."
        )

        # running functions in order
        df, df_null, embedding_map = df_encoder(
            df=main_df, sel_feature='primary_merchant_name')

        X_features, X_test, X_train, y_test, y_train = split_data(
            full_df=df, null_df=df_null)

        grid_search_svc = svc_class(X_train, X_test, y_train, y_test)

        prediction_df, predictions = add_pred(grid_search=grid_search_svc,
                                              X_features=X_features,
                                              label='primary_merchant_name')

        merchants = []

        for i in predictions:
            for val, enc in embedding_map.items():
                if enc in predictions:
                    merchants.append(val)
                else:
                    merchants.append("unseen in training")
        merchants_short = merchants[:len(predictions)]

        # attach merchants to prediction_df; list will be turned into df column
        prediction_df = prediction_df.assign(merchants=merchants_short)

        card_ess = ['Rewards', 'Transfers', 'Refunds/Adjustments', 'Gifts']
        card_non_ess = [
            'Groceries', 'Automotive/Fuel', 'Home Improvement', 'Travel',
            'Restaurants', 'Healthcare/Medical', 'Credit Card Payments',
            'Electronics/General Merchandise', 'Entertainment/Recreation',
            'Postage/Shipping', 'Other Expenses', 'Personal/Family',
            'Service Charges/Fees', 'Services/Supplies', 'Utilities',
            'Office Expenses', 'Cable/Satellite/Telecom',
            'Subscriptions/Renewals', 'Insurance'
        ]
        bank_ess = [
            'Deposits', 'Salary/Regular Income', 'Transfers',
            'Investment/Retirement Income', 'Rewards', 'Other Income',
            'Refunds/Adjustments', 'Interest Income', 'Gifts',
            'Expense Reimbursement'
        ]
        bank_non_ess = [
            'Service Charges/Fees', 'Credit Card Payments', 'Utilities',
            'Healthcare/Medical', 'Loans', 'Check Payment',
            'Electronics/General Merchandise', 'Groceries', 'Automotive/Fuel',
            'Restaurants', 'Personal/Family', 'Entertainment/Recreation',
            'Services/Supplies', 'Other Expenses', 'ATM/Cash Withdrawals',
            'Cable/Satellite/Telecom', 'Postage/Shipping', 'Insurance',
            'Travel', 'Taxes', 'Home Improvement', 'Education',
            'Charitable Giving', 'Subscriptions/Renewals', 'Rent',
            'Office Expenses', 'Mortgage'
        ]

        # Iterate through rows and mark transactions
        try:
            tr_class = pd.Series([], dtype='object')
            for index, i in enumerate(
                    prediction_df['transaction_category_name']):
                if i in bank_ess:
                    tr_class[index] = "essential"
                elif i in card_ess:
                    tr_class[index] = "essential"
                elif i in bank_non_ess:
                    tr_class[index] = "non_essential"
                elif i in card_non_ess:
                    tr_class[index] = "non_essential"
                else:
                    tr_class[index] = "unknown"
            prediction_df = prediction_df.assign(
                tr_essentiality=tr_class.values)
        except BaseException as error:
            print(f"column is already existing or following: {error}")

        cash_env = [
            'Rewards', 'Transfers', 'Refunds/Adjustments', 'Interest Income',
            'Restaurants', 'Electronics/General Merchandise',
            'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses',
            'Other Income', 'Expense Reimbursement', 'Personal/Family',
            'Travel', 'Office Expenses', 'Deposits', 'Salary/Regular Income',
            'Investment/Retirement Income', 'ATM/Cash Withdrawals'
        ]

        bill_env = [
            'Check Payment', 'Rent', 'Mortgage', 'Subscriptions/Renewals',
            'Healthcare/Medical', 'Credit Card Payments',
            'Service Charges/Fees', 'Services/Supplies', 'Utilities',
            'Insurance', 'Taxes', 'Home Improvement', 'Cable/Satellite/Telecom'
        ]

        car_env = ['Automotive/Fuel']

        gro_env = ['Groceries']

        chari_env = ['Charitable Giving', 'Gifts']

        #iterate through rows and create a new columns with a note that it is either an expense or income
        try:
            envel_cat = pd.Series([], dtype='object')
            for index, i in enumerate(
                    prediction_df['transaction_category_name']):
                if i in cash_env:
                    envel_cat[index] = "cash"
                elif i in bill_env:
                    envel_cat[index] = "bill"
                elif i in car_env:
                    envel_cat[index] = "automobile"
                elif i in gro_env:
                    envel_cat[index] = "groceries"
                elif i in chari_env:
                    envel_cat[index] = "charity"
                else:
                    envel_cat[index] = "not_classified"
            prediction_df = prediction_df.assign(
                envelope_category=envel_cat.values)
        except BaseException as error:
            print(f"envelope column is already existing or following {error}")

        db_name = "postgres"
        db_user = "******"
        db_pw = "envel"
        db_host = "0.0.0.0"
        db_port = "5432"
        '''
        Always use %s placeholder for queries; psycopg2 will convert most data automatically
        For special cases or conversion problems use adapters or "AsIs"
        Enclose the tuples in square brackets or leave without square brackets (no performance diff)
        Dataframes have to be split up into tuples and passed as list
        '''

        try:
            connection = create_connection(db_name=db_name,
                                           db_user=db_user,
                                           db_password=db_pw,
                                           db_host=db_host,
                                           db_port=db_port)
            print("Sesson_1\n-------------")
            cursor = connection.cursor()
            sql_insert_query = """
            INSERT INTO test (test_col, test_col_2, test_col_3, test_col_4, test_col_5,
                              test_col_6, test_col_7, test_col_8, test_col_9, test_col_10,
                              test_col_11, test_col_12, test_col_13, test_col_14,
                              test_col_15, test_col_16, test_col_17, test_col_18,
                              test_col_19, test_col_20)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
                    %s, %s, %s, %s, %s, %s, %s);
            """

            for i in prediction_df.itertuples():
                # executemany() to insert multiple rows rows
                # one-element-tuple with (i, )
                cursor.execute(sql_insert_query, ([
                    i.unique_mem_id, i.unique_bank_account_id,
                    i.unique_bank_transaction_id, i.amount, i.description,
                    i.transaction_base_type, i.transaction_category_name,
                    i.city, i.state, i.transaction_origin,
                    i.optimized_transaction_date, i.account_type,
                    i.account_source_type, i.account_score, i.user_score,
                    i.panel_file_created_date, i.primary_merchant_name,
                    i.merchants, i.tr_essentiality, i.envelope_category
                ]))

            connection.commit()
            print(len(prediction_df), "record(s) inserted successfully.")

        except (Exception, psycopg2.Error) as error:
            print("Failed inserting record; {}".format(error))

        finally:
            # closing database connection.
            if (connection):
                cursor.close()
                connection.close()
                print("Operation completed.\nPostgreSQL connection is closed.")
        print("=========================")
        return 'insert done'
示例#8
0
def envelope_cat(df):
    '''
    POSTGRE-SQL COLUMNS - ALLOCATION TO ENVELOPES
    This section adds a classification of transaction categories to allow a proper allocation to either the cash or the bills envelope
    Bills describe as of 3/26/2020 all kinds of payment whose occurrence are beyond one's control,
    that come due and for which non-compliance has severe consequences
    All other kinds of payments that are of optional nature and can be avoided are classifed as cash
    '''

    cash_env = [
        'Rewards', 'Transfers', 'Refunds/Adjustments', 'Interest Income',
        'Restaurants', 'Electronics/General Merchandise',
        'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses',
        'Other Income', 'Expense Reimbursement', 'Personal/Family', 'Travel',
        'Office Expenses', 'Deposits', 'Salary/Regular Income',
        'Investment/Retirement Income', 'ATM/Cash Withdrawals'
    ]

    bill_env = [
        'Check Payment', 'Rent', 'Mortgage', 'Subscriptions/Renewals',
        'Healthcare/Medical', 'Credit Card Payments', 'Service Charges/Fees',
        'Services/Supplies', 'Utilities', 'Insurance', 'Taxes',
        'Home Improvement', 'Cable/Satellite/Telecom'
    ]

    car_env = ['Automotive/Fuel']

    gro_env = ['Groceries']

    chari_env = ['Charitable Giving', 'Gifts']

    #iterate through rows and create a new columns with a note that it is either an expense or income
    try:
        envel_cat = pd.Series([], dtype='object')
        for index, i in enumerate(df['transaction_category_name']):
            if i in cash_env:
                envel_cat[index] = "cash"
            elif i in bill_env:
                envel_cat[index] = "bill"
            elif i in car_env:
                envel_cat[index] = "automobile"
            elif i in gro_env:
                envel_cat[index] = "groceries"
            elif i in chari_env:
                envel_cat[index] = "charity"
            else:
                envel_cat[index] = "NOT_CLASSIFIED"
    except BaseException as error:
        print("envelope column is already existing or following {error}")

    db_name = "postgres"
    db_user = "******"
    db_pw = "envel"
    db_host = "0.0.0.0"
    db_port = "5432"
    '''
    Always use %s placeholder for queries; psycopg2 will convert most data automatically
    For special cases or conversion problems use adapters or "AsIs"
    Enclose the tuples in square brackets or leave without square brackets (no performance diff)
    Dataframes have to be split up into tuples and passed as list
    '''

    try:
        connection = create_connection(db_name=db_name,
                                       db_user=db_user,
                                       db_password=db_pw,
                                       db_host=db_host,
                                       db_port=db_port)
        print("-------------")
        cursor = connection.cursor()
        sql_insert_query = """
        INSERT INTO test (test_col_13)
        VALUES (%s);
        """

        # tuple or list works
        tup_ec = tuple(envel_cat)
        for i in tup_ec:

            # one-element-tuple with (i, )
            cursor.execute(sql_insert_query, (i, ))

        connection.commit()
        print(len(df), "record(s) inserted successfully.")

    except (Exception, psycopg2.Error) as error:
        print("Failed inserting record; {}".format(error))

    finally:
        # closing database connection.
        if (connection):
            cursor.close()
            connection.close()
            print("Operation completed.\nPostgreSQL connection is closed.")
    print("=========================")

    return 'envelope categotization complete'
示例#9
0
def df_encoder(rng=4,
               spending_report=False,
               plots=False,
               include_lag_features=True):
    '''
    Parameters
    ----------
    rng : int, Random Seed for user picker. The default is 4.
    spending_report : bool, Save a spending report in directory if True. Default is False.
    plots : bool, Plots various graphs if True. Default is False.
    include_lag_features : include lag feature 'amount' to database with 3, 7, and 30 day rolls. Default is True

    Returns
    -------
    bank_df.
    '''

    connection = create_connection(db_name=acc.YDB_name,
                                   db_user=acc.YDB_user,
                                   db_password=acc.YDB_password,
                                   db_host=acc.YDB_host,
                                   db_port=acc.YDB_port)

    # establish connection to get user IDs for all users in MA
    filter_query = f"SELECT unique_mem_id, state, city, income_class FROM user_demographic WHERE state = 'MA'"
    transaction_query = execute_read_query(connection, filter_query)
    query_df = pd.DataFrame(
        transaction_query,
        columns=['unique_mem_id', 'state', 'city', 'income_class'])

    # dateframe to gather bank data from one randomly chosen user
    # test user 1= 4
    # test user 2= 8
    try:
        for i in pd.Series(query_df['unique_mem_id'].unique()).sample(
                n=1, random_state=rng):
            print(i)
            filter_query = f"SELECT * FROM bank_record WHERE unique_mem_id = '{i}'"
            transaction_query = execute_read_query(connection, filter_query)
            bank_df = pd.DataFrame(
                transaction_query,
                columns=[
                    'unique_mem_id', 'unique_bank_account_id',
                    'unique_bank_transaction_id', 'amount', 'currency',
                    'description', 'transaction_date', 'post_date',
                    'transaction_base_type', 'transaction_category_name',
                    'primary_merchant_name', 'secondary_merchant_name', 'city',
                    'state', 'zip_code', 'transaction_origin',
                    'factual_category', 'factual_id', 'file_created_date',
                    'optimized_transaction_date', 'yodlee_transaction_status',
                    'mcc_raw', 'mcc_inferred', 'swipe_date',
                    'panel_file_created_date', 'update_type', 'is_outlier',
                    'change_source', 'account_type', 'account_source_type',
                    'account_score', 'user_score', 'lag', 'is_duplicate'
                ])
            print(f"User {i} has {len(bank_df)} transactions on record.")
            #all these columns are empty or almost empty and contain no viable information
            bank_df = bank_df.drop(columns=[
                'secondary_merchant_name', 'swipe_date', 'update_type',
                'is_outlier', 'is_duplicate', 'change_source', 'lag',
                'mcc_inferred', 'mcc_raw', 'factual_id', 'factual_category',
                'zip_code', 'yodlee_transaction_status', 'file_created_date',
                'panel_file_created_date', 'account_type',
                'account_source_type', 'account_score'
            ],
                                   axis=1)
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        connection.rollback
    '''
    Plotting of various relations
    The Counter object keeps track of permutations in a dictionary which can then be read and
    used as labels
    '''
    if plots:
        # Pie chart States
        state_ct = Counter(list(bank_df['state']))
        # The * operator can be used in conjunction with zip() to unzip the list.
        labels, values = zip(*state_ct.items())
        # Pie chart, where the slices will be ordered and plotted counter-clockwise:
        fig1, ax = plt.subplots(figsize=(20, 12))
        ax.pie(values,
               labels=labels,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        #ax.title('Transaction locations of user {bank_df[unique_mem_id][0]}')
        ax.legend(loc='center right')
        plt.show()

        # Pie chart transaction type
        trans_ct = Counter(list(bank_df['transaction_category_name']))
        # The * operator can be used in conjunction with zip() to unzip the list.
        labels_2, values_2 = zip(*trans_ct.items())
        #Pie chart, where the slices will be ordered and plotted counter-clockwise:
        fig1, ax = plt.subplots(figsize=(20, 12))
        ax.pie(values_2,
               labels=labels_2,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        #ax.title('Transaction categories of user {bank_df[unique_mem_id][0]}')
        ax.legend(loc='center right')
        plt.show()
    '''
    Generate a spending report of the unaltered dataframe
    Use the datetime columns just defined
    This report measures either the sum or mean of transactions happening
    on various days of the week/or wihtin a week or a month  over the course of the year
    '''
    # convert all date col from date to datetime objects
    # date objects will block Select K Best if not converted
    # first conversion from date to datetime objects; then conversion to unix
    bank_df['post_date'] = pd.to_datetime(bank_df['post_date'])
    bank_df['transaction_date'] = pd.to_datetime(bank_df['transaction_date'])
    bank_df['optimized_transaction_date'] = pd.to_datetime(
        bank_df['optimized_transaction_date'])
    bank_df['file_created_date'] = pd.to_datetime(bank_df['file_created_date'])
    bank_df['panel_file_created_date'] = pd.to_datetime(
        bank_df['panel_file_created_date'])

    # set optimized transaction_date as index for later
    bank_df.set_index('optimized_transaction_date', drop=False, inplace=True)

    # generate the spending report with the above randomly picked user ID
    if spending_report:
        create_spending_report(df=bank_df.copy())
    '''
    After successfully loading the data, columns that are of no importance have been removed and missing values replaced
    Then the dataframe is ready to be encoded to get rid of all non-numerical data
    '''
    try:
        # Include below if need unique ID's later:
        # bank_df['unique_mem_id'] = bank_df['unique_mem_id'].astype(
        #     'str', errors='ignore')
        # bank_df['unique_bank_account_id'] = bank_df['unique_bank_account_id'].astype(
        #     'str', errors='ignore')
        # bank_df['unique_bank_transaction_id'] = bank_df['unique_bank_transaction_id'].astype(
        #     'str', errors='ignore')
        bank_df['amount'] = bank_df['amount'].astype('float64')
        bank_df['transaction_base_type'] = bank_df[
            'transaction_base_type'].replace(to_replace=["debit", "credit"],
                                             value=[1, 0])
    except (TypeError, OSError, ValueError) as e:
        print(f"Problem with conversion: {e}")

    # attempt to convert date objects to unix timestamps as numeric value (fl64) if they have no missing values; otherwise they are being dropped
    date_features = [
        'post_date', 'transaction_date', 'optimized_transaction_date',
        'file_created_date', 'panel_file_created_date'
    ]
    try:
        for feature in date_features:
            if bank_df[feature].isnull().sum() == 0:
                bank_df[feature] = bank_df[feature].apply(
                    lambda x: dt.timestamp(x))
            else:
                bank_df = bank_df.drop(columns=feature, axis=1)
                print(f"Column {feature} dropped")

    except (TypeError, OSError, ValueError) as e:
        print(f"Problem with conversion: {e}")
    '''
    The columns PRIMARY_MERCHANT_NAME; CITY, STATE, DESCRIPTION, TRANSACTION_CATEGORY_NAME, CURRENCY
    are encoded manually and cleared of empty values
    '''
    encoding_features = [
        'primary_merchant_name', 'city', 'state', 'description',
        'transaction_category_name', 'transaction_origin', 'currency'
    ]
    UNKNOWN_TOKEN = '<unknown>'
    embedding_maps = {}
    for feature in encoding_features:
        unique_list = bank_df[feature].unique().astype('str').tolist()
        unique_list.append(UNKNOWN_TOKEN)
        le = LabelEncoder()
        le.fit_transform(unique_list)
        embedding_maps[feature] = dict(
            zip(le.classes_, le.transform(le.classes_)))

        # APPLICATION TO OUR DATASET
        bank_df[feature] = bank_df[feature].apply(
            lambda x: x if x in embedding_maps[feature] else UNKNOWN_TOKEN)
        bank_df[feature] = bank_df[feature].map(lambda x: le.transform([x])[0]
                                                if type(x) == str else x)

    # dropping currency if there is only one
    if len(bank_df['currency'].value_counts()) == 1:
        bank_df = bank_df.drop(columns=['currency'], axis=1)
    '''
    IMPORTANT
    The lagging features produce NaN for the first two rows due to unavailability
    of values
    NaNs need to be dropped to make scaling and selection of features working
    '''
    if include_lag_features:
        #FEATURE ENGINEERING
        #typical engineered features based on lagging metrics
        #mean + stdev of past 3d/7d/30d/ + rolling volume
        date_index = bank_df.index.values
        bank_df.reset_index(drop=True, inplace=True)
        #pick lag features to iterate through and calculate features
        lag_features = ["amount"]
        #set up time frames; how many days/months back/forth
        t1 = 3
        t2 = 7
        t3 = 30
        #rolling values for all columns ready to be processed
        bank_df_rolled_3d = bank_df[lag_features].rolling(window=t1,
                                                          min_periods=0)
        bank_df_rolled_7d = bank_df[lag_features].rolling(window=t2,
                                                          min_periods=0)
        bank_df_rolled_30d = bank_df[lag_features].rolling(window=t3,
                                                           min_periods=0)

        #calculate the mean with a shifting time window
        bank_df_mean_3d = bank_df_rolled_3d.mean().shift(
            periods=1).reset_index().astype(np.float32)
        bank_df_mean_7d = bank_df_rolled_7d.mean().shift(
            periods=1).reset_index().astype(np.float32)
        bank_df_mean_30d = bank_df_rolled_30d.mean().shift(
            periods=1).reset_index().astype(np.float32)

        #calculate the std dev with a shifting time window
        bank_df_std_3d = bank_df_rolled_3d.std().shift(
            periods=1).reset_index().astype(np.float32)
        bank_df_std_7d = bank_df_rolled_7d.std().shift(
            periods=1).reset_index().astype(np.float32)
        bank_df_std_30d = bank_df_rolled_30d.std().shift(
            periods=1).reset_index().astype(np.float32)

        for feature in lag_features:
            bank_df[f"{feature}_mean_lag{t1}"] = bank_df_mean_3d[feature]
            bank_df[f"{feature}_mean_lag{t2}"] = bank_df_mean_7d[feature]
            bank_df[f"{feature}_mean_lag{t3}"] = bank_df_mean_30d[feature]
            bank_df[f"{feature}_std_lag{t1}"] = bank_df_std_3d[feature]
            bank_df[f"{feature}_std_lag{t2}"] = bank_df_std_7d[feature]
            bank_df[f"{feature}_std_lag{t3}"] = bank_df_std_30d[feature]

        bank_df.set_index(date_index, drop=False, inplace=True)

    #drop all features left with empty (NaN) values
    bank_df = bank_df.dropna()
    #drop user IDs to avoid overfitting with useless information
    bank_df = bank_df.drop([
        'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id'
    ],
                           axis=1)

    if plots:
        # seaborn plots
        ax_desc = bank_df['description'].astype('int64', errors='ignore')
        ax_amount = bank_df['amount'].astype('int64', errors='ignore')
        sns.pairplot(bank_df)
        sns.boxplot(x=ax_desc, y=ax_amount)
        sns.heatmap(bank_df)

    return bank_df
示例#10
0
def df_insert_query(df):
    
    """
    Parameters
    ---------
    df. pandas dataframe that is to be inserted into a databank.
    Returns
    --------
    'insert complete' msg.
    """



    # create the placeholders for the columns that will be fitted with values
    #col_records = ", ".join(["%s"] * len(df.columns))

    db_name = "postgres"
    db_user = "******"
    db_pw = "envel"
    db_host = "0.0.0.0"
    db_port = "5432"
    
    '''
    Always use %s placeholder for queries; psycopg2 will convert most data automatically
    For special cases or conversion problems use adapters or "AsIs"
    Enclose the tuples in square brackets or leave without square brackets (no performance diff)
    '''

    try:
        connection = create_connection(db_name=db_name,
                                        db_user=db_user,
                                        db_password=db_pw,
                                        db_host=db_host,
                                        db_port=db_port)
        print("-------------")
        cursor = connection.cursor()
        sql_insert_query = """
        INSERT INTO test (test_col, test_col_2, test_col_3, test_col_4,
                          test_col_5, test_col_6, test_col_7, test_col_8,
                          test_col_9, test_col_10, test_col_11)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
        """
        # merch_list = np.ndarray(['Tatte Bakery', 'Star Market', 'Stop n Shop', 'Auto Parts Shop',
        #               'Trader Joes', 'Insomnia Cookies'])

        # tuple or list works
        # split up into tuples and pass as list
        for i in df.itertuples():
        # executemany() to insert multiple rows rows
        # one-element-tuple with (i, )
            cursor.execute(sql_insert_query, ([i.unique_mem_id, i.amount, i.currency,
                                               i.description, i.transaction_base_type,
                                               i.transaction_category_name,
                                               i.primary_merchant_name, i.city,
                                               i.state, i.transaction_origin,
                                               i.optimized_transaction_date]))

        connection.commit()
        print(len(df), "record(s) inserted successfully.")

    except (Exception, psycopg2.Error) as error:
        print("Failed inserting record; {}".format(error))

    finally:
        # closing database connection.
        if (connection):
            cursor.close()
            connection.close()
            print("Operation completed.\nPostgreSQL connection is closed.")
    print("=========================")

    return'insert completed'