def generate_person_id_construction(schema): # create date ID query = """ select distinct concat("BirthDate","LastName") as "combinaison" from "CUSTOMERS"."MASTER_ID" where "GoodCombinaison" = 1 """ engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() data = pd.read_sql(query, con=engine) engine.close() query = """ CREATE TABLE "{}"."ID_PERSON"( "PERSON_ID" SERIAL, "combinaison" TEXT ) """.format(schema) engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() engine.execute(query) engine.close() InsertTableIntoDatabase(data, TlbName="ID_PERSON", Schema=schema, database_name="Creacard_Calypso", database_type="Postgres", DropTable=False, InstertInParrell=False) query = """ update "CUSTOMERS"."MASTER_ID" set "PERSON_ID" = T1."PERSON_ID" from "CUSTOMERS"."ID_PERSON" as T1 where concat("CUSTOMERS"."MASTER_ID"."BirthDate", "CUSTOMERS"."MASTER_ID"."LastName") = T1."combinaison" """ engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() engine.execute(query) engine.close() query = """ update "CUSTOMERS"."MASTER_ID" set "PERSON_ID" = concat("USER_ID",'_',"MOBILE_ID") where "GoodCombinaison" = 0 and "PERSON_ID" is null """ engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() engine.execute(query) engine.close()
def update_output_div(input_value): engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() query = """ SELECT "table_name" FROM "information_schema"."tables" WHERE table_schema IN ('{}') """.format(input_value) data = pd.read_sql(query, con=engine) tmp_list = data["table_name"].tolist() del data if not tmp_list: val = "" else: val = tmp_list[0] options = [{'label': i, 'value': i} for i in tmp_list], val engine.close() return options
def add_fees_others_transactions(database_type, database_name, _year, _month, _day, **kwargs): _tlbname = kwargs.get('tlbname', "FEES_TRANSACTIONS") _schema = kwargs.get('schema', "TRANSACTIONS") date_start = datetime.datetime(_year, _month, _day) date_start_cond = str(date_start)[0:10] end_date = date_start + datetime.timedelta(days=1) end_date = str(end_date)[0:10] engine = connect_to_database(database_type, database_name).CreateEngine() # check if the date had already treated query = """ select count(*) from "{}"."{}" where "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(_schema,_tlbname, date_start_cond, end_date) data = pd.read_sql(query, con=engine) if data.iloc[0, 0] == 0: querytmp = """ SELECT "CardHolderID","MCC","Fee","Surcharge","TransactionTP","TransactionTime","Currency", "CardVPUType", "MerchantAddress", "MerchantCity", "MerchantCountry", "MerchantID", "TransactionID" FROM "TRANSACTIONS_MONTHLY"."MONTHLY_TRANSACTIONS_{}" where "DebitCredit" IN ('Debit') and "TransactionTP" ~* 'fee' and "TransactionTP" !~* 'reversal' and "TransactionResult" = 'APPROVED' and "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(str(date_start.year) + str(date_start.month), date_start_cond, end_date) query = """ insert into "{}"."{}" {} """.format(_schema, _tlbname, querytmp) engine.execute(query) query_update = """ update "TRANSACTIONS"."FEES_TRANSACTIONS" set "Surcharge" = ABS("Surcharge") where "TransactionTP" = 'FX Fee' and "Surcharge" < 0 """ engine.execute(query_update) engine.close() else: print("this data had been already treated")
def create_master_id(schema): query = """ CREATE TABLE "{}"."MASTER_ID" as select *, null::integer as "MOBILE_ID", null::bigint as "USER_ID", null::integer as "CONTACT_ID", null::text as "PERSON_ID", null::bigint as "MOVIDON_ID" from "{}"."TMP_USER_ID" """.format(schema, schema) engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() engine.execute(query) engine.close() query = """ ALTER TABLE "CUSTOMERS"."MASTER_ID" ALTER COLUMN "CONTACT_ID" TYPE VARCHAR(50) """ engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() engine.execute(query) engine.close()
def add_new_atm_transactions(database_type, database_name, _year, _month, _day, **kwargs): _tlbname = kwargs.get('tlbname', "ATM_TRANSACTIONS") _schema = kwargs.get('schema', "TRANSACTIONS") date_start = datetime.datetime(_year, _month, _day) date_start_cond = str(date_start)[0:10] end_date = date_start + datetime.timedelta(days=1) end_date = str(end_date)[0:10] engine = connect_to_database(database_type, database_name).CreateEngine() # check if the date had already treated query = """ select count(*) from "{}"."{}" where "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(_schema,_tlbname, date_start_cond, end_date) data = pd.read_sql(query, con=engine) if data.iloc[0, 0] == 0: querytmp = """ SELECT "CardHolderID","MCC","Amount","MerchantName","TransactionTime","Currency", "CardVPUType", "MerchantAddress", "MerchantCity", "MerchantCountry", "MerchantID", "TransactionID", CASE WHEN "TransactionTP" in ('ATM International') then 1 else 0 end as "IsInternational","TransactionTP" FROM "TRANSACTIONS_MONTHLY"."MONTHLY_TRANSACTIONS_{}" where "TransactionTP" IN ('ATM Domestic','ATM International') and "DebitCredit" IN ('Debit') and "TransactionResult" = 'APPROVED' and "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(str(date_start.year) + str(date_start.month), date_start_cond, end_date) query = """ insert into "{}"."{}" {} """.format(_schema,_tlbname, querytmp) engine.execute(query) engine.close() else: print("this data had been already treated")
def add_new_loads_transactions(database_type, database_name, _year, _month, _day, **kwargs): _tlbname = kwargs.get('tlbname', "LOADS_TRANSACTIONS") _schema = kwargs.get('schema', "TRANSACTIONS") date_start = datetime.datetime(_year, _month, _day) date_start_cond = str(date_start)[0:10] end_date = date_start + datetime.timedelta(days=1) end_date = str(end_date)[0:10] engine = connect_to_database(database_type, database_name).CreateEngine() # check if the date had already treated query = """ select count(*) from "{}"."{}" where "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(_schema,_tlbname, date_start_cond, end_date) data = pd.read_sql(query, con=engine) if data.iloc[0, 0] == 0: querytmp = """ SELECT "CardHolderID","MCC","Amount","TransactionTP","TransactionTime","Currency", "CardVPUType", "MerchantAddress", "MerchantCity", "MerchantCountry", "MerchantID", "TransactionID" FROM "TRANSACTIONS_MONTHLY"."MONTHLY_TRANSACTIONS_{}" WHERE "DebitCredit" IN ('Credit') and "TransactionResult" = 'APPROVED' AND "TransactionTP" IN ('Voucher load','Terminal Load','Sepa Incoming Payment','Card to Card In','INTERNET DEBIT/CREDIT') and "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(str(date_start.year) + str(date_start.month), date_start_cond, end_date) query = """ insert into "{}"."{}" {} """.format(_schema, _tlbname, querytmp) engine.execute(query) engine.close() else: print("this data had been already treated")
def copy_to_csv(query, local_filename, local_folder, connexion_name, **kwargs): tic = time.time() to_compress = kwargs.get('compression', False) _remove_csv_file = kwargs.get('remove_csv_file', False) _delimiter = kwargs.get('delimiter', ';') local_filename = local_filename + "_" + str(datetime.datetime.now())[0:10].replace("-", "").replace(":", "").replace( ".", "").replace(" ", "") + ".csv" local_path = local_folder + local_filename # extract data from query to csv using a built-in postgres function # by passing query as argument and destination file cmd = """ COPY ({}) TO '{}' WITH CSV HEADER DELIMITER '{}'; """.format(query, local_path, _delimiter) engine = connect_to_database("Postgres", connexion_name).CreateEngine() engine.execute(cmd) engine.close() if to_compress: cmd = "Compress-Archive " + local_path + " " + local_folder + local_filename.replace(".csv", "") + ".zip" p = subprocess.Popen(['powershell.exe', cmd]) # wait until the process finishs p.wait() print("compression of the file is finished") local_path = local_folder + local_filename.replace(".csv", "") + ".zip" if _remove_csv_file: os.remove(local_folder + local_filename) print("file extraction from postgres took {} seconds".format(time.time() - tic)) return local_path, local_filename
def insert_into_postgres_copyfrom(df, database_type, database_name, schema, TlbName, **kwargs): _encod = kwargs.get('encoding', 'utf-8') # data formating before insert for date_var in df.dtypes[df.dtypes == "datetime64[ns]"].index: df[date_var] = df[date_var].astype(str).replace("NaT", "None") for var_name in df.dtypes[df.dtypes == "object"].index: df[var_name] = df[var_name].str.replace(",", "") df[var_name] = df[var_name].astype('category') output = StringIO.StringIO() df.reset_index(drop=True).to_csv(output, header=None, index=False, sep=",", na_rep=None, encoding=_encod) output.seek(0) con = connect_to_database(database_type, database_name).CreateEngine() cur = con.connection.cursor() cur.copy_from(output, '"{}"."{}"'.format(schema, TlbName), sep=",", null="None") con.connection.commit() con.close()
def update_output_div(schema,tlb_name): engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() query = """ select column_name,data_type from information_schema.columns where table_name = '{}' and table_schema IN ('{}') """.format(schema,tlb_name) data = pd.read_sql(query, con=engine) engine.close() col_name = list({"name": i, "id": i} for i in data.columns) datas = data.to_dict('records') return col_name, datas
def calypso_ids_production(schema_main, connexion_postgres): """Compute new ids and ids that changed overtime Parameters ----------- schema_main: str schema where ids are stored connexion_postgres: str name of postgres connexction (referred to conf_python file) """ # extract condition (exclusion) of stored on the computer if sys.platform == "win32": folder_json = os.path.expanduser( '~') + "\\conf_python\\unique_id_conditions.json" else: folder_json = os.environ[ 'HOME'] + "/conf_python/unique_id_conditions.json" with open(folder_json, 'r') as JSON: conditions = json.load(JSON) condition = conditions["exclusion_cartes"]["request"] condition_on_email = conditions["condition_email"]["dataframe"] # generate new ids and handling customer's information that changed # Step - 0 : # - extract new card # - extract card that was already associated to a user id but with information that changed engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() query = """ select T1."CardHolderID", T1."NoMobile", lower(T1."Email") as "Email", T1."FirstName", T1."LastName", T1."BirthDate", T1."PostCode", T1."Address1", T1."Address2", T1."ActivationDate" from "CARD_STATUS"."STATUS_CARTES" as T1 left join "{}"."MASTER_ID" as T2 on T1."CardHolderID" = T2."CardHolderID" where (T1."NoMobile" is not null) and (T1."Email" !~* '.*creacard.*|.*prepaidfinancial.*|.*financial.*') and T2."USER_ID" is null UNION ALL select T1."CardHolderID", T1."NoMobile", lower(T1."Email") as "Email", T1."FirstName", T1."LastName", T1."BirthDate", T1."PostCode", T1."Address1", T1."Address2", T1."ActivationDate" from "CARD_STATUS"."STATUS_CARTES" as T1 Join( select "CardHolderID" from "CARD_STATUS"."CHANGE_CUSTOMERS_CARTES" where "dt_change" >= date(now() - INTERVAL '1 DAY')::timestamp and ("Is_ch_BirthDate" = 1 or "Is_ch_Email" = 1 or "Is_ch_LastName" = 1 or "Is_ch_NoMobile" = 1) and "NoMobile" is not null and "Email" !~* '.*creacard.*|.*prepaidfinancial.*|.*financial.*' ) as T2 on T1."CardHolderID" = T2."CardHolderID" """.format(schema_main) data = pd.read_sql(query, con=engine) engine.close() # associtated new ids for identified cards if not data.empty: for var in [ "FirstName", "LastName", "Address1", "Address2", "PostCode", "Email" ]: data[var] = data[var].str.encode('utf-8').astype(str) data.loc[data[var].isnull(), var] = "" data[var] = data[var].str.strip(" ") data[var] = data[var].str.replace(" ", "") data[var] = data[var].str.lower() data = data[~data["Email"].str.contains( '.*creacard.*|.*prepaidfinancial.*|.*financial.*', regex=True)] data["GoodEmail"] = 1 data.loc[data["Email"].str.contains(condition_on_email, regex=True), "GoodEmail"] = 0 data["GoodCombinaison"] = 1 data.loc[(data["LastName"].str.contains( conditions["condition_combinaison"]["LastName"], regex=True)) | (data["BirthDate"].isnull()) | (data["BirthDate"].isin(conditions["condition_combinaison"] ["BirthDate"].split(","))), "GoodCombinaison"] = 0 # Delete leading "00" at the start of string. data["NoMobile"] = data["NoMobile"].str.replace("^00", "", regex=True) # replace .0 at the end$ data["NoMobile"] = data["NoMobile"].str.replace("\.0$", "", regex=True) # delete only literal '|' from string data["NoMobile"] = data["NoMobile"].str.replace("\|", "", regex=True) query = """ DROP TABLE IF EXISTS "{}"."TMP_USER_ID" CASCADE """.format(schema_main) engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ CREATE TABLE "{}"."TMP_USER_ID"( "CardHolderID" VARCHAR(50), "NoMobile" TEXT, "Email" TEXT, "FirstName" TEXT, "LastName" TEXT, "BirthDate" TEXT, "PostCode" TEXT, "Address1" TEXT, "Address2" TEXT, "ActivationDate" timestamp without time zone, "GoodEmail" INTEGER, "GoodCombinaison" INTEGER, "MOBILE_ID" INTEGER, "USER_ID" BIGINT, "CONTACT_ID" VARCHAR(50), "PERSON_ID" TEXT, "MOVIDON_ID" BIGINT ) """.format(schema_main) engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() data = data[~data["NoMobile"].isnull()] data["MOBILE_ID"] = None data["USER_ID"] = None data["CONTACT_ID"] = None data["PERSON_ID"] = None data["MOVIDON_ID"] = None InsertTableIntoDatabase(data, TlbName="TMP_USER_ID", Schema=schema_main, database_name=connexion_postgres, database_type="Postgres", DropTable=False, InstertInParrell=False) # STEP 1 : handle new id mobile query = """ update "{}"."TMP_USER_ID" set "MOBILE_ID" = T1."MOBILE_ID" from "{}"."ID_MOBILE" as T1 where "{}"."TMP_USER_ID"."NoMobile" = T1."NoMobile" """.format(schema_main, schema_main, schema_main) engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) query = """ select "NoMobile", count(*) as "NUM_CARTES" from "CUSTOMERS"."TMP_USER_ID" where "MOBILE_ID" is null group by "NoMobile" """ data = pd.read_sql(query, con=engine) engine.close() if not data.empty: # as the function serial was used to create the table i.e the ID # it is not necessary to specified it using python. # the function serial associates automatically and incrementally a # MOBILE_ID to a new inserted row InsertTableIntoDatabase(data, TlbName="ID_MOBILE", Schema='CUSTOMERS', database_name=connexion_postgres, database_type="Postgres", DropTable=False, InstertInParrell=False) query = """ update "CUSTOMERS"."TMP_USER_ID" set "MOBILE_ID" = T1."MOBILE_ID" from "CUSTOMERS"."ID_MOBILE" as T1 where "CUSTOMERS"."TMP_USER_ID"."NoMobile" = T1."NoMobile" and "CUSTOMERS"."TMP_USER_ID"."MOBILE_ID" is null """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() # STEP 2 : handle new CONTACT_ID query = """ update "CUSTOMERS"."TMP_USER_ID" set "CONTACT_ID" = "CardHolderID" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() # STEP 3 : handle new PERSON_ID query = """ update "CUSTOMERS"."TMP_USER_ID" set "PERSON_ID" = T1."PERSON_ID" from "CUSTOMERS"."ID_PERSON" as T1 where concat("CUSTOMERS"."TMP_USER_ID"."BirthDate", "CUSTOMERS"."TMP_USER_ID"."LastName") = T1."combinaison" and "CUSTOMERS"."TMP_USER_ID"."GoodCombinaison" = 1 """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) query = """ select distinct concat("BirthDate", "LastName") as "combinaison" from "CUSTOMERS"."TMP_USER_ID" where "PERSON_ID" is null and "GoodCombinaison" = 1 """ data = pd.read_sql(query, con=engine) engine.close() if not data.empty: # as the function serial was used to create the table i.e the ID # it is not necessary to specified it using python. # the function serial associates automatically and incrementally a # PERSON_ID to a new inserted row InsertTableIntoDatabase(data, TlbName="ID_PERSON", Schema='CUSTOMERS', database_name=connexion_postgres, database_type="Postgres", DropTable=False, InstertInParrell=False) query = """ update "CUSTOMERS"."TMP_USER_ID" set "PERSON_ID" = T1."PERSON_ID" from "CUSTOMERS"."ID_PERSON" as T1 where concat("CUSTOMERS"."TMP_USER_ID"."BirthDate", "CUSTOMERS"."TMP_USER_ID"."LastName") = T1."combinaison" and "CUSTOMERS"."TMP_USER_ID"."GoodCombinaison" = 1 and "CUSTOMERS"."TMP_USER_ID"."PERSON_ID" is null """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() # step 4.1: make sur that we are keeping the max user id engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() query = """ select max("USER_ID") as "user_id_max" from "CUSTOMERS"."MASTER_ID" """ id_max = pd.read_sql(query, con=engine) id_max = int(id_max.loc[0, "user_id_max"]) engine.close() # STEP 4.2: identified cards that can be associated to a know USER_ID query = """ UPDATE "CUSTOMERS"."TMP_USER_ID" set "USER_ID" = T1."USER_ID" from (select "CardHolderID", "USER_ID"::integer from "CUSTOMERS"."MASTER_ID") as T1 where "CUSTOMERS"."TMP_USER_ID"."CardHolderID" = T1."CardHolderID" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ update "CUSTOMERS"."TMP_USER_ID" set "USER_ID" = T1."USER_ID" from "CUSTOMERS"."ID_USER" as T1 where "CUSTOMERS"."TMP_USER_ID"."NoMobile" = T1."NoMobile" and "CUSTOMERS"."TMP_USER_ID"."USER_ID" is null """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ update "CUSTOMERS"."TMP_USER_ID" set "USER_ID" = T1."USER_ID" from "CUSTOMERS"."ID_USER" as T1 where "CUSTOMERS"."TMP_USER_ID"."GoodEmail" = 1 and "CUSTOMERS"."TMP_USER_ID"."Email" = T1."Email" and "CUSTOMERS"."TMP_USER_ID"."USER_ID" is null """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ update "CUSTOMERS"."TMP_USER_ID" set "USER_ID" = T1."USER_ID" from "CUSTOMERS"."ID_USER" as T1 where concat("CUSTOMERS"."TMP_USER_ID"."BirthDate", "CUSTOMERS"."TMP_USER_ID"."LastName") = T1."combinaison" and "CUSTOMERS"."TMP_USER_ID"."GoodCombinaison" = 1 and "CUSTOMERS"."TMP_USER_ID"."CONTACT_ID" is null """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) # STEP 4.4: Extract all cards (new cards and cards that were already associated to a USER_ID) # in order to re-associate USER_ID based on the sorted algorithm in order to always # be sure to keep the homogeneity and unicity of the USER_ID engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() query = """ select * from "CUSTOMERS"."TMP_USER_ID" """ data = pd.read_sql(query, con=engine) data["combinaison"] = data["BirthDate"] + data["LastName"] data = data[[ "NoMobile", "Email", "combinaison", "GoodEmail", "GoodCombinaison", "USER_ID" ]] query = """ select "NoMobile", "Email", "combinaison", "GoodEmail", "GoodCombinaison", "USER_ID" from "CUSTOMERS"."ID_USER" """ data_bis = pd.read_sql(query, con=engine) data_bis["USER_ID"] = data_bis["USER_ID"].astype(float) data = pd.concat([data, data_bis], axis=0) user_id = data[~data.duplicated(keep='first')] tic = time.time() sorted = False while sorted is False: tmp_user_id = user_id.groupby( "NoMobile")["USER_ID"].min().reset_index() tmp_user_id.columns = ["NoMobile", "TMP_USER_ID"] user_id = pd.merge(user_id, tmp_user_id, on="NoMobile", how="inner") user_id["USER_ID"] = user_id["TMP_USER_ID"] user_id = user_id.drop(columns='TMP_USER_ID', axis=1) tmp_user_id = user_id[user_id["GoodEmail"] == 1].groupby( "Email")["USER_ID"].min().reset_index() tmp_user_id.columns = ["Email", "TMP_USER_ID"] user_id = pd.merge(user_id, tmp_user_id, on="Email", how="left") user_id.loc[~user_id["TMP_USER_ID"].isnull(), "USER_ID"] = user_id["TMP_USER_ID"] user_id = user_id.drop(columns='TMP_USER_ID', axis=1) tmp_user_id = user_id[user_id["GoodCombinaison"] == 1].groupby( "combinaison")["USER_ID"].min().reset_index() tmp_user_id.columns = ["combinaison", "TMP_USER_ID"] user_id = pd.merge(user_id, tmp_user_id, on="combinaison", how="left") user_id.loc[~user_id["TMP_USER_ID"].isnull(), "USER_ID"] = user_id["TMP_USER_ID"] user_id = user_id.drop(columns='TMP_USER_ID', axis=1) non_unique_num = user_id.groupby( "NoMobile")["USER_ID"].nunique().sort_values().reset_index() non_unique_num = non_unique_num.loc[non_unique_num["USER_ID"] > 1, "NoMobile"] non_unique_email = user_id[user_id["GoodEmail"] == 1].groupby( "Email")["USER_ID"].nunique().sort_values().reset_index() non_unique_email = non_unique_email.loc[ non_unique_email["USER_ID"] > 1, "Email"] non_unique_combi = user_id[user_id["GoodCombinaison"] == 1].groupby( "combinaison")["USER_ID"].nunique().sort_values().reset_index() non_unique_combi = non_unique_combi.loc[ non_unique_combi["USER_ID"] > 1, "combinaison"] if (len(non_unique_num) > 0) or (len(non_unique_email) > 0) or (len(non_unique_combi) > 0): sorted = False else: sorted = True toc = time.time() - tic tmp_use_id = user_id[~user_id["USER_ID"].isnull()] # STEP 4.5: associate new user_id to cards that already haven't user_id = user_id[user_id["USER_ID"].isnull()] user_id = compute_user_id(user_id, last_user_id=id_max) # STEP 4.6: Replace all ID_USER table by the new ones user_id = pd.concat([user_id, tmp_use_id], axis=0) user_id = user_id.reset_index(drop=True) query = """ delete from "CUSTOMERS"."ID_USER" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() InsertTableIntoDatabase(user_id, TlbName="ID_USER", Schema='CUSTOMERS', database_name=connexion_postgres, database_type="Postgres", DropTable=False, InstertInParrell=False) # STEP 5: keep tracks of cards for which the user id changed # STEP 5.1: keep tracks of cards for which the user id changed query = """ CREATE TABLE "CUSTOMERS"."TMP_MASTER_ID" as SELECT "CardHolderID", "USER_ID" from "CUSTOMERS"."MASTER_ID" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() # STEP 5.1.1: delete cards already identified from MASTER_ID query = """ delete from "CUSTOMERS"."MASTER_ID" where "CardHolderID" in (select T1."CardHolderID" from "CUSTOMERS"."TMP_USER_ID" as T1 inner join "CUSTOMERS"."MASTER_ID" as T2 ON T1."CardHolderID" = T2."CardHolderID") """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() # STEP 5.2: add new cards to master ID engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() query = """ select * from "CUSTOMERS"."TMP_USER_ID" """ data = pd.read_sql(query, con=engine) engine.close() data["PERSON_ID"] = data["PERSON_ID"].astype(str) data["PERSON_ID"] = data["PERSON_ID"].str.replace("\.0$", "", regex=True) InsertTableIntoDatabase(data, TlbName="MASTER_ID", Schema='CUSTOMERS', database_name=connexion_postgres, database_type="Postgres", DropTable=False, InstertInParrell=False) # STEP 5.3: update new user id query = """ UPDATE "CUSTOMERS"."MASTER_ID" SET "USER_ID" = NULL """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ update "CUSTOMERS"."MASTER_ID" set "USER_ID" = T1."USER_ID" from( select distinct "NoMobile", "USER_ID" from "CUSTOMERS"."ID_USER") as T1 where "CUSTOMERS"."MASTER_ID"."NoMobile" = T1."NoMobile" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() # STEP 5.4: store user id that changed query = """ select T3.* from( select T1.*, T2."USER_ID" as "oth_user_id" from "CUSTOMERS"."MASTER_ID" as T1 INNER JOIN "CUSTOMERS"."TMP_MASTER_ID" as T2 on T1."CardHolderID" = T2."CardHolderID" ) as T3 where T3."USER_ID" <> T3."oth_user_id" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() data = pd.read_sql(query, con=engine) engine.close() data["USER_ID"] = data["oth_user_id"] data = data.drop(columns=["oth_user_id"], axis=1) data["dt_change"] = datetime.datetime.now() - datetime.timedelta(days=1) data["PERSON_ID"] = data["PERSON_ID"].astype(str) data["PERSON_ID"] = data["PERSON_ID"].str.replace("\.0$", "", regex=True) # insert these cards into the table that allows to track the change of ID # overtime InsertTableIntoDatabase(data, TlbName="CHANGE_IDS", Schema='CUSTOMERS', database_name=connexion_postgres, database_type="Postgres", DropTable=False, InstertInParrell=False) query = """ drop table "CUSTOMERS"."TMP_MASTER_ID" cascade """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() # step 5.5: make sure that all changes are taken into account query = """ drop table if exists "CUSTOMERS"."TMP_CHANGES_IDS" cascade; create table "CUSTOMERS"."TMP_CHANGES_IDS" as select T2.*, now() as "dt_change" from( select distinct T1."USER_ID" as "user_id_change", T2."USER_ID" as "user_id_current" from "CUSTOMERS"."CHANGE_IDS" as T1 inner join "CUSTOMERS"."MASTER_ID" as T2 on T1."CardHolderID" = T2."CardHolderID" where T1."dt_change" >= date(now()-interval '1 days') ) as T3 inner join "CUSTOMERS"."MASTER_ID" as T2 on T2."USER_ID" = T3."user_id_change"; """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ update "CUSTOMERS"."ID_USER" set "USER_ID" = T3."user_id_current" from( select distinct T1."USER_ID" as "user_id_change", T2."USER_ID" as "user_id_current" from "CUSTOMERS"."CHANGE_IDS" as T1 inner join "CUSTOMERS"."MASTER_ID" as T2 on T1."CardHolderID" = T2."CardHolderID" where T1."dt_change" >= date(now()-interval '1 days') ) as T3 where "CUSTOMERS"."ID_USER"."USER_ID" = T3."user_id_change" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ insert into "CUSTOMERS"."CHANGE_IDS" select * from "CUSTOMERS"."TMP_CHANGES_IDS" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ drop table if exists "CUSTOMERS"."TMP_CHANGES_IDS" cascade """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ UPDATE "CUSTOMERS"."MASTER_ID" SET "USER_ID" = NULL """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() query = """ update "CUSTOMERS"."MASTER_ID" set "USER_ID" = T1."USER_ID" from( select distinct "NoMobile", "USER_ID" from "CUSTOMERS"."ID_USER") as T1 where "CUSTOMERS"."MASTER_ID"."NoMobile" = T1."NoMobile" """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close() # STEP 5.6: associated person id for cards which haven't query = """ update "CUSTOMERS"."MASTER_ID" set "PERSON_ID" = concat("USER_ID",'_',"MOBILE_ID") where "GoodCombinaison" = 0 """ engine = connect_to_database("Postgres", connexion_postgres).CreateEngine() engine.execute(query) engine.close()
def CsvToDataBase(FilePath,TlbName,Schema,ingestion_params, **kwargs): """Insert a csv using pandas into a database table Parameters ----------- FilePath : str Path of the folder where .csv files are located engine : sqlalchmey create_engine object Engin object & connection to the database from sqlalchemy TlbName : str Name of the targeted table into the database Schema: str Indicate the schema where the table is stores into the database PreprocessingCsv: dict - optional parameters Dictionnary with a function that transform a pandas DataFrame and a set of optional arguments with this function. Dictionnary args: - 'function' = function object - 'KeyWords' = dict -- with optional args for the function """ InsertInTheSameTable = kwargs.get('InsertInTheSameTable', None) PreprocessingCsv = kwargs.get('PreprocessingCsv', None) _use_credentials = kwargs.get('_use_credentials', None) _use_conf = kwargs.get('_use_conf', None) engine = kwargs.get('use_engine', None) logger = kwargs.get('logger', None) SizeChunck = kwargs.get('SizeChunck', None) database_type = kwargs.get('database_type', None) database_name = kwargs.get('database_name', None) _was_engine = True _num_lines_csv = None if engine is None: try: engine = connect_to_database(database_type, database_name, _use_credentials=_use_credentials, _use_conf=_use_conf).CreateEngine() _was_engine = False except: raise try: if InsertInTheSameTable: if PreprocessingCsv is not None: F = PreprocessingCsv['function'] if PreprocessingCsv['KeyWords'] is not None: # read the data from a specific protocole Data = read_csv_protocole(ingestion_params["protocole_type"], ingestion_params["protocole_name"], FilePath, ingestion_params["csv_params"], copy_to_local=ingestion_params["copy_to_filesystem"]) Data = F(Data, FilePath, PreprocessingCsv['KeyWords']) _num_lines_csv = Data.shape[0] else: # read the data from a specific protocole Data = read_csv_protocole(ingestion_params["protocole_type"], ingestion_params["protocole_name"], FilePath, ingestion_params["csv_params"], copy_to_local=ingestion_params["copy_to_filesystem"]) Data = F(Data, FilePath) _num_lines_csv = Data.shape[0] InsertToPostgre(Data, TlbName=TlbName, engine=engine, schema=Schema, DropTable=False, SizeChunck=SizeChunck) print("file {} was succesfully inserted".format(FilePath)) else: # read the data from a specific protocole Data = read_csv_protocole(ingestion_params["protocole_type"], ingestion_params["protocole_name"], FilePath, ingestion_params["csv_params"], copy_to_local=ingestion_params["copy_to_filesystem"]) _num_lines_csv = Data.shape[0] InsertToPostgre(Data, TlbName=TlbName, engine=engine, schema=Schema, DropTable=False, SizeChunck=SizeChunck) print("file {} was succesfully inserted".format(FilePath)) else: if TlbName is None: TlbName = FilePath.split('/')[-1].replace(".csv", "") if PreprocessingCsv is not None: F = PreprocessingCsv['function'] if PreprocessingCsv['KeyWords'] is not None: # read the data from a specific protocole Data = read_csv_protocole(ingestion_params["protocole_type"], ingestion_params["protocole_name"], FilePath, ingestion_params["csv_params"], copy_to_local=ingestion_params["copy_to_filesystem"]) Data = F(Data, FilePath, PreprocessingCsv['KeyWords']) _num_lines_csv = Data.shape[0] else: # read the data from a specific protocole Data = read_csv_protocole(ingestion_params["protocole_type"], ingestion_params["protocole_name"], FilePath, ingestion_params["csv_params"], copy_to_local=ingestion_params["copy_to_filesystem"]) Data = F(Data, FilePath) _num_lines_csv = Data.shape[0] InsertToPostgre(Data, TlbName=TlbName, engine=engine, schema=Schema, DropTable=True, SizeChunck=SizeChunck) print("file {} was succesfully inserted".format(FilePath)) else: Data = pd.read_csv(FilePath) _num_lines_csv = Data.shape[0] InsertToPostgre(Data, engine, TlbName, schema=Schema, DropTable=True, SizeChunck=SizeChunck) print("file {} was succesfully inserted".format(FilePath)) except Exception as e: if logger is not None: logger.error(e, exc_info=True) else: print(e) if not _was_engine: engine.close() _outputs = [FilePath.split("/")[-1], _num_lines_csv] return _outputs
import dash import dash_core_components as dcc import dash_html_components as html from dash.dependencies import Input, Output import dash_table from creacard_connectors.database_connector import connect_to_database import pandas as pd # extract available schema engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() query = """ select "schema_name" from information_schema.schemata where "schema_name" !~* '^pg*.' and "schema_name" not in ('information_schema') """ data = pd.read_sql(query, con=engine) list_schema = data["schema_name"].tolist() del data engine.close() external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] app = dash.Dash(__name__) app.layout = html.Div([
def daily_card_status2(Data, filepath, **kwargs): logger = kwargs.get('logger', None) # Table parameter for the temporary table TableParameter = {} TableParameter["ActivationDate"] = "timestamp without time zone" TableParameter["Address1"] = "TEXT" TableParameter["Address2"] = "TEXT" TableParameter["ApplicationName"] = "VARCHAR (50)" TableParameter["AvailableBalance"] = "double precision" TableParameter["BirthDate"] = "timestamp without time zone" TableParameter["CardHolderID"] = "VARCHAR (50)" TableParameter["CardStatus"] = "VARCHAR (100)" TableParameter["City"] = "VARCHAR (100)" TableParameter["Country"] = "VARCHAR (50)" TableParameter["CreationDate"] = "timestamp without time zone" TableParameter["DistributorCode"] = "INTEGER" TableParameter["Email"] = "TEXT" TableParameter["ExpirationDate"] = "timestamp without time zone" TableParameter["FirstName"] = "TEXT" TableParameter["IBAN"] = "TEXT" TableParameter["IsExcludedAddress"] = "INTEGER" TableParameter["IsRenewal"] = "INTEGER" TableParameter["KYC_Status"] = "VARCHAR (50)" TableParameter["LastName"] = "TEXT" TableParameter["LastChangeDate"] = "timestamp without time zone" TableParameter["LastAddressDate"] = "timestamp without time zone" TableParameter["LastCustomerDate"] = "timestamp without time zone" TableParameter["NoMobile"] = "TEXT" TableParameter["PostCode"] = "VARCHAR (50)" TableParameter["Programme"] = "VARCHAR (50)" TableParameter["RenewalDate"] = "timestamp without time zone" TableParameter["UpdateDate"] = "timestamp without time zone" keepcol = [ "CardHolderID", "Email", "FirstName", "LastName", "City", "Country", "Card Status", "DistributorCode", "ApplicationName", "Date of Birth", "IBAN", "CreatedDate", "UpdatedDate", "Address1", "Address2", "PostCode", "KYC Status", "expirydate", "AvailableBalance", "NoMobile", "Programme" ] #### Step 1: Extract the data from the file and keep ony updated data # extract filedate FileName = filepath.split('/')[-1].replace(".csv", "") DateFile = pd.to_datetime( FileName.split("-")[1] + "-" + FileName.split("-")[2] + "-" + FileName.split("-")[3]) # based on the file date, identify the appropriate names of columns if DateFile > pd.to_datetime('2019-03-12'): col_names = [ "CardHolderID", "Cardnumber", "Email", "FirstName", "LastName", "City", "Country", "Card Status", "DistributorCode", "ApplicationName", "Date of Birth", "SortCodeAccNum", "IBAN", "CreatedDate", "UpdatedDate", "Address1", "Address2", "PostCode", "KYC Status", "expirydate", "AvailableBalance", "UDF2", "NoMobile", "Programme", "VPVR" ] elif DateFile < pd.to_datetime('2019-01-16'): col_names = [ "CardHolderID", "Cardnumber", "Email", "FirstName", "LastName", "City", "Country", "Card Status", "DistributorCode", "ApplicationName", "Date of Birth", "SortCodeAccNum", "IBAN", "CreatedDate", "UpdatedDate", "Address1", "Address2", "PostCode", "KYC Status", "expirydate" ] else: col_names = [ "CardHolderID", "Cardnumber", "Email", "FirstName", "LastName", "City", "Country", "Card Status", "DistributorCode", "ApplicationName", "Date of Birth", "SortCodeAccNum", "IBAN", "CreatedDate", "UpdatedDate", "Address1", "Address2", "PostCode", "KYC Status", "expirydate", "AvailableBalance", "UDF2", "NoMobile", "UDF3", "VPVR" ] # add the names of columns to the dataframe Data.columns = col_names # store the missing columns missing_columns = list(set(keepcol).difference(col_names)) if missing_columns: # if the list is not add new columns to the dataframe for col in missing_columns: Data[col] = None # Store change values engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() query = """ select distinct "CardHolderID","CardStatus","KYC_Status" from "CARD_STATUS"."STATUS_CARTES" """ data_current = pd.read_sql(query, con=engine) data_current["CardHolderID"] = data_current["CardHolderID"].astype(str) data_current["KYC_Status"] = data_current["KYC_Status"].astype(str) data_current["CardStatus"] = data_current["CardStatus"].astype(str) #### Step 2: Transform the data # transform date columns to pd.datetime format in order to have a consistent format # of date over the database # Only transform updated date Data["UpdatedDate"] = pd.to_datetime(Data["UpdatedDate"], format="%b %d %Y %I:%M%p", errors='coerce') Data["CreatedDate"] = pd.to_datetime(Data["CreatedDate"], format="%b %d %Y %I:%M%p", errors='coerce') Data["Date of Birth"] = pd.to_datetime(Data["Date of Birth"], format="%b %d %Y %I:%M%p", errors='coerce') # transform expirydate Data["expirydate"] = Data["expirydate"].astype(str) Data["expirydate"] = "20" + Data["expirydate"].str[0:2] + "-" + Data[ "expirydate"].str[2:] + "-01" Data["expirydate"] = pd.to_datetime(Data["expirydate"], format='%Y-%m-%d', errors='coerce') Data = Data[keepcol] # condition remove address AddressToRemove = [ "77 OXFORD STREET LONDON", "17 RUE D ORLEANS", "TSA 51760", "77 Oxford Street London", "36 CARNABY STREET", "36 CARNABY STREET LONDON", "36 CARNABY STREET LONDON", "ADDRESS", "17 RUE D ORLEANS PARIS", "CreaCard Espana S L Paseo de Gracia 59", "36 Carnaby Street London", "CREACARD SA Pl Marcel Broodthaers 8 Box 5", "17 Rue D Orleans Paris", "CREACARD ESPANA S L PASEO DE GRACIA 59", "CreaCard 17 rue d Orleans", "CREACARD SA PL MARCEL BROODTHAERS 8 BOX 75", "CREACARD SA PL MARCEL BROODTHAERS 8 BOX 75", "36 Carnaby Street", "77 OXFORD STREET" ] Data["IsExcludedAddress"] = ( Data.Address1.isin(AddressToRemove)).astype(int) Data["ActivationDate"] = pd.NaT Data["IsRenewal"] = 0 Data["RenewalDate"] = pd.NaT Data["LastChangeDate"] = pd.NaT Data["LastAddressDate"] = pd.NaT Data["LastCustomerDate"] = pd.NaT Data = Data[sorted(Data.columns)] colnames = [ "ActivationDate", "Address1", "Address2", "ApplicationName", "AvailableBalance", "CardStatus", "CardHolderID", "City", "Country", "CreationDate", "BirthDate", "DistributorCode", "Email", "FirstName", "IBAN", "IsExcludedAddress", "IsRenewal", "KYC_Status", "LastAddressDate", "LastChangeDate", "LastCustomerDate", "LastName", "NoMobile", "PostCode", "Programme", "RenewalDate", "UpdateDate", "ExpirationDate" ] Data.columns = colnames Data = Data[sorted(Data.columns)] Data.loc[(Data["KYC_Status"] == '0') | (Data["KYC_Status"] == '0.0') | (Data["KYC_Status"] == 0), "KYC_Status"] = 'Anonyme' Data.loc[(Data["KYC_Status"] == '1') | (Data["KYC_Status"] == '1.0') | (Data["KYC_Status"] == 1), "KYC_Status"] = 'SDD' Data.loc[(Data["KYC_Status"] == '2') | (Data["KYC_Status"] == '2.0') | (Data["KYC_Status"] == 2), "KYC_Status"] = 'KYC' Data.loc[(Data["KYC_Status"] == '3') | (Data["KYC_Status"] == '3.0') | (Data["KYC_Status"] == 3), "KYC_Status"] = 'KYC LITE' Data["DistributorCode"] = Data["DistributorCode"].fillna(-1) Data["DistributorCode"] = Data["DistributorCode"].astype(int) Data["CardHolderID"] = Data["CardHolderID"].astype(str) Data["KYC_Status"] = Data["KYC_Status"].astype(str) Data["CardStatus"] = Data["CardStatus"].astype(str) Data.loc[Data["DistributorCode"].isin(["203", "914", "915"]), "IsRenewal"] = 1 Data = Data[sorted(Data.columns)] # Delete leading "00" at the start of string. Data["NoMobile"] = Data["NoMobile"].str.replace("^00", "", regex=True) # replace .0 at the end$ Data["NoMobile"] = Data["NoMobile"].str.replace("\.0$", "", regex=True) # delete only literal '|' from string Data["NoMobile"] = Data["NoMobile"].str.replace("\|", "", regex=True) # Step 1: Identify data_new = Data[["CardHolderID", "CardStatus", "KYC_Status"]] outer_join = data_current.merge(data_new, how='outer', indicator=True) outer_join = outer_join[outer_join["_merge"] == "right_only"] # set 2 : identify new cardholder ID new_card_holder_id = set(outer_join["CardHolderID"].unique()).difference( data_current["CardHolderID"].unique()) ### set 3 : insert old values into changes table data_to_change = data_current[data_current["CardHolderID"].isin( set(outer_join.loc[ ~outer_join["CardHolderID"].isin(new_card_holder_id), "CardHolderID"]))] FileName = filepath.split('/')[-1].replace(".csv", "") DateFile = pd.to_datetime( FileName.split("-")[1] + "-" + FileName.split("-")[2] + "-" + FileName.split("-")[3]) - datetime.timedelta(days=1) data_to_change["dt_change"] = DateFile InsertTableIntoDatabase(data_to_change, "CHANGE_STATUS_CARTES", "CARD_STATUS", "Postgres", "Creacard_Calypso", DropTable=False) engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() TlbName = "STATUS_CARTES" schema = "CARD_STATUS" database_type = "Postgres" database_name = "Creacard_Calypso" query_delete = """ delete from "CARD_STATUS"."STATUS_CARTES" """ tic = time.time() engine.execute(query_delete) print("delete took the data {} seconds".format(time.time() - tic)) engine.close() data = splitDataFrameIntoSmaller(Data, chunkSize=100000) num_process = int(multiprocessing.cpu_count() / 4) tic = time.time() pool = Pool(num_process) pool.map( partial(insert_into_postgres_copyfrom, database_type=database_type, database_name=database_name, schema=schema, TlbName=TlbName), data) pool.close() pool.terminate() pool.join() toc = time.time() - tic print("ingestion was done in {} seconds ".format(toc)) ### update the LastChangeDate columns (KYC & card status) con_postgres = connect_to_database(database_type, database_name).CreateEngine() query = """ UPDATE "CARD_STATUS"."STATUS_CARTES" SET "LastChangeDate" = T1."max_date" FROM( SELECT max("dt_change") as "max_date", "CardHolderID" FROM "CARD_STATUS"."CHANGE_STATUS_CARTES" GROUP BY "CardHolderID" ) as T1 WHERE "CARD_STATUS"."STATUS_CARTES"."CardHolderID" = T1."CardHolderID" """ con_postgres.execute(query) con_postgres.close() con_postgres = connect_to_database(database_type, database_name).CreateEngine() query = """ update "CARD_STATUS"."STATUS_CARTES" as T1 SET "ActivationDate" = "ActivationTime" FROM "CARD_STATUS"."ACTIVATION_REPORT" as T2 WHERE T1."CardHolderID" = T2."CardHolderID" and "ActivationDate" is null """ con_postgres.execute(query) con_postgres.close()
def FromCsvToDataBase(ListOfPath, database_type,database_name, Schema, ingestion_params, **kwargs): """Insert all .csv from a folder Requiered Parameters ----------- ListOfPath: str Path of the folder where .csv files are located database_type : str Type of the database (into your configurations files) database_name: str name of the database configuration files name (ex: Postgres_calypso) schema: str name of the schema where the table must be written ingestion_params: dict standarized dictionnary with paramters for ingestion ex: params_ingestion = dict() params_ingestion["protocole_type"] = "LOCAL" -- FTP or SFTP protocole type where .csv are located params_ingestion["protocole_name"] = "" protocole name where .csv are located i.e in configurations files params_ingestion["csv_params"] = csv_params -- reading csv paramaters (ex: csv_params = {'sep': ","}) params_ingestion["copy_to_filesystem"] = destination_copy (dict) -- dictionnary with two fileds: ex: destination_copy = dict() destination_copy["destination_folder"] = folder_2 -- path of teh filesystem where the data mst be duplicated destination_copy["csv_destination_params"] = {'sep': ",", 'index':False} -- writting parameters Optional Parameters (**kwargs) ----------- TlbName : str Name of the targeted table into the database InsertInParrell : Boolean -- default value False True if the insertion has to be done in parallel InsertInTheSameTable : Boolean -- default value False True if the pandas dataframe has to be inserted into the same table at each loop PreprocessingCsv: dict - optional parameters Dictionnary with a function that transform a pandas DataFrame and a set of optional arguments with this function. Dictionnary args: - 'function' = function object - 'KeyWords' = dict -- with optional args for the function logger: logger object logger to get logs from the running function in case of errors TableDict: dict Dictionnary with the postgres types associated to the variables ingested from the DataFrame """ TlbName = kwargs.get('TlbName', None) InsertInParrell = kwargs.get('InsertInParrell', False) InsertInTheSameTable = kwargs.get('InsertInTheSameTable', False) PreprocessingCsv = kwargs.get('PreprocessingCsv', None) logger = kwargs.get('logger', None) TableDict = kwargs.get('TableDict', None) SizeChunck = kwargs.get('SizeChunck', 10000) NumWorkers = kwargs.get('NumWorkers', 3) _use_credentials = kwargs.get('_use_credentials', None) _use_conf = kwargs.get('_use_conf', None) engine = connect_to_database(database_type, database_name, _use_credentials=_use_credentials, _use_conf=_use_conf).CreateEngine() # Test if the targeted schema exists if isinstance(Schema, str): if ~db.IsSchemaExist(engine, Schema): db.CreateSchema(engine, Schema) if InsertInTheSameTable is not None: if TableDict is not None: try: # Check if the table exist # The function will automatically create the # if it doesn't exist if not db.table_exists(engine, TlbName, Schema): db.CreateTable(engine, TlbName, Schema, TableDict) except Exception as e: if logger is not None: logger.error(e, exc_info=True) else: print(e) else: raise ValueError("You must specify a dictionnary to insert into the same table") # The .csv are ingested in the table if the user choose # the insertion in parallel if InsertInParrell: del engine if NumWorkers is None: # Store the number of available workers NbWorkers = multiprocessing.cpu_count() - 1 else: NbWorkers = NumWorkers print(".csv simultaneous ingestion of {} files using {} Workers is launched".format(len(ListOfPath), NbWorkers)) _lines_file = [] tic = time.time() p = ThreadPool(NbWorkers) _lines_file.append(p.map(partial(db.CsvToDataBase, TlbName=TlbName, Schema=Schema, ingestion_params=ingestion_params, logger=logger, SizeChunck=SizeChunck, PreprocessingCsv=PreprocessingCsv, InsertInTheSameTable=True, database_type=database_type, database_name=database_name), ListOfPath)) toc = time.time() - tic print(".csv files were succesfully ingested in parallel into the table {} in {} seconds".format(TlbName, toc)) else: _lines_file = [] tic = time.time() for i in ListOfPath: _lines_file.append(db.CsvToDataBase(i, TlbName=TlbName, Schema=Schema, SizeChunck=SizeChunck, database_type=database_type, database_name=database_name, ingestion_params=ingestion_params, PreprocessingCsv=PreprocessingCsv, use_engine = engine, InsertInTheSameTable=True)) print("{} was succesfully ingested".format(i)) toc = time.time() - tic print(".csv files were succesfully ingested into the table {} in {} seconds".format(TlbName, toc)) else: # The .csv are ingested in the table if the user choose # the insertion in parallel if InsertInParrell: del engine if NumWorkers is None: # Store the number of available workers NbWorkers = multiprocessing.cpu_count() - 1 else: NbWorkers = NumWorkers print(".csv simultaneous ingestion of {} files using {} Workers is launched".format(len(ListOfPath), NbWorkers)) _lines_file = [] tic = time.time() p = ThreadPool(NbWorkers) _lines_file.append(p.map(partial(db.CsvToDataBase, TlbName=TlbName, Schema=Schema, logger=logger, SizeChunck=SizeChunck, ingestion_params=ingestion_params, PreprocessingCsv=PreprocessingCsv, InsertInTheSameTable=False, database_type=database_type, database_name=database_name, use_engine=None), ListOfPath)) toc = time.time() - tic print( ".csv files were succesfully ingested in parallel into the table {} in {} seconds".format(TlbName, toc)) else: _lines_file = [] tic = time.time() for i in ListOfPath: _lines_file.append(db.CsvToDataBase(i, TlbName=TlbName, Schema=Schema, SizeChunck=SizeChunck, PreprocessingCsv=PreprocessingCsv, ingestion_params=ingestion_params, use_engine=engine, database_type=database_type, database_name=database_name, InsertInTheSameTable=False)) print("{} was succesfully ingested".format(i)) toc = time.time() - tic print(".csv files were succesfully ingested into the table {} in {} seconds".format(TlbName, toc)) engine.close() return _lines_file
def InsertTableIntoDatabase(Data, TlbName, Schema, database_type, database_name, **kwargs): """Insert a pandas Dataframe Requiered Parameters ----------- engine : sqlalchmey create_engine object Engin object & connection to the database from sqlalchemy TlbName : str Name of the targeted table into the database Schema: str Indicate the schema where the table is stores into the database Optional Parameters (**kwargs) ----------- logger: logger object logger to get logs from the running function in case of errors TableDict: dict Dictionnary with the postgres types associated to the variables ingested from the DataFrame DropTable : Boolean -- default value False True if the table has to be dropped before ingestion InsertInParrell : Boolean -- default value False True if the insertion has to be done in parallel """ InsertInParrell = kwargs.get('InsertInParrell', False) SizeParrell = kwargs.get('SizeParrell', 10000) logger = kwargs.get('logger', None) TableDict = kwargs.get('TableDict', None) DropTable = kwargs.get('DropTable', False) SizeChunck = kwargs.get('SizeChunck', 10000) NumWorkers = kwargs.get('NumWorkers', 3) _use_credentials = kwargs.get('credentials', None) _use_conf = kwargs.get('credentials', None) engine = kwargs.get('use_engine', None) if engine is None: try: engine = connect_to_database(database_type, database_name, _use_credentials=_use_credentials, _use_conf=_use_conf).CreateEngine() except: raise # Test if the targeted schema exists try: if isinstance(Schema, str): if ~db.IsSchemaExist(engine, Schema): db.CreateSchema(engine, Schema) else: raise ValueError("'Schema' must have a string format") except: raise # Variables type in postgres if TableDict is not None: try: # Check if the table exist if not db.table_exists(engine, TlbName, Schema): db.CreateTable(engine, TlbName, Schema, TableDict) else: if DropTable: metadata = MetaData() TlbObject = Table(TlbName, metadata, schema=Schema) TlbObject.drop(engine) db.CreateTable(engine, TlbName, Schema, TableDict) except Exception as e: if logger is not None: logger.error(e, exc_info=True) else: print(e) else: TableDict = CreateDictionnaryType(Data) try: # Check if the table exist if not db.table_exists(engine, TlbName, Schema): db.CreateTable(engine, TlbName, Schema, TableDict) else: if DropTable: metadata = MetaData() TlbObject = Table(TlbName, metadata, schema=Schema) TlbObject.drop(engine) db.CreateTable(engine, TlbName, Schema, TableDict) except Exception as e: if logger is not None: logger.error(e, exc_info=True) else: print(e) # Insert in paralell or not if InsertInParrell: del engine if NumWorkers is None: # Store the number of available workers NbWorkers = multiprocessing.cpu_count() - 1 else: NbWorkers = NumWorkers # Split the data into different chunck size DataSplitted = splitDataFrameIntoSmaller(Data, chunkSize=SizeParrell) print("Launch multi-insertion of sample {} rows on {} Workers".format(SizeParrell,NbWorkers)) tic = time.time() p = ThreadPool(NbWorkers) p.map(partial(db.InsertToPostgre, engine=None, TlbName=TlbName, schema=Schema, DropTable=False, database_type=database_type, database_name=database_name, SizeChunck = SizeChunck, _use_credentials=_use_credentials, _use_conf=_use_conf), DataSplitted) p.close() p.join() toc = time.time() - tic print("The DataFrame was succesfully ingested in parallel into the table {} in {} seconds".format(TlbName, toc)) else: tic = time.time() db.InsertToPostgre(Data, engine=engine, TlbName=TlbName, schema=Schema, DropTable=False, SizeChunck=SizeChunck) toc = time.time() - tic print("The DataFrame was succesfully ingested into the table {} in {} seconds".format(TlbName, toc)) return
def fill_univers_sous_univers(database_type, database_name, schema, TlbName): # Refresh categorisation dictionnaries create_update_dictionnaries_categorisation(database_type, database_name) engine = connect_to_database(database_type, database_name).CreateEngine() # Step 1.1 - MCC code note 1 -- update categories and under categories query = """ UPDATE "{}"."{}" SET "UNIVERS" = T4."UNIVERS_DATABASE", "SOUS_UNIVERS" = T4."SOUS_UNIVERS_DATABASE" from( -- Join MCC categories, univers database & MCC database select T3."UNIVERS_DATABASE",T3."SOUS_UNIVERS_DATABASE",T2."MCC" as "MCC_DATABASE" from "REFERENTIEL"."MCC_CATEGORIES" as T1 INNER JOIN "REFERENTIEL"."MCC_CODE_LINK" as T2 ON T1."MCC_CODE" = T2."MCC_CODE" INNER JOIN "REFERENTIEL"."UNIVERS_DESCRIPTION" as T3 ON T1."UNIVERS" = T3."UNIVERS" and T1."SOUS_UNIVERS" = T3."SOUS_UNIVERS" WHERE T1."NOTE" in ('1') ) as T4 WHERE T4."MCC_DATABASE" = "MCC" """.format(schema, TlbName) tic = time.time() engine.execute(query) print("update of exclusion was done in {} seconds".format(time.time() - tic)) # Step 1.2 - MCC code note 1 -- Regex excluded query = """ select T3."UNIVERS_DATABASE",T3."SOUS_UNIVERS_DATABASE",T2."MCC" as "MCC_DATABASE",T1."NEW_REGEX" as "REGEX" from "REFERENTIEL"."REGEX_EXCLUDED" as T1 INNER JOIN "REFERENTIEL"."MCC_CODE_LINK" as T2 ON T1."MCC_CODE" = T2."MCC_CODE" INNER JOIN "REFERENTIEL"."UNIVERS_DESCRIPTION" as T3 ON T1."UNIVERS" = T3."UNIVERS" and T1."SOUS_UNIVERS" = T3."SOUS_UNIVERS" INNER JOIN "REFERENTIEL"."MCC_CATEGORIES" as T4 ON T1."MCC_CODE" = T4."MCC_CODE" WHERE T4."NOTE" = 1 """ DataRegex = pd.read_sql(query, con=engine) print("update of exclusion was done in {} seconds".format(time.time() - tic)) tic = time.time() for i in range(0, len(DataRegex)): ExcludedRegex(NumRow=i, DataRegex=DataRegex, engine=engine, TlbName=TlbName, schema=schema) print("update of exclusion was done in {} seconds".format(time.time() - tic)) # Step 2.1 - MCC code note 0 and 2 -- update categories and under categories query = """ UPDATE "{}"."{}" SET "UNIVERS" = T4."UNIVERS_DATABASE", "SOUS_UNIVERS" = T4."SOUS_UNIVERS_DATABASE" from( -- Join MCC categories, univers database & MCC database select T3."UNIVERS_DATABASE",T3."SOUS_UNIVERS_DATABASE",T2."MCC" as "MCC_DATABASE" from "REFERENTIEL"."MCC_CATEGORIES" as T1 INNER JOIN "REFERENTIEL"."MCC_CODE_LINK" as T2 ON T1."MCC_CODE" = T2."MCC_CODE" INNER JOIN "REFERENTIEL"."UNIVERS_DESCRIPTION" as T3 ON T1."UNIVERS" = T3."UNIVERS" and T1."SOUS_UNIVERS" = T3."SOUS_UNIVERS" WHERE T1."NOTE" in ('2','0') ) as T4 WHERE T4."MCC_DATABASE" = "MCC" """.format(schema, TlbName) tic = time.time() engine.execute(query) print("update of exclusion was done in {} seconds".format(time.time() - tic)) # Step 2.2 - MCC code note 1 -- Regex excluded query = """ select T3."UNIVERS_DATABASE",T3."SOUS_UNIVERS_DATABASE",T2."MCC" as "MCC_DATABASE",T1."NEW_REGEX" as "REGEX" from "REFERENTIEL"."REGEX_EXCLUDED" as T1 INNER JOIN "REFERENTIEL"."MCC_CODE_LINK" as T2 ON T1."MCC_CODE" = T2."MCC_CODE" INNER JOIN "REFERENTIEL"."UNIVERS_DESCRIPTION" as T3 ON T1."UNIVERS" = T3."UNIVERS" and T1."SOUS_UNIVERS" = T3."SOUS_UNIVERS" INNER JOIN "REFERENTIEL"."MCC_CATEGORIES" as T4 ON T1."MCC_CODE" = T4."MCC_CODE" WHERE T4."NOTE" in ('0','2') """ DataRegex = pd.read_sql(query, con=engine) tic = time.time() for i in range(0, len(DataRegex)): ExcludedRegex(NumRow=i, DataRegex=DataRegex, engine=engine, TlbName=TlbName, schema=schema) print("update of exclusion was done in {} seconds".format(time.time() - tic)) # step 3 - regex adding # Step 3.1 - MCC code note 1 -- Regex excluded query = """ select T3."UNIVERS_DATABASE",T3."SOUS_UNIVERS_DATABASE", T1."NEW_REGEX" as "REGEX" from "REFERENTIEL"."REGEX_INCLUDED" as T1 INNER JOIN "REFERENTIEL"."UNIVERS_DESCRIPTION" as T3 ON T1."UNIVERS" = T3."UNIVERS" and T1."SOUS_UNIVERS" = T3."SOUS_UNIVERS" """ DataRegex = pd.read_sql(query, con=engine) tic = time.time() for i in range(0, len(DataRegex)): IncludedRegex(NumRow=i, DataRegex=DataRegex, engine=engine, TlbName=TlbName, schema=schema) print("update of exclusion was done in {} seconds".format(time.time() - tic)) engine.close()
def create_update_dictionnaries_categorisation(database_type,database_name): engine = connect_to_database(database_type, database_name).CreateEngine() CreateSchema(engine, "REFERENTIEL") # Conf files location if sys.platform == "win32": Folder = os.path.expanduser('~') + "\\conf_python\\categorisation_univers\\" else: Folder = os.environ['HOME'] + "/conf_python/categorisation_univers/" # Referentials table FileDescription = "description_univers.xlsx" DataDescritpion = pd.read_excel(Folder + FileDescription) TableParameter = {} TableParameter["UNIVERS_DATABASE"] = "VARCHAR (50)" TableParameter["SOUS_UNIVERS_DATABASE"] = "VARCHAR (100)" TableParameter["UNIVERS"] = "VARCHAR (50)" TableParameter["SOUS_UNIVERS"] = "VARCHAR (100)" TableParameter["DESCRIPTION"] = "TEXT" InsertTableIntoDatabase(DataDescritpion, "UNIVERS_DESCRIPTION", "REFERENTIEL", database_type, database_name, DropTable=True, TableDict=TableParameter) del DataDescritpion del FileDescription # link between MCC and MCC_CODE FileMCC = "mcc_code_link.xlsx" DataFileMCC = pd.read_excel(Folder + FileMCC, dtype={'MCC_CODE': str, 'MCC': object}) TableParameter = {} TableParameter["MCC_CODE"] = "VARCHAR (20)" TableParameter["MCC"] = "TEXT" InsertTableIntoDatabase(DataFileMCC, "MCC_CODE_LINK", "REFERENTIEL", database_type, database_name, DropTable=True, TableDict=TableParameter) del FileMCC del DataFileMCC # Ingest MCC_CATEGORIES FileMCCCat = "mcc_categories.xlsx" DataMCCCat = pd.read_excel(Folder + FileMCCCat) TableParameter = {} TableParameter["MCC_NAME"] = "TEXT" TableParameter["SOUS_UNIVERS"] = "VARCHAR (100)" TableParameter["UNIVERS"] = "VARCHAR (50)" TableParameter["MCC_CODE"] = "VARCHAR (20)" TableParameter["NOTE"] = "INTEGER" InsertTableIntoDatabase(DataMCCCat, "MCC_CATEGORIES", "REFERENTIEL", database_type, database_name, DropTable=True, TableDict=TableParameter) del FileMCCCat del DataMCCCat # Regex exclusion FileRegexExclu = "regex_merchant.xlsx" Data1 = pd.read_excel(Folder + FileRegexExclu, sheet_name='Regex exclu') Data1 = Data1[~Data1.UNIVERS.isna()] TableParameter = {} TableParameter["MCC"] = "TEXT" TableParameter["Regex"] = "TEXT" TableParameter["UNIVERS"] = "VARCHAR (50)" TableParameter["SOUS_UNIVERS"] = "VARCHAR (100)" TableParameter["NEW_REGEX"] = "TEXT" TableParameter["MCC_CODE"] = "VARCHAR (20)" InsertTableIntoDatabase(Data1, "REGEX_EXCLUDED", "REFERENTIEL", database_type, database_name, DropTable=True, TableDict=TableParameter) del FileRegexExclu del Data1 # Update MCC code for Regex excluded query = """ UPDATE "REFERENTIEL"."REGEX_EXCLUDED" set "MCC_CODE" = T2."MCC_CODE" FROM "REFERENTIEL"."MCC_CATEGORIES" AS T2 where "MCC" = T2."MCC_NAME" """ engine.execute(query) # Regex inclusion FileRegexExclu = "regex_ajout.xlsx" Data1 = pd.read_excel(Folder + FileRegexExclu, sheet_name='Regex ajout') Data1 = Data1[~Data1.UNIVERS.isna()] TableParameter = {} TableParameter["Regex"] = "TEXT" TableParameter["UNIVERS"] = "VARCHAR (50)" TableParameter["SOUS_UNIVERS"] = "VARCHAR (100)" TableParameter["NEW_REGEX"] = "TEXT" InsertTableIntoDatabase(Data1, "REGEX_INCLUDED", "REFERENTIEL", database_type, database_name, DropTable=True, TableDict=TableParameter) del FileRegexExclu del Data1 engine.close()
def InsertToPostgre(Data, TlbName, schema, engine, **kwargs): """Insert a pandas DataFrame into a database table Requiered Parameters ----------- engine : sqlalchmey create_engine object Engine object & connection to the database from sqlalchemy TlbName : str Name of the targeted table into the database Schema: str Indicate the schema where the table is stores into the database Optional Parameters (**kwargs) ----------- logger: logger object logger to get logs from the running function in case of errors DropTable : Boolean -- default value False True if the table has to be dropped before ingestion SizeChunck : Integer -- default value 10000 """ # store the value of each optional argument # if an argument is missing, the default value # is none logger = kwargs.get('logger', None) DropTable = kwargs.get('TableDict', False) SizeChunck = kwargs.get('SizeChunck', None) TableDict = kwargs.get('TableDict', None) _use_credentials = kwargs.get('_use_credentials', None) _use_conf = kwargs.get('_use_conf', None) database_type = kwargs.get('database_type', None) database_name = kwargs.get('database_name', None) _was_engine = True if engine is None: try: engine = connect_to_database(database_type, database_name, _use_credentials=_use_credentials, _use_conf=_use_conf).CreateEngine() _was_engine = False except: raise # add a default value to DropTable if DropTable: try: # Variables type in postgres if TableDict is not None: # Check if the table exist if not table_exists(engine, TlbName, schema): CreateTable(engine, TlbName, schema, TableDict) else: if DropTable: metadata = MetaData() TlbObject = Table(TlbName, metadata, schema=schema) TlbObject.drop(engine) CreateTable(engine, TlbName, schema, TableDict) else: TableDict = CreateDictionnaryType(Data) #Check if the table exist if not table_exists(engine, TlbName, schema): CreateTable(engine, TlbName, schema, TableDict) else: if DropTable: metadata = MetaData() TlbObject = Table(TlbName, metadata, schema=schema) TlbObject.drop(engine) CreateTable(engine, TlbName, schema, TableDict) # Insert into the table if SizeChunck is None: Data.to_sql(TlbName, con=engine, if_exists='append', schema=schema, index=False) else: Data.to_sql(TlbName, con=engine, if_exists='append', schema=schema, index=False, chunksize=SizeChunck) except Exception as e: if logger is not None: logger.error(e, exc_info=True) else: print(e) else: try: if SizeChunck is None: Data.to_sql(TlbName, con=engine, if_exists='append', schema=schema, index=False) else: Data.to_sql(TlbName, con=engine, if_exists='append', schema=schema, index=False, chunksize=SizeChunck) except Exception as e: if logger is not None: logger.error(e, exc_info=True) else: print(e) if not _was_engine: engine.close()
def add_new_pos_transactions(database_type, database_name, _year, _month, _day, **kwargs): _tlbname = kwargs.get('tlbname', "POS_TRANSACTIONS") _schema = kwargs.get('schema', "TRANSACTIONS") date_start = datetime.datetime(_year, _month, _day) date_start_cond = str(date_start)[0:10] end_date = date_start + datetime.timedelta(days=1) end_date = str(end_date)[0:10] engine = connect_to_database(database_type, database_name).CreateEngine() # check if the date had already treated query = """ select count(*) from "{}"."{}" where "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(_schema,_tlbname,str(date_start), str(end_date)) data = pd.read_sql(query, con=engine) if data.iloc[0, 0] == 0: query = """ SELECT "CardHolderID","MCC","Amount","MerchantName","TransactionTime","Currency", "CardVPUType", "MerchantAddress", "MerchantCity", "MerchantCountry", "MerchantID", "TransactionID", CASE WHEN "TransactionTP" in ('POS International') then 1 else 0 end as "IsPOSInternational","TransactionTP", '' as "UNIVERS", '' as "SOUS_UNIVERS" FROM "TRANSACTIONS_MONTHLY"."MONTHLY_TRANSACTIONS_{}" where "TransactionTP" IN ('POS International','POS Domestic') and "DebitCredit" IN ('Debit') and "TransactionResult" = 'APPROVED' and "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(str(date_start.year) + str(date_start.month), date_start_cond, end_date) data = pd.read_sql(query, con=engine) if not data.empty: # Get the type of each variables columns_type = create_dictionnary_type_from_table(engine,"POS_TRANSACTIONS") # Create the TMP table for POS TRANSACTIONS # Drop the table query = """ DROP TABLE IF EXISTS "TMP_UPDATE"."TMP_POS_TRANSACTIONS" """ engine.execute(query) CreateTable(engine, "TMP_POS_TRANSACTIONS", "TMP_UPDATE", columns_type,keep_order=True) # Insert into table InsertTableIntoDatabase(data, TlbName="TMP_POS_TRANSACTIONS", Schema="TMP_UPDATE", database_type=database_type, database_name=database_name, DropTable=False) tic =time.time() fill_univers_sous_univers(database_type, database_name, "TMP_UPDATE", "TMP_POS_TRANSACTIONS") print("categorisation was done in {} seconds".format(time.time() - tic)) engine = connect_to_database(database_type, database_name).CreateEngine() query = """ insert into "{}"."{}" select * from "TMP_UPDATE"."TMP_POS_TRANSACTIONS" """.format(_schema, _tlbname) engine.execute(query) # Drop the table query = """ DROP TABLE IF EXISTS "TMP_UPDATE"."TMP_POS_TRANSACTIONS" """ engine.execute(query) engine.close() else: print("Any data for this date") else: print("this data had been already treated")
def add_new_others_transactions(database_type, database_name, _year, _month, _day, **kwargs): _tlbname = kwargs.get('tlbname', "OTHER_TRANSACTIONS") _schema = kwargs.get('schema', "TRANSACTIONS") date_start = datetime.datetime(_year, _month, _day) date_start_cond = str(date_start)[0:10] end_date = date_start + datetime.timedelta(days=1) end_date = str(end_date)[0:10] engine = connect_to_database(database_type, database_name).CreateEngine() # check if the date had already treated query = """ select count(*) from "{}"."{}" where "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(_schema,_tlbname, date_start_cond, end_date) data = pd.read_sql(query, con=engine) if data.iloc[0, 0] == 0: querytmp = """ select "CardHolderID","MCC","Amount","MerchantName","TransactionTP","TransactionTime","Currency", "CardVPUType", "MerchantAddress", "MerchantCity", "MerchantCountry", "MerchantID", "TransactionID", "DebitCredit", CASE WHEN "TransactionTP" ~* ('reversal') THEN 1 ELSE 0 END AS "IsReversal", CASE WHEN "TransactionTP" ~* ('fee') THEN 1 ELSE 0 END AS "IsFee" from "TRANSACTIONS_MONTHLY"."MONTHLY_TRANSACTIONS_{}" WHERE "TransactionTP" NOT IN ( 'ATM Domestic','ATM Domestic Fee','ATM International','ATM International Fee','BalanceInquiry fee','FX Fee', 'Bank Payment fee','Bank Transfer Fee','Batch Load Fee','Card Fee','Card Load Fee','Card Load at Payzone Fee', 'Card To Card Transfer Fee','Card to Card In','Cash Advance Fee','Decline Fee','Deposit To Card API Fee', 'INTERNET DEBIT/CREDIT','IVR Fee','InternetDrCrFee','KYC Card Upgrade Fee','Monthly Fee','POS Domestic', 'POS International','POS International Fee','Paytrail Load Fee','Post Office Fee','RefundFee','Replacement Card Fee', 'Replacement Card In','SEPA Outgoing Payment Fee','SMS Balance Inquiry fee','SMS Fee','SMS Lock UnLock Fee', 'Sepa Credit Fee','Sepa Incoming Payment','Sepa Incoming Payment Fee','Terminal Load','Terminal load fee', 'Upgrade to Physical Fee','Voucher load','Voucher load fee') AND "TransactionTP" !~* ('auth') and "TransactionTime" >= '{}' and "TransactionTime" < '{}' """.format(str(date_start.year) + str(date_start.month), date_start_cond, end_date) query = """ insert into "{}"."{}" {} """.format(_schema,_tlbname, querytmp) engine.execute(query) engine.close() else: print("this data had been already treated")
def daily_card_status2(Data, filepath, database_type, database_name): #### constant variables # Table parameter for the temporary table TableParameter = {} TableParameter["ActivationDate"] = "timestamp without time zone" TableParameter["Address1"] = "TEXT" TableParameter["Address2"] = "TEXT" TableParameter["ApplicationName"] = "VARCHAR (50)" TableParameter["AvailableBalance"] = "double precision" TableParameter["BirthDate"] = "timestamp without time zone" TableParameter["CardHolderID"] = "VARCHAR (50)" TableParameter["CardStatus"] = "VARCHAR (100)" TableParameter["City"] = "VARCHAR (100)" TableParameter["Country"] = "VARCHAR (50)" TableParameter["CreationDate"] = "timestamp without time zone" TableParameter["DistributorCode"] = "INTEGER" TableParameter["Email"] = "TEXT" TableParameter["ExpirationDate"] = "timestamp without time zone" TableParameter["FirstName"] = "TEXT" TableParameter["IBAN"] = "TEXT" TableParameter["IsExcludedAddress"] = "INTEGER" TableParameter["IsRenewal"] = "INTEGER" TableParameter["KYC_Status"] = "VARCHAR (50)" TableParameter["LastName"] = "TEXT" TableParameter["NoMobile"] = "TEXT" TableParameter["PostCode"] = "VARCHAR (50)" TableParameter["Programme"] = "VARCHAR (50)" TableParameter["RenewalDate"] = "timestamp without time zone" TableParameter["UpdateBalanceDate"] = "timestamp without time zone" TableParameter["UpdateDate"] = "timestamp without time zone" keepcol = [ "CardHolderID", "Email", "FirstName", "LastName", "City", "Country", "Card Status", "DistributorCode", "ApplicationName", "Date of Birth", "IBAN", "CreatedDate", "UpdatedDate", "Address1", "Address2", "PostCode", "KYC Status", "expirydate", "AvailableBalance", "NoMobile", "Programme" ] #### Step 1: Extract the data from the file and keep ony updated data # extract filedate FileName = filepath.split('/')[-1].replace(".csv", "") DateFile = pd.to_datetime( FileName.split("-")[1] + "-" + FileName.split("-")[2] + "-" + FileName.split("-")[3]) # based on the file date, identify the appropriate names of columns if DateFile > pd.to_datetime('2019-03-12'): col_names = [ "CardHolderID", "Cardnumber", "Email", "FirstName", "LastName", "City", "Country", "Card Status", "DistributorCode", "ApplicationName", "Date of Birth", "SortCodeAccNum", "IBAN", "CreatedDate", "UpdatedDate", "Address1", "Address2", "PostCode", "KYC Status", "expirydate", "AvailableBalance", "UDF2", "NoMobile", "Programme", "VPVR" ] elif DateFile < pd.to_datetime('2019-01-16'): col_names = [ "CardHolderID", "Cardnumber", "Email", "FirstName", "LastName", "City", "Country", "Card Status", "DistributorCode", "ApplicationName", "Date of Birth", "SortCodeAccNum", "IBAN", "CreatedDate", "UpdatedDate", "Address1", "Address2", "PostCode", "KYC Status", "expirydate" ] else: col_names = [ "CardHolderID", "Cardnumber", "Email", "FirstName", "LastName", "City", "Country", "Card Status", "DistributorCode", "ApplicationName", "Date of Birth", "SortCodeAccNum", "IBAN", "CreatedDate", "UpdatedDate", "Address1", "Address2", "PostCode", "KYC Status", "expirydate", "AvailableBalance", "UDF2", "NoMobile", "UDF3", "VPVR" ] # add the names of columns to the dataframe Data.columns = col_names # store the missing columns missing_columns = list(set(keepcol).difference(col_names)) if missing_columns: # if the list is not add new columns to the dataframe for col in missing_columns: Data[col] = None # keep track of available balance tmp_available_balance = Data[["CardHolderID", "AvailableBalance"]] tmp_available_balance["UpdateBalanceDate"] = datetime.datetime.now() # Store change values engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() query = """ select distinct "CardHolderID","CardStatus","KYC_Status" from "CARD_STATUS"."STATUS_CARTES" """ data_current = pd.read_sql(query, con=engine) data_current["CardHolderID"] = data_current["CardHolderID"].astype(str) data_current["KYC_Status"] = data_current["KYC_Status"].astype(str) data_current["CardStatus"] = data_current["CardStatus"].astype(str) #### Step 2: Transform the data # transform date columns to pd.datetime format in order to have a consistent format # of date over the database # Only transform updated date Data["UpdatedDate"] = pd.to_datetime(Data["UpdatedDate"], format="%b %d %Y %I:%M%p", errors='coerce') Data["CreatedDate"] = pd.to_datetime(Data["CreatedDate"], format="%b %d %Y %I:%M%p", errors='coerce') Data["Date of Birth"] = pd.to_datetime(Data["Date of Birth"], format="%b %d %Y %I:%M%p", errors='coerce') # transform expirydate Data["expirydate"] = Data["expirydate"].astype(str) Data["expirydate"] = "20" + Data["expirydate"].str[0:2] + "-" + Data[ "expirydate"].str[2:] + "-01" Data["expirydate"] = pd.to_datetime(Data["expirydate"], format='%Y-%m-%d', errors='coerce') Data = Data[keepcol] # condition remove address AddressToRemove = [ "77 OXFORD STREET LONDON", "17 RUE D ORLEANS", "TSA 51760", "77 Oxford Street London", "36 CARNABY STREET", "36 CARNABY STREET LONDON", "36 CARNABY STREET LONDON", "ADDRESS", "17 RUE D ORLEANS PARIS", "CreaCard Espana S L Paseo de Gracia 59", "36 Carnaby Street London", "CREACARD SA Pl Marcel Broodthaers 8 Box 5", "17 Rue D Orleans Paris", "CREACARD ESPANA S L PASEO DE GRACIA 59", "CreaCard 17 rue d Orleans", "CREACARD SA PL MARCEL BROODTHAERS 8 BOX 75", "CREACARD SA PL MARCEL BROODTHAERS 8 BOX 75", "36 Carnaby Street", "77 OXFORD STREET" ] Data["IsExcludedAddress"] = ( Data.Address1.isin(AddressToRemove)).astype(int) Data["ActivationDate"] = pd.NaT Data["IsRenewal"] = 0 Data["RenewalDate"] = pd.NaT Data = Data[sorted(Data.columns)] colnames = [ "ActivationDate", "Address1", "Address2", "ApplicationName", "AvailableBalance", "CardStatus", "CardHolderID", "City", "Country", "CreationDate", "BirthDate", "DistributorCode", "Email", "FirstName", "IBAN", "IsExcludedAddress", "IsRenewal", "KYC_Status", "LastName", "NoMobile", "PostCode", "Programme", "RenewalDate", "UpdateDate", "ExpirationDate" ] Data.columns = colnames Data["UpdateBalanceDate"] = datetime.datetime.now() Data = Data[sorted(Data.columns)] Data.loc[(Data["KYC_Status"] == '0') | (Data["KYC_Status"] == '0.0') | (Data["KYC_Status"] == 0), "KYC_Status"] = 'Anonyme' Data.loc[(Data["KYC_Status"] == '1') | (Data["KYC_Status"] == '1.0') | (Data["KYC_Status"] == 1), "KYC_Status"] = 'SDD' Data.loc[(Data["KYC_Status"] == '2') | (Data["KYC_Status"] == '2.0') | (Data["KYC_Status"] == 2), "KYC_Status"] = 'KYC' Data.loc[(Data["KYC_Status"] == '3') | (Data["KYC_Status"] == '3.0') | (Data["KYC_Status"] == 3), "KYC_Status"] = 'KYC LITE' Data["DistributorCode"] = Data["DistributorCode"].fillna(-1) Data["DistributorCode"] = Data["DistributorCode"].astype(int) Data["CardHolderID"] = Data["CardHolderID"].astype(str) Data["KYC_Status"] = Data["KYC_Status"].astype(str) Data["CardStatus"] = Data["CardStatus"].astype(str) # Step 1: Identify data_new = Data[["CardHolderID", "CardStatus", "KYC_Status"]] outer_join = data_current.merge(data_new, how='outer', indicator=True) outer_join = outer_join[outer_join["_merge"] == "right_only"] # set 2 : identify new cardholder ID new_card_holder_id = set(outer_join["CardHolderID"].unique()).difference( data_current["CardHolderID"].unique()) ### set 3 : insert old values into changes table data_to_change = data_current[data_current["CardHolderID"].isin( set(outer_join.loc[ ~outer_join["CardHolderID"].isin(new_card_holder_id), "CardHolderID"]))] FileName = filepath.split('/')[-1].replace(".csv", "") DateFile = pd.to_datetime( FileName.split("-")[1] + "-" + FileName.split("-")[2] + "-" + FileName.split("-")[3]) - datetime.timedelta(days=1) data_to_change["dt_change"] = DateFile InsertTableIntoDatabase(data_to_change, "CHANGE_STATUS_CARTES", "CARD_STATUS", "Postgres", "Creacard_Calypso", DropTable=False) # find new cardholder ID update + card holder ID change DateFile = pd.to_datetime( FileName.split("-")[1] + "-" + FileName.split("-")[2] + "-" + FileName.split("-")[3]) update_set = Data[(Data["UpdateDate"] >= DateFile) & (Data["UpdateDate"] < DateFile + pd.Timedelta(days=1))] update_set = update_set.reset_index(drop=True) import numpy as np kk = pd.DataFrame( np.concatenate((outer_join["CardHolderID"].unique(), update_set["CardHolderID"].unique()), axis=0)) kk.columns = ["CardHolderID"] kk = kk[kk["CardHolderID"] != '0'] to_update = Data[Data["CardHolderID"].isin(kk["CardHolderID"].unique())] #### Step 3: Load these data into a temporary table con_postgres = connect_to_database(database_type, database_name).CreateEngine() query = """ DROP TABLE IF EXISTS "TMP_UPDATE"."TMP_STATUS_CARTES" """ con_postgres.execute(query) con_postgres.close() InsertTableIntoDatabase(to_update, "TMP_STATUS_CARTES", "TMP_UPDATE", database_type, database_name, DropTable=True, TableDict=TableParameter, SizeChunck=10000) #### Step 5: Update new values query_delete = """ DELETE FROM "CARD_STATUS"."STATUS_CARTES" USING "TMP_UPDATE"."TMP_STATUS_CARTES" WHERE "CARD_STATUS"."STATUS_CARTES"."CardHolderID" = "TMP_UPDATE"."TMP_STATUS_CARTES"."CardHolderID" """ con_postgres = connect_to_database(database_type, database_name).CreateEngine() con_postgres.execute(query_delete) con_postgres.close() query_update = """ UPDATE "TMP_UPDATE"."TMP_STATUS_CARTES" SET "IsRenewal" = CASE WHEN "DistributorCode" in ('203','914','915') then 1 else 0 end """ con_postgres = connect_to_database(database_type, database_name).CreateEngine() con_postgres.execute(query_update) con_postgres.close() query = """ INSERT INTO "CARD_STATUS"."STATUS_CARTES" SELECT * FROM "TMP_UPDATE"."TMP_STATUS_CARTES" """ con_postgres = connect_to_database(database_type, database_name).CreateEngine() con_postgres.execute(query) con_postgres.close() # drop the temporary table con_postgres = connect_to_database(database_type, database_name).CreateEngine() query = """ DROP TABLE IF EXISTS "TMP_UPDATE"."TMP_STATUS_CARTES" """ con_postgres.execute(query) con_postgres.close() #### Step 6: Update available balance for all CHID tlb_param_balance = dict() tlb_param_balance["AvailableBalance"] = "double precision" tlb_param_balance["CardHolderID"] = "VARCHAR (50)" tlb_param_balance["UpdateBalanceDate"] = "timestamp without time zone" con_postgres = connect_to_database(database_type, database_name).CreateEngine() query = """ DROP TABLE IF EXISTS "TMP_UPDATE"."TMP_AVAILABLE_BALANCE" """ con_postgres.execute(query) con_postgres.close() InsertTableIntoDatabase(tmp_available_balance, "TMP_AVAILABLE_BALANCE", "TMP_UPDATE", database_type, database_name, DropTable=True, TableDict=tlb_param_balance, SizeChunck=10000) con_postgres = connect_to_database(database_type, database_name).CreateEngine() query_balance = """ UPDATE "CARD_STATUS"."STATUS_CARTES" SET "AvailableBalance" = T1."AvailableBalance", "UpdateBalanceDate" = T1."UpdateBalanceDate" from "TMP_UPDATE"."TMP_AVAILABLE_BALANCE" as T1 WHERE "CARD_STATUS"."STATUS_CARTES"."CardHolderID" = T1."CardHolderID" """ con_postgres.execute(query_balance) con_postgres.close() con_postgres = connect_to_database(database_type, database_name).CreateEngine() query = """ DROP TABLE IF EXISTS "TMP_UPDATE"."TMP_AVAILABLE_BALANCE" """ con_postgres.execute(query) con_postgres.close() con_postgres = connect_to_database(database_type, database_name).CreateEngine() query = """ update "CARD_STATUS"."STATUS_CARTES" as T1 SET "ActivationDate" = "ActivationTime" FROM "CARD_STATUS"."ACTIVATION_REPORT" as T2 WHERE T1."CardHolderID" = T2."CardHolderID" and "ActivationDate" is null """ con_postgres.execute(query) con_postgres.close()
def create_tmp_id(schema, tlb, schema_main): if sys.platform == "win32": folder_json = os.path.expanduser('~') + "\\conf_python\\unique_id_conditions.json" else: folder_json = os.environ['HOME'] + "/conf_python/unique_id_conditions.json" with open(folder_json, 'r') as JSON: conditions = json.load(JSON) condition = conditions["exclusion_cartes"]["request"] condition_on_email = conditions["condition_email"]["dataframe"] engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() CreateSchema(engine, schema_main) query = """ select "CardHolderID", "NoMobile", lower("Email") as "Email", "FirstName", "LastName", "BirthDate", "PostCode", "Address1", "Address2", "ActivationDate" from "{}"."{}" where {} """.format(schema, tlb, condition) data = pd.read_sql(query, con=engine) engine.close() for var in ["FirstName", "LastName", "Address1", "Address2", "PostCode", "Email"]: data[var] = data[var].str.encode('utf-8').astype(str) data.loc[data[var].isnull(), var] = "" data[var] = data[var].str.strip(" ") data[var] = data[var].str.replace(" ", "") data[var] = data[var].str.lower() data = data[~data["Email"].str.contains('.*creacard.*|.*prepaidfinancial.*|.*financial.*', regex=True)] data["GoodEmail"] = 1 data.loc[data["Email"].str.contains(condition_on_email, regex=True), "GoodEmail"] = 0 data["GoodCombinaison"] = 1 data.loc[(data["LastName"].str.contains(conditions["condition_combinaison"]["LastName"], regex=True)) | (data["BirthDate"].isnull()) | (data["BirthDate"].isin(conditions["condition_combinaison"]["BirthDate"].split(","))), "GoodCombinaison"] = 0 # Delete leading "00" at the start of string. data["NoMobile"] = data["NoMobile"].str.replace("^00", "", regex=True) # replace .0 at the end$ data["NoMobile"] = data["NoMobile"].str.replace("\.0$", "", regex=True) # delete only literal '|' from string data["NoMobile"] = data["NoMobile"].str.replace("\|", "", regex=True) query = """ DROP TABLE IF EXISTS "CUSTOMERS"."TMP_USER_ID" CASCADE """ engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() engine.execute(query) engine.close() query = """ CREATE TABLE "{}"."TMP_USER_ID"( "CardHolderID" VARCHAR(50), "NoMobile" TEXT, "Email" TEXT, "FirstName" TEXT, "LastName" TEXT, "BirthDate" TEXT, "PostCode" TEXT, "Address1" TEXT, "Address2" TEXT, "ActivationDate" timestamp without time zone, "GoodEmail" INTEGER, "GoodCombinaison" INTEGER ) """.format(schema_main) engine = connect_to_database("Postgres", "Creacard_Calypso").CreateEngine() engine.execute(query) engine.close() data = data[~data["NoMobile"].isnull()] InsertTableIntoDatabase(data, TlbName="TMP_USER_ID", Schema=schema_main, database_name="Creacard_Calypso", database_type="Postgres", DropTable=False, InstertInParrell=False)