def __init__(self, meta, logger=None, chuck_size=5000000, time_out=3600): """ :param meta: Parameters from config file :param logger: logger handler :param chuck_size: The chuck size when reading data for Pandas :param time_out: time out when uploading data to Azure Storage """ self.meta = meta self.account_name = self.meta.get("azure_storage_account_name") self.container_name = self.meta.get("azure_storage_blob_container") self.logger = logger if logger else Logger(log_level="info", target="console", vendor_key=-1, retailer_key=-1, sql_conn=None) self.account_key = get_password(username=self.account_name, meta=self.meta) self.blob_service = BlockBlobService(self.account_name, self.account_key) self.dw_conn = DWOperation(meta=self.meta) self.sql = "" self.parq_filename = "" self.local_path = "" self.chuck_size = chuck_size # 5,000,000 rows as a chuck self.time_out = time_out # time secs
def __init__(self, meta, params, logger=None): self.meta = meta self.params = params self.file_ext = self.params.get("fileExt") if self.params.get( "fileExt") else "txt" self.zip_flag = True if str(self.params.get( "zipFlag", "N")).upper() in ["Y", "YES", "T", "TRUE"] else False self._debug = "Y" if str(self.params.get( 'debug', 'N')).upper() in ["Y", "YES", "T", "TRUE"] else 'N' __log_level = 'DEBUG' if self._debug == "Y" else 'INFO' self.logger = logger if logger else Logger(log_level="info", target="console", vendor_key=-1, retailer_key=-1, sql_conn=None) self.logger.set_level(log_level=__log_level) self.logger.set_keys(log_id="{}".format(self.params["jobId"])) self.provision_url = self.meta[ "api_provision_str"] + "/filesDelivery/layout/attributes?cycle_key=" self.capacity = Capacity(meta=self.meta) self.rsi_username = self.meta.get("fileshare_username") self.rsi_folder = self.meta.get("fileshare_folder") self.cmn_schema = self.meta.get("db_conn_vertica_common_schema", "common") self.app_conn = MSOperation(meta=self.meta) self.vertica_conn = DWOperation(meta=self.meta) self.dumper_context = self.get_context() self._layout_payload = {}
def get_table_def(self, raw_table_name): sql = "SELECT ID, TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA FROM META_SCD_TABLE_DEF WHERE TABLE_NAME='%s'" % raw_table_name if self.db_type: sql += " AND DB_TYPE='%s'" % self.db_type if self.table_schema: sql += " AND TABLE_SCHEMA='%s'" % self.table_schema rs = self.mssql.query(sql) if len(rs) == 0: raise ValueError( '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] The table is not found.' % (raw_table_name, db_type, table_schema)) elif len(rs) > 1: self.logger.info('|ID|TABLE_NAME|DB_TYPE|TABLE_SCHEMA|') for r in rs: self.logger.info( '|%s|%s|%s|%s|' % (r.ID, r.TABLE_NAME, r.DB_TYPE, r.TABLE_SCHEMA)) raise ValueError( '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] More than one records returned for the table.' % (raw_table_name, db_type, table_schema)) else: if rs[0].DB_TYPE == 'MSSQL': self.target_conn = self.mssql else: self.vtc = DWOperation(meta=self.meta) self.target_conn = self.vtc return rs[0]
def _process_data_cleanup(self): """Process Data Cleanup :return: """ try: self._dw = DWOperation(meta=self.context["meta"]) lst_warning = [" none warning"] for cfg in eval(self.context["config_json"]): try: if cfg["type"].upper() == "TABLE": self._cleanup_table(cfg) elif cfg["type"].upper() == "DATA": self._cleanup_data(cfg) self.body["message"] = "{} cleanup done".format(cfg["type"]) self._logger.info(self.body) except Exception as msg: self.body["status"] = StepStatus.ERROR.name self.body["message"] = ("Data Cleanup error with config {} --||## {}".format(cfg, msg)) self._logger.warning(self.body) lst_warning.append(cfg) continue self.body["status"] = StepStatus.SUCCESS.name self.body["message"] = ("Data Cleanup done with config {}, {}".format(cfg, "; ".join(lst_warning))) self._logger.info(self.body) except Exception as msg: self.body["status"] = StepStatus.ERROR.name self._logger.error(msg) finally: if hasattr(self, "_dw"): self._dw.close_connection() self.body["message"] = "Data Cleanup end" self._logger.info(self.body)
def __init__(self, context): self.context = context self.meta = self.context["meta"] self._logger = self.context["logger"] self._db = MSOperation(meta=self.meta) self._dw = DWOperation(meta=self.meta) self.bundle_base_url = self.meta["api_osa_bundle_str"] self.alert_base_url = self.meta["api_alerts_str"] self._common_schema = self.meta["db_conn_vertica_common_schema"] self._dim_calendar = "DIM_CALENDAR" self._dim_product = "DIM_PRODUCT" self._dim_store = "DIM_STORE" self._meta_intvn = "ALT_META_INTERVENTION" self._dim_base_dir = self.meta["azure_storage_dim_root"] self._azure_storage_account_name = self.meta[ "azure_storage_account_name"] self._azure_storage_blob_container = self.meta[ "azure_storage_blob_container"] self._prefix_name = "IRIS_" self.azure_uploader = UploadToBlobStorage(meta=self.meta, logger=self._logger) self.body = { "job_id": self.context["jobId"], "step_id": self.context["stepId"], "status": StepStatus.RUNNING.name } self.capacity = Capacity(meta=self.meta) self.cycle_key = self._get_cycle_key()
def __init__(self, context): self.context = {**context, **context["meta"]}.copy() self._hub_ids = context["hub_ids"] self._logger = context["logger"] self._db = MSOperation(meta=context["meta"]) self._dw = DWOperation(meta=context["meta"]) self._common_schema = context["meta"]["db_conn_vertica_common_schema"] # self._dw_conn_vertica = self.context["dw_conn_vertica"] self._all_dim = self.context["all_dim"] self._dim_calendar = "DIM_CALENDAR" self._dim_product = "DIM_PRODUCT" self._dim_store = "DIM_STORE" self._product_cis_src_tbl = "PS_ITEM_DIM_EXT" self._store_cis_src_tbl = "PS_STORE_DIM_EXT" self._dct_vendor_retailer_hub = {} self._dct_vendor_hub = {} self._dct_retailer_hub = {} self._all_vendor_retailer_hub = self._get_avaliable_vr()
def _populate_source_config(self, source_config): self.logger.debug("The source config is: %s" % source_config) _src_config = {} if os.name == 'nt': _src_config["temp_file_path"] = "d:" elif os.name == 'posix': _src_config["temp_file_path"] = "/tmp" # Getting user account from config.properties file first. if self.meta.get("db_conn_vertica_rdp_username"): _src_config["dw.etluser.id"] = self.meta.get( "db_conn_vertica_rdp_username") if self.meta.get("db_conn_vertica_rdp_password"): _src_config["dw.etluser.password"] = self.meta.get( "db_conn_vertica_rdp_password") else: _pmp_pwd = get_password( username=self.meta.get("db_conn_vertica_rdp_username"), meta=self.meta) # The pwd should be encrypted in order to: 1, align with else part, 2, pass it to db.sync_data module _src_config["dw.etluser.password"] = Crypto().encrypt(_pmp_pwd) # if not configed then get them directly from RDP config. else: _src_config["dw.etluser.id"] = source_config.get("dw.etluser.id") # the pwd is encrypted _src_config["dw.etluser.password"] = source_config.get( "dw.etluser.password") # required info for calling sync_data module. _src_config["dw.server.name"] = source_config.get("dw.server.name") _src_config["dw.db.name"] = source_config.get("dw.db.name") _src_config["dw.db.portno"] = source_config.get("dw.db.portno", 5433) _src_config["dw.schema.name"] = source_config.get("dw.schema.name") self.logger.debug("srouce config is: %s" % _src_config) self.dct_sync_data["source_config"] = _src_config # Create the connection to RDP Vertica Cluster. which is the source Vertica cluster rdp_meta = copy.deepcopy(self.meta) tmp_rdp_meta = { 'db_conn_vertica_servername': _src_config["dw.server.name"], 'db_conn_vertica_port': _src_config["dw.db.portno"], 'db_conn_vertica_dbname': _src_config["dw.db.name"], 'db_conn_vertica_username': _src_config["dw.etluser.id"], 'db_conn_vertica_password': _src_config["dw.etluser.password"], 'db_conn_vertica_password_encrypted': "true" } rdp_meta.update(tmp_rdp_meta) self.logger.debug("rdp config is: %s" % rdp_meta) rdp_connection = DWOperation(meta=rdp_meta) self.dct_sync_data["source_dw"] = rdp_connection
def __init__(self, context): self.context = context self.meta = self.context["meta"] self._logger = self.context["logger"] self.body = { "job_id": self.context["jobId"], "step_id": self.context["stepId"], "status": StepStatus.RUNNING.name } self._db = MSOperation(meta=self.meta) self._dw = DWOperation(meta=self.meta) self._common_schema = self.meta["db_conn_vertica_common_schema"] self._check_spd_url = "{}/availablecycle/rc/action".format( self.meta["api_alerts_str"]) self.pacific_job_def_name = "OSAPacificNotification"
def test_gen_vtc_test_data(): mssql = MSOperation(meta=meta) vtc = DWOperation(meta=meta) try: vtc.execute('drop table common.test_columns') except: print('table doesn\'t exists') vtc.execute( 'create table common.test_columns as select * from columns') rs = mssql.query( 'select id from META_SCD_TABLE_DEF where table_name=\'test_columns\'' ) if not rs: sql = ''' INSERT INTO META_SCD_TABLE_DEF(TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA, CREATE_TIME) SELECT 'test_columns', '["table_schema", "table_name", "column_name"]', 'VERTICA', '(COMMON)', GETDATE() ''' mssql.execute(sql) '''
class SyncDimData: """ Get product and store dimension data from hub. DIM_PRODUCT and DIM_STORE save pivoted data, tall table PRODUCT and STORE save attribute that config in AP_CONFIG_META_ATTRIBUTE """ def __init__(self, context): self.context = {**context, **context["meta"]}.copy() self._hub_ids = context["hub_ids"] self._logger = context["logger"] self._db = MSOperation(meta=context["meta"]) self._dw = DWOperation(meta=context["meta"]) self._common_schema = context["meta"]["db_conn_vertica_common_schema"] # self._dw_conn_vertica = self.context["dw_conn_vertica"] self._all_dim = self.context["all_dim"] self._dim_calendar = "DIM_CALENDAR" self._dim_product = "DIM_PRODUCT" self._dim_store = "DIM_STORE" self._product_cis_src_tbl = "PS_ITEM_DIM_EXT" self._store_cis_src_tbl = "PS_STORE_DIM_EXT" self._dct_vendor_retailer_hub = {} self._dct_vendor_hub = {} self._dct_retailer_hub = {} self._all_vendor_retailer_hub = self._get_avaliable_vr() def _get_column(self): """Get attribute from config table :return: """ self._dct_table = {"DIM_PRODUCT": "", "DIM_STORE": ""} self._dct_key = {"DIM_PRODUCT": "ITEM_KEY", "DIM_STORE": "STORE_KEY"} self._dct_table_column = {"PRODUCT": [], "STORE": []} sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 1" self._logger.debug(sql) for row in self._db.query(sql): self._dct_table_column["PRODUCT"].append(row[0]) sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 2" self._logger.debug(sql) for row in self._db.query(sql): self._dct_table_column["STORE"].append(row[0]) for k, v in self._dct_table_column.items(): self._dct_table_column[k] = """'{}'""".format("', '".join(v)) def _get_source_config(self, hub_id): """Get source config :return: """ self._source_dw = SiloOperation(hub_id=hub_id, meta=self.context["meta"]) if self.context["meta"]["db_conn_vertica_silo_username"]: self._source_dw.config.json_data["configs"][ "dw.etluser.id"] = self._source_dw.username self._source_dw.config.json_data["configs"][ "dw.etluser.password"] = Crypto().encrypt( self._source_dw.password) silo_type = self._source_dw.config.get_property( "etl.silo.type", "None").upper() # TODO For hub, silo_type is MD, not find para for this, default host if silo_type in ("WMSSC", "SASSC", "WMCAT", "WMINTL"): self._active_status = "" else: self._active_status = " AND D.ACTIVE = 'T'" def _get_avaliable_vr(self): """ getting all avaliable vendor and retailer from cycle mapping table. and get related hub_id via config service. :return: dict like {'vendor,retailer': 'hub_id', ...} """ sql = "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_MAPPING " \ "UNION " \ "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_RC_MAPPING" self._logger.debug(sql) dct_vendor_retailer_hub = {} # dct_vendor_retailer_hub = dict(self._db.query(sql)) for v_r in self._db.query(sql): try: config = Config(meta=self.context["meta"], vendor_key=v_r.VENDOR_KEY, retailer_key=v_r.RETAILER_KEY) hub_id = config.get_hub_id() _key = str(v_r.VENDOR_KEY) + ',' + str(v_r.RETAILER_KEY) dct_vendor_retailer_hub[_key] = hub_id # in case there is no config returned for given vendor & retailer, then skip this vendor & retailer. except Exception as e: # self._logger.warning(str(e)) self._logger.warning( "Seems there is no silo configed for vendor: %s and retailer: %s" % (str(v_r.VENDOR_KEY), str(v_r.RETAILER_KEY))) continue return dct_vendor_retailer_hub def _get_vendor_retailer_from_hub(self, hub_id): """Get vendor retailer from table AP_ALERT_CYCLE_MAPPING, only keep the first one that appear in multiple hubs :return: """ self._str_vendor_retailer_filter = "" self._str_vendor_filter = "" self._str_retailer_filter = "" lst_vendor_retailer_filter = [] lst_vendor_filter = [] lst_retailer_filter = [] dct_vendor_retailer_hub = self._all_vendor_retailer_hub sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ " " DISTINCT VENDOR_KEY, RETAILER_KEY FROM {self._meta_schema}.CONFIG_CUSTOMERS" " WHERE VENDOR_NAME != 'Alert' AND HUB_ID = '{hub_id}'" " ORDER BY VENDOR_KEY, RETAILER_KEY".format(hub_id=hub_id, self=self)) self._logger.debug(sql) vendor_retailers = self._source_dw.query(sql) for row in vendor_retailers: _vr = "{row.VENDOR_KEY},{row.RETAILER_KEY}".format(row=row) # same vendor & retailer could have multi hubs. # So only getting the one which matches with the hub from Config service. if dct_vendor_retailer_hub.get(_vr) == hub_id or self._all_dim: if _vr not in self._dct_vendor_retailer_hub: self._dct_vendor_retailer_hub[_vr] = hub_id lst_vendor_retailer_filter.append( "({row.VENDOR_KEY},{row.RETAILER_KEY})".format( row=row)) if str(row.VENDOR_KEY) not in self._dct_vendor_hub: self._dct_vendor_hub["{}".format(row.VENDOR_KEY)] = hub_id lst_vendor_filter.append("{}".format(row.VENDOR_KEY)) if str(row.RETAILER_KEY) not in self._dct_retailer_hub: self._dct_retailer_hub["{}".format( row.RETAILER_KEY)] = hub_id lst_retailer_filter.append("{}".format(row.RETAILER_KEY)) if lst_vendor_retailer_filter: self._str_vendor_retailer_filter = ( " AND (VENDOR_KEY, RETAILER_KEY) IN ({})" "".format(",".join(lst_vendor_retailer_filter))) self._logger.debug(self._str_vendor_retailer_filter) if lst_vendor_filter: self._str_vendor_filter = (" AND VENDOR_KEY IN ({})".format( ",".join(list(set(lst_vendor_filter))))) self._logger.debug(self._str_vendor_filter) if lst_retailer_filter: self._str_retailer_filter = (" AND RETAILER_KEY IN ({})".format( ",".join(list(set(lst_retailer_filter))))) self._logger.debug(self._str_retailer_filter) def _recreate_stage_table(self): """ Creating related stage tables on IRIS DW. :return: """ lst_sql = [] for table in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]): lst_sql.append(""" DROP TABLE IF EXISTS {self._common_schema}.STAGE_{table}; CREATE TABLE {self._common_schema}.STAGE_{table} LIKE {self._common_schema}.{table} INCLUDING PROJECTIONS; """.format(self=self, table=table)) sql = ''.join(lst_sql) self._logger.info(sql) self._dw.execute(sql) # creating staging table for loading CIS source data. _sql = """ DROP TABLE IF EXISTS {cmnSchema}.STAGE_{itemTable}_CIS; CREATE TABLE {cmnSchema}.STAGE_{itemTable}_CIS LIKE {cmnSchema}.{itemTable} INCLUDING PROJECTIONS; DROP TABLE IF EXISTS {cmnSchema}.STAGE_{storeTable}_CIS; CREATE TABLE {cmnSchema}.STAGE_{storeTable}_CIS LIKE {cmnSchema}.{storeTable} INCLUDING PROJECTIONS; """.format(cmnSchema=self._common_schema, itemTable=self._dim_product, storeTable=self._dim_store) self._logger.info(_sql) self._dw.execute(_sql) def _truncate_stage_table(self): """Truncate stage table :return: """ lst_sql = [] for k in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]): lst_sql.append( "TRUNCATE TABLE {self._common_schema}.STAGE_{k}".format( self=self, k=k)) sql = ';\n'.join(lst_sql) self._logger.debug(sql) self._dw.execute(sql) def _analyze_table(self): """Analyze table :return: """ lst_sql = [] for k in self._dct_table.keys(): lst_sql.append( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.{k}')" "".format(self=self, k=k)) sql = ';\n'.join(lst_sql) self._logger.debug(sql) self._dw.execute(sql) def _export_calendar_data(self, hub_id): """ Export calendar data to common schema from source schema if no data in DIM_CALENDAR table :param hub_id: :return: """ cnt = self._dw.query_scalar( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ COUNT(*) " "FROM {self._common_schema}.{self._dim_calendar}".format( self=self)) if cnt != 0: self._logger.info( "Calendar data exists, skip hub_id {}.".format(hub_id)) else: col = ( 'PERIOD_KEY, CALENDAR_KEY, CALENDARNAME, YEAR, YEARNAME, QUARTER, QUARTERNAME, MONTH, MONTHNAME, ' 'PERIOD, PERIODNAME, WEEKENDED, WEEKENDEDNAME, WEEKBEGIN, WEEKBEGINNAME, YEARWEEKNAME, YEARWEEK, ' 'YEARMONTHWEEKNAME, LY_PERIOD_KEY, NY_PERIOD_KEY, DATE_NAME, DATE_VALUE, CAL_PERIOD_KEY, ' '"2YA_PERIOD_KEY", "3YA_PERIOD_KEY", "4YA_PERIOD_KEY"') sql = """ SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CAL.PERIOD_KEY AS PERIOD_KEY, META.CALENDAR_KEY AS CALENDAR_KEY, MAX(META.CALENDAR_NAME) AS CALENDARNAME, MAX(CAL."YEAR") AS "YEAR", MAX(CAL.YEARNAME) AS YEARNAME, MAX(CAL.YEARQUARTER) AS QUARTER, MAX(CAL.YEARQUARTERNAME) AS QUARTERNAME, MAX(CAL.YEARPERIOD) AS "MONTH", MAX(CAL.YEARPERIODNAME) AS MONTHNAME, MAX(CAL.YEARMONTH) AS PERIOD, MAX(CAL.YEARMONTHNAME) AS PERIODNAME, MAX(CAL.WEEKENDED) AS WEEKENDED, MAX(CAL.WEEKENDEDNAME) AS WEEKENDEDNAME, MAX(CAL.WEEKBEGIN) AS WEEKBEGIN, MAX(CAL.WEEKBEGINNAME) AS WEEKBEGINNAME, (((MAX(CAL.YEAR))::VARCHAR|| ' - Wk '::VARCHAR(6)) || SUBSTR(('00'::VARCHAR(2) ||(MAX(CAL.WEEK))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.WEEK))::VARCHAR)) - 2) + 1),2) ) AS YEARWEEKNAME, MAX(CAL.YEARWEEK) AS YEARWEEK, (((((MAX(CAL.YEAR))::VARCHAR|| ' - '::VARCHAR(3)) || SUBSTR (('00'::VARCHAR(2) ||(MAX(CAL.MONTH))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.MONTH))::VARCHAR)) - 2) + 1),2)) || ' Wk '::VARCHAR(4)) || SUBSTR(('00'::VARCHAR(2) ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(META.MONTH) ORDER BY CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(CAL.MONTH) ORDER BY CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR)) - 2) + 1),2)) AS YEARMONTHWEEKNAME, MAX(CAL.LY_PERIOD_KEY) LY_PERIOD_KEY, MAX(CAL.NY_PERIOD_KEY) NY_PERIOD_KEY, TO_CHAR(TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD'), 'YYYY-MM-DD') DATE_NAME, TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD') DATE_VALUE, CAL.PERIOD_KEY CAL_PERIOD_KEY, MAX("2YA_PERIOD_KEY") "2YA_PERIOD_KEY", MAX("3YA_PERIOD_KEY") "3YA_PERIOD_KEY", MAX("4YA_PERIOD_KEY") "4YA_PERIOD_KEY" FROM ( (SELECT 2 AS CALENDAR_KEY, 'STD' AS PREFIX, 1 AS "MONTH", '445 (Standard)' AS CALENDAR_NAME) META JOIN (SELECT C.PERIOD_KEY AS PERIOD_KEY, C.CALENDAR_KEY AS CALENDAR_KEY, C.YEAR, C.WEEK, C.MONTH, (C.YEAR)::VARCHAR(4) AS YEARNAME, QE.YEARQUARTER, QE.YEARQUARTERNAME, ME.YEARPERIOD AS YEARMONTH, ME.YEARPERIODNAME AS YEARMONTHNAME, ME.YEARPERIOD, ME.YEARPERIODNAME, WE.YEARWEEK AS YEARWEEK, WE.WEEKBEGIN, TO_CHAR(TO_DATE ((WE.WEEKBEGIN)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10) ) AS WEEKBEGINNAME, WE.WEEKENDED, TO_CHAR(TO_DATE ((WE.WEEKENDED)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10) ) AS WEEKENDEDNAME, C.LY_PERIOD_KEY, C.NY_PERIOD_KEY, YA2.LY_PERIOD_KEY "2YA_PERIOD_KEY", YA3.LY_PERIOD_KEY "3YA_PERIOD_KEY", YA4.LY_PERIOD_KEY "4YA_PERIOD_KEY" FROM ((({self._dim_schema}.CALENDAR_PERIOD C JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY, MIN(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKBEGIN, MAX(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKENDED, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.WEEK AS WEEK, CASE WHEN (CALENDAR_PERIOD.WEEK > 2000) THEN CALENDAR_PERIOD.WEEK ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.WEEK) END AS YEARWEEK FROM {self._dim_schema}.CALENDAR_PERIOD WHERE ((CALENDAR_PERIOD.WEEK_CODE = 'B'::CHAR(1)) OR (CALENDAR_PERIOD.WEEK_CODE = 'E'::CHAR(1))) GROUP BY CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.WEEK) WE ON ( ( (C.CALENDAR_KEY = WE.CALENDAR_KEY) AND (C.YEAR = WE.YEAR) AND (C.WEEK = WE.WEEK)))) JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.MONTH AS PERIOD, CASE WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN (CALENDAR_PERIOD.MONTH)::VARCHAR(6) ELSE CONCAT (CONCAT (CONCAT ('Period '::VARCHAR(7),(CALENDAR_PERIOD.MONTH)::VARCHAR(2)), ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4)) END AS YEARPERIODNAME, CASE WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN CALENDAR_PERIOD.MONTH ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.MONTH) END AS YEARPERIOD FROM {self._dim_schema}.CALENDAR_PERIOD WHERE ((CALENDAR_PERIOD.MONTH_CODE = 'B'::CHAR (1)) OR (CALENDAR_PERIOD.MONTH_CODE = 'E'::CHAR (1))) GROUP BY CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.MONTH) ME ON ( ( (C.CALENDAR_KEY = ME.CALENDAR_KEY) AND (C.YEAR = ME.YEAR) AND (C.MONTH = ME.PERIOD)))) JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.QUARTER AS QUARTER, CASE WHEN CALENDAR_PERIOD.QUARTER > 2000 THEN CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1), (SUBSTR((CALENDAR_PERIOD.QUARTER)::VARCHAR(6), 6))::VARCHAR(2)), ', '::VARCHAR(2)),(CALENDAR_PERIOD.QUARTER)::VARCHAR(4)) ELSE CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1),(CALENDAR_PERIOD.QUARTER)::VARCHAR(2)), ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4)) END AS YEARQUARTERNAME, CASE WHEN (CALENDAR_PERIOD.QUARTER > 2000) THEN CALENDAR_PERIOD.QUARTER ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.QUARTER) END AS YEARQUARTER FROM {self._dim_schema}.CALENDAR_PERIOD WHERE ((CALENDAR_PERIOD.QUARTER_CODE = 'B'::CHAR (1)) OR (CALENDAR_PERIOD.QUARTER_CODE = 'E'::CHAR (1))) GROUP BY CALENDAR_PERIOD.CALENDAR_KEY, CALENDAR_PERIOD.YEAR, CALENDAR_PERIOD.QUARTER) QE ON (((C.CALENDAR_KEY = QE.CALENDAR_KEY) AND (C.YEAR = QE.YEAR) AND (C.QUARTER = QE.QUARTER)))) LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA2 ON C.CALENDAR_KEY = YA2.CALENDAR_KEY AND C.LY_PERIOD_KEY = YA2.PERIOD_KEY LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA3 ON C.CALENDAR_KEY = YA3.CALENDAR_KEY AND YA2.LY_PERIOD_KEY = YA3.PERIOD_KEY LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA4 ON C.CALENDAR_KEY = YA4.CALENDAR_KEY AND YA3.LY_PERIOD_KEY = YA4.PERIOD_KEY WHERE (C.YEAR >= ((DATE_PART ('year'::VARCHAR (4),(NOW ())::TIMESTAMP))::INT- 5))) CAL ON ( (CAL.CALENDAR_KEY = META.CALENDAR_KEY)) ) GROUP BY CAL.PERIOD_KEY,META.CALENDAR_KEY; """.format(self=self) dct_sync_data = self.context.copy() dct_sync_data["source_dw"] = self._source_dw dct_sync_data["source_config"] = self._source_dw.config.json_data[ "configs"] dct_sync_data["target_dw_schema"] = self._common_schema dct_sync_data["target_dw_table"] = self._dim_calendar dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) self._logger.info( "Sync hub_id {} dim calendar done".format(hub_id)) def _export_product_data(self, hub_id, cis_source=False): """ Export product data to common schema from source schema :param hub_id: :return: """ _target_stage_table = "STAGE_{}".format(self._dim_product) _src_item_table = "PRODUCT" if cis_source: _target_stage_table = "STAGE_{}_CIS".format(self._dim_product) _src_item_table = "STAGE_{}".format(self._product_cis_src_tbl) # Creating related staging table for CIS source in HUB dw. # Since CIS source tables are external tables, can NOT be used to join other tables directly. sql = """ DROP TABLE IF EXISTS {dimSchema}.{stgItemTable}; CREATE TABLE {dimSchema}.{stgItemTable} AS SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC_ID AS UPC, ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' AS ACTIVE FROM {dimSchema}.{srcItemTable} WHERE 0=0 {filters}; """.format(dimSchema=self._dim_schema, stgItemTable=_src_item_table, srcItemTable=self._product_cis_src_tbl, filters=self._str_vendor_retailer_filter) self._logger.info(sql) self._source_dw.execute(sql) col = ( " RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC, ITEM_DESCRIPTION, ITEM_GROUP, OSM_ITEM_NBR, OSM_ITEM_STATUS," "OSM_ITEM_TYPE, OSM_BRAND, OSM_UNIT_PRICE, OSM_CATEGORY, OSM_MAJOR_CATEGORY_NO, OSM_MAJOR_CATEGORY, " "OSM_SUB_CATEGORY_NO, OSM_SUB_CATEGORY, OSM_VENDOR_STK_NBR, OSM_VENDOR_PACK_COST, VENDOR_PACK_QTY, " "OSM_WHSE_PACK_COST, OSM_WHSE_PACK_QTY, DSD_IND, VENDOR_NAME ") sql = """ SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY, PV.UPC, AGPG.ITEM_DESCRIPTION, CASE WHEN AGPG.ITEM_GROUP IS NULL OR AGPG.ITEM_GROUP = '000000' THEN PV.UPC ELSE AGPG.ITEM_GROUP END AS ITEM_GROUP, AGPG.OSM_ITEM_NBR, AGPG.OSM_ITEM_STATUS, AGPG.OSM_ITEM_TYPE, AGPG.OSM_BRAND,AGPG.OSM_UNIT_PRICE, AGPG.OSM_CATEGORY, AGPG.OSM_MAJOR_CATEGORY_NO, AGPG.OSM_MAJOR_CATEGORY, AGPG.OSM_SUB_CATEGORY_NO, AGPG.OSM_SUB_CATEGORY, AGPG.OSM_VENDOR_STK_NBR, AGPG.OSM_VENDOR_PACK_COST, CASE WHEN length(AGPG.VENDOR_PACK_QTY) < 15 THEN CASE WHEN REGEXP_LIKE(SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15), '^[0-9]*(\.[0-9]+)?$') THEN CASE WHEN SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC <= 1 THEN 1 ELSE SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC END ELSE 1 END ELSE 1 END VENDOR_PACK_QTY, AGPG.OSM_WHSE_PACK_COST, AGPG.OSM_WHSE_PACK_QTY, CASE WHEN AGPG.OSM_ITEM_TYPE IN ('07','37') THEN '1' ELSE '0' END AS DSD_IND, CC.VENDOR_NAME FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, VENDOR_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC JOIN ( SELECT D.VENDOR_KEY, D.ITEM_KEY, MAX(D.UPC) UPC FROM {self._dim_schema}.{itemTable} D JOIN {self._dim_schema}.VENDOR V on V.VENDOR_KEY = D.VENDOR_KEY WHERE D.ITEM_KEY NOT IN (SELECT ITEM_KEY FROM {self._dim_schema}.PRODUCT_GLOBAL_FILTER) {self._active_status} GROUP BY D.VENDOR_KEY, D.ITEM_KEY ) PV ON PV.VENDOR_KEY = CC.VENDOR_KEY LEFT JOIN ( SELECT AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_GROUP_X' THEN ATTRIBVALUE END) ITEM_GROUP , MAX(CASE WHEN AGP.ATTRIBNAME = 'VENDOR_PACK_QTY_X' THEN ATTRIBVALUE END) VENDOR_PACK_QTY , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_DESCRIPTION_X' THEN ATTRIBVALUE END) ITEM_DESCRIPTION , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_NBR_X' THEN ATTRIBVALUE END) OSM_ITEM_NBR , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_STATUS_X' THEN ATTRIBVALUE END) OSM_ITEM_STATUS , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_TYPE_X' THEN ATTRIBVALUE END) OSM_ITEM_TYPE , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_QTY_X' THEN ATTRIBVALUE END) OSM_WHSE_PACK_QTY , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_NO_X' THEN ATTRIBVALUE END) OSM_MAJOR_CATEGORY_NO , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_X' THEN ATTRIBVALUE END) OSM_MAJOR_CATEGORY , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_CATEGORY_X' THEN ATTRIBVALUE END) OSM_CATEGORY , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_NO_X' THEN ATTRIBVALUE END) OSM_SUB_CATEGORY_NO , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_X' THEN ATTRIBVALUE END) OSM_SUB_CATEGORY , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_BRAND_X' THEN ATTRIBVALUE END) OSM_BRAND , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_UNIT_PRICE_X' THEN ATTRIBVALUE END) OSM_UNIT_PRICE , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_STK_NBR_X' THEN ATTRIBVALUE END) OSM_VENDOR_STK_NBR , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_PACK_COST_X' THEN ATTRIBVALUE END) OSM_VENDOR_PACK_COST , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_COST_X' THEN ATTRIBVALUE END) OSM_WHSE_PACK_COST FROM ( SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(ATTR_GROUP, '_X') ATTRIBNAME, D.ITEM_KEY, D.ATTRIBVALUE FROM {self._meta_schema}.ATTRIBUTEGROUP AG JOIN {self._dim_schema}.{itemTable} D ON D.VENDOR_KEY = AG.VENDOR_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status} WHERE AG.ATTR_GROUP IN ('ITEM_GROUP','VENDOR_PACK_QTY','ITEM_DESCRIPTION','OSM_ITEM_NBR' ,'OSM_ITEM_STATUS','OSM_ITEM_TYPE','OSM_WHSE_PACK_QTY','OSM_MAJOR_CATEGORY_NO' ,'OSM_MAJOR_CATEGORY','OSM_CATEGORY','OSM_SUB_CATEGORY_NO','OSM_SUB_CATEGORY','OSM_BRAND', 'OSM_UNIT_PRICE','OSM_VENDOR_STK_NBR','OSM_VENDOR_PACK_COST','OSM_WHSE_PACK_COST') ) AGP GROUP BY AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY ) AGPG ON AGPG.RETAILER_KEY = CC.RETAILER_KEY AND AGPG.VENDOR_KEY = PV.VENDOR_KEY AND AGPG.ITEM_KEY =PV.ITEM_KEY WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY) NOT IN (SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY FROM {self._dim_schema}.PRODUCT_DEFAULT_FILTER) """.format(self=self, itemTable=_src_item_table) dct_sync_data = self.context.copy() dct_sync_data["source_dw"] = self._source_dw dct_sync_data["source_config"] = self._source_dw.config.json_data[ "configs"] dct_sync_data["target_dw_schema"] = self._common_schema dct_sync_data["target_dw_table"] = _target_stage_table dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) # sync tall table(not for CIS). if self._str_vendor_filter and not cis_source: self._product_columns = self._dct_table_column["PRODUCT"] col = " VENDOR_KEY, ITEM_KEY, UPC, ATTRIBNAME, ATTRIBVALUE, ACTIVE " sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.PRODUCT WHERE ATTRIBNAME " "IN ({self._product_columns}) {self._str_vendor_filter}". format(self=self, col=col)) dct_sync_data["target_dw_table"] = "STAGE_PRODUCT" dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) def _export_store_data(self, hub_id, cis_source=False): """ Export store data to common schema table stage_<table_name> from source schema <table_name> :param hub_id: :return: """ _target_store_table = "STAGE_{}".format(self._dim_store) _src_store_table = "STORE" if cis_source: _target_store_table = "STAGE_{}_CIS".format(self._dim_store) _src_store_table = "STAGE_{}".format(self._store_cis_src_tbl) # Creating related staging table for CIS source in HUB dw. # Since CIS source tables are external tables, can NOT be used to join other tables directly. sql = """ DROP TABLE IF EXISTS {dimSchema}.{stgStoreTable}; CREATE TABLE {dimSchema}.{stgStoreTable} AS SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID AS STOREID, ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' as ACTIVE FROM {dimSchema}.{srcStoreTable} WHERE 0=0 {filters}; """.format(dimSchema=self._dim_schema, stgStoreTable=_src_store_table, srcStoreTable=self._store_cis_src_tbl, filters=self._str_vendor_retailer_filter) self._logger.info(sql) self._source_dw.execute(sql) col = ( " RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID, MARKET_CLUSTER, STATE, CITY, ZIP, OSM_REGION, " "OSM_DISTRICT, RSI_ALERT_TYPE, RETAILER_NAME, RSI_BANNER, PRIME_DC, STORE_NAME" ) sql = """ SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY, SR.STOREID, AGSG.MARKET_CLUSTER, SR.STATE, CASE WHEN SR.CITY IS NULL AND STATE IS NULL THEN 'N/A' WHEN SR.CITY IS NOT NULL AND STATE IS NULL THEN CONCAT(SR.CITY, ', N/A') WHEN SR.CITY IS NULL AND STATE IS NOT NULL THEN CONCAT('N/A, ' , SR.STATE) WHEN SR.CITY IS NOT NULL AND STATE IS NOT NULL THEN CONCAT(CONCAT(SR.CITY, ', '), SR.STATE) END CITY, SR.ZIP, AGSG.OSM_REGION, AGSG.OSM_DISTRICT, COALESCE(SR.RSI_ALERT_TYPE, '0') RSI_ALERT_TYPE, CC.RETAILER_NAME, coalesce(AGSG.RSI_BANNER, 'unmapped') AS RSI_BANNER, AGSG.PRIME_DC, AGSG.STORE_NAME FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, RETAILER_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC JOIN ( SELECT D.RETAILER_KEY, D.STORE_KEY, D.STOREID, MAX(CASE WHEN D.ATTRIBNAME = 'STATE' THEN D.ATTRIBVALUE END) STATE, MAX(CASE WHEN D.ATTRIBNAME = 'CITY' THEN D.ATTRIBVALUE END) CITY, MAX(CASE WHEN D.ATTRIBNAME = 'ZIP' THEN D.ATTRIBVALUE END) ZIP, MAX(CASE WHEN D.ATTRIBNAME = 'RSI_ALERT_TYPE' THEN D.ATTRIBVALUE END) RSI_ALERT_TYPE FROM {self._dim_schema}.{storeTable} D JOIN {self._dim_schema}.RETAILER R ON R.RETAILER_KEY = D.RETAILER_KEY WHERE STORE_KEY NOT IN (SELECT STORE_KEY FROM {self._dim_schema}.STORES_GLOBAL_FILTER) {self._active_status} GROUP BY D.RETAILER_KEY, D.STORE_KEY, D.STOREID ) SR ON SR.RETAILER_KEY = CC.RETAILER_KEY LEFT JOIN ( SELECT AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY, MAX(CASE WHEN AGS.ATTRIBNAME = 'MARKET_CLUSTER_X' THEN ATTRIBVALUE END) MARKET_CLUSTER, MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_REGION_X' THEN ATTRIBVALUE END) OSM_REGION, MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_DISTRICT_X' THEN ATTRIBVALUE END) OSM_DISTRICT, MAX(CASE WHEN AGS.ATTRIBNAME = 'RSI_BANNER_X' THEN ATTRIBVALUE END) RSI_BANNER, MAX(CASE WHEN AGS.ATTRIBNAME = 'PRIME_DC_X' THEN ATTRIBVALUE END) PRIME_DC, MAX(CASE WHEN AGS.ATTRIBNAME = 'SAR_STORE_NAME_X' THEN ATTRIBVALUE END) STORE_NAME FROM ( SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(AG.ATTR_GROUP, '_X') ATTRIBNAME, D.STORE_KEY, D.ATTRIBVALUE FROM {self._meta_schema}.ATTRIBUTEGROUP AG JOIN {self._dim_schema}.{storeTable} D ON D.RETAILER_KEY = AG.RETAILER_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status} WHERE AG.ATTR_GROUP IN ('RSI_BANNER','PRIME_DC', 'MARKET_CLUSTER', 'OSM_REGION', 'OSM_DISTRICT', 'SAR_STORE_NAME') ) AGS GROUP BY AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY ) AGSG ON AGSG.RETAILER_KEY = SR.RETAILER_KEY AND AGSG.VENDOR_KEY = CC.VENDOR_KEY AND AGSG.STORE_KEY = SR.STORE_KEY WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY) NOT IN (SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY FROM {self._dim_schema}.STORES_DEFAULT_FILTER); """.format(self=self, storeTable=_src_store_table) dct_sync_data = self.context.copy() dct_sync_data["source_dw"] = self._source_dw dct_sync_data["source_config"] = self._source_dw.config.json_data[ "configs"] dct_sync_data["target_dw_schema"] = self._common_schema dct_sync_data["target_dw_table"] = _target_store_table dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) # sync tall table(not for CIS) if self._str_retailer_filter and not cis_source: self._store_columns = self._dct_table_column["STORE"] col = " RETAILER_KEY, STORE_KEY, STOREID, ATTRIBNAME, ATTRIBVALUE, ACTIVE " sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.STORE WHERE ATTRIBNAME " "IN ({self._store_columns}) {self._str_retailer_filter}". format(self=self, col=col)) dct_sync_data["target_dw_table"] = "STAGE_STORE" dct_sync_data["target_column"] = col dct_sync_data["source_sql"] = sql self._logger.debug(sql) sync_data(dct_sync_data) def _load_product_data_cis(self, hub_id): """ Sync data from CIS source. if the same item_key exists in both CIS and NXG HUBs, the CIS one should override the one in NXG side. :param hub_id: :return: """ # dump CIS data into cis staging table: stage_dim_product_cis self._export_product_data(hub_id=hub_id, cis_source=True) # Below part is moved from method self._export_product_data. Put it here when all product data are ready. # Override nxg items with CIS if there are duplicated items on both NXG and CIS sql = """ DELETE FROM {cmnSchema}.STAGE_{itemTable} WHERE (vendor_key, retailer_key, item_key) IN (SELECT vendor_key, retailer_key, item_key FROM {cmnSchema}.STAGE_{itemTable}_CIS); INSERT INTO {cmnSchema}.STAGE_{itemTable} SELECT * FROM {cmnSchema}.STAGE_{itemTable}_CIS; """.format(itemTable=self._dim_product, cmnSchema=self._common_schema) self._logger.info(sql) self._dw.execute(sql) # updating column OSM_MAJOR_CATEGORY when NXG and CIS sources are all done. sql = """ DROP TABLE IF EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY; CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ SELECT distinct OSM_SUB_CATEGORY_NO,MAX(OSM_SUB_CATEGORY) as OSM_SUB_CATEGORY FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_SUB_CATEGORY_NO; DROP TABLE IF EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY; CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ SELECT distinct OSM_MAJOR_CATEGORY_NO,MAX(OSM_MAJOR_CATEGORY) as OSM_MAJOR_CATEGORY FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_MAJOR_CATEGORY_NO; """.format(self=self) self._logger.info(sql) self._dw.execute(sql) sql = (""" SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.STAGE_{self._dim_product}'); INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_SUB_CATEGORY_CONSISTENCY SELECT a.OSM_SUB_CATEGORY_NO, MAX(a.OSM_SUB_CATEGORY) AS OSM_SUB_CATEGORY FROM {self._common_schema}.STAGE_{self._dim_product} a LEFT JOIN TEMP_OSM_SUB_CATEGORY_CONSISTENCY b on a.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO WHERE a.OSM_SUB_CATEGORY_NO IS NOT NULL AND a.OSM_SUB_CATEGORY_NO <> '' AND b.OSM_SUB_CATEGORY_NO IS NULL GROUP BY a.OSM_SUB_CATEGORY_NO; UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product} SET OSM_SUB_CATEGORY = b.OSM_SUB_CATEGORY FROM (SELECT * FROM TEMP_OSM_SUB_CATEGORY_CONSISTENCY) b WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO; /*UPDATE {self._common_schema}.STAGE_{self._dim_product} SET ITEM_GROUP = B.ITEM_GROUP FROM $schemaName.OLAP_ITEM B WHERE {self._common_schema}.STAGE_{self._dim_product}.ITEM_KEY = B.ITEM_KEY;*/ INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY SELECT a.OSM_MAJOR_CATEGORY_NO, MAX(a.OSM_MAJOR_CATEGORY) AS OSM_MAJOR_CATEGORY FROM {self._common_schema}.STAGE_{self._dim_product} a LEFT JOIN TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY b on a.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO WHERE a.OSM_MAJOR_CATEGORY_NO IS NOT NULL AND a.OSM_MAJOR_CATEGORY_NO <> '' AND b.OSM_MAJOR_CATEGORY_NO IS NULL GROUP BY a.OSM_MAJOR_CATEGORY_NO; UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product} SET OSM_MAJOR_CATEGORY = b.OSM_MAJOR_CATEGORY FROM (SELECT * FROM TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY) b WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO; """.format(self=self)) self._logger.info(sql) self._dw.execute(sql) def _load_store_data_cis(self, hub_id): """ Sync data from CIS source. # if the same store_key exists in CIS and NXG HUBs, the CIS one should override the one in NXG side. :param hub_id: :return: """ # dump CIS data into cis staging table: stage_dim_store_cis self._export_store_data(hub_id=hub_id, cis_source=True) # # Override nxg stores with CIS if there are duplicated items on both NXG and CIS sql = """ DELETE FROM {cmnSchema}.STAGE_{storeTable} WHERE (vendor_key, retailer_key, store_key) IN (SELECT vendor_key, retailer_key, store_key FROM {cmnSchema}.STAGE_{storeTable}_CIS ); INSERT INTO {cmnSchema}.STAGE_{storeTable} SELECT * FROM {cmnSchema}.STAGE_{storeTable}_CIS; """.format(storeTable=self._dim_store, cmnSchema=self._common_schema) self._logger.info(sql) self._dw.execute(sql) def _swap_data(self): """ Switch partition from stage table to final table. :return: """ sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY " "FROM {self._common_schema}.STAGE_{self._dim_product}".format( self=self)) self._logger.debug(sql) for row in self._dw.query(sql): self._dw.switch_partition( schema_name=self._common_schema, table_name=self._dim_product, partition_name=row[0], stage_schema_name=self._common_schema, stage_table_name='STAGE_{self._dim_product}'.format(self=self)) sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY " "FROM {self._common_schema}.STAGE_PRODUCT".format(self=self)) self._logger.debug(sql) for row in self._dw.query(sql): self._dw.switch_partition(schema_name=self._common_schema, table_name='PRODUCT', partition_name=row[0], stage_schema_name=self._common_schema, stage_table_name='STAGE_PRODUCT') sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY " "FROM {self._common_schema}.STAGE_{self._dim_store}".format( self=self)) self._logger.debug(sql) for row in self._dw.query(sql): self._dw.switch_partition( schema_name=self._common_schema, table_name=self._dim_store, partition_name=row[0], stage_schema_name=self._common_schema, stage_table_name='STAGE_{self._dim_store}'.format(self=self)) sql = ( "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY " "FROM {self._common_schema}.STAGE_STORE".format(self=self)) self._logger.debug(sql) for row in self._dw.query(sql): self._dw.switch_partition(schema_name=self._common_schema, table_name='STORE', partition_name=row[0], stage_schema_name=self._common_schema, stage_table_name='STAGE_STORE') def _process_sync_dim_data(self): """ Process to sync dim data :return: """ try: self._get_column() self._recreate_stage_table() for hub_id in self._hub_ids: try: self._dim_schema = "DIM_{hub_id}".format(hub_id=hub_id) self._meta_schema = "METADATA_{hub_id}".format( hub_id=hub_id) self._logger.info( "Sync dim data start hub_id {hub_id}".format( hub_id=hub_id)) self._get_source_config(hub_id) self._export_calendar_data(hub_id) self._get_vendor_retailer_from_hub(hub_id) if not self._str_vendor_retailer_filter: self._logger.warning( "hub_id {} does not have any matched product and store data" .format(hub_id)) continue self._export_product_data(hub_id) self._export_store_data(hub_id) # handle CIS source. self._load_product_data_cis(hub_id) self._load_store_data_cis(hub_id) self._logger.info( "Sync hub_id {hub_id} dim data to stage table.".format( hub_id=hub_id)) except Exception as msg: self._logger.warning(msg) continue self._swap_data() # self._truncate_stage_table() self._logger.info("Sync dim data done from hubs {hub_ids}".format( hub_ids=self._hub_ids)) self._analyze_table() except Exception as msg: self._logger.error(msg) finally: pass def process(self): try: self._process_sync_dim_data() except Exception as e: self._logger.error(e) finally: if hasattr(self, "_dw"): self._dw.close_connection() if hasattr(self, "_source_dw"): self._source_dw.close_connection() if hasattr(self, "_db"): self._db.close_connection()
parse.add_option("--vendor_key", action="store", dest="vendor_key") parse.add_option("--meta", action="store", dest="meta") (options, args) = parse.parse_args() meta = json.loads(options.meta) schema_name = meta['schema'] common_schema = meta['common_schema'] #schemas = [common_schema, schema_name] schemas = [schema_name] logger = Logger(log_level="info", target="console", vendor_key=options.vendor_key, retailer_key=options.retailer_key) dw = DWOperation(meta=meta, logger=logger) try: for s in schemas: dw.execute(''' CREATE SCHEMA IF NOT EXISTS %(schema_name)s DEFAULT INCLUDE SCHEMA PRIVILEGES; GRANT USAGE ON SCHEMA %(schema_name)s TO read_internalusers; GRANT SELECT ON SCHEMA %(schema_name)s TO read_internalusers; GRANT USAGE ON SCHEMA %(schema_name)s TO write_internalusers; GRANT SELECT,INSERT,UPDATE,DELETE,TRUNCATE ON SCHEMA %(schema_name)s TO write_internalusers; ''' % {'schema_name': s}) logger.info( '%s is created, and permission has been granted for read_internalusers and write_internalusers.' % s) for env in ['uat', 'pp', 'Prod']:
def __init__(self, meta={}, to_physical_table=False, no_delete_key=False, logger=None): logger = logger if logger else Logger(log_level="info", vendor_key=-1, retailer_key=-1, module_name="Persist_Feedback") vertica_conn = DWOperation(meta=meta, logger=logger) redis_conn = RedisOperation(meta=meta, logger=logger) c = redis_conn.get_connection() column_mapping = { 'from_key': [ # Save a value from key into dataset # ['key name in dataset', 'index of key elements splitted by :'] [ 'UPDATE_TIME', 4 ] #FEEDBACK:342:6:38726000097879:20190219013621 --> get key.split(':')[4] ], 'hardcode': [ # ['key name in dataset', 'hardcode value'] ['EVENT_KEY', '-1'], ], 'from_redis': [ # ['table column', 'key name in dataset', 'table column type', 'value updatable', 'target'] # Be noted, the table column type, if the value needs to be quoted, use VARCHAR, else use NUMBER # so int, float, number -> NUMBER # varchar, date, timestamp -> VARCHAR # updatable means the value of the column would be updated if alerts already exist [ 'ALERT_ID', 'ALERT_ID', 'NUMBER', 'NON-UPDATABLE', 'TABLE|MOBILE' ], [ 'VENDOR_KEY', 'VENDOR_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE|MOBILE' ], [ 'RETAILER_KEY', 'RETAILER_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE|MOBILE' ], ['ITEM_KEY', 'ITEM_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE'], ['STORE_KEY', 'STORE_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE'], [ 'PERIOD_KEY', 'PERIOD_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE' ], [ 'STORE_REP', 'STORE_REP', 'VARCHAR', 'NON-UPDATABLE', 'TABLE' ], [ 'STORE_VISITED_PERIOD_KEY', 'STORE_VISITED_PERIOD_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE' ], [ 'FEEDBACK_DESCRIPTION', 'FEEDBACK_DESCRIPTION', 'VARCHAR', 'UPDATABLE', 'TABLE|MOBILE' ], [ 'ON_HAND_PHYSICAL_COUNT', 'ON_HAND_PHYSICAL_COUNT', 'NUMBER', 'UPDATABLE', 'TABLE|MOBILE' ], [ 'ON_HAND_CAO_COUNT', 'ON_HAND_CAO_COUNT', 'NUMBER', 'UPDATABLE', 'TABLE|MOBILE' ], ['SOURCE', 'SOURCE', 'VARCHAR', 'NON-UPDATABLE', 'TABLE'], [ 'CYCLE_KEY', 'CYCLE_KEY', 'NUMBER', 'NON-UPDATABLE', 'MOBILE' ], [ 'UPDATE_TIME', 'UPDATE_TIME', 'VARCHAR', 'NON-UPDATABLE', 'MOBILE' ], ['EVENT_KEY', 'EVENT_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE'], ] } self.vars = { "meta": meta, "logger": logger, "column_mapping": column_mapping, "vertica_conn": vertica_conn, "redis_conn": c, "to_physical_table": to_physical_table, "no_delete_key": no_delete_key } capacity = Capacity(meta) self.vars['capacity'] = capacity
class DataCleanup: """ Cleanup outdated Data For table: it will drop the table end with job id which less then job id from api /common/job/cleanup Gor data, it will drop partition set in config json """ def __init__(self, context): self.context = context.copy() self._logger = context["logger"] self.body = {"job_id": context["jobId"], "step_id": context["stepId"], "status": StepStatus.RUNNING.name, "message": "Data Cleanup start"} self._logger.info(self.body) def get_url_response(self, url, method="POST", **kwargs): """Get response from url :return: response """ self._logger.info(url) if method.upper() not in ["GET", "POST", "PUT", "DELETE"]: method = "GET" response = requests.request(method=method, url=url, verify=False, **kwargs) self._logger.info(response.json()) return response def _get_boundary_job_id(self): """Get boundary job id :return: boundary_job_id """ boundary_job_id = None response = self.get_url_response(url="{}/job/cleanup".format(self.context["meta"]["api_job_str"]), method="DELETE", headers=self.context["header"]).json() if response["status"].upper() == StepStatus.SUCCESS.name: if response["data"]: boundary_job_id = response["data"] self.body["message"] = "Boundary job id is {}".format(boundary_job_id) self._logger.info(self.body) else: self.body["message"] = "None boundary job id".format(boundary_job_id) self._logger.warning(self.body) else: self.body["message"] = response["error"] self._logger.error(response) return boundary_job_id def _get_outdated_data(self, dct_job_id): """Get outdated data :return: dictionary of outdated data """ dct = {} boundary_job_id = self._get_boundary_job_id() if boundary_job_id: for k, v in dct_job_id.items(): if int(k) < int(boundary_job_id): dct[k] = v return dct def _cleanup_table(self, config): """Cleanup dw table :return: """ dct_table = {} sql = ("SELECT /*+ LABEL(GX_IRIS_DATA_CLEANUP)*/ TABLE_SCHEMA || '.' || TABLE_NAME AS SCHEMA_TABLE" " , REGEXP_SUBSTR(TABLE_NAME, '(.+)_(\d+)$', 1, 1, '', 2) AS JOB_ID FROM TABLES " "WHERE TABLE_SCHEMA IN ( " " SELECT SCHEMA_NAME FROM SCHEMATA WHERE SCHEMA_OWNER = '{}' AND SCHEMA_NAME NOT IN ('{}'))" "AND (REGEXP_ILIKE(TABLE_NAME,'^{}$'))" "".format(self.context["meta"]["db_conn_vertica_username"], "','".join(config["excludeSchema"]), "$') OR REGEXP_ILIKE(TABLE_NAME,'^".join(config["namePattern"]))) self._logger.info(sql) for row in self._dw.query(sql): dct_table[row.JOB_ID] = row.SCHEMA_TABLE if dct_table: dct_outdated_data = self._get_outdated_data(dct_table) if dct_outdated_data: sql = "DROP TABLE IF EXISTS {} CASCADE".format(",".join(dct_outdated_data.values())) self._logger.info(sql) self._dw.execute(sql) else: self.body["message"] = "None table cleanup" self._logger.warning(self.body) else: self.body["message"] = "Not found table to cleanup" self._logger.warning(self.body) def _cleanup_data(self, config): """Cleanup dw table partition, SEQ_NUM logic relate to OSA-471 :return: """ if "SEQ_NUM" in config["partitionTable"]: sql = ("SELECT /*+ LABEL(GX_IRIS_DATA_CLEANUP)*/ DISTINCT INCIDENT_ID_START//1000000000%36500 AS SEQ_NUM " "FROM {}.DIM_INCIDENT_ID_START " "WHERE PERIOD_KEY = {}".format( self.context["meta"]["db_conn_vertica_common_schema"], (datetime.datetime.now() - datetime.timedelta(days=config["reserveDays"])).strftime("%Y%m%d"))) self._logger.info(sql) seq_num = self._dw.query_scalar(sql) if seq_num: sql = ("SELECT /*+ LABEL(GX_IRIS_DATA_CLEANUP)*/ SCHEMA_TABLE, PARTITION_KEY FROM ( " " SELECT DISTINCT PA.PARTITION_KEY, PA.TABLE_SCHEMA||'.'||PR.ANCHOR_TABLE_NAME SCHEMA_TABLE " " FROM PARTITIONS PA " " JOIN PROJECTIONS PR ON PR.PROJECTION_ID = PA.PROJECTION_ID " " WHERE PA.TABLE_SCHEMA IN ( " " SELECT SCHEMA_NAME FROM SCHEMATA " " WHERE SCHEMA_OWNER = '{}' AND SCHEMA_NAME NOT IN ('{}')) " " AND PR.ANCHOR_TABLE_NAME IN ('{}')) T " "WHERE T.PARTITION_KEY < {} " "".format(self.context["meta"]["db_conn_vertica_username"], "','".join(config["excludeSchema"]), "','".join(config["partitionTable"]['SEQ_NUM']), seq_num)) self._logger.info(sql) for row in self._dw.query(sql): sql = "SELECT /*+ LABEL(GX_IRIS_DATA_CLEANUP)*/ DROP_PARTITION('{}', {})" \ "".format(row.SCHEMA_TABLE, row.PARTITION_KEY) self._logger.info(sql) self._dw.execute(sql) else: self.body["message"] = self._logger.warning("Not found SEQ_NUM to cleanup") self._logger.warning(self.body) else: self.body["message"] = "None data cleanup" self._logger.warning(self.body) def _process_data_cleanup(self): """Process Data Cleanup :return: """ try: self._dw = DWOperation(meta=self.context["meta"]) lst_warning = [" none warning"] for cfg in eval(self.context["config_json"]): try: if cfg["type"].upper() == "TABLE": self._cleanup_table(cfg) elif cfg["type"].upper() == "DATA": self._cleanup_data(cfg) self.body["message"] = "{} cleanup done".format(cfg["type"]) self._logger.info(self.body) except Exception as msg: self.body["status"] = StepStatus.ERROR.name self.body["message"] = ("Data Cleanup error with config {} --||## {}".format(cfg, msg)) self._logger.warning(self.body) lst_warning.append(cfg) continue self.body["status"] = StepStatus.SUCCESS.name self.body["message"] = ("Data Cleanup done with config {}, {}".format(cfg, "; ".join(lst_warning))) self._logger.info(self.body) except Exception as msg: self.body["status"] = StepStatus.ERROR.name self._logger.error(msg) finally: if hasattr(self, "_dw"): self._dw.close_connection() self.body["message"] = "Data Cleanup end" self._logger.info(self.body) def process(self): self._process_data_cleanup()
def __init__(self, meta, params=None, init_flag=False, logger=None): """ # sync feedback data from RDP side to IRIS(OSA) side by incremental via event_key. # 1, only sync those vendor&retailer which applied OSA Service. # 2, for Those new vendor&retailer, copy all historical data when initialization. :param meta: [mandatory] config data from config.properties file :param params: 2 cases here. depends on whether sync rdp feedback for whole RDP or new customer. see below. 1, rdp_id: if rdp_id was given, then sync all data for this given RDP. otherwise, sync data from all related RDPs. Noted: rdp_id will be passed when calling this service via REST API. 2, vendor_key: mandatory only when init_flag is True. retailer_key: mandatory only when init_flag is True Noted: These 2 parameters will not be passed from REST API but called directly by deploy scripts. :param init_flag: if init_flag is True: then only sync feedback data for given vendor & retailer. This is used when introducing new customer. if init_flat is False: sync all customers' data from RDP periodically(e.g. sync daily). :param logger: """ self.meta = meta self._params = {} if params is None else params self._rdp_id = self._params.get("rdpId", None) self._fact_type = 'fdbk' self._init_flag = init_flag self._vendor_key = self._params.get("vendor_key", None) self._retailer_key = self._params.get("retailer_key", None) self._debug = self._params.get('debug', 'N') self._default_rdp = "RDP_AUX" self._log_file = './log/sync_fdbk_%s_%s.log' % ( self._rdp_id, datetime.datetime.now().strftime('%Y%m%d')) self.logger = logger if logger else Logger(log_level="debug", target="console|file", vendor_key=-1, retailer_key=-1, log_file=self._log_file, sql_conn=None) self.osa_app_conn = MSOperation(meta=self.meta, logger=self.logger) self.osa_dw_conn = DWOperation(meta=self.meta, logger=self.logger) self.max_event_key = None # we already know feedback table name of RDP self.source_table_rdp = "DS_FACT_FEEDBACK" # source table in RDP side. self.staging_import_table_osa = "STAGE_FACT_FEEDBACK_RDP" # used to store sync data from RDP table (same structure as table DS_FACT_FEEDBACK) self.target_table_osa = "FACT_FEEDBACK" # final table in OSA side self.capacity = Capacity(meta=meta) self.dct_sync_data = copy.deepcopy( self.meta) # required for calling sync_data module self.dct_sync_data[ "meta"] = self.meta # required for calling sync_data module self.dct_sync_data["target_osa_conn"] = self.osa_dw_conn self.dct_sync_data["target_dw_schema"] = self.meta[ 'db_conn_vertica_common_schema'] self.dct_sync_data["target_dw_table"] = self.staging_import_table_osa self.dct_sync_data["logger"] = self.logger # [True|False(default)] True: direct connection between Vertica clusters. False: using vsql. self.dct_sync_data["dw_conn_vertica"] = False # self.dct_sync_data["dw_conn_vertica"] = True self.transfer = TransferData(dct_sync_data=self.dct_sync_data)
class Feedback(object): def __init__(self, meta, params=None, init_flag=False, logger=None): """ # sync feedback data from RDP side to IRIS(OSA) side by incremental via event_key. # 1, only sync those vendor&retailer which applied OSA Service. # 2, for Those new vendor&retailer, copy all historical data when initialization. :param meta: [mandatory] config data from config.properties file :param params: 2 cases here. depends on whether sync rdp feedback for whole RDP or new customer. see below. 1, rdp_id: if rdp_id was given, then sync all data for this given RDP. otherwise, sync data from all related RDPs. Noted: rdp_id will be passed when calling this service via REST API. 2, vendor_key: mandatory only when init_flag is True. retailer_key: mandatory only when init_flag is True Noted: These 2 parameters will not be passed from REST API but called directly by deploy scripts. :param init_flag: if init_flag is True: then only sync feedback data for given vendor & retailer. This is used when introducing new customer. if init_flat is False: sync all customers' data from RDP periodically(e.g. sync daily). :param logger: """ self.meta = meta self._params = {} if params is None else params self._rdp_id = self._params.get("rdpId", None) self._fact_type = 'fdbk' self._init_flag = init_flag self._vendor_key = self._params.get("vendor_key", None) self._retailer_key = self._params.get("retailer_key", None) self._debug = self._params.get('debug', 'N') self._default_rdp = "RDP_AUX" self._log_file = './log/sync_fdbk_%s_%s.log' % ( self._rdp_id, datetime.datetime.now().strftime('%Y%m%d')) self.logger = logger if logger else Logger(log_level="debug", target="console|file", vendor_key=-1, retailer_key=-1, log_file=self._log_file, sql_conn=None) self.osa_app_conn = MSOperation(meta=self.meta, logger=self.logger) self.osa_dw_conn = DWOperation(meta=self.meta, logger=self.logger) self.max_event_key = None # we already know feedback table name of RDP self.source_table_rdp = "DS_FACT_FEEDBACK" # source table in RDP side. self.staging_import_table_osa = "STAGE_FACT_FEEDBACK_RDP" # used to store sync data from RDP table (same structure as table DS_FACT_FEEDBACK) self.target_table_osa = "FACT_FEEDBACK" # final table in OSA side self.capacity = Capacity(meta=meta) self.dct_sync_data = copy.deepcopy( self.meta) # required for calling sync_data module self.dct_sync_data[ "meta"] = self.meta # required for calling sync_data module self.dct_sync_data["target_osa_conn"] = self.osa_dw_conn self.dct_sync_data["target_dw_schema"] = self.meta[ 'db_conn_vertica_common_schema'] self.dct_sync_data["target_dw_table"] = self.staging_import_table_osa self.dct_sync_data["logger"] = self.logger # [True|False(default)] True: direct connection between Vertica clusters. False: using vsql. self.dct_sync_data["dw_conn_vertica"] = False # self.dct_sync_data["dw_conn_vertica"] = True self.transfer = TransferData(dct_sync_data=self.dct_sync_data) def _populate_source_config(self, source_config): self.logger.debug("The source config is: %s" % source_config) _src_config = {} if os.name == 'nt': _src_config["temp_file_path"] = "d:" elif os.name == 'posix': _src_config["temp_file_path"] = "/tmp" # Getting user account from config.properties file first. if self.meta.get("db_conn_vertica_rdp_username"): _src_config["dw.etluser.id"] = self.meta.get( "db_conn_vertica_rdp_username") if self.meta.get("db_conn_vertica_rdp_password"): _src_config["dw.etluser.password"] = self.meta.get( "db_conn_vertica_rdp_password") else: _pmp_pwd = get_password( username=self.meta.get("db_conn_vertica_rdp_username"), meta=self.meta) # The pwd should be encrypted in order to: 1, align with else part, 2, pass it to db.sync_data module _src_config["dw.etluser.password"] = Crypto().encrypt(_pmp_pwd) # if not configed then get them directly from RDP config. else: _src_config["dw.etluser.id"] = source_config.get("dw.etluser.id") # the pwd is encrypted _src_config["dw.etluser.password"] = source_config.get( "dw.etluser.password") # required info for calling sync_data module. _src_config["dw.server.name"] = source_config.get("dw.server.name") _src_config["dw.db.name"] = source_config.get("dw.db.name") _src_config["dw.db.portno"] = source_config.get("dw.db.portno", 5433) _src_config["dw.schema.name"] = source_config.get("dw.schema.name") self.logger.debug("srouce config is: %s" % _src_config) self.dct_sync_data["source_config"] = _src_config # Create the connection to RDP Vertica Cluster. which is the source Vertica cluster rdp_meta = copy.deepcopy(self.meta) tmp_rdp_meta = { 'db_conn_vertica_servername': _src_config["dw.server.name"], 'db_conn_vertica_port': _src_config["dw.db.portno"], 'db_conn_vertica_dbname': _src_config["dw.db.name"], 'db_conn_vertica_username': _src_config["dw.etluser.id"], 'db_conn_vertica_password': _src_config["dw.etluser.password"], 'db_conn_vertica_password_encrypted': "true" } rdp_meta.update(tmp_rdp_meta) self.logger.debug("rdp config is: %s" % rdp_meta) rdp_connection = DWOperation(meta=rdp_meta) self.dct_sync_data["source_dw"] = rdp_connection def main_process(self): try: # if not introducing new customer and _rdp_id was given, # then we will sync all feedback data from given RDP for registered users. if self._init_flag is False and self._rdp_id: try: rdp_config = Config(meta=self.meta, hub_id=self._rdp_id).json_data if not rdp_config['configs']: raise Warning( "There is no configs returned for RDP: %s." "Please check if this RDP registered in CP with below URL." "%s/properties/rdps?factType=fdbk" % (self._rdp_id, self.meta["api_config_str"])) # exit(StepStatus.SUCCESS.value) _rdp_schema = rdp_config['configs'].get('dw.schema.name') self.logger.info("Started to sync data from rdp: %s" % _rdp_schema) # self.dct_sync_data["source_config"] = rdp_config['configs'] self._populate_source_config(rdp_config['configs']) self.initialize() _flag = self.load_data() if _flag: # if no data, then no need to process & update variables table. self.process_data() sql = """ IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}') INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME) VALUES ('{eventType}', '{value}', '', getdate(), getdate()) ELSE UPDATE VARIABLES SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate() WHERE VARIABLE_NAME = '{eventType}' """.format(eventType=_rdp_schema, value=self.max_event_key) self.logger.info(sql) self.osa_app_conn.execute(sql) self.logger.info("Data sync done for RDP: %s" % _rdp_schema) except Exception as e: self.logger.warning(e) raise # exit(StepStatus.SUCCESS.value) # exit(0) otherwise Docker container will fail. # Else we will get all RDPs from REST API: http://10.172.36.75/config/properties/rdps?factType=fdbk # There could be multi RDPs(e.g. for SVR & WM). if so, loop all RDPs elif self._init_flag is False and self._rdp_id is None: try: rdp_configs = Config( meta=self.meta, rdp_info=True, rdp_fact_type=self._fact_type).json_data if not rdp_configs: raise Warning( "No feedback related RDP found." "Please check if any data returned from below URL." "%s/properties/rdps?factType=fdbk" % (self.meta["api_config_str"])) # exit(StepStatus.SUCCESS.value) for rdp_config in rdp_configs: _rdp_schema = rdp_config['configs'].get( 'dw.schema.name') self.logger.info("Started to sync data from rdp: %s" % _rdp_schema) # self.dct_sync_data["source_config"] = rdp_config['configs'] self._populate_source_config(rdp_config['configs']) self.initialize() _flag = self.load_data() if _flag: # if no data, then no need to process & update variables table. self.process_data() sql = """ IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}') INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME) VALUES ('{eventType}', '{value}', '', getdate(), getdate()) ELSE UPDATE VARIABLES SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate() WHERE VARIABLE_NAME = '{eventType}' """.format(eventType=_rdp_schema, value=self.max_event_key) self.logger.info(sql) self.osa_app_conn.execute(sql) self.logger.info("Data sync done for RDP: %s" % _rdp_schema) except Exception as e: self.logger.warning(e) raise elif self._init_flag is True: if self._vendor_key is None or self._retailer_key is None: self.logger.warning( "vendor_key and retailer_key are required when initilize feedback for new customer" ) raise ValueError # getting fdbk related rdps. try: rdp_configs = Config( meta=self.meta, rdp_info=True, rdp_fact_type=self._fact_type).json_data if not rdp_configs: self.logger.warning( "No feedback related RDP found." "Please check if any data returned from below URL." "%s/properties/rdps?factType=fdbk" % (self.meta["api_config_str"])) exit(StepStatus.SUCCESS.value) fdbk_rdps = [ str(rdp_config["rdpId"]).upper() for rdp_config in rdp_configs ] # change table name in case conflict with normal sync process self.dct_sync_data[ "target_dw_table"] = "{0}_{1}_{2}".format( self.staging_import_table_osa, self._vendor_key, self._retailer_key) _silo_config = Config( meta=self.meta, vendor_key=self._vendor_key, retailer_key=self._retailer_key).json_data _silo_type = _silo_config['configs'].get( 'etl.silo.type', 'SVR') _rdp_id = _silo_config['configs'].get('rdp.db.name') # RDP_AUX is default rdp id for feedback etl on PRODUCTION. # 1, if there is no RDP for given silo. then exit. if not _rdp_id or str(_rdp_id).strip() == '': self.logger.warning( "There is no RDP silo configed for the given vendor:%s " "and retailer:%s. So no need to sync feedback." % (self._vendor_key, self._retailer_key)) exit(StepStatus.SUCCESS.value) # 2, Getting configed RDP list, and check if there are feedback related RDPs. _tmp_rdp_lst = str(_rdp_id).upper().split(sep=",") _rdp_lst = [_tmp.strip() for _tmp in _tmp_rdp_lst] # common_rdps is RDP silo configed for syncing feedback data for given silo(vendor&retailer) common_rdps = list(set(_rdp_lst).intersection(fdbk_rdps)) if common_rdps is None: self.logger.warning( "There is no RDP silo configed for the given vendor:%s " "and retailer:%s. So no need to sync feedback." % (self._vendor_key, self._retailer_key)) exit(StepStatus.SUCCESS.value) # If there is 1 or more than 1 feedback related rdps configed, then loop them to sync feedback data, # Normally, there should be only 1. or no feedback rdp configed. for common_rdp in common_rdps: _rdp_id = common_rdp self.logger.info( "Started to sync data from rdp: %s for given vendor:%s and retailer:%s. " % (_rdp_id, self._vendor_key, self._retailer_key)) # if RDP is not RDP_AUX, Won't exit but log a warning. if _rdp_id != self._default_rdp: self.logger.warning( "Please be noted: The RDP is:%s. It is not RDP_AUX." % _rdp_id) # WM silos are also following above logic. # all hosted silo are ultilizing RDP_AUX to transfer feedback data. not sure about Walmart. # if str(_silo_type).upper() in ["WMSSC", "WMCAT", "SASSC", "WMINTL"]: # _rdp_id = self._default_rdp # WM rdp is RDP_AUX as well? rdp_config = Config(meta=self.meta, hub_id=_rdp_id).json_data if not rdp_config['configs']: self.logger.warning( "There is no configs for RDP: %s. Please check following URL:" "%s/properties/%s/%s" % (_rdp_id, self.meta["api_config_str"], _rdp_id, _rdp_id)) exit(StepStatus.SUCCESS.value) _rdp_schema = rdp_config['configs'].get( 'dw.schema.name') self.logger.info( "Started to init feedback data from rdp: %s for " "given vendor:%s and retailer:%s " % (_rdp_schema, self._vendor_key, self._retailer_key)) # self.dct_sync_data["source_config"] = rdp_config['configs'] self._populate_source_config(rdp_config['configs']) self.initialize() _flag = self.load_data() if _flag: # if no data, then no need to process. self.process_data() self.logger.info("Data sync done for RDP: %s" % _rdp_id) except Exception as e: self.logger.warning(e) self.logger.warning( "Please check if any warning or error messages when doing the initialization!" ) finally: if self.osa_app_conn: self.osa_app_conn.close_connection() if self.osa_dw_conn: self.osa_dw_conn.close_connection() def initialize(self): """ Create local temp tables , and DDLs required to process this fact type :return: """ self.logger.info("Initialize...") # recreate this table for every RDP. no need to truncate any longer. # sql = "TRUNCATE TABLE {cmnSchema}.{targetTable}"\ # .format(cmnSchema=self.dct_sync_data["target_dw_schema"], targetTable=self.staging_import_table_osa) # self.logger.info(sql) # self.osa_dw.execute(sql) sql = """ --Store data from RDP table. DROP TABLE IF EXISTS {cmnSchema}.{importTable}; CREATE TABLE {cmnSchema}.{importTable} ( EVENT_KEY int NOT NULL, RETAILER_KEY int, VENDOR_KEY int, STORE_VISIT_DATE date, PERIOD_KEY int NOT NULL, TYPE varchar(1), TYPE_DATE varchar(10), ALERT_ID int, ALERT_TYPE varchar(64), MERCHANDISER_STORE_NUMBER varchar(512), STORE_ID varchar(512), MERCHANDISER_UPC varchar(512), INNER_UPC varchar(512), MERCHANDISER varchar(100), STORE_REP varchar(1000), SOURCE varchar(1000), BEGIN_STATUS varchar(255), ACTION varchar(255), FEEDBACK_DESCRIPTION varchar(255), FEEDBACK_HOTLINEREPORTDATE date, FEEDBACK_ISININVENTORY varchar(5), ZIP_CODE varchar(64), ARTS_CHAIN_NAME varchar(255), UPC_STATUS varchar(255), MSI varchar(255) ) UNSEGMENTED ALL NODES; """.format(cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info(sql) self.osa_dw_conn.execute(sql) def load_data(self): """ # Load data from RDP table ds_fact_feedback to local temp tables. There is an column event_key which is incremental for all customers in ds_fact_feedback table. we can save the snapshot of this column to variable table, and do the incremental every time based on this column. There are few cases here: 1, Routinely, There will be a scheduled job to sync the whole feedback data for valid customers from related RDP silo. And save the snapshot of the event_key from previous loading for next incremental loading. 2, if on-boarding a new vendor & retailer customer. Getting rdp_event_key from variable for related RDP silo. (rdp_event_key is from previous loading) and then sync feedback data from related RDP silo only for this given customer when event_key < rdp_event_key. Then case1 will take care the rest of feedback data. :return: """ rdp_schema = self.dct_sync_data["source_config"].get('dw.schema.name') # rdp_aux.ds_fact_feedback source_table = "{rdpSchema}.{rdptableName}"\ .format(rdpSchema=rdp_schema, rdptableName=self.source_table_rdp) # common.stage_fact_feedback_rdp target_table = "{dwSchema}.{importTable}"\ .format(dwSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info( "Ready to load Data from {srouceTable} to {targetTable}".format( targetTable=target_table, srouceTable=source_table)) insert_columns = " EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISIT_DATE, PERIOD_KEY, TYPE, TYPE_DATE," \ " ALERT_ID, ALERT_TYPE, MERCHANDISER_STORE_NUMBER, STORE_ID, MERCHANDISER_UPC, INNER_UPC," \ " MERCHANDISER, STORE_REP, SOURCE, BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION," \ " FEEDBACK_HOTLINEREPORTDATE, FEEDBACK_ISININVENTORY, ZIP_CODE, ARTS_CHAIN_NAME, UPC_STATUS, MSI " try: self.logger.info( "Getting the previous Event_key from last run for incremental load." ) _event_sql = "SELECT VARIABLE_VALUE FROM variables " \ "WHERE VARIABLE_NAME = '{rdpName}'".format(rdpName=rdp_schema) self.logger.info(_event_sql) event_key = self.osa_app_conn.query_scalar(_event_sql) self.logger.info( "Getting customer info which only applied OSA services as filter" ) sql = "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_MAPPING " \ "UNION " \ "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_RC_MAPPING" self.logger.info(sql) results = self.osa_app_conn.query(sql) if not results: raise Warning( "There is no data in table AP_ALERT_CYCLE_MAPPING. Please check sql: %s" % sql) # exit(StepStatus.SUCCESS.value) user_filters = [ 'SELECT ' + str(result.retailer_key) + ',' + str(result.vendor_key) for result in results ] user_filter_str = ' UNION ALL '.join(user_filters) self.logger.info("Customer filters are: %s" % user_filter_str) # incremental filter from RDP table where_sql = "EVENT_KEY > {eventKey} AND SOURCE != 'ARIA' " \ "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\ .format(eventKey=event_key, userFilter=user_filter_str) # TODO2DONE: how to set default value? use -1 # copy all if there is no value in variables table. if not event_key: self.logger.warning( "There is no value set in variables table for RDP:{name}, " "So copy the whole table".format(name=rdp_schema)) where_sql = " SOURCE != 'ARIA' AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\ .format(eventKey=event_key, userFilter=user_filter_str) event_key = -1 # check if this is the first run. if self._init_flag is True: if event_key == -1: # event_key is None self.logger.warning( "There is no event_key logged in variables table for the given RDP: %s." "So Let's wait for the routine job to sync the whole rdp feedback data" % rdp_schema) return False self.logger.info("Generating init feedback filters") where_sql = "EVENT_KEY <= {eventKey} AND SOURCE != 'ARIA' " \ "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter}) " \ "AND RETAILER_KEY={retailerKey} AND VENDOR_KEY={vendorKey} "\ .format(eventKey=event_key, userFilter=user_filter_str, retailerKey=self._retailer_key, vendorKey=self._vendor_key) self.logger.debug("The filters are: %s" % where_sql) # form the fetch query from RDP and then Insert into the target table fetch_query = """ SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ {insertQuery} FROM {sourceTable} WHERE {whereSql} """.format(insertQuery=insert_columns, sourceTable=source_table, whereSql=where_sql) self.logger.info("fetch_query is : %s" % fetch_query) self.logger.info( ">>Loading {factType} Data from event_key:{eventKey} start at: {timestamp}<<" .format(factType=self._fact_type, eventKey=event_key, timestamp=datetime.datetime.now())) self.dct_sync_data["target_column"] = insert_columns self.dct_sync_data["source_sql"] = fetch_query row_count = self.transfer.transfer_data( dct_sync_data=self.dct_sync_data) self.logger.info( ">>Done loaded {cnt} rows from event_key:{eventKey} completed at: {timestamp}<<" .format(cnt=row_count, factType=self._fact_type, eventKey=event_key, timestamp=datetime.datetime.now())) # if no data transfered, then update variables with previous value. sql = "SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ nvl(max(event_key), {oldEventKey}) " \ "FROM {schemaName}.{importTable} "\ .format(schemaName=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"], oldEventKey=event_key) self.logger.info(sql) self.max_event_key = self.osa_dw_conn.query_scalar(sql) # max_event_key = -1 # testing purpose if self.max_event_key == -1: self.logger.warning( "There is no feedback data in RDP table: {0}".format( source_table)) return False return True except Exception as e: self.logger.warning(e) raise finally: pass def process_data(self): """ after load_data part completes. sync data from temp table to related schemas. :return: """ try: self.logger.info("Processing feedback start...") # loop retailer to insert feedback data sql = "SELECT DISTINCT retailer_key " \ "FROM {cmnSchema}.{importTable}"\ .format(cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info(sql) retailers = self.osa_dw_conn.query(sql) if retailers.rowcount == 0: self.logger.warning( "There is no data in table {cmnSchema}.{importTable}." "It could be no incremental data. Please check fetch_query against RDP database" .format(cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"])) for retailer in retailers: retailer_key = retailer.retailer_key osa_schema = self.capacity.get_retailer_schema_name( retailer_key) # Finally, run the sql msg = "Processing fdbk data within retailer {retailerKey}:{retailerName}"\ .format(retailerKey=retailer_key, retailerName=osa_schema) self.logger.info(msg) # Normally, There should NOT be duplicated alert_id transfered by incremental. # But should consider this case here. Delete existing alertid from target table # TODO: delete could have performance issue. consider using switch partition delete_sql = "DELETE FROM {osaSchema}.{targetTable} " \ "WHERE alert_id IN (SELECT alert_id FROM {cmnSchema}.{importTable} )"\ .format(targetTable=self.target_table_osa, osaSchema=osa_schema, cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info(delete_sql) self.osa_dw_conn.execute(delete_sql) # inserting feedback data into final table fact_feedback from processed table. sql = """ INSERT INTO {osaSchema}.{targetTable} (EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISITED_PERIOD_KEY, PERIOD_KEY, ALERT_ID, STORE_KEY, MERCHANDISER_STORE_NUMBER, STORE_ID, ITEM_KEY, MERCHANDISER_UPC, UPC, MERCHANDISER, STORE_REP, SOURCE, BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION, ON_HAND_PHYSICAL_COUNT, ON_HAND_CAO_COUNT ) SELECT stage.EVENT_KEY, stage.RETAILER_KEY, stage.VENDOR_KEY, TO_CHAR(stage.STORE_VISIT_DATE, 'YYYYMMDD')::int AS STORE_VISITED_PERIOD_KEY, stage.PERIOD_KEY, stage.ALERT_ID, store.STORE_KEY AS STORE_KEY, stage.MERCHANDISER_STORE_NUMBER, COALESCE(store.STORE_ID , alert.STOREID, stage.STORE_ID) AS STORE_ID, item.ITEM_KEY AS ITEM_KEY, stage.MERCHANDISER_UPC, COALESCE(item.UPC, alert.UPC, stage.INNER_UPC, stage.MERCHANDISER_UPC) AS UPC, stage.MERCHANDISER, stage.STORE_REP, stage.SOURCE, stage.BEGIN_STATUS, stage.ACTION, stage.FEEDBACK_DESCRIPTION, 0 AS ON_HAND_PHYSICAL_COUNT, 0 AS ON_HAND_CAO_COUNT FROM {cmnSchema}.{importTable} stage LEFT JOIN {osaSchema}.FACT_PROCESSED_ALERT alert ON stage.alert_id = alert.alert_id AND alert.issuanceid = 0 AND alert.retailer_key = {retailerKey} AND stage.vendor_key = alert.vendor_key INNER JOIN {cmnSchema}.DIM_PRODUCT item ON item.retailer_key = {retailerKey} AND alert.vendor_key = item.vendor_key AND item.item_key = alert.item_key INNER JOIN {cmnSchema}.DIM_STORE store ON store.retailer_key = {retailerKey} AND alert.vendor_key = store.vendor_key AND store.store_key = alert.store_key WHERE stage.retailer_key = {retailerKey} """.format(osaSchema=osa_schema, targetTable=self.target_table_osa, cmnSchema=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"], retailerKey=retailer_key) self.logger.info( "SQL used to load data to related schema. %s" % sql) self.osa_dw_conn.execute(sql) self.logger.info("Processing feedback ended...") except Exception as e: self.logger.warning("Process data for RDP {0} failed: {1}".format( self._rdp_id, e)) raise finally: if self._debug.upper() == 'N': _drop_sql = "DROP TABLE IF EXISTS {schemaName}.{importTable};" \ .format(schemaName=self.dct_sync_data["target_dw_schema"], importTable=self.dct_sync_data["target_dw_table"]) self.logger.info(_drop_sql) self.osa_dw_conn.execute(_drop_sql)
class DumpMain(object): def __init__(self, meta, params, logger=None): self.meta = meta self.params = params self.file_ext = self.params.get("fileExt") if self.params.get( "fileExt") else "txt" self.zip_flag = True if str(self.params.get( "zipFlag", "N")).upper() in ["Y", "YES", "T", "TRUE"] else False self._debug = "Y" if str(self.params.get( 'debug', 'N')).upper() in ["Y", "YES", "T", "TRUE"] else 'N' __log_level = 'DEBUG' if self._debug == "Y" else 'INFO' self.logger = logger if logger else Logger(log_level="info", target="console", vendor_key=-1, retailer_key=-1, sql_conn=None) self.logger.set_level(log_level=__log_level) self.logger.set_keys(log_id="{}".format(self.params["jobId"])) self.provision_url = self.meta[ "api_provision_str"] + "/filesDelivery/layout/attributes?cycle_key=" self.capacity = Capacity(meta=self.meta) self.rsi_username = self.meta.get("fileshare_username") self.rsi_folder = self.meta.get("fileshare_folder") self.cmn_schema = self.meta.get("db_conn_vertica_common_schema", "common") self.app_conn = MSOperation(meta=self.meta) self.vertica_conn = DWOperation(meta=self.meta) self.dumper_context = self.get_context() self._layout_payload = {} def main_process(self): """ Getting AFM related parameters. 2 cases here. 1, called by AFM directly: Alerts Delivery is coupled with AFM. So it is better to call by AFM. Then we can get parameters directly from AFM via self.params 2, called via REST API. in case failure when called by AFM, we also support the REST API while we can manually trigger this. And getting params from REST API Body. :return: """ try: # Checking the cancel status from previous step. _cancel_flag = self.params.get("cancel_flag", False) if _cancel_flag: return self.params.get("message") self.logger.debug("The context is: %s" % str(self.dumper_context)) init = Initialize(self.dumper_context) afm_params = init.init_process() _cycle_key = afm_params.get("cycleKey") _retailer_name = self.capacity.get_retailer_schema_name( retailer_key=afm_params["retailerKey"]) self.dumper_context["retailer_name"] = _retailer_name self.dumper_context["schema"] = _retailer_name self.dumper_context["cycle_key"] = _cycle_key self._layout_payload = self._get_layout() self._process(afm_params) except Warning as e: return str(e) except Exception as e: self.logger.warning(str(e)) raise def get_context(self): context = { "meta": self.meta, "params": self.params, "dw_conn": self.vertica_conn, "app_conn": self.app_conn, "logger": self.logger } return context def _get_layout(self): # Getting layout fields for alert delivery. _cycle_key = self.dumper_context["cycle_key"] _delivery_layout_api = "{0}{1}".format(self.provision_url, _cycle_key) self.logger.info( "Alert delivery related columns(payload param) is not given, " "So just calling provision API: %s to retrieve them." % _delivery_layout_api) # _header = {"tokenid": "eyJhbGciOiJIUzI1NiJ9.eyJjb29raWVOYW1lIjoicnNpU3NvTmV4dEdlbiIsImNvb2tpZVZhbHVlIjoiQVFJQzV3TTJMWTRTZmN6ZmRuLVVQazA2b2NnRzVWaTlZRFc1cHZZQzF6b3djbXMuKkFBSlRTUUFDTURFQUFsTkxBQkkyTVRBNU1EUTBPRFUwTXpZeU1UWTBORFlBQWxNeEFBQS4qIiwic3RhdHVzIjoic3VjY2VzcyIsInVzZXJJZCI6ImJlbi53dUByc2ljb3JwLmxvY2FsIiwiaWF0IjoxNTQ0Njg5OTY3fQ.AzCgFFHXHo3J1M4fk-17T8fBLwReDQDb4p-DXUcBm_M"} _header = {} resp = requests.get(_delivery_layout_api, _header, verify=False) if resp.text == "invalid token": self.logger.error( "ERROR: invalid tokenid, Please update the tokenid manually. " "Then rerun this script again!!!") if resp.status_code != requests.codes.ok or str( resp.json().get("status")).lower() != "success": self.logger.warning("The response result is: %s" % resp.json()) self.logger.error("Calling API failed. Refer to API: %s" % _delivery_layout_api) # _payload = resp.json()["data"]["payload"] # e.g. list({"key1":"value1"}, {}, {}) _payload = resp.json()["data"] self.logger.debug("The layout payload is: %s" % _payload) return _payload def _gen_query(self, delivery_key): # getting the payload for given delivery key. _payload = self._layout_payload.get(str(delivery_key))["payload"] self.logger.debug("The payload is: %s" % str(_payload)) # only getting data where field preselect is True. _payload = list(filter(lambda x: x["preselect"] is True, _payload)) self.logger.debug("The filtered payload is: %s" % str(_payload)) if not _payload: self.logger.warning( "There is no layout configed for this delivery.") return None, None # It is required to order by seq, so need to convert list(dict) to list(tuple) to sort the data. # Otherwise, the following code will not be working correctly. # _payload_list should be like: e.g. [(1, 'Alert', 'ALERT_ID', 'ALERT ID', None), (), ...] # Noted: Please don't change below fields order for generating _payload_list. _payload_list = [(ele["sequence"], ele["dimension_type"], ele["Field"], ele["display_name"], ele["sort_by"]) for ele in _payload] self.logger.debug("Converted payload is: %s" % str(_payload_list)) # Fields should be listed based on sequence. In case seq is None, then putting those columns to the last. # if many elements with None value, then they will be sorted randomly. _payload_list.sort( key=lambda x: x[0] if x[0] is not None else 999) # Assuming less than 999 columns. self.logger.debug("Sorted payload is: %s" % str(_payload_list)) display_fields_lst = [ ] # getting display name like: ["alert type", ...] for displaying on delivery file. columns_with_alias = [ ] # getting fields with format like: [alert."alert_type" as "alert type", ...] table_list = set( ) # getting required table list. ("Alert", "Store", ...) for columns in _payload_list: self.logger.debug( "Current display name is: \"%s\" for column: %s" % (columns[3], columns[2])) table_list.add(str(columns[1]).lower()) if columns[3] is None: # display name could be None display_fields_lst.append(columns[2]) columns_with_alias.append(columns[1] + '."' + columns[2] + '" AS "' + columns[2] + '"') else: display_fields_lst.append(columns[3]) columns_with_alias.append(columns[1] + '."' + columns[2] + '" AS "' + columns[3] + '"') # if there is no "Alert Date" enabled, then manually added this field to the beginning. if "Alert Date".lower() not in list( map(lambda x: str(x).lower(), display_fields_lst)): display_fields_lst.insert(0, "Alert Date") columns_with_alias.insert(0, 'ALERT.period_key AS "Alert Date"') _display_fields_str = ",".join([ '"' + column + '"' for column in display_fields_lst ]) # combine them with comma(,) _required_columns_prefix_tmp = ",".join( [column for column in columns_with_alias]) # combine them # replace one mandatory fields alert_type to intervention_name. Since it is not the column name in fact table. _required_columns_prefix = re.sub( '(?i)' + re.escape('ALERT."ALERT_TYPE"'), 'type.intervention_name', _required_columns_prefix_tmp) # Getting columns which sort_by is True(which is enabled sorting). And sort it. sort_by_list = sorted(list( filter(lambda x: x[4] is True, _payload_list)), key=lambda x: x[4]) if sort_by_list: _sort_columns = ",".join([ ele[1] + '."' + ele[2] + '" DESC NULLS LAST' for ele in sort_by_list ]) else: _sort_columns = '1' self.logger.debug(_sort_columns) _sort_columns = re.sub('(?i)' + re.escape('ALERT."ALERT_TYPE"'), 'type.intervention_name', _sort_columns) _fdbk_required = True if "feedback" in table_list else False _item_required = True if "product" in table_list else False _store_required = True # mandatory # Generating the initial query. _init_sql = """ SELECT {columns}, ROW_NUMBER() OVER(ORDER BY store.STORE_ID, {sortedFields} ) AS rn FROM {schema}.FACT_PROCESSED_ALERT alert INNER JOIN {cmnSchema}.alt_meta_intervention type ON alert.InterventionKey = type.Intervention_Key """\ .format(columns=_required_columns_prefix, sortedFields=_sort_columns, schema=self.dumper_context["schema"], cmnSchema=self.cmn_schema) if _fdbk_required: _init_sql += """ LEFT JOIN {schema}.FACT_FEEDBACK Feedback ON alert.alert_id = Feedback.alert_id"""\ .format(schema=self.dumper_context["schema"]) if _item_required: _init_sql += """ INNER JOIN {cmnSchema}.dim_product Product ON alert.item_key = Product.item_key AND alert.vendor_key = Product.vendor_key AND alert.retailer_key = Product.retailer_key"""\ .format(cmnSchema=self.cmn_schema) if _store_required: _init_sql += """ INNER JOIN {cmnSchema}.dim_store Store ON alert.store_key = store.store_key AND alert.vendor_key = store.vendor_key AND alert.retailer_key = store.retailer_key"""\ .format(cmnSchema=self.cmn_schema) # always apply the period_key filter for given vendor/retailer _init_sql += """ WHERE 1=1 AND alert.IssuanceId = 0 AND (alert.vendor_key, alert.retailer_key, alert.period_key) IN ( SELECT vendor_key, retailer_key, alert_day FROM TMP_RAW_ALERTS_INFO ) """ return _display_fields_str, _init_sql def _process(self, afm_params): try: _cycle_key = afm_params.get("cycleKey") # Reading configuration from meta table under IRIS MSSQL. # Getting all owners(includes both SVR & RETAILER rule) according to given cycle_key. # the delivery file will be dumped by owner. sql = """ SELECT d.ID AS DELIVERY_KEY, d.CYCLE_KEY, d.RETAILER_KEY, d.DELIVERY_NAME, d.FILTERS, d.DELIMITER, d.OWNER, ep.SERVER, ep.EXTRACTION_FOLDER, ep.USERNAME, ep.PASSWORD, ep.MAIL_SUBJECT, ep.MAIL_BODY, ep.MAIL_RECPSCC, ep.MAIL_RECPSTO, ep.DELIVERY_TYPE FROM AP_META_DELIVERIES d INNER JOIN AP_META_ENDPOINTS ep ON d.ENDPOINT_ID = ep.ID WHERE d.cycle_key = {0} AND d.ENABLED = 'T' AND ep.ENABLED = 'T' """.format(_cycle_key) self.logger.info(sql) meta_rows = self.app_conn.query(sql) self.logger.debug("The meta data is: %s" % str(meta_rows)) if not meta_rows: raise Warning( "There is no endpoint or delivery configed. Please check meta table!" ) # There could be multi owners for the given cycle but with different filters. # This is required by PM. And we need to generate separate files for every single row. for meta_data in meta_rows: # 1, Getting the initial source query _delivery_key = meta_data.DELIVERY_KEY required_columns, _init_src_query = self._gen_query( delivery_key=_delivery_key) if required_columns is None and _init_src_query is None: self.logger.warning( "Seems no layout configed for delivery key: %s" % _delivery_key) continue self.logger.info("The initial source query is: %s" % _init_src_query) delivery_type = meta_data.DELIVERY_TYPE if str.lower(delivery_type) == 'customer': meta_data = meta_data._replace( EXTRACTION_FOLDER=self.rsi_folder, USERNAME=self.rsi_username) if meta_data.USERNAME is None: self.logger.warning( "There is no username configed for delivery key: %s" % _delivery_key) continue _pmp_pwd = get_password(username=meta_data.USERNAME, meta=self.meta) if _pmp_pwd: meta_data = meta_data._replace(PASSWORD=_pmp_pwd) self.logger.info("Start to dump & delivery for meta: %s" % str(meta_data)) _src_query = _init_src_query # 2, checking if any filters applied. (e.g alert_type, category etc.) # User might wants to dump only given alert types of data. This should be configurable. # So far, we support 2 types of filters: alert_type & category # TODO: confirm the filter format with UI team. Currently filters are configed with json format. # e.g. {"alert_type": "d-void,phantom", "category":"cat1,cat2"} _filters_raw = meta_data.FILTERS if not _filters_raw or _filters_raw == "": self.logger.info("No filters applied.") else: self.logger.info("The filters are: %s" % _filters_raw) _filters = json.loads(str(_filters_raw).lower().strip()) alert_type_str = _filters.get( "alert_type", None) # e.g. phantom,d-void,shelf oos if alert_type_str is not None and str( alert_type_str).strip() != '': alert_type = ','.join( "'" + str(ele).strip() + "'" for ele in str(alert_type_str).split(',')) _src_query += " AND type.intervention_name IN ({type})".format( type=alert_type) category_str = _filters.get("category", None) if category_str is not None and str( category_str).strip() != '': category_type = ','.join( "'" + str(ele).strip() + "'" for ele in str(category_str).split(',')) _src_query += " AND Product.OSM_CATEGORY IN ({cat_type})".format( cat_type=category_type) # The owner format should be like: owner1 or owner1,owner2,... _owners = str(meta_data.OWNER) if not _owners: # owner is the mandatory filter for every delivery. raise ValueError( "There is no owner configed in delivery meta table") _owner_in_str = ",".join("'" + ele.strip() + "'" for ele in _owners.split(",")) _src_query += " AND alert.owner IN ({owner}) ".format( owner=_owner_in_str) _final_src_query = """ SELECT {columns} FROM ({query}) x ORDER BY rn """.format(columns=required_columns, query=_src_query) self.logger.info("The final source sql is: %s" % _final_src_query) # delivery file name should be: <delivery_name>_<YYYYMMDD>.<fileExt>. e.g. <delivery_name>_20180101.txt curr_folder = os.path.dirname(os.path.realpath(__file__)) target_filename = meta_data.DELIVERY_NAME + "_" + datetime.datetime.now( ).strftime('%Y%m%d') # delivery file will be dumped to "<curr_dir>/data" folder temporarily. abs_target_filename = curr_folder + os.sep + "data" + os.sep + target_filename + '.' + self.file_ext zip_filename = curr_folder + os.sep + "data" + os.sep + target_filename + '.zip' # Getting data delimiter. e.g. ',' delimiter = str(meta_data.DELIMITER).strip() if len(delimiter) != 1: raise ValueError("delimiter should be 1 char") # start to dump data self.dumper = dd.DumpData(context=self.dumper_context) # dump data from source db self.logger.info("Dumping data into file: %s" % abs_target_filename) _dump_flag = self.dumper.dump_data( src_sql=_final_src_query, output_file=abs_target_filename, delimiter=delimiter) self.logger.debug("The dump flag is: %s" % _dump_flag) # dump alerts succeeded. if _dump_flag is True: self.logger.info("Dumping data is done!") # check the zip flag if self.zip_flag: _flat_file_size = round( os.path.getsize(abs_target_filename) / 1024 / 1024) self.logger.debug("The flat file size is: %s" % _flat_file_size) self.logger.info("zipping file: %s" % abs_target_filename) with zipfile.ZipFile(zip_filename, 'w') as z: z.write(abs_target_filename, os.path.basename(abs_target_filename)) abs_target_filename = zip_filename self.logger.info("The zip file name is: %s" % abs_target_filename) # start to send data file self.logger.info( "Starting uploading delivery file to dest folder!") self.sender = sd.SendData(context=self.dumper_context) self.sender.delivery_file(meta_data=meta_data, src_file=abs_target_filename) else: self.logger.warning( "There is no data returned or dump data failed. " "Please refer to previous log to get the related source query." ) self.logger.info("Alert delivery process is done") except Warning as e: raise except Exception: raise finally: if self.vertica_conn: self.vertica_conn.close_connection() if self.app_conn: self.app_conn.close_connection()
SEP = os.path.sep cwd = os.path.dirname(os.path.realpath(__file__)) generic_main_file = cwd + SEP + '..' + SEP + 'main.py' CONFIG_FILE = cwd + SEP + '..' + SEP + '..' + SEP + 'config' + SEP + 'config.properties' exec(open(generic_main_file).read()) _common_schema = "common" _dim_product = "DIM_PRODUCT" _dim_store = "DIM_STORE" _dim_calendar = "DIM_CALENDAR" _meta_intvn = "ALT_META_INTERVENTION" _dim_base_dir = meta.get("azure_storage_dim_root") _prefix_name = "IRIS_" _dw_conn = DWOperation(meta=meta) # Getting retailer list from dim table retailer_list_in_table = [] _retailer_sql = "SELECT DISTINCT retailer_key FROM {schema}.{table} where retailer_key in (6, 5240)"\ .format(schema=_common_schema, table=_dim_store) res = _dw_conn.query(_retailer_sql) for ele in res: retailer_list_in_table.append(ele.retailer_key) print(retailer_list_in_table) capacity = Capacity(meta=meta) azure_uploader = UploadToBlobStorage(meta=meta) # sql_for_item = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_product) # sql_for_store = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_store) # sql_for_cal = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_calendar)
class UploadToBlobStorage(object): def __init__(self, meta, logger=None, chuck_size=5000000, time_out=3600): """ :param meta: Parameters from config file :param logger: logger handler :param chuck_size: The chuck size when reading data for Pandas :param time_out: time out when uploading data to Azure Storage """ self.meta = meta self.account_name = self.meta.get("azure_storage_account_name") self.container_name = self.meta.get("azure_storage_blob_container") self.logger = logger if logger else Logger(log_level="info", target="console", vendor_key=-1, retailer_key=-1, sql_conn=None) self.account_key = get_password(username=self.account_name, meta=self.meta) self.blob_service = BlockBlobService(self.account_name, self.account_key) self.dw_conn = DWOperation(meta=self.meta) self.sql = "" self.parq_filename = "" self.local_path = "" self.chuck_size = chuck_size # 5,000,000 rows as a chuck self.time_out = time_out # time secs def upload_azure_main(self, parq_filename, sql, azure_base_dir, azure_sub_dir): self.parq_filename = parq_filename self._prepare_work() self.dump_data_to_parquet(sql) self.upload_to_blob_storage(azure_base_dir=azure_base_dir, azure_sub_dir=azure_sub_dir) def _prepare_work(self): """ Check if the local dest folder exists: The local dest path should be like: <currdir>/data/20190403/IRIS_DIM_PRODUCT/ 1, if exists, then remove existing dir. Meaning this is not the first time run this script for given table & day. That case, we should remove the existing files. 2, not exists. Then create the folder. :return: """ _day = datetime.datetime.now().strftime('%Y%m%d') self.local_path = os.path.join(os.path.realpath(os.curdir), "data", _day, self.parq_filename) _flag = os.path.exists(self.local_path) if _flag: shutil.rmtree(self.local_path) time.sleep( 0.5 ) # in case there are many files to be deleted. then below makedirs might fail. os.makedirs(self.local_path) def dump_data_to_parquet(self, sql): self.logger.info( "Loading data to Pandas DataFrame from db based on sql: %s" % self.sql) file_index = 0 for df in pd.read_sql(sql=sql, con=self.dw_conn.get_connection(), chunksize=self.chuck_size): final_parq_name = "%s-PART-%04d.parquet" % (self.parq_filename, file_index) abs_parq_file = os.path.join(self.local_path, final_parq_name) self.logger.info("Dumping DataFrame data into parquet file: %s" % abs_parq_file) df.to_parquet(abs_parq_file) self.logger.info("Done dumping data into file: %s" % abs_parq_file) # csv_file = "%s/test-PART-%04d.csv" % (self.local_path, file_index) # df.to_csv(csv_file, index=False) file_index += 1 def clear_blobs(self, folder=None): """ Remove all existing files from azure folder. :param folder: :return: """ blobs = self.blob_service.list_blobs( container_name=self.container_name, prefix=folder) for blob in blobs: self.logger.info("Deleting blob file: %s from Azure blob storage" % blob.name) self.blob_service.delete_blob(container_name=self.container_name, blob_name=blob.name) def upload_to_blob_storage(self, azure_base_dir="DIM_DATA", azure_sub_dir=""): _azure_dir = azure_base_dir + "/" + azure_sub_dir self.logger.info( "Started to clear all existing blobs from Azure under given path: %s" % _azure_dir) self.clear_blobs(folder=_azure_dir) self.logger.info("Done clear all existing blobs.") self.logger.info( "Start to upload parquet file into Azure Blob Storage") for files in glob.glob(os.path.join(self.local_path, '*.parquet')): self.logger.info("Uploading blob file: %s" % files) self.blob_service.create_blob_from_path( container_name=self.container_name, blob_name="{0}/{1}".format( _azure_dir, os.path.basename(files)), # the sep in Azure is always '/' file_path=files, timeout=self.time_out) self.logger.info("Done uploaded into Azure Blob Storage")
% (str(i + 1), vendor_key, retailer_key, schema_name)) body_sqls = [] i += 1 current_date_dt = start_date_dt + datetime.timedelta(days=i) if body_sqls: sql = head_sql + ' UNION '.join(body_sqls) dw.execute(sql) logger.info( 'Totally %s records for vendor_key=%s, retailer_key=%s have been inserted into %s.DIM_INCIDENT_ID_START' % (str(delta), vendor_key, retailer_key, schema_name)) try: dw = DWOperation(meta=meta, logger=logger) try: check_sql = 'SELECT COUNT(*) FROM %s.DIM_VENDOR_RETAILER WHERE VENDOR_KEY=%s AND RETAILER_KEY=%s' % ( schema_name, str(options.vendor_key), str(options.retailer_key)) count = dw.query_scalar(check_sql) if count == 0: insert_sql = 'INSERT INTO %s.DIM_VENDOR_RETAILER (VENDOR_KEY, RETAILER_KEY) VALUES(%s, %s)' % ( schema_name, str(options.vendor_key), str( options.retailer_key)) dw.execute(insert_sql) except pyodbc.ProgrammingError: logger.info('%s.DIM_VENDOR_RETAILER doesn' 't exist, if you run updateSQL before update' % schema_name) logger.info(
class Scd(object): def __init__(self, sql, actioner, log_detail=True, batch_size=100, meta={}, db_type=None, table_schema=None, logger=None): self.meta = meta self.actioner = actioner self.log_detail = log_detail self.batch_size = batch_size self.logger = logger if logger else Logger(log_level="info", vendor_key=-1, retailer_key=-1, module_name="scd") self.sql = sql self.raw_sql_splitted = self.sql.split(' ') self.raw_sql_splitted_trimed = [ str for str in self.raw_sql_splitted if str != '' ] self.db_type = db_type self.table_schema = table_schema self.logger.info('[SQL] %s' % self.sql) self.logger.info('[SQL splitted] %s' % self.raw_sql_splitted) self.mssql = MSOperation(meta=self.meta) self.vtc = None self.saved_records_count = 0 def set_insert_delete_update(self): cmd = self.raw_sql_splitted_trimed[0].upper() if cmd in ('DELETE', 'UPDATE', 'INSERT'): self.cmd = cmd else: raise RuntimeError('%s is not supported.' % cmd) def locate_keyword(self, keyword, sql_splitted=[]): # find the location of the keyword firstly appears in the sql # e.g. locate WHERE # UPDATE TEST SET COL = ' WHERE ' WHERE ID=1, the first ' WHERE ' is not a keyword, so return the index of the 2nd WHERE keyword = keyword.upper() base_seq_splitted = sql_splitted if sql_splitted else self.raw_sql_splitted # if sql_splitted is specified, locate keyword in sql_splitted, otherwise locate keyword in raw_sql_splitted index_keyword = [ k for k, v in enumerate(base_seq_splitted) if keyword in v.upper() ] for ind in index_keyword: count_of_single_quote_ahead_of_where = sum([ base_seq_splitted[i].count("'") for i in range(0, ind) if "'" in base_seq_splitted[i] ]) i = base_seq_splitted[ind].upper().index(keyword) count_of_single_quote_ahead_of_where += base_seq_splitted[ind][ 0:i].count("'") if count_of_single_quote_ahead_of_where % 2 == 0: # the KEYWORD we want to capture is the one has n*2 "'" ahead of it return ind return -1 # KEYWORD not found def locate_keyword_combination( self, keywords): #--mainly for parsing merge, may need to be updated # find the location of the first keyword, and to check if 2nd, 3rd keywords are just following the 1st keyword in the SQL # if match all the keywords, return the index of the first keyword # else return -1 keywords_list = [ w.upper().strip() for w in keywords.split(' ') if w.strip() != '' ] sql_splitted = [*self.raw_sql_splitted] base_ind = 0 self.logger.info('Locating %s' % keywords_list) while True: self.logger.info('Locating 1st keyword "%s"' % keywords_list[0]) ind = self.locate_keyword(keywords_list[0], sql_splitted) if ind == -1: self.logger.warning('Keyword not found.') return -1 following = [ v.upper().strip() for v in sql_splitted[ind + 1:] if v.strip() != '' ] self.logger.info('Getting following %s', following) if following[0:len(keywords_list) - 1] == keywords_list[1:]: self.logger.info('Keywords matched!') return base_ind + ind else: self.logger.warning('Keywords not matched.') sql_splitted = sql_splitted[ind + 1:] base_ind += ind def get_merge_where( self ): #--TODO, not done yet, the sql gramma "update from" for vertica and sqlserver are different, need more time to work on this index_using = self.locate_keyword('USING') index_on = self.locate_keyword('ON') index_when_match = self.locate_keyword_combination('WHEN MATCHED THEN') index_when_not_match = self.locate_keyword_combination( 'WHEN NOT MATCHED THEN') def get_where_condition(self): index_where = self.locate_keyword('WHERE') return '' if index_where == -1 else ' '.join( self.raw_sql_splitted[index_where:]) def get_table_name(self): self.set_insert_delete_update() return self.raw_sql_splitted_trimed[1].upper( ) if self.cmd == 'UPDATE' else self.raw_sql_splitted_trimed[2].upper() def get_raw_table_name(self, table_name): return table_name.split('.')[-1] def get_table_def(self, raw_table_name): sql = "SELECT ID, TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA FROM META_SCD_TABLE_DEF WHERE TABLE_NAME='%s'" % raw_table_name if self.db_type: sql += " AND DB_TYPE='%s'" % self.db_type if self.table_schema: sql += " AND TABLE_SCHEMA='%s'" % self.table_schema rs = self.mssql.query(sql) if len(rs) == 0: raise ValueError( '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] The table is not found.' % (raw_table_name, db_type, table_schema)) elif len(rs) > 1: self.logger.info('|ID|TABLE_NAME|DB_TYPE|TABLE_SCHEMA|') for r in rs: self.logger.info( '|%s|%s|%s|%s|' % (r.ID, r.TABLE_NAME, r.DB_TYPE, r.TABLE_SCHEMA)) raise ValueError( '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] More than one records returned for the table.' % (raw_table_name, db_type, table_schema)) else: if rs[0].DB_TYPE == 'MSSQL': self.target_conn = self.mssql else: self.vtc = DWOperation(meta=self.meta) self.target_conn = self.vtc return rs[0] def get_mssql_table_ddl_def(self, db_name, table_name, table_schema): ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE FROM %s.INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'%s' AND TABLE_SCHEMA = N'%s' ORDER BY ORDINAL_POSITION ''' % (db_name, table_name, table_schema) rs = self.mssql.query(ddl_sql) #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs]) return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs]) def get_vertica_table_ddl_def(self, table_name, table_schema): ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE FROM COLUMNS WHERE TABLE_NAME = '%s' AND TABLE_SCHEMA = '%s' ORDER BY ORDINAL_POSITION ''' % (table_name, table_schema) rs = self.vtc.query(ddl_sql) #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs]) return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs]) def save_to_db(self, table_def, recs): pk = json.loads(table_def.PK) batch_sqls = [] columns = 'TABLE_DEF_ID, PK_VALUE, ACTION, RECORD, UPDATE_TIME, ACTIONER' if self.log_detail else 'TABLE_DEF_ID, PK_VALUE, ACTION, UPDATE_TIME, ACTIONER' counter = 0 now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") for rec in recs: pk_dict = collections.OrderedDict() for k in pk: pk_dict[k] = rec[k] pk_value = json.dumps(pk_dict).replace("'", "''") if self.log_detail: batch_sqls.append( '''SELECT %s, '%s', '%s', '%s', '%s', '%s' ''' % (table_def.ID, pk_value, self.cmd, json.dumps(rec).replace( "'", "''"), now, self.actioner)) else: batch_sqls.append( '''SELECT %s, '%s', '%s', '%s', '%s' ''' % (table_def.ID, pk_value, self.cmd, now, self.actioner)) counter += 1 if len(batch_sqls) % self.batch_size == 0: self.mssql.execute( '''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' % (columns, ' UNION ALL '.join(batch_sqls))) batch_sqls = [] self.logger.info('%s has been logged.' % counter) if len(batch_sqls) > 0: self.mssql.execute('''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' % (columns, ' UNION ALL '.join(batch_sqls))) self.logger.info('%s has been logged.' % counter) @Timer() def save_data_to_be_changed(self, table_def, where_condition): if table_def.DB_TYPE == 'MSSQL': table_schema = 'dbo' if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA table_ddl_def = self.get_mssql_table_ddl_def( self.meta['db_conn_mssql_dbname'], table_def.TABLE_NAME, table_schema) else: # vertica table_schema = self.meta[ 'db_conn_vertica_common_schema'] if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA table_ddl_def = self.get_vertica_table_ddl_def( table_def.TABLE_NAME, table_schema) if not self.log_detail: # only dump primary key columns pk = json.loads(table_def.PK) table_ddl_def = dict([(c, table_ddl_def[c]) for c in table_ddl_def if c in pk]) self.logger.info(table_ddl_def) query_columns = ','.join([ '"%s"' % c if ' ' in c else c for c in list(table_ddl_def.keys()) ]) pull_data_sql = 'SELECT %s FROM %s.%s %s' % ( query_columns, table_schema, table_def.TABLE_NAME, where_condition) self.logger.info(pull_data_sql) rs = self.target_conn.query(pull_data_sql) recs = [] for r in rs: rec = {} for c in list(table_ddl_def.keys()): #rec[c] = str(eval("r."+c)) if 'TIME' not in table_ddl_def[c].upper() and 'DATE' not in table_ddl_def[c].upper() else eval("r."+c).strftime("%Y-%m-%d %H:%M:%S") rec[c] = str(eval('r.__getattribute__("%s")' % c)) recs.append(rec.copy()) self.saved_records_count = len(recs) self.logger.info('%s records will be saved.' % self.saved_records_count) self.save_to_db(table_def, recs) @Timer() def execute_sql(self): self.target_conn.execute(self.sql) def main(self): try: try: table_name = self.get_table_name() raw_table_name = self.get_raw_table_name(table_name) table_def = self.get_table_def(raw_table_name) saved = False if self.cmd != 'INSERT': where_condition = self.get_where_condition() self.save_data_to_be_changed(table_def, where_condition) saved = True except Exception as e: self.logger.warning(traceback.format_exc()) self.logger.warning( 'Failed to save the to-be-changed data, start to executing sql.' ) self.execute_sql() return '(%(db_type)s)%(table_schema)s.%(table_name)s - impacted %(row_count)s rows, saved %(status)s' % { 'db_type': table_def.DB_TYPE, 'table_schema': table_def.TABLE_SCHEMA, 'table_name': table_def.TABLE_NAME, 'row_count': self.saved_records_count, 'status': 'N/A' if self.cmd == 'INSERT' else 'successfully' if saved else 'unsuccessfully' } finally: if self.mssql: self.mssql.close_connection() if self.vtc: self.vtc.close_connection()