Exemplo n.º 1
0
    def __init__(self, meta, logger=None, chuck_size=5000000, time_out=3600):
        """
        :param meta:  Parameters from config file
        :param logger:  logger handler
        :param chuck_size: The chuck size when reading data for Pandas
        :param time_out: time out when uploading data to Azure Storage
        """

        self.meta = meta
        self.account_name = self.meta.get("azure_storage_account_name")
        self.container_name = self.meta.get("azure_storage_blob_container")
        self.logger = logger if logger else Logger(log_level="info",
                                                   target="console",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   sql_conn=None)

        self.account_key = get_password(username=self.account_name,
                                        meta=self.meta)
        self.blob_service = BlockBlobService(self.account_name,
                                             self.account_key)
        self.dw_conn = DWOperation(meta=self.meta)

        self.sql = ""
        self.parq_filename = ""
        self.local_path = ""
        self.chuck_size = chuck_size  # 5,000,000 rows as a chuck
        self.time_out = time_out  # time secs
Exemplo n.º 2
0
    def __init__(self, meta, params, logger=None):
        self.meta = meta
        self.params = params
        self.file_ext = self.params.get("fileExt") if self.params.get(
            "fileExt") else "txt"
        self.zip_flag = True if str(self.params.get(
            "zipFlag", "N")).upper() in ["Y", "YES", "T", "TRUE"] else False

        self._debug = "Y" if str(self.params.get(
            'debug', 'N')).upper() in ["Y", "YES", "T", "TRUE"] else 'N'
        __log_level = 'DEBUG' if self._debug == "Y" else 'INFO'
        self.logger = logger if logger else Logger(log_level="info",
                                                   target="console",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   sql_conn=None)
        self.logger.set_level(log_level=__log_level)
        self.logger.set_keys(log_id="{}".format(self.params["jobId"]))

        self.provision_url = self.meta[
            "api_provision_str"] + "/filesDelivery/layout/attributes?cycle_key="
        self.capacity = Capacity(meta=self.meta)
        self.rsi_username = self.meta.get("fileshare_username")
        self.rsi_folder = self.meta.get("fileshare_folder")
        self.cmn_schema = self.meta.get("db_conn_vertica_common_schema",
                                        "common")

        self.app_conn = MSOperation(meta=self.meta)
        self.vertica_conn = DWOperation(meta=self.meta)

        self.dumper_context = self.get_context()
        self._layout_payload = {}
Exemplo n.º 3
0
 def get_table_def(self, raw_table_name):
     sql = "SELECT ID, TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA FROM META_SCD_TABLE_DEF WHERE TABLE_NAME='%s'" % raw_table_name
     if self.db_type:
         sql += " AND DB_TYPE='%s'" % self.db_type
     if self.table_schema:
         sql += " AND TABLE_SCHEMA='%s'" % self.table_schema
     rs = self.mssql.query(sql)
     if len(rs) == 0:
         raise ValueError(
             '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] The table is not found.'
             % (raw_table_name, db_type, table_schema))
     elif len(rs) > 1:
         self.logger.info('|ID|TABLE_NAME|DB_TYPE|TABLE_SCHEMA|')
         for r in rs:
             self.logger.info(
                 '|%s|%s|%s|%s|' %
                 (r.ID, r.TABLE_NAME, r.DB_TYPE, r.TABLE_SCHEMA))
         raise ValueError(
             '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] More than one records returned for the table.'
             % (raw_table_name, db_type, table_schema))
     else:
         if rs[0].DB_TYPE == 'MSSQL':
             self.target_conn = self.mssql
         else:
             self.vtc = DWOperation(meta=self.meta)
             self.target_conn = self.vtc
         return rs[0]
Exemplo n.º 4
0
 def _process_data_cleanup(self):
     """Process Data Cleanup
     :return:
     """
     try:
         self._dw = DWOperation(meta=self.context["meta"])
         lst_warning = [" none warning"]
         for cfg in eval(self.context["config_json"]):
             try:
                 if cfg["type"].upper() == "TABLE":
                     self._cleanup_table(cfg)
                 elif cfg["type"].upper() == "DATA":
                     self._cleanup_data(cfg)
                 self.body["message"] = "{} cleanup done".format(cfg["type"])
                 self._logger.info(self.body)
             except Exception as msg:
                 self.body["status"] = StepStatus.ERROR.name
                 self.body["message"] = ("Data Cleanup error with config {}  --||##  {}".format(cfg, msg))
                 self._logger.warning(self.body)
                 lst_warning.append(cfg)
                 continue
         self.body["status"] = StepStatus.SUCCESS.name
         self.body["message"] = ("Data Cleanup done with config {}, {}".format(cfg, "; ".join(lst_warning)))
         self._logger.info(self.body)
     except Exception as msg:
         self.body["status"] = StepStatus.ERROR.name
         self._logger.error(msg)
     finally:
         if hasattr(self, "_dw"):
             self._dw.close_connection()
         self.body["message"] = "Data Cleanup end"
         self._logger.info(self.body)
Exemplo n.º 5
0
 def __init__(self, context):
     self.context = context
     self.meta = self.context["meta"]
     self._logger = self.context["logger"]
     self._db = MSOperation(meta=self.meta)
     self._dw = DWOperation(meta=self.meta)
     self.bundle_base_url = self.meta["api_osa_bundle_str"]
     self.alert_base_url = self.meta["api_alerts_str"]
     self._common_schema = self.meta["db_conn_vertica_common_schema"]
     self._dim_calendar = "DIM_CALENDAR"
     self._dim_product = "DIM_PRODUCT"
     self._dim_store = "DIM_STORE"
     self._meta_intvn = "ALT_META_INTERVENTION"
     self._dim_base_dir = self.meta["azure_storage_dim_root"]
     self._azure_storage_account_name = self.meta[
         "azure_storage_account_name"]
     self._azure_storage_blob_container = self.meta[
         "azure_storage_blob_container"]
     self._prefix_name = "IRIS_"
     self.azure_uploader = UploadToBlobStorage(meta=self.meta,
                                               logger=self._logger)
     self.body = {
         "job_id": self.context["jobId"],
         "step_id": self.context["stepId"],
         "status": StepStatus.RUNNING.name
     }
     self.capacity = Capacity(meta=self.meta)
     self.cycle_key = self._get_cycle_key()
Exemplo n.º 6
0
 def __init__(self, context):
     self.context = {**context, **context["meta"]}.copy()
     self._hub_ids = context["hub_ids"]
     self._logger = context["logger"]
     self._db = MSOperation(meta=context["meta"])
     self._dw = DWOperation(meta=context["meta"])
     self._common_schema = context["meta"]["db_conn_vertica_common_schema"]
     # self._dw_conn_vertica = self.context["dw_conn_vertica"]
     self._all_dim = self.context["all_dim"]
     self._dim_calendar = "DIM_CALENDAR"
     self._dim_product = "DIM_PRODUCT"
     self._dim_store = "DIM_STORE"
     self._product_cis_src_tbl = "PS_ITEM_DIM_EXT"
     self._store_cis_src_tbl = "PS_STORE_DIM_EXT"
     self._dct_vendor_retailer_hub = {}
     self._dct_vendor_hub = {}
     self._dct_retailer_hub = {}
     self._all_vendor_retailer_hub = self._get_avaliable_vr()
Exemplo n.º 7
0
    def _populate_source_config(self, source_config):
        self.logger.debug("The source config is: %s" % source_config)

        _src_config = {}
        if os.name == 'nt':
            _src_config["temp_file_path"] = "d:"
        elif os.name == 'posix':
            _src_config["temp_file_path"] = "/tmp"

        # Getting user account from config.properties file first.
        if self.meta.get("db_conn_vertica_rdp_username"):
            _src_config["dw.etluser.id"] = self.meta.get(
                "db_conn_vertica_rdp_username")
            if self.meta.get("db_conn_vertica_rdp_password"):
                _src_config["dw.etluser.password"] = self.meta.get(
                    "db_conn_vertica_rdp_password")
            else:
                _pmp_pwd = get_password(
                    username=self.meta.get("db_conn_vertica_rdp_username"),
                    meta=self.meta)
                # The pwd should be encrypted in order to: 1, align with else part, 2, pass it to db.sync_data module
                _src_config["dw.etluser.password"] = Crypto().encrypt(_pmp_pwd)
        # if not configed then get them directly from RDP config.
        else:
            _src_config["dw.etluser.id"] = source_config.get("dw.etluser.id")
            # the pwd is encrypted
            _src_config["dw.etluser.password"] = source_config.get(
                "dw.etluser.password")

        # required info for calling sync_data module.
        _src_config["dw.server.name"] = source_config.get("dw.server.name")
        _src_config["dw.db.name"] = source_config.get("dw.db.name")
        _src_config["dw.db.portno"] = source_config.get("dw.db.portno", 5433)
        _src_config["dw.schema.name"] = source_config.get("dw.schema.name")

        self.logger.debug("srouce config is: %s" % _src_config)
        self.dct_sync_data["source_config"] = _src_config

        # Create the connection to RDP Vertica Cluster. which is the source Vertica cluster
        rdp_meta = copy.deepcopy(self.meta)
        tmp_rdp_meta = {
            'db_conn_vertica_servername': _src_config["dw.server.name"],
            'db_conn_vertica_port': _src_config["dw.db.portno"],
            'db_conn_vertica_dbname': _src_config["dw.db.name"],
            'db_conn_vertica_username': _src_config["dw.etluser.id"],
            'db_conn_vertica_password': _src_config["dw.etluser.password"],
            'db_conn_vertica_password_encrypted': "true"
        }
        rdp_meta.update(tmp_rdp_meta)
        self.logger.debug("rdp config is: %s" % rdp_meta)
        rdp_connection = DWOperation(meta=rdp_meta)
        self.dct_sync_data["source_dw"] = rdp_connection
Exemplo n.º 8
0
 def __init__(self, context):
     self.context = context
     self.meta = self.context["meta"]
     self._logger = self.context["logger"]
     self.body = {
         "job_id": self.context["jobId"],
         "step_id": self.context["stepId"],
         "status": StepStatus.RUNNING.name
     }
     self._db = MSOperation(meta=self.meta)
     self._dw = DWOperation(meta=self.meta)
     self._common_schema = self.meta["db_conn_vertica_common_schema"]
     self._check_spd_url = "{}/availablecycle/rc/action".format(
         self.meta["api_alerts_str"])
     self.pacific_job_def_name = "OSAPacificNotification"
Exemplo n.º 9
0
    def test_gen_vtc_test_data():
        mssql = MSOperation(meta=meta)
        vtc = DWOperation(meta=meta)
        try:
            vtc.execute('drop table common.test_columns')
        except:
            print('table doesn\'t exists')
        vtc.execute(
            'create table common.test_columns as select * from columns')

        rs = mssql.query(
            'select id from META_SCD_TABLE_DEF where table_name=\'test_columns\''
        )
        if not rs:
            sql = '''
            INSERT INTO META_SCD_TABLE_DEF(TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA, CREATE_TIME)
            SELECT 'test_columns', '["table_schema", "table_name", "column_name"]', 'VERTICA', '(COMMON)', GETDATE()
            '''
            mssql.execute(sql)
        '''
Exemplo n.º 10
0
class SyncDimData:
    """
    Get product and store dimension data from hub. DIM_PRODUCT and DIM_STORE save pivoted data,
    tall table PRODUCT and STORE save attribute that config in AP_CONFIG_META_ATTRIBUTE
    """
    def __init__(self, context):
        self.context = {**context, **context["meta"]}.copy()
        self._hub_ids = context["hub_ids"]
        self._logger = context["logger"]
        self._db = MSOperation(meta=context["meta"])
        self._dw = DWOperation(meta=context["meta"])
        self._common_schema = context["meta"]["db_conn_vertica_common_schema"]
        # self._dw_conn_vertica = self.context["dw_conn_vertica"]
        self._all_dim = self.context["all_dim"]
        self._dim_calendar = "DIM_CALENDAR"
        self._dim_product = "DIM_PRODUCT"
        self._dim_store = "DIM_STORE"
        self._product_cis_src_tbl = "PS_ITEM_DIM_EXT"
        self._store_cis_src_tbl = "PS_STORE_DIM_EXT"
        self._dct_vendor_retailer_hub = {}
        self._dct_vendor_hub = {}
        self._dct_retailer_hub = {}
        self._all_vendor_retailer_hub = self._get_avaliable_vr()

    def _get_column(self):
        """Get attribute from config table
        :return:
        """
        self._dct_table = {"DIM_PRODUCT": "", "DIM_STORE": ""}
        self._dct_key = {"DIM_PRODUCT": "ITEM_KEY", "DIM_STORE": "STORE_KEY"}
        self._dct_table_column = {"PRODUCT": [], "STORE": []}
        sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 1"
        self._logger.debug(sql)
        for row in self._db.query(sql):
            self._dct_table_column["PRODUCT"].append(row[0])
        sql = "SELECT DISTINCT KEY_COLUMN FROM AP_CONFIG_META_ATTRIBUTE WHERE DIMENSION_TYPE = 2"
        self._logger.debug(sql)
        for row in self._db.query(sql):
            self._dct_table_column["STORE"].append(row[0])
        for k, v in self._dct_table_column.items():
            self._dct_table_column[k] = """'{}'""".format("', '".join(v))

    def _get_source_config(self, hub_id):
        """Get source config
        :return:
        """
        self._source_dw = SiloOperation(hub_id=hub_id,
                                        meta=self.context["meta"])
        if self.context["meta"]["db_conn_vertica_silo_username"]:
            self._source_dw.config.json_data["configs"][
                "dw.etluser.id"] = self._source_dw.username
            self._source_dw.config.json_data["configs"][
                "dw.etluser.password"] = Crypto().encrypt(
                    self._source_dw.password)
        silo_type = self._source_dw.config.get_property(
            "etl.silo.type", "None").upper()
        # TODO For hub, silo_type is MD, not find para for this, default host
        if silo_type in ("WMSSC", "SASSC", "WMCAT", "WMINTL"):
            self._active_status = ""
        else:
            self._active_status = " AND D.ACTIVE = 'T'"

    def _get_avaliable_vr(self):
        """
        getting all avaliable vendor and retailer from cycle mapping table.
        and get related hub_id via config service.
        :return: dict like {'vendor,retailer': 'hub_id', ...}
        """

        sql = "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_MAPPING " \
              "UNION " \
              "SELECT DISTINCT VENDOR_KEY, RETAILER_KEY FROM AP_ALERT_CYCLE_RC_MAPPING"
        self._logger.debug(sql)
        dct_vendor_retailer_hub = {}
        # dct_vendor_retailer_hub = dict(self._db.query(sql))
        for v_r in self._db.query(sql):
            try:
                config = Config(meta=self.context["meta"],
                                vendor_key=v_r.VENDOR_KEY,
                                retailer_key=v_r.RETAILER_KEY)
                hub_id = config.get_hub_id()
                _key = str(v_r.VENDOR_KEY) + ',' + str(v_r.RETAILER_KEY)
                dct_vendor_retailer_hub[_key] = hub_id
            # in case there is no config returned for given vendor & retailer, then skip this vendor & retailer.
            except Exception as e:
                # self._logger.warning(str(e))
                self._logger.warning(
                    "Seems there is no silo configed for vendor: %s and retailer: %s"
                    % (str(v_r.VENDOR_KEY), str(v_r.RETAILER_KEY)))
                continue

        return dct_vendor_retailer_hub

    def _get_vendor_retailer_from_hub(self, hub_id):
        """Get vendor retailer from table AP_ALERT_CYCLE_MAPPING, only keep the first one that appear in multiple hubs
        :return:
        """
        self._str_vendor_retailer_filter = ""
        self._str_vendor_filter = ""
        self._str_retailer_filter = ""
        lst_vendor_retailer_filter = []
        lst_vendor_filter = []
        lst_retailer_filter = []
        dct_vendor_retailer_hub = self._all_vendor_retailer_hub

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ "
            "    DISTINCT VENDOR_KEY, RETAILER_KEY FROM {self._meta_schema}.CONFIG_CUSTOMERS"
            " WHERE VENDOR_NAME != 'Alert' AND HUB_ID = '{hub_id}'"
            " ORDER BY VENDOR_KEY, RETAILER_KEY".format(hub_id=hub_id,
                                                        self=self))
        self._logger.debug(sql)
        vendor_retailers = self._source_dw.query(sql)
        for row in vendor_retailers:
            _vr = "{row.VENDOR_KEY},{row.RETAILER_KEY}".format(row=row)
            # same vendor & retailer could have multi hubs.
            # So only getting the one which matches with the hub from Config service.
            if dct_vendor_retailer_hub.get(_vr) == hub_id or self._all_dim:
                if _vr not in self._dct_vendor_retailer_hub:
                    self._dct_vendor_retailer_hub[_vr] = hub_id
                    lst_vendor_retailer_filter.append(
                        "({row.VENDOR_KEY},{row.RETAILER_KEY})".format(
                            row=row))
                if str(row.VENDOR_KEY) not in self._dct_vendor_hub:
                    self._dct_vendor_hub["{}".format(row.VENDOR_KEY)] = hub_id
                    lst_vendor_filter.append("{}".format(row.VENDOR_KEY))
                if str(row.RETAILER_KEY) not in self._dct_retailer_hub:
                    self._dct_retailer_hub["{}".format(
                        row.RETAILER_KEY)] = hub_id
                    lst_retailer_filter.append("{}".format(row.RETAILER_KEY))
        if lst_vendor_retailer_filter:
            self._str_vendor_retailer_filter = (
                " AND (VENDOR_KEY, RETAILER_KEY) IN ({})"
                "".format(",".join(lst_vendor_retailer_filter)))
            self._logger.debug(self._str_vendor_retailer_filter)
        if lst_vendor_filter:
            self._str_vendor_filter = (" AND VENDOR_KEY IN ({})".format(
                ",".join(list(set(lst_vendor_filter)))))
            self._logger.debug(self._str_vendor_filter)
        if lst_retailer_filter:
            self._str_retailer_filter = (" AND RETAILER_KEY IN ({})".format(
                ",".join(list(set(lst_retailer_filter)))))
            self._logger.debug(self._str_retailer_filter)

    def _recreate_stage_table(self):
        """
        Creating related stage tables on IRIS DW.
        :return:
        """
        lst_sql = []
        for table in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]):
            lst_sql.append("""
                DROP TABLE IF EXISTS {self._common_schema}.STAGE_{table};
                CREATE TABLE {self._common_schema}.STAGE_{table} LIKE {self._common_schema}.{table} INCLUDING PROJECTIONS;
            """.format(self=self, table=table))
        sql = ''.join(lst_sql)
        self._logger.info(sql)
        self._dw.execute(sql)

        # creating staging table for loading CIS source data.
        _sql = """
         DROP TABLE IF EXISTS {cmnSchema}.STAGE_{itemTable}_CIS;
         CREATE TABLE {cmnSchema}.STAGE_{itemTable}_CIS LIKE {cmnSchema}.{itemTable} INCLUDING PROJECTIONS;
         
         DROP TABLE IF EXISTS {cmnSchema}.STAGE_{storeTable}_CIS;
         CREATE TABLE {cmnSchema}.STAGE_{storeTable}_CIS LIKE {cmnSchema}.{storeTable} INCLUDING PROJECTIONS;
         """.format(cmnSchema=self._common_schema,
                    itemTable=self._dim_product,
                    storeTable=self._dim_store)
        self._logger.info(_sql)
        self._dw.execute(_sql)

    def _truncate_stage_table(self):
        """Truncate stage table
        :return:
        """
        lst_sql = []
        for k in (list(self._dct_table.keys()) + ["PRODUCT", "STORE"]):
            lst_sql.append(
                "TRUNCATE TABLE {self._common_schema}.STAGE_{k}".format(
                    self=self, k=k))
        sql = ';\n'.join(lst_sql)
        self._logger.debug(sql)
        self._dw.execute(sql)

    def _analyze_table(self):
        """Analyze table
        :return:
        """
        lst_sql = []
        for k in self._dct_table.keys():
            lst_sql.append(
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.{k}')"
                "".format(self=self, k=k))
        sql = ';\n'.join(lst_sql)
        self._logger.debug(sql)
        self._dw.execute(sql)

    def _export_calendar_data(self, hub_id):
        """ Export calendar data to common schema from source schema if no data in DIM_CALENDAR table
        :param hub_id:
        :return:
        """
        cnt = self._dw.query_scalar(
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ COUNT(*) "
            "FROM {self._common_schema}.{self._dim_calendar}".format(
                self=self))
        if cnt != 0:
            self._logger.info(
                "Calendar data exists, skip hub_id {}.".format(hub_id))
        else:
            col = (
                'PERIOD_KEY, CALENDAR_KEY, CALENDARNAME, YEAR, YEARNAME, QUARTER, QUARTERNAME, MONTH, MONTHNAME, '
                'PERIOD, PERIODNAME, WEEKENDED, WEEKENDEDNAME, WEEKBEGIN, WEEKBEGINNAME, YEARWEEKNAME, YEARWEEK, '
                'YEARMONTHWEEKNAME, LY_PERIOD_KEY, NY_PERIOD_KEY, DATE_NAME, DATE_VALUE, CAL_PERIOD_KEY, '
                '"2YA_PERIOD_KEY", "3YA_PERIOD_KEY", "4YA_PERIOD_KEY"')
            sql = """
            SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CAL.PERIOD_KEY AS PERIOD_KEY,
                   META.CALENDAR_KEY AS CALENDAR_KEY,
                   MAX(META.CALENDAR_NAME) AS CALENDARNAME,
                   MAX(CAL."YEAR") AS "YEAR",
                   MAX(CAL.YEARNAME) AS YEARNAME,
                   MAX(CAL.YEARQUARTER) AS QUARTER,
                   MAX(CAL.YEARQUARTERNAME) AS QUARTERNAME,
                   MAX(CAL.YEARPERIOD) AS "MONTH",
                   MAX(CAL.YEARPERIODNAME) AS MONTHNAME,
                   MAX(CAL.YEARMONTH) AS PERIOD,
                   MAX(CAL.YEARMONTHNAME) AS PERIODNAME,
                   MAX(CAL.WEEKENDED) AS WEEKENDED,
                   MAX(CAL.WEEKENDEDNAME) AS WEEKENDEDNAME,
                   MAX(CAL.WEEKBEGIN) AS WEEKBEGIN,
                   MAX(CAL.WEEKBEGINNAME) AS WEEKBEGINNAME,
                   (((MAX(CAL.YEAR))::VARCHAR|| ' - Wk '::VARCHAR(6)) || SUBSTR(('00'::VARCHAR(2) 
                   ||(MAX(CAL.WEEK))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.WEEK))::VARCHAR)) - 2) + 1),2)
                   ) AS YEARWEEKNAME,
                   MAX(CAL.YEARWEEK) AS YEARWEEK,
                   (((((MAX(CAL.YEAR))::VARCHAR|| ' - '::VARCHAR(3)) || SUBSTR (('00'::VARCHAR(2) 
                   ||(MAX(CAL.MONTH))::VARCHAR),((LENGTH(('00'::VARCHAR(2) ||(MAX(CAL.MONTH))::VARCHAR)) - 2) + 1),2)) 
                   || ' Wk '::VARCHAR(4)) || SUBSTR(('00'::VARCHAR(2) 
                   ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(META.MONTH) 
                   ORDER BY CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR),((LENGTH(('00'::VARCHAR(2) 
                   ||(CEILING(((RANK() OVER (PARTITION BY META.CALENDAR_KEY,MAX(CAL.YEAR),MAX(CAL.MONTH) ORDER BY 
                   CAL.PERIOD_KEY)*1.0) / 7::NUMERIC(18,0))))::VARCHAR)) - 2) + 1),2)) AS YEARMONTHWEEKNAME,
                   MAX(CAL.LY_PERIOD_KEY) LY_PERIOD_KEY,
                   MAX(CAL.NY_PERIOD_KEY) NY_PERIOD_KEY,
                   TO_CHAR(TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD'), 'YYYY-MM-DD') DATE_NAME,
                   TO_DATE(CAL.PERIOD_KEY::VARCHAR, 'YYYYMMDD') DATE_VALUE,
                   CAL.PERIOD_KEY CAL_PERIOD_KEY,
                   MAX("2YA_PERIOD_KEY") "2YA_PERIOD_KEY",  MAX("3YA_PERIOD_KEY") "3YA_PERIOD_KEY", 
                   MAX("4YA_PERIOD_KEY") "4YA_PERIOD_KEY"
            FROM (
                (SELECT 2 AS CALENDAR_KEY, 'STD' AS PREFIX, 1 AS "MONTH", '445 (Standard)' AS CALENDAR_NAME) META
                  JOIN (SELECT C.PERIOD_KEY AS PERIOD_KEY,
                        C.CALENDAR_KEY AS CALENDAR_KEY,
                        C.YEAR,
                        C.WEEK,
                        C.MONTH,
                        (C.YEAR)::VARCHAR(4) AS YEARNAME,
                        QE.YEARQUARTER,
                        QE.YEARQUARTERNAME,
                        ME.YEARPERIOD AS YEARMONTH,
                        ME.YEARPERIODNAME AS YEARMONTHNAME,
                        ME.YEARPERIOD,
                        ME.YEARPERIODNAME,
                        WE.YEARWEEK AS YEARWEEK,
                        WE.WEEKBEGIN,
                        TO_CHAR(TO_DATE ((WE.WEEKBEGIN)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10)
                        ) AS WEEKBEGINNAME,
                        WE.WEEKENDED,
                        TO_CHAR(TO_DATE ((WE.WEEKENDED)::VARCHAR,'YYYYMMDD'::VARCHAR(8)),'MM/DD/YYYY'::VARCHAR(10)
                        ) AS WEEKENDEDNAME,
                        C.LY_PERIOD_KEY,
                        C.NY_PERIOD_KEY,
                        YA2.LY_PERIOD_KEY "2YA_PERIOD_KEY",  YA3.LY_PERIOD_KEY "3YA_PERIOD_KEY", 
                        YA4.LY_PERIOD_KEY "4YA_PERIOD_KEY"
                    FROM ((({self._dim_schema}.CALENDAR_PERIOD C
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            MIN(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKBEGIN,
                            MAX(CALENDAR_PERIOD.PERIOD_KEY) AS WEEKENDED,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.WEEK AS WEEK,
                            CASE
                            WHEN (CALENDAR_PERIOD.WEEK > 2000) THEN CALENDAR_PERIOD.WEEK
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.WEEK)
                            END AS YEARWEEK
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.WEEK_CODE = 'B'::CHAR(1)) OR (CALENDAR_PERIOD.WEEK_CODE = 'E'::CHAR(1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.WEEK) WE
                      ON ( ( (C.CALENDAR_KEY = WE.CALENDAR_KEY)
                      AND (C.YEAR = WE.YEAR)
                      AND (C.WEEK = WE.WEEK))))
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.MONTH AS PERIOD,
                            CASE
                            WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN (CALENDAR_PERIOD.MONTH)::VARCHAR(6)
                            ELSE CONCAT (CONCAT (CONCAT ('Period '::VARCHAR(7),(CALENDAR_PERIOD.MONTH)::VARCHAR(2)), 
                                 ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4))
                            END AS YEARPERIODNAME,
                            CASE
                            WHEN (CALENDAR_PERIOD.MONTH > 2000) THEN CALENDAR_PERIOD.MONTH
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.MONTH)
                            END AS YEARPERIOD
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.MONTH_CODE = 'B'::CHAR (1)) 
                                OR (CALENDAR_PERIOD.MONTH_CODE = 'E'::CHAR (1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.MONTH) ME
                      ON ( ( (C.CALENDAR_KEY = ME.CALENDAR_KEY)
                      AND (C.YEAR = ME.YEAR)
                      AND (C.MONTH = ME.PERIOD))))
                      JOIN (SELECT CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.QUARTER AS QUARTER,
                            CASE WHEN CALENDAR_PERIOD.QUARTER > 2000 THEN CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1), 
                                (SUBSTR((CALENDAR_PERIOD.QUARTER)::VARCHAR(6), 6))::VARCHAR(2)), 
                                ', '::VARCHAR(2)),(CALENDAR_PERIOD.QUARTER)::VARCHAR(4)) 
                            ELSE CONCAT(CONCAT (CONCAT ('Q'::VARCHAR(1),(CALENDAR_PERIOD.QUARTER)::VARCHAR(2)), 
                                ', '::VARCHAR(2)),(CALENDAR_PERIOD.YEAR)::VARCHAR(4)) 
                            END AS  YEARQUARTERNAME,
                            CASE
                            WHEN (CALENDAR_PERIOD.QUARTER > 2000) THEN CALENDAR_PERIOD.QUARTER
                            ELSE ((CALENDAR_PERIOD.YEAR*100) + CALENDAR_PERIOD.QUARTER)
                            END AS YEARQUARTER
                        FROM {self._dim_schema}.CALENDAR_PERIOD
                        WHERE ((CALENDAR_PERIOD.QUARTER_CODE = 'B'::CHAR (1)) 
                                OR (CALENDAR_PERIOD.QUARTER_CODE = 'E'::CHAR (1)))
                        GROUP BY CALENDAR_PERIOD.CALENDAR_KEY,
                            CALENDAR_PERIOD.YEAR,
                            CALENDAR_PERIOD.QUARTER) QE
                      ON (((C.CALENDAR_KEY = QE.CALENDAR_KEY) AND (C.YEAR = QE.YEAR) AND (C.QUARTER = QE.QUARTER))))
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA2 
                      ON C.CALENDAR_KEY = YA2.CALENDAR_KEY AND C.LY_PERIOD_KEY = YA2.PERIOD_KEY
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA3 
                      ON C.CALENDAR_KEY = YA3.CALENDAR_KEY AND YA2.LY_PERIOD_KEY = YA3.PERIOD_KEY
                      LEFT JOIN {self._dim_schema}.CALENDAR_PERIOD YA4 
                      ON C.CALENDAR_KEY = YA4.CALENDAR_KEY AND YA3.LY_PERIOD_KEY = YA4.PERIOD_KEY        
                    WHERE (C.YEAR >= ((DATE_PART ('year'::VARCHAR (4),(NOW ())::TIMESTAMP))::INT- 5))) CAL 
                  ON ( (CAL.CALENDAR_KEY = META.CALENDAR_KEY))
            )
            GROUP BY CAL.PERIOD_KEY,META.CALENDAR_KEY;
            """.format(self=self)
            dct_sync_data = self.context.copy()
            dct_sync_data["source_dw"] = self._source_dw
            dct_sync_data["source_config"] = self._source_dw.config.json_data[
                "configs"]
            dct_sync_data["target_dw_schema"] = self._common_schema
            dct_sync_data["target_dw_table"] = self._dim_calendar
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)
            self._logger.info(
                "Sync hub_id {} dim calendar done".format(hub_id))

    def _export_product_data(self, hub_id, cis_source=False):
        """
        Export product data to common schema from source schema
        :param hub_id:
        :return:
        """

        _target_stage_table = "STAGE_{}".format(self._dim_product)
        _src_item_table = "PRODUCT"
        if cis_source:
            _target_stage_table = "STAGE_{}_CIS".format(self._dim_product)
            _src_item_table = "STAGE_{}".format(self._product_cis_src_tbl)

            # Creating related staging table for CIS source in HUB dw.
            # Since CIS source tables are external tables, can NOT be used to join other tables directly.
            sql = """
            DROP TABLE IF EXISTS {dimSchema}.{stgItemTable};
            CREATE TABLE {dimSchema}.{stgItemTable} AS 
            SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC_ID AS UPC, 
                   ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' AS ACTIVE 
            FROM {dimSchema}.{srcItemTable}
            WHERE 0=0 {filters};
            """.format(dimSchema=self._dim_schema,
                       stgItemTable=_src_item_table,
                       srcItemTable=self._product_cis_src_tbl,
                       filters=self._str_vendor_retailer_filter)
            self._logger.info(sql)
            self._source_dw.execute(sql)

        col = (
            " RETAILER_KEY, VENDOR_KEY, ITEM_KEY, UPC, ITEM_DESCRIPTION, ITEM_GROUP, OSM_ITEM_NBR, OSM_ITEM_STATUS,"
            "OSM_ITEM_TYPE, OSM_BRAND, OSM_UNIT_PRICE, OSM_CATEGORY, OSM_MAJOR_CATEGORY_NO, OSM_MAJOR_CATEGORY, "
            "OSM_SUB_CATEGORY_NO, OSM_SUB_CATEGORY, OSM_VENDOR_STK_NBR, OSM_VENDOR_PACK_COST, VENDOR_PACK_QTY, "
            "OSM_WHSE_PACK_COST, OSM_WHSE_PACK_QTY, DSD_IND, VENDOR_NAME ")
        sql = """
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
            CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY, PV.UPC, AGPG.ITEM_DESCRIPTION, 
            CASE WHEN AGPG.ITEM_GROUP IS NULL OR AGPG.ITEM_GROUP = '000000' THEN PV.UPC ELSE AGPG.ITEM_GROUP END AS 
            ITEM_GROUP, AGPG.OSM_ITEM_NBR, AGPG.OSM_ITEM_STATUS, AGPG.OSM_ITEM_TYPE, AGPG.OSM_BRAND,AGPG.OSM_UNIT_PRICE, 
            AGPG.OSM_CATEGORY, AGPG.OSM_MAJOR_CATEGORY_NO, AGPG.OSM_MAJOR_CATEGORY, AGPG.OSM_SUB_CATEGORY_NO, 
            AGPG.OSM_SUB_CATEGORY, AGPG.OSM_VENDOR_STK_NBR, AGPG.OSM_VENDOR_PACK_COST, 
            CASE WHEN length(AGPG.VENDOR_PACK_QTY) < 15
                THEN CASE WHEN REGEXP_LIKE(SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15), '^[0-9]*(\.[0-9]+)?$') 
                    THEN CASE WHEN SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC <= 1 THEN 1
                        ELSE SUBSTR(REPLACE(AGPG.VENDOR_PACK_QTY, ',', ''),0,15)::NUMERIC END 
                    ELSE 1 END 
                ELSE 1 END VENDOR_PACK_QTY, AGPG.OSM_WHSE_PACK_COST, AGPG.OSM_WHSE_PACK_QTY, 
            CASE WHEN AGPG.OSM_ITEM_TYPE IN ('07','37') THEN '1' ELSE '0' END AS DSD_IND, CC.VENDOR_NAME
        FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, VENDOR_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS
              WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC
        JOIN (
            SELECT D.VENDOR_KEY, D.ITEM_KEY, MAX(D.UPC) UPC FROM {self._dim_schema}.{itemTable} D
            JOIN {self._dim_schema}.VENDOR V on V.VENDOR_KEY = D.VENDOR_KEY
            WHERE D.ITEM_KEY NOT IN (SELECT ITEM_KEY FROM {self._dim_schema}.PRODUCT_GLOBAL_FILTER) 
            {self._active_status}
            GROUP BY D.VENDOR_KEY, D.ITEM_KEY
        ) PV ON PV.VENDOR_KEY = CC.VENDOR_KEY
        LEFT JOIN (
            SELECT AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_GROUP_X' THEN ATTRIBVALUE END) ITEM_GROUP
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'VENDOR_PACK_QTY_X' THEN ATTRIBVALUE END) VENDOR_PACK_QTY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'ITEM_DESCRIPTION_X' THEN ATTRIBVALUE END) ITEM_DESCRIPTION
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_NBR_X' THEN ATTRIBVALUE END)     OSM_ITEM_NBR
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_STATUS_X' THEN ATTRIBVALUE END)     OSM_ITEM_STATUS
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_ITEM_TYPE_X' THEN ATTRIBVALUE END)     OSM_ITEM_TYPE
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_QTY_X' THEN ATTRIBVALUE END)     OSM_WHSE_PACK_QTY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_NO_X' THEN ATTRIBVALUE END)     OSM_MAJOR_CATEGORY_NO
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_MAJOR_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_MAJOR_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_NO_X' THEN ATTRIBVALUE END)     OSM_SUB_CATEGORY_NO
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_SUB_CATEGORY_X' THEN ATTRIBVALUE END)     OSM_SUB_CATEGORY
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_BRAND_X' THEN ATTRIBVALUE END)     OSM_BRAND
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_UNIT_PRICE_X' THEN ATTRIBVALUE END)     OSM_UNIT_PRICE
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_STK_NBR_X' THEN ATTRIBVALUE END)     OSM_VENDOR_STK_NBR
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_VENDOR_PACK_COST_X' THEN ATTRIBVALUE END)     OSM_VENDOR_PACK_COST
            , MAX(CASE WHEN AGP.ATTRIBNAME = 'OSM_WHSE_PACK_COST_X' THEN ATTRIBVALUE END)     OSM_WHSE_PACK_COST
            FROM (
                SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(ATTR_GROUP, '_X') ATTRIBNAME, D.ITEM_KEY, D.ATTRIBVALUE
                      FROM {self._meta_schema}.ATTRIBUTEGROUP AG
                      JOIN {self._dim_schema}.{itemTable} D
                      ON D.VENDOR_KEY = AG.VENDOR_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status}
                      WHERE AG.ATTR_GROUP IN ('ITEM_GROUP','VENDOR_PACK_QTY','ITEM_DESCRIPTION','OSM_ITEM_NBR'
                          ,'OSM_ITEM_STATUS','OSM_ITEM_TYPE','OSM_WHSE_PACK_QTY','OSM_MAJOR_CATEGORY_NO'
                          ,'OSM_MAJOR_CATEGORY','OSM_CATEGORY','OSM_SUB_CATEGORY_NO','OSM_SUB_CATEGORY','OSM_BRAND',
                          'OSM_UNIT_PRICE','OSM_VENDOR_STK_NBR','OSM_VENDOR_PACK_COST','OSM_WHSE_PACK_COST')
            ) AGP
            GROUP BY AGP.RETAILER_KEY, AGP.VENDOR_KEY, AGP.ITEM_KEY
        ) AGPG ON AGPG.RETAILER_KEY = CC.RETAILER_KEY AND AGPG.VENDOR_KEY = PV.VENDOR_KEY AND AGPG.ITEM_KEY =PV.ITEM_KEY
        WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, PV.ITEM_KEY) NOT IN
                (SELECT RETAILER_KEY, VENDOR_KEY, ITEM_KEY FROM {self._dim_schema}.PRODUCT_DEFAULT_FILTER)
        """.format(self=self, itemTable=_src_item_table)
        dct_sync_data = self.context.copy()
        dct_sync_data["source_dw"] = self._source_dw
        dct_sync_data["source_config"] = self._source_dw.config.json_data[
            "configs"]
        dct_sync_data["target_dw_schema"] = self._common_schema
        dct_sync_data["target_dw_table"] = _target_stage_table
        dct_sync_data["target_column"] = col
        dct_sync_data["source_sql"] = sql
        self._logger.debug(sql)
        sync_data(dct_sync_data)

        # sync tall table(not for CIS).
        if self._str_vendor_filter and not cis_source:
            self._product_columns = self._dct_table_column["PRODUCT"]
            col = " VENDOR_KEY, ITEM_KEY, UPC, ATTRIBNAME, ATTRIBVALUE, ACTIVE "
            sql = (
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.PRODUCT WHERE ATTRIBNAME "
                "IN ({self._product_columns}) {self._str_vendor_filter}".
                format(self=self, col=col))
            dct_sync_data["target_dw_table"] = "STAGE_PRODUCT"
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)

    def _export_store_data(self, hub_id, cis_source=False):
        """
        Export store data to common schema table stage_<table_name> from source schema <table_name>
        :param hub_id:
        :return:
        """

        _target_store_table = "STAGE_{}".format(self._dim_store)
        _src_store_table = "STORE"
        if cis_source:
            _target_store_table = "STAGE_{}_CIS".format(self._dim_store)
            _src_store_table = "STAGE_{}".format(self._store_cis_src_tbl)

            # Creating related staging table for CIS source in HUB dw.
            # Since CIS source tables are external tables, can NOT be used to join other tables directly.
            sql = """
            DROP TABLE IF EXISTS {dimSchema}.{stgStoreTable};
            CREATE TABLE {dimSchema}.{stgStoreTable} AS 
            SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID AS STOREID, 
                   ATTRIBUTE_NAME as ATTRIBNAME, ATTRIBUTE_VALUE as ATTRIBVALUE, 'T' as ACTIVE 
            FROM {dimSchema}.{srcStoreTable}
            WHERE 0=0 {filters};
            """.format(dimSchema=self._dim_schema,
                       stgStoreTable=_src_store_table,
                       srcStoreTable=self._store_cis_src_tbl,
                       filters=self._str_vendor_retailer_filter)
            self._logger.info(sql)
            self._source_dw.execute(sql)

        col = (
            " RETAILER_KEY, VENDOR_KEY, STORE_KEY, STORE_ID, MARKET_CLUSTER, STATE, CITY, ZIP, OSM_REGION, "
            "OSM_DISTRICT, RSI_ALERT_TYPE, RETAILER_NAME, RSI_BANNER, PRIME_DC, STORE_NAME"
        )
        sql = """
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY, SR.STOREID, 
            AGSG.MARKET_CLUSTER, SR.STATE,
            CASE WHEN SR.CITY IS NULL AND STATE IS NULL THEN 'N/A'
                WHEN SR.CITY IS NOT NULL AND STATE IS NULL THEN CONCAT(SR.CITY, ', N/A')
                WHEN SR.CITY IS NULL AND STATE IS NOT NULL THEN CONCAT('N/A, ' , SR.STATE)
                WHEN SR.CITY IS NOT NULL AND STATE IS NOT NULL THEN CONCAT(CONCAT(SR.CITY, ', '), SR.STATE) END CITY,
            SR.ZIP, AGSG.OSM_REGION, AGSG.OSM_DISTRICT, COALESCE(SR.RSI_ALERT_TYPE, '0') RSI_ALERT_TYPE,
            CC.RETAILER_NAME, coalesce(AGSG.RSI_BANNER, 'unmapped') AS RSI_BANNER, AGSG.PRIME_DC, AGSG.STORE_NAME
        FROM (SELECT DISTINCT RETAILER_KEY, VENDOR_KEY, RETAILER_NAME FROM {self._meta_schema}.CONFIG_CUSTOMERS
              WHERE VENDOR_NAME != 'Alert' {self._str_vendor_retailer_filter}) CC
        JOIN (
            SELECT D.RETAILER_KEY, D.STORE_KEY, D.STOREID,
                MAX(CASE WHEN D.ATTRIBNAME = 'STATE' THEN D.ATTRIBVALUE END) STATE,
                MAX(CASE WHEN D.ATTRIBNAME = 'CITY' THEN D.ATTRIBVALUE END) CITY,
                MAX(CASE WHEN D.ATTRIBNAME = 'ZIP' THEN D.ATTRIBVALUE END) ZIP,
                MAX(CASE WHEN D.ATTRIBNAME = 'RSI_ALERT_TYPE' THEN D.ATTRIBVALUE END) RSI_ALERT_TYPE
            FROM {self._dim_schema}.{storeTable} D
            JOIN {self._dim_schema}.RETAILER R ON R.RETAILER_KEY = D.RETAILER_KEY
            WHERE STORE_KEY NOT IN (SELECT STORE_KEY FROM {self._dim_schema}.STORES_GLOBAL_FILTER) {self._active_status}
            GROUP BY D.RETAILER_KEY, D.STORE_KEY, D.STOREID
        ) SR ON SR.RETAILER_KEY = CC.RETAILER_KEY
        LEFT JOIN (
            SELECT AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'MARKET_CLUSTER_X' THEN ATTRIBVALUE END) MARKET_CLUSTER,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_REGION_X' THEN ATTRIBVALUE END) OSM_REGION,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'OSM_DISTRICT_X' THEN ATTRIBVALUE END) OSM_DISTRICT,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'RSI_BANNER_X' THEN ATTRIBVALUE END) RSI_BANNER,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'PRIME_DC_X' THEN ATTRIBVALUE END) PRIME_DC,
                MAX(CASE WHEN AGS.ATTRIBNAME = 'SAR_STORE_NAME_X' THEN ATTRIBVALUE END) STORE_NAME
                FROM (
                    SELECT AG.RETAILER_KEY, AG.VENDOR_KEY, CONCAT(AG.ATTR_GROUP, '_X') ATTRIBNAME, D.STORE_KEY, 
                        D.ATTRIBVALUE
                    FROM {self._meta_schema}.ATTRIBUTEGROUP AG
                    JOIN {self._dim_schema}.{storeTable} D
                    ON D.RETAILER_KEY = AG.RETAILER_KEY AND D.ATTRIBNAME = AG.ATTR_NAME {self._active_status}
                    WHERE AG.ATTR_GROUP IN ('RSI_BANNER','PRIME_DC', 'MARKET_CLUSTER', 'OSM_REGION', 'OSM_DISTRICT', 'SAR_STORE_NAME')
                ) AGS
                GROUP BY AGS.RETAILER_KEY, AGS.VENDOR_KEY, AGS.STORE_KEY
        ) AGSG 
        ON AGSG.RETAILER_KEY = SR.RETAILER_KEY AND AGSG.VENDOR_KEY = CC.VENDOR_KEY
        AND AGSG.STORE_KEY = SR.STORE_KEY
        WHERE (CC.RETAILER_KEY, CC.VENDOR_KEY, SR.STORE_KEY) NOT IN
                (SELECT RETAILER_KEY, VENDOR_KEY, STORE_KEY FROM {self._dim_schema}.STORES_DEFAULT_FILTER);
        """.format(self=self, storeTable=_src_store_table)
        dct_sync_data = self.context.copy()
        dct_sync_data["source_dw"] = self._source_dw
        dct_sync_data["source_config"] = self._source_dw.config.json_data[
            "configs"]
        dct_sync_data["target_dw_schema"] = self._common_schema
        dct_sync_data["target_dw_table"] = _target_store_table
        dct_sync_data["target_column"] = col
        dct_sync_data["source_sql"] = sql
        self._logger.debug(sql)
        sync_data(dct_sync_data)

        # sync tall table(not for CIS)
        if self._str_retailer_filter and not cis_source:
            self._store_columns = self._dct_table_column["STORE"]
            col = " RETAILER_KEY, STORE_KEY, STOREID, ATTRIBNAME, ATTRIBVALUE, ACTIVE "
            sql = (
                "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {col} FROM {self._dim_schema}.STORE WHERE ATTRIBNAME "
                "IN ({self._store_columns}) {self._str_retailer_filter}".
                format(self=self, col=col))
            dct_sync_data["target_dw_table"] = "STAGE_STORE"
            dct_sync_data["target_column"] = col
            dct_sync_data["source_sql"] = sql
            self._logger.debug(sql)
            sync_data(dct_sync_data)

    def _load_product_data_cis(self, hub_id):
        """
        Sync data from CIS source.
        if the same item_key exists in both CIS and NXG HUBs, the CIS one should override the one in NXG side.
        :param hub_id:
        :return:
        """

        # dump CIS data into cis staging table: stage_dim_product_cis
        self._export_product_data(hub_id=hub_id, cis_source=True)

        # Below part is moved from method self._export_product_data. Put it here when all product data are ready.
        # Override nxg items with CIS if there are duplicated items on both NXG and CIS
        sql = """
         DELETE FROM {cmnSchema}.STAGE_{itemTable} 
         WHERE (vendor_key, retailer_key, item_key) IN
         (SELECT vendor_key, retailer_key, item_key
          FROM {cmnSchema}.STAGE_{itemTable}_CIS);
         
         INSERT INTO {cmnSchema}.STAGE_{itemTable} 
         SELECT * FROM {cmnSchema}.STAGE_{itemTable}_CIS;
        """.format(itemTable=self._dim_product, cmnSchema=self._common_schema)
        self._logger.info(sql)
        self._dw.execute(sql)

        # updating column OSM_MAJOR_CATEGORY when NXG and CIS sources are all done.
        sql = """
        DROP TABLE IF EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY;
        CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_SUB_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS 
        AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
        SELECT distinct OSM_SUB_CATEGORY_NO,MAX(OSM_SUB_CATEGORY) as OSM_SUB_CATEGORY 
        FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_SUB_CATEGORY_NO;

        DROP TABLE IF EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY;
        CREATE LOCAL TEMP TABLE IF NOT EXISTS TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY ON COMMIT PRESERVE ROWS 
        AS /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ 
        SELECT distinct OSM_MAJOR_CATEGORY_NO,MAX(OSM_MAJOR_CATEGORY) as OSM_MAJOR_CATEGORY 
        FROM {self._common_schema}.{self._dim_product} GROUP BY OSM_MAJOR_CATEGORY_NO;
        """.format(self=self)
        self._logger.info(sql)
        self._dw.execute(sql)

        sql = ("""
        SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ ANALYZE_STATISTICS('{self._common_schema}.STAGE_{self._dim_product}');

        INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_SUB_CATEGORY_CONSISTENCY
        SELECT a.OSM_SUB_CATEGORY_NO, MAX(a.OSM_SUB_CATEGORY) AS OSM_SUB_CATEGORY
        FROM {self._common_schema}.STAGE_{self._dim_product} a
        LEFT JOIN TEMP_OSM_SUB_CATEGORY_CONSISTENCY b on a.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO
        WHERE a.OSM_SUB_CATEGORY_NO IS NOT NULL AND a.OSM_SUB_CATEGORY_NO <> '' AND b.OSM_SUB_CATEGORY_NO IS NULL
        GROUP BY a.OSM_SUB_CATEGORY_NO;

        UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product}
        SET OSM_SUB_CATEGORY = b.OSM_SUB_CATEGORY
        FROM 
        (SELECT * FROM TEMP_OSM_SUB_CATEGORY_CONSISTENCY) b
        WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_SUB_CATEGORY_NO = b.OSM_SUB_CATEGORY_NO;
        /*UPDATE {self._common_schema}.STAGE_{self._dim_product}
        SET ITEM_GROUP = B.ITEM_GROUP
        FROM $schemaName.OLAP_ITEM B
        WHERE {self._common_schema}.STAGE_{self._dim_product}.ITEM_KEY = B.ITEM_KEY;*/

        INSERT /*+ DIRECT, LABEL(GX_IRIS_SYNC_DIM_DATA)*/ INTO TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY
        SELECT a.OSM_MAJOR_CATEGORY_NO, MAX(a.OSM_MAJOR_CATEGORY) AS OSM_MAJOR_CATEGORY
        FROM {self._common_schema}.STAGE_{self._dim_product} a
        LEFT JOIN TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY b on a.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO
        WHERE a.OSM_MAJOR_CATEGORY_NO IS NOT NULL AND a.OSM_MAJOR_CATEGORY_NO <> '' AND b.OSM_MAJOR_CATEGORY_NO IS NULL
        GROUP BY a.OSM_MAJOR_CATEGORY_NO;

        UPDATE /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ {self._common_schema}.STAGE_{self._dim_product}
        SET OSM_MAJOR_CATEGORY = b.OSM_MAJOR_CATEGORY
        FROM 
        (SELECT * FROM TEMP_OSM_MAJOR_CATEGORY_CONSISTENCY) b
        WHERE {self._common_schema}.STAGE_{self._dim_product}.OSM_MAJOR_CATEGORY_NO = b.OSM_MAJOR_CATEGORY_NO;
        """.format(self=self))
        self._logger.info(sql)
        self._dw.execute(sql)

    def _load_store_data_cis(self, hub_id):
        """
        Sync data from CIS source.
        # if the same store_key exists in CIS and NXG HUBs, the CIS one should override the one in NXG side.
        :param hub_id:
        :return:
        """

        # dump CIS data into cis staging table: stage_dim_store_cis
        self._export_store_data(hub_id=hub_id, cis_source=True)

        # # Override nxg stores with CIS if there are duplicated items on both NXG and CIS
        sql = """
        DELETE FROM {cmnSchema}.STAGE_{storeTable} 
        WHERE (vendor_key, retailer_key, store_key) 
        IN (SELECT vendor_key, retailer_key, store_key 
            FROM {cmnSchema}.STAGE_{storeTable}_CIS );
        
        INSERT INTO {cmnSchema}.STAGE_{storeTable}  
        SELECT * FROM {cmnSchema}.STAGE_{storeTable}_CIS;
        """.format(storeTable=self._dim_store, cmnSchema=self._common_schema)
        self._logger.info(sql)
        self._dw.execute(sql)

    def _swap_data(self):
        """
        Switch partition from stage table to final table.
        :return:
        """

        sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY "
               "FROM {self._common_schema}.STAGE_{self._dim_product}".format(
                   self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(
                schema_name=self._common_schema,
                table_name=self._dim_product,
                partition_name=row[0],
                stage_schema_name=self._common_schema,
                stage_table_name='STAGE_{self._dim_product}'.format(self=self))

        sql = ("SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT VENDOR_KEY "
               "FROM {self._common_schema}.STAGE_PRODUCT".format(self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(schema_name=self._common_schema,
                                      table_name='PRODUCT',
                                      partition_name=row[0],
                                      stage_schema_name=self._common_schema,
                                      stage_table_name='STAGE_PRODUCT')

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY "
            "FROM {self._common_schema}.STAGE_{self._dim_store}".format(
                self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(
                schema_name=self._common_schema,
                table_name=self._dim_store,
                partition_name=row[0],
                stage_schema_name=self._common_schema,
                stage_table_name='STAGE_{self._dim_store}'.format(self=self))

        sql = (
            "SELECT /*+ LABEL(GX_IRIS_SYNC_DIM_DATA)*/ DISTINCT RETAILER_KEY "
            "FROM {self._common_schema}.STAGE_STORE".format(self=self))
        self._logger.debug(sql)
        for row in self._dw.query(sql):
            self._dw.switch_partition(schema_name=self._common_schema,
                                      table_name='STORE',
                                      partition_name=row[0],
                                      stage_schema_name=self._common_schema,
                                      stage_table_name='STAGE_STORE')

    def _process_sync_dim_data(self):
        """
        Process to sync dim data
        :return:
        """
        try:
            self._get_column()
            self._recreate_stage_table()
            for hub_id in self._hub_ids:
                try:
                    self._dim_schema = "DIM_{hub_id}".format(hub_id=hub_id)
                    self._meta_schema = "METADATA_{hub_id}".format(
                        hub_id=hub_id)
                    self._logger.info(
                        "Sync dim data start hub_id {hub_id}".format(
                            hub_id=hub_id))
                    self._get_source_config(hub_id)
                    self._export_calendar_data(hub_id)
                    self._get_vendor_retailer_from_hub(hub_id)

                    if not self._str_vendor_retailer_filter:
                        self._logger.warning(
                            "hub_id {} does not have any matched product and store data"
                            .format(hub_id))
                        continue

                    self._export_product_data(hub_id)
                    self._export_store_data(hub_id)

                    # handle CIS source.
                    self._load_product_data_cis(hub_id)
                    self._load_store_data_cis(hub_id)

                    self._logger.info(
                        "Sync hub_id {hub_id} dim data to stage table.".format(
                            hub_id=hub_id))
                except Exception as msg:
                    self._logger.warning(msg)
                    continue

            self._swap_data()
            # self._truncate_stage_table()
            self._logger.info("Sync dim data done from hubs {hub_ids}".format(
                hub_ids=self._hub_ids))
            self._analyze_table()
        except Exception as msg:
            self._logger.error(msg)
        finally:
            pass

    def process(self):
        try:
            self._process_sync_dim_data()

        except Exception as e:
            self._logger.error(e)

        finally:
            if hasattr(self, "_dw"):
                self._dw.close_connection()
            if hasattr(self, "_source_dw"):
                self._source_dw.close_connection()
            if hasattr(self, "_db"):
                self._db.close_connection()
Exemplo n.º 11
0
parse.add_option("--vendor_key", action="store", dest="vendor_key")
parse.add_option("--meta", action="store", dest="meta")
(options, args) = parse.parse_args()

meta = json.loads(options.meta)

schema_name = meta['schema']
common_schema = meta['common_schema']
#schemas = [common_schema, schema_name]
schemas = [schema_name]

logger = Logger(log_level="info",
                target="console",
                vendor_key=options.vendor_key,
                retailer_key=options.retailer_key)
dw = DWOperation(meta=meta, logger=logger)

try:
    for s in schemas:
        dw.execute('''
CREATE SCHEMA IF NOT EXISTS %(schema_name)s DEFAULT INCLUDE SCHEMA PRIVILEGES;
GRANT USAGE ON SCHEMA %(schema_name)s TO read_internalusers;
GRANT SELECT ON SCHEMA %(schema_name)s TO read_internalusers;
GRANT USAGE ON SCHEMA %(schema_name)s TO write_internalusers;
GRANT SELECT,INSERT,UPDATE,DELETE,TRUNCATE ON SCHEMA %(schema_name)s TO write_internalusers;
        ''' % {'schema_name': s})
        logger.info(
            '%s is created, and permission has been granted for read_internalusers and write_internalusers.'
            % s)

        for env in ['uat', 'pp', 'Prod']:
Exemplo n.º 12
0
    def __init__(self,
                 meta={},
                 to_physical_table=False,
                 no_delete_key=False,
                 logger=None):

        logger = logger if logger else Logger(log_level="info",
                                              vendor_key=-1,
                                              retailer_key=-1,
                                              module_name="Persist_Feedback")
        vertica_conn = DWOperation(meta=meta, logger=logger)
        redis_conn = RedisOperation(meta=meta, logger=logger)
        c = redis_conn.get_connection()
        column_mapping = {
            'from_key': [
                # Save a value from key into dataset
                # ['key name in dataset', 'index of key elements splitted by :']
                [
                    'UPDATE_TIME', 4
                ]  #FEEDBACK:342:6:38726000097879:20190219013621  --> get key.split(':')[4]
            ],
            'hardcode': [
                # ['key name in dataset', 'hardcode value']
                ['EVENT_KEY', '-1'],
            ],
            'from_redis': [
                # ['table column', 'key name in dataset', 'table column type', 'value updatable', 'target']
                # Be noted, the table column type, if the value needs to be quoted, use VARCHAR, else use NUMBER
                # so int, float, number -> NUMBER
                # varchar, date, timestamp -> VARCHAR
                # updatable means the value of the column would be updated if alerts already exist
                [
                    'ALERT_ID', 'ALERT_ID', 'NUMBER', 'NON-UPDATABLE',
                    'TABLE|MOBILE'
                ],
                [
                    'VENDOR_KEY', 'VENDOR_KEY', 'NUMBER', 'NON-UPDATABLE',
                    'TABLE|MOBILE'
                ],
                [
                    'RETAILER_KEY', 'RETAILER_KEY', 'NUMBER', 'NON-UPDATABLE',
                    'TABLE|MOBILE'
                ],
                ['ITEM_KEY', 'ITEM_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE'],
                ['STORE_KEY', 'STORE_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE'],
                [
                    'PERIOD_KEY', 'PERIOD_KEY', 'NUMBER', 'NON-UPDATABLE',
                    'TABLE'
                ],
                [
                    'STORE_REP', 'STORE_REP', 'VARCHAR', 'NON-UPDATABLE',
                    'TABLE'
                ],
                [
                    'STORE_VISITED_PERIOD_KEY', 'STORE_VISITED_PERIOD_KEY',
                    'NUMBER', 'NON-UPDATABLE', 'TABLE'
                ],
                [
                    'FEEDBACK_DESCRIPTION', 'FEEDBACK_DESCRIPTION', 'VARCHAR',
                    'UPDATABLE', 'TABLE|MOBILE'
                ],
                [
                    'ON_HAND_PHYSICAL_COUNT', 'ON_HAND_PHYSICAL_COUNT',
                    'NUMBER', 'UPDATABLE', 'TABLE|MOBILE'
                ],
                [
                    'ON_HAND_CAO_COUNT', 'ON_HAND_CAO_COUNT', 'NUMBER',
                    'UPDATABLE', 'TABLE|MOBILE'
                ],
                ['SOURCE', 'SOURCE', 'VARCHAR', 'NON-UPDATABLE', 'TABLE'],
                [
                    'CYCLE_KEY', 'CYCLE_KEY', 'NUMBER', 'NON-UPDATABLE',
                    'MOBILE'
                ],
                [
                    'UPDATE_TIME', 'UPDATE_TIME', 'VARCHAR', 'NON-UPDATABLE',
                    'MOBILE'
                ],
                ['EVENT_KEY', 'EVENT_KEY', 'NUMBER', 'NON-UPDATABLE', 'TABLE'],
            ]
        }

        self.vars = {
            "meta": meta,
            "logger": logger,
            "column_mapping": column_mapping,
            "vertica_conn": vertica_conn,
            "redis_conn": c,
            "to_physical_table": to_physical_table,
            "no_delete_key": no_delete_key
        }

        capacity = Capacity(meta)
        self.vars['capacity'] = capacity
Exemplo n.º 13
0
class DataCleanup:
    """
    Cleanup outdated Data
    For table: it will drop the table end with job id which less then job id from api /common/job/cleanup
    Gor data, it will drop partition set in config json
    """

    def __init__(self, context):
        self.context = context.copy()
        self._logger = context["logger"]
        self.body = {"job_id": context["jobId"], "step_id": context["stepId"], "status": StepStatus.RUNNING.name,
                     "message": "Data Cleanup start"}
        self._logger.info(self.body)

    def get_url_response(self, url, method="POST", **kwargs):
        """Get response from url
        :return: response
        """
        self._logger.info(url)
        if method.upper() not in ["GET", "POST", "PUT", "DELETE"]:
            method = "GET"
        response = requests.request(method=method, url=url, verify=False, **kwargs)
        self._logger.info(response.json())
        return response

    def _get_boundary_job_id(self):
        """Get boundary job id
        :return: boundary_job_id
        """
        boundary_job_id = None
        response = self.get_url_response(url="{}/job/cleanup".format(self.context["meta"]["api_job_str"]),
                                         method="DELETE", headers=self.context["header"]).json()
        if response["status"].upper() == StepStatus.SUCCESS.name:
            if response["data"]:
                boundary_job_id = response["data"]
                self.body["message"] = "Boundary job id is {}".format(boundary_job_id)
                self._logger.info(self.body)
            else:
                self.body["message"] = "None boundary job id".format(boundary_job_id)
                self._logger.warning(self.body)
        else:
            self.body["message"] = response["error"]
            self._logger.error(response)
        return boundary_job_id

    def _get_outdated_data(self, dct_job_id):
        """Get outdated data
        :return: dictionary of outdated data
        """
        dct = {}
        boundary_job_id = self._get_boundary_job_id()
        if boundary_job_id:
            for k, v in dct_job_id.items():
                if int(k) < int(boundary_job_id):
                    dct[k] = v
        return dct

    def _cleanup_table(self, config):
        """Cleanup dw table
        :return:
        """
        dct_table = {}
        sql = ("SELECT /*+ LABEL(GX_IRIS_DATA_CLEANUP)*/ TABLE_SCHEMA || '.' || TABLE_NAME AS SCHEMA_TABLE"
               "    , REGEXP_SUBSTR(TABLE_NAME, '(.+)_(\d+)$', 1, 1, '', 2) AS JOB_ID FROM TABLES "
               "WHERE TABLE_SCHEMA IN ( "
               "    SELECT SCHEMA_NAME FROM SCHEMATA WHERE SCHEMA_OWNER = '{}' AND SCHEMA_NAME NOT IN ('{}'))"
               "AND (REGEXP_ILIKE(TABLE_NAME,'^{}$'))"
               "".format(self.context["meta"]["db_conn_vertica_username"], "','".join(config["excludeSchema"]),
                         "$') OR REGEXP_ILIKE(TABLE_NAME,'^".join(config["namePattern"])))
        self._logger.info(sql)
        for row in self._dw.query(sql):
            dct_table[row.JOB_ID] = row.SCHEMA_TABLE
        if dct_table:
            dct_outdated_data = self._get_outdated_data(dct_table)
            if dct_outdated_data:
                sql = "DROP TABLE IF EXISTS {} CASCADE".format(",".join(dct_outdated_data.values()))
                self._logger.info(sql)
                self._dw.execute(sql)
            else:
                self.body["message"] = "None table cleanup"
                self._logger.warning(self.body)
        else:
            self.body["message"] = "Not found table to cleanup"
            self._logger.warning(self.body)

    def _cleanup_data(self, config):
        """Cleanup dw table partition, SEQ_NUM logic relate to OSA-471
        :return:
        """
        if "SEQ_NUM" in config["partitionTable"]:
            sql = ("SELECT /*+ LABEL(GX_IRIS_DATA_CLEANUP)*/ DISTINCT INCIDENT_ID_START//1000000000%36500 AS SEQ_NUM "
                   "FROM {}.DIM_INCIDENT_ID_START "
                   "WHERE PERIOD_KEY = {}".format(
                    self.context["meta"]["db_conn_vertica_common_schema"],
                    (datetime.datetime.now() - datetime.timedelta(days=config["reserveDays"])).strftime("%Y%m%d")))
            self._logger.info(sql)
            seq_num = self._dw.query_scalar(sql)
            if seq_num:
                sql = ("SELECT /*+ LABEL(GX_IRIS_DATA_CLEANUP)*/ SCHEMA_TABLE, PARTITION_KEY FROM ( "
                       "    SELECT DISTINCT PA.PARTITION_KEY, PA.TABLE_SCHEMA||'.'||PR.ANCHOR_TABLE_NAME SCHEMA_TABLE "
                       "    FROM PARTITIONS PA "
                       "    JOIN PROJECTIONS PR ON PR.PROJECTION_ID = PA.PROJECTION_ID "
                       "    WHERE PA.TABLE_SCHEMA IN ( "
                       "        SELECT SCHEMA_NAME FROM SCHEMATA "
                       "        WHERE SCHEMA_OWNER = '{}' AND SCHEMA_NAME NOT IN ('{}')) "
                       "    AND PR.ANCHOR_TABLE_NAME IN ('{}')) T "
                       "WHERE T.PARTITION_KEY < {} "
                       "".format(self.context["meta"]["db_conn_vertica_username"], "','".join(config["excludeSchema"]),
                                 "','".join(config["partitionTable"]['SEQ_NUM']), seq_num))
                self._logger.info(sql)
                for row in self._dw.query(sql):
                    sql = "SELECT /*+ LABEL(GX_IRIS_DATA_CLEANUP)*/ DROP_PARTITION('{}', {})" \
                          "".format(row.SCHEMA_TABLE, row.PARTITION_KEY)
                    self._logger.info(sql)
                    self._dw.execute(sql)
            else:
                self.body["message"] = self._logger.warning("Not found SEQ_NUM to cleanup")
                self._logger.warning(self.body)
        else:
            self.body["message"] = "None data cleanup"
            self._logger.warning(self.body)

    def _process_data_cleanup(self):
        """Process Data Cleanup
        :return:
        """
        try:
            self._dw = DWOperation(meta=self.context["meta"])
            lst_warning = [" none warning"]
            for cfg in eval(self.context["config_json"]):
                try:
                    if cfg["type"].upper() == "TABLE":
                        self._cleanup_table(cfg)
                    elif cfg["type"].upper() == "DATA":
                        self._cleanup_data(cfg)
                    self.body["message"] = "{} cleanup done".format(cfg["type"])
                    self._logger.info(self.body)
                except Exception as msg:
                    self.body["status"] = StepStatus.ERROR.name
                    self.body["message"] = ("Data Cleanup error with config {}  --||##  {}".format(cfg, msg))
                    self._logger.warning(self.body)
                    lst_warning.append(cfg)
                    continue
            self.body["status"] = StepStatus.SUCCESS.name
            self.body["message"] = ("Data Cleanup done with config {}, {}".format(cfg, "; ".join(lst_warning)))
            self._logger.info(self.body)
        except Exception as msg:
            self.body["status"] = StepStatus.ERROR.name
            self._logger.error(msg)
        finally:
            if hasattr(self, "_dw"):
                self._dw.close_connection()
            self.body["message"] = "Data Cleanup end"
            self._logger.info(self.body)

    def process(self):
        self._process_data_cleanup()
Exemplo n.º 14
0
    def __init__(self, meta, params=None, init_flag=False, logger=None):
        """
        # sync feedback data from RDP side to IRIS(OSA) side by incremental via event_key.
        # 1, only sync those vendor&retailer which applied OSA Service.
        # 2, for Those new vendor&retailer, copy all historical data when initialization.

        :param meta: [mandatory] config data from config.properties file
        :param params: 2 cases here. depends on whether sync rdp feedback for whole RDP or new customer. see below.
            1, rdp_id: if rdp_id was given, then sync all data for this given RDP. otherwise, sync data from all related RDPs.
               Noted: rdp_id will be passed when calling this service via REST API.
            2, vendor_key:   mandatory only when init_flag is True.
               retailer_key: mandatory only when init_flag is True
               Noted: These 2 parameters will not be passed from REST API but called directly by deploy scripts.
        :param init_flag:
            if init_flag is True: then only sync feedback data for given vendor & retailer. This is used when introducing new customer.
            if init_flat is False: sync all customers' data from RDP periodically(e.g. sync daily).
        :param logger:
        """
        self.meta = meta
        self._params = {} if params is None else params
        self._rdp_id = self._params.get("rdpId", None)
        self._fact_type = 'fdbk'
        self._init_flag = init_flag
        self._vendor_key = self._params.get("vendor_key", None)
        self._retailer_key = self._params.get("retailer_key", None)
        self._debug = self._params.get('debug', 'N')
        self._default_rdp = "RDP_AUX"

        self._log_file = './log/sync_fdbk_%s_%s.log' % (
            self._rdp_id, datetime.datetime.now().strftime('%Y%m%d'))
        self.logger = logger if logger else Logger(log_level="debug",
                                                   target="console|file",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   log_file=self._log_file,
                                                   sql_conn=None)

        self.osa_app_conn = MSOperation(meta=self.meta, logger=self.logger)
        self.osa_dw_conn = DWOperation(meta=self.meta, logger=self.logger)
        self.max_event_key = None

        # we already know feedback table name of RDP
        self.source_table_rdp = "DS_FACT_FEEDBACK"  # source table in RDP side.
        self.staging_import_table_osa = "STAGE_FACT_FEEDBACK_RDP"  # used to store sync data from RDP table (same structure as table DS_FACT_FEEDBACK)
        self.target_table_osa = "FACT_FEEDBACK"  # final table in OSA side
        self.capacity = Capacity(meta=meta)

        self.dct_sync_data = copy.deepcopy(
            self.meta)  # required for calling sync_data module
        self.dct_sync_data[
            "meta"] = self.meta  # required for calling sync_data module
        self.dct_sync_data["target_osa_conn"] = self.osa_dw_conn
        self.dct_sync_data["target_dw_schema"] = self.meta[
            'db_conn_vertica_common_schema']
        self.dct_sync_data["target_dw_table"] = self.staging_import_table_osa
        self.dct_sync_data["logger"] = self.logger

        # [True|False(default)] True: direct connection between Vertica clusters. False: using vsql.
        self.dct_sync_data["dw_conn_vertica"] = False
        # self.dct_sync_data["dw_conn_vertica"] = True

        self.transfer = TransferData(dct_sync_data=self.dct_sync_data)
Exemplo n.º 15
0
class Feedback(object):
    def __init__(self, meta, params=None, init_flag=False, logger=None):
        """
        # sync feedback data from RDP side to IRIS(OSA) side by incremental via event_key.
        # 1, only sync those vendor&retailer which applied OSA Service.
        # 2, for Those new vendor&retailer, copy all historical data when initialization.

        :param meta: [mandatory] config data from config.properties file
        :param params: 2 cases here. depends on whether sync rdp feedback for whole RDP or new customer. see below.
            1, rdp_id: if rdp_id was given, then sync all data for this given RDP. otherwise, sync data from all related RDPs.
               Noted: rdp_id will be passed when calling this service via REST API.
            2, vendor_key:   mandatory only when init_flag is True.
               retailer_key: mandatory only when init_flag is True
               Noted: These 2 parameters will not be passed from REST API but called directly by deploy scripts.
        :param init_flag:
            if init_flag is True: then only sync feedback data for given vendor & retailer. This is used when introducing new customer.
            if init_flat is False: sync all customers' data from RDP periodically(e.g. sync daily).
        :param logger:
        """
        self.meta = meta
        self._params = {} if params is None else params
        self._rdp_id = self._params.get("rdpId", None)
        self._fact_type = 'fdbk'
        self._init_flag = init_flag
        self._vendor_key = self._params.get("vendor_key", None)
        self._retailer_key = self._params.get("retailer_key", None)
        self._debug = self._params.get('debug', 'N')
        self._default_rdp = "RDP_AUX"

        self._log_file = './log/sync_fdbk_%s_%s.log' % (
            self._rdp_id, datetime.datetime.now().strftime('%Y%m%d'))
        self.logger = logger if logger else Logger(log_level="debug",
                                                   target="console|file",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   log_file=self._log_file,
                                                   sql_conn=None)

        self.osa_app_conn = MSOperation(meta=self.meta, logger=self.logger)
        self.osa_dw_conn = DWOperation(meta=self.meta, logger=self.logger)
        self.max_event_key = None

        # we already know feedback table name of RDP
        self.source_table_rdp = "DS_FACT_FEEDBACK"  # source table in RDP side.
        self.staging_import_table_osa = "STAGE_FACT_FEEDBACK_RDP"  # used to store sync data from RDP table (same structure as table DS_FACT_FEEDBACK)
        self.target_table_osa = "FACT_FEEDBACK"  # final table in OSA side
        self.capacity = Capacity(meta=meta)

        self.dct_sync_data = copy.deepcopy(
            self.meta)  # required for calling sync_data module
        self.dct_sync_data[
            "meta"] = self.meta  # required for calling sync_data module
        self.dct_sync_data["target_osa_conn"] = self.osa_dw_conn
        self.dct_sync_data["target_dw_schema"] = self.meta[
            'db_conn_vertica_common_schema']
        self.dct_sync_data["target_dw_table"] = self.staging_import_table_osa
        self.dct_sync_data["logger"] = self.logger

        # [True|False(default)] True: direct connection between Vertica clusters. False: using vsql.
        self.dct_sync_data["dw_conn_vertica"] = False
        # self.dct_sync_data["dw_conn_vertica"] = True

        self.transfer = TransferData(dct_sync_data=self.dct_sync_data)

    def _populate_source_config(self, source_config):
        self.logger.debug("The source config is: %s" % source_config)

        _src_config = {}
        if os.name == 'nt':
            _src_config["temp_file_path"] = "d:"
        elif os.name == 'posix':
            _src_config["temp_file_path"] = "/tmp"

        # Getting user account from config.properties file first.
        if self.meta.get("db_conn_vertica_rdp_username"):
            _src_config["dw.etluser.id"] = self.meta.get(
                "db_conn_vertica_rdp_username")
            if self.meta.get("db_conn_vertica_rdp_password"):
                _src_config["dw.etluser.password"] = self.meta.get(
                    "db_conn_vertica_rdp_password")
            else:
                _pmp_pwd = get_password(
                    username=self.meta.get("db_conn_vertica_rdp_username"),
                    meta=self.meta)
                # The pwd should be encrypted in order to: 1, align with else part, 2, pass it to db.sync_data module
                _src_config["dw.etluser.password"] = Crypto().encrypt(_pmp_pwd)
        # if not configed then get them directly from RDP config.
        else:
            _src_config["dw.etluser.id"] = source_config.get("dw.etluser.id")
            # the pwd is encrypted
            _src_config["dw.etluser.password"] = source_config.get(
                "dw.etluser.password")

        # required info for calling sync_data module.
        _src_config["dw.server.name"] = source_config.get("dw.server.name")
        _src_config["dw.db.name"] = source_config.get("dw.db.name")
        _src_config["dw.db.portno"] = source_config.get("dw.db.portno", 5433)
        _src_config["dw.schema.name"] = source_config.get("dw.schema.name")

        self.logger.debug("srouce config is: %s" % _src_config)
        self.dct_sync_data["source_config"] = _src_config

        # Create the connection to RDP Vertica Cluster. which is the source Vertica cluster
        rdp_meta = copy.deepcopy(self.meta)
        tmp_rdp_meta = {
            'db_conn_vertica_servername': _src_config["dw.server.name"],
            'db_conn_vertica_port': _src_config["dw.db.portno"],
            'db_conn_vertica_dbname': _src_config["dw.db.name"],
            'db_conn_vertica_username': _src_config["dw.etluser.id"],
            'db_conn_vertica_password': _src_config["dw.etluser.password"],
            'db_conn_vertica_password_encrypted': "true"
        }
        rdp_meta.update(tmp_rdp_meta)
        self.logger.debug("rdp config is: %s" % rdp_meta)
        rdp_connection = DWOperation(meta=rdp_meta)
        self.dct_sync_data["source_dw"] = rdp_connection

    def main_process(self):
        try:
            # if not introducing new customer and _rdp_id was given,
            # then we will sync all feedback data from given RDP for registered users.
            if self._init_flag is False and self._rdp_id:
                try:
                    rdp_config = Config(meta=self.meta,
                                        hub_id=self._rdp_id).json_data
                    if not rdp_config['configs']:
                        raise Warning(
                            "There is no configs returned for RDP: %s."
                            "Please check if this RDP registered in CP with below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self._rdp_id, self.meta["api_config_str"]))
                        # exit(StepStatus.SUCCESS.value)

                    _rdp_schema = rdp_config['configs'].get('dw.schema.name')
                    self.logger.info("Started to sync data from rdp: %s" %
                                     _rdp_schema)

                    # self.dct_sync_data["source_config"] = rdp_config['configs']
                    self._populate_source_config(rdp_config['configs'])

                    self.initialize()

                    _flag = self.load_data()
                    if _flag:
                        # if no data, then no need to process & update variables table.
                        self.process_data()

                        sql = """
                        IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}')
                            INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME)
                            VALUES ('{eventType}', '{value}', '', getdate(), getdate())
                        ELSE
                            UPDATE VARIABLES
                            SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate()
                            WHERE VARIABLE_NAME = '{eventType}'
                        """.format(eventType=_rdp_schema,
                                   value=self.max_event_key)
                        self.logger.info(sql)
                        self.osa_app_conn.execute(sql)

                    self.logger.info("Data sync done for RDP: %s" %
                                     _rdp_schema)

                except Exception as e:
                    self.logger.warning(e)
                    raise
                    # exit(StepStatus.SUCCESS.value)  # exit(0) otherwise Docker container will fail.

            # Else we will get all RDPs from REST API: http://10.172.36.75/config/properties/rdps?factType=fdbk
            # There could be multi RDPs(e.g. for SVR & WM). if so, loop all RDPs
            elif self._init_flag is False and self._rdp_id is None:
                try:
                    rdp_configs = Config(
                        meta=self.meta,
                        rdp_info=True,
                        rdp_fact_type=self._fact_type).json_data
                    if not rdp_configs:
                        raise Warning(
                            "No feedback related RDP found."
                            "Please check if any data returned from below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self.meta["api_config_str"]))
                        # exit(StepStatus.SUCCESS.value)

                    for rdp_config in rdp_configs:
                        _rdp_schema = rdp_config['configs'].get(
                            'dw.schema.name')
                        self.logger.info("Started to sync data from rdp: %s" %
                                         _rdp_schema)

                        # self.dct_sync_data["source_config"] = rdp_config['configs']
                        self._populate_source_config(rdp_config['configs'])

                        self.initialize()

                        _flag = self.load_data()
                        if _flag:
                            # if no data, then no need to process & update variables table.
                            self.process_data()

                            sql = """
                            IF NOT EXISTS(SELECT * FROM VARIABLES WHERE VARIABLE_NAME = '{eventType}')
                                INSERT INTO VARIABLES (VARIABLE_NAME, VARIABLE_VALUE, PREVIOUS_VALUE, INSERT_TIME, UPDATE_TIME)
                                VALUES ('{eventType}', '{value}', '', getdate(), getdate())
                            ELSE
                                UPDATE VARIABLES
                                SET PREVIOUS_VALUE = VARIABLE_VALUE, VARIABLE_VALUE = '{value}',UPDATE_TIME = getdate()
                                WHERE VARIABLE_NAME = '{eventType}'
                            """.format(eventType=_rdp_schema,
                                       value=self.max_event_key)
                            self.logger.info(sql)
                            self.osa_app_conn.execute(sql)

                        self.logger.info("Data sync done for RDP: %s" %
                                         _rdp_schema)

                except Exception as e:
                    self.logger.warning(e)
                    raise

            elif self._init_flag is True:
                if self._vendor_key is None or self._retailer_key is None:
                    self.logger.warning(
                        "vendor_key and retailer_key are required when initilize feedback for new customer"
                    )
                    raise ValueError

                # getting fdbk related rdps.
                try:
                    rdp_configs = Config(
                        meta=self.meta,
                        rdp_info=True,
                        rdp_fact_type=self._fact_type).json_data
                    if not rdp_configs:
                        self.logger.warning(
                            "No feedback related RDP found."
                            "Please check if any data returned from below URL."
                            "%s/properties/rdps?factType=fdbk" %
                            (self.meta["api_config_str"]))
                        exit(StepStatus.SUCCESS.value)

                    fdbk_rdps = [
                        str(rdp_config["rdpId"]).upper()
                        for rdp_config in rdp_configs
                    ]

                    # change table name in case conflict with normal sync process
                    self.dct_sync_data[
                        "target_dw_table"] = "{0}_{1}_{2}".format(
                            self.staging_import_table_osa, self._vendor_key,
                            self._retailer_key)

                    _silo_config = Config(
                        meta=self.meta,
                        vendor_key=self._vendor_key,
                        retailer_key=self._retailer_key).json_data
                    _silo_type = _silo_config['configs'].get(
                        'etl.silo.type', 'SVR')
                    _rdp_id = _silo_config['configs'].get('rdp.db.name')

                    # RDP_AUX is default rdp id for feedback etl on PRODUCTION.
                    # 1, if there is no RDP for given silo. then exit.
                    if not _rdp_id or str(_rdp_id).strip() == '':
                        self.logger.warning(
                            "There is no RDP silo configed for the given vendor:%s "
                            "and retailer:%s. So no need to sync feedback." %
                            (self._vendor_key, self._retailer_key))
                        exit(StepStatus.SUCCESS.value)

                    # 2, Getting configed RDP list, and check if there are feedback related RDPs.
                    _tmp_rdp_lst = str(_rdp_id).upper().split(sep=",")
                    _rdp_lst = [_tmp.strip() for _tmp in _tmp_rdp_lst]
                    # common_rdps is RDP silo configed for syncing feedback data for given silo(vendor&retailer)
                    common_rdps = list(set(_rdp_lst).intersection(fdbk_rdps))
                    if common_rdps is None:
                        self.logger.warning(
                            "There is no RDP silo configed for the given vendor:%s "
                            "and retailer:%s. So no need to sync feedback." %
                            (self._vendor_key, self._retailer_key))
                        exit(StepStatus.SUCCESS.value)

                    # If there is 1 or more than 1 feedback related rdps configed, then loop them to sync feedback data,
                    # Normally, there should be only 1. or no feedback rdp configed.
                    for common_rdp in common_rdps:
                        _rdp_id = common_rdp
                        self.logger.info(
                            "Started to sync data from rdp: %s for given vendor:%s and retailer:%s. "
                            % (_rdp_id, self._vendor_key, self._retailer_key))

                        # if RDP is not RDP_AUX, Won't exit but log a warning.
                        if _rdp_id != self._default_rdp:
                            self.logger.warning(
                                "Please be noted: The RDP is:%s. It is not RDP_AUX."
                                % _rdp_id)

                        # WM silos are also following above logic.
                        # all hosted silo are ultilizing RDP_AUX to transfer feedback data. not sure about Walmart.
                        # if str(_silo_type).upper() in ["WMSSC", "WMCAT", "SASSC", "WMINTL"]:
                        #     _rdp_id = self._default_rdp  # WM rdp is RDP_AUX as well?

                        rdp_config = Config(meta=self.meta,
                                            hub_id=_rdp_id).json_data
                        if not rdp_config['configs']:
                            self.logger.warning(
                                "There is no configs for RDP: %s. Please check following URL:"
                                "%s/properties/%s/%s" %
                                (_rdp_id, self.meta["api_config_str"], _rdp_id,
                                 _rdp_id))
                            exit(StepStatus.SUCCESS.value)

                        _rdp_schema = rdp_config['configs'].get(
                            'dw.schema.name')
                        self.logger.info(
                            "Started to init feedback data from rdp: %s for "
                            "given vendor:%s and retailer:%s " %
                            (_rdp_schema, self._vendor_key,
                             self._retailer_key))

                        # self.dct_sync_data["source_config"] = rdp_config['configs']
                        self._populate_source_config(rdp_config['configs'])

                        self.initialize()

                        _flag = self.load_data()
                        if _flag:
                            # if no data, then no need to process.
                            self.process_data()

                        self.logger.info("Data sync done for RDP: %s" %
                                         _rdp_id)

                except Exception as e:
                    self.logger.warning(e)
                    self.logger.warning(
                        "Please check if any warning or error messages when doing the initialization!"
                    )

        finally:
            if self.osa_app_conn:
                self.osa_app_conn.close_connection()
            if self.osa_dw_conn:
                self.osa_dw_conn.close_connection()

    def initialize(self):
        """
        Create local temp tables , and DDLs required to process this fact type
        :return:
        """
        self.logger.info("Initialize...")

        # recreate this table for every RDP. no need to truncate any longer.
        # sql = "TRUNCATE TABLE {cmnSchema}.{targetTable}"\
        #     .format(cmnSchema=self.dct_sync_data["target_dw_schema"], targetTable=self.staging_import_table_osa)
        # self.logger.info(sql)
        # self.osa_dw.execute(sql)

        sql = """
        --Store data from RDP table.
        DROP TABLE IF EXISTS {cmnSchema}.{importTable};
        CREATE TABLE {cmnSchema}.{importTable}
        (
            EVENT_KEY int NOT NULL,
            RETAILER_KEY int,
            VENDOR_KEY int,
            STORE_VISIT_DATE date,
            PERIOD_KEY int NOT NULL,
            TYPE varchar(1),
            TYPE_DATE varchar(10),
            ALERT_ID int,
            ALERT_TYPE varchar(64),
            MERCHANDISER_STORE_NUMBER varchar(512),
            STORE_ID varchar(512),
            MERCHANDISER_UPC varchar(512),
            INNER_UPC varchar(512),
            MERCHANDISER varchar(100),
            STORE_REP varchar(1000),
            SOURCE varchar(1000),
            BEGIN_STATUS varchar(255),
            ACTION varchar(255),
            FEEDBACK_DESCRIPTION varchar(255),
            FEEDBACK_HOTLINEREPORTDATE date,
            FEEDBACK_ISININVENTORY varchar(5),
            ZIP_CODE varchar(64),
            ARTS_CHAIN_NAME varchar(255),
            UPC_STATUS varchar(255),
            MSI varchar(255)
        )
        UNSEGMENTED ALL NODES;
        """.format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                   importTable=self.dct_sync_data["target_dw_table"])
        self.logger.info(sql)
        self.osa_dw_conn.execute(sql)

    def load_data(self):
        """
        # Load data from RDP table ds_fact_feedback to local temp tables.
        There is an column event_key which is incremental for all customers in ds_fact_feedback table.
        we can save the snapshot of this column to variable table, and do the incremental every time based on this column.
        There are few cases here:
        1, Routinely, There will be a scheduled job to sync the whole feedback data for valid customers from related RDP silo.
           And save the snapshot of the event_key from previous loading for next incremental loading.
        2, if on-boarding a new vendor & retailer customer. Getting rdp_event_key from variable for related RDP silo. (rdp_event_key is from previous loading)
           and then sync feedback data from related RDP silo only for this given customer when event_key < rdp_event_key.
           Then case1 will take care the rest of feedback data.
        :return:
        """
        rdp_schema = self.dct_sync_data["source_config"].get('dw.schema.name')

        # rdp_aux.ds_fact_feedback
        source_table = "{rdpSchema}.{rdptableName}"\
            .format(rdpSchema=rdp_schema,
                    rdptableName=self.source_table_rdp)
        # common.stage_fact_feedback_rdp
        target_table = "{dwSchema}.{importTable}"\
            .format(dwSchema=self.dct_sync_data["target_dw_schema"],
                    importTable=self.dct_sync_data["target_dw_table"])

        self.logger.info(
            "Ready to load Data from {srouceTable} to {targetTable}".format(
                targetTable=target_table, srouceTable=source_table))

        insert_columns = " EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISIT_DATE, PERIOD_KEY, TYPE, TYPE_DATE," \
                         " ALERT_ID, ALERT_TYPE, MERCHANDISER_STORE_NUMBER, STORE_ID, MERCHANDISER_UPC, INNER_UPC," \
                         " MERCHANDISER, STORE_REP, SOURCE, BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION," \
                         " FEEDBACK_HOTLINEREPORTDATE, FEEDBACK_ISININVENTORY, ZIP_CODE, ARTS_CHAIN_NAME, UPC_STATUS, MSI "

        try:
            self.logger.info(
                "Getting the previous Event_key from last run for incremental load."
            )
            _event_sql = "SELECT VARIABLE_VALUE FROM variables " \
                         "WHERE VARIABLE_NAME = '{rdpName}'".format(rdpName=rdp_schema)
            self.logger.info(_event_sql)
            event_key = self.osa_app_conn.query_scalar(_event_sql)

            self.logger.info(
                "Getting customer info which only applied OSA services as filter"
            )
            sql = "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_MAPPING " \
                  "UNION " \
                  "SELECT DISTINCT retailer_key, vendor_key FROM AP_ALERT_CYCLE_RC_MAPPING"
            self.logger.info(sql)
            results = self.osa_app_conn.query(sql)
            if not results:
                raise Warning(
                    "There is no data in table AP_ALERT_CYCLE_MAPPING. Please check sql: %s"
                    % sql)
                # exit(StepStatus.SUCCESS.value)

            user_filters = [
                'SELECT ' + str(result.retailer_key) + ',' +
                str(result.vendor_key) for result in results
            ]
            user_filter_str = ' UNION ALL '.join(user_filters)
            self.logger.info("Customer filters are: %s" % user_filter_str)

            # incremental filter from RDP table
            where_sql = "EVENT_KEY > {eventKey} AND SOURCE != 'ARIA' " \
                        "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\
                .format(eventKey=event_key,
                        userFilter=user_filter_str)

            # TODO2DONE: how to set default value? use -1
            # copy all if there is no value in variables table.
            if not event_key:
                self.logger.warning(
                    "There is no value set in variables table for RDP:{name}, "
                    "So copy the whole table".format(name=rdp_schema))
                where_sql = " SOURCE != 'ARIA' AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter})"\
                    .format(eventKey=event_key,
                            userFilter=user_filter_str)
                event_key = -1  # check if this is the first run.

            if self._init_flag is True:
                if event_key == -1:  # event_key is None
                    self.logger.warning(
                        "There is no event_key logged in variables table for the given RDP: %s."
                        "So Let's wait for the routine job to sync the whole rdp feedback data"
                        % rdp_schema)
                    return False

                self.logger.info("Generating init feedback filters")
                where_sql = "EVENT_KEY <= {eventKey} AND SOURCE != 'ARIA' " \
                            "AND (RETAILER_KEY, VENDOR_KEY) in ({userFilter}) " \
                            "AND RETAILER_KEY={retailerKey} AND VENDOR_KEY={vendorKey} "\
                    .format(eventKey=event_key,
                            userFilter=user_filter_str,
                            retailerKey=self._retailer_key,
                            vendorKey=self._vendor_key)

            self.logger.debug("The filters are: %s" % where_sql)

            # form the fetch query from RDP and then Insert into the target table
            fetch_query = """
            SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ {insertQuery} FROM {sourceTable}
            WHERE {whereSql}
            """.format(insertQuery=insert_columns,
                       sourceTable=source_table,
                       whereSql=where_sql)
            self.logger.info("fetch_query is : %s" % fetch_query)

            self.logger.info(
                ">>Loading {factType} Data from event_key:{eventKey} start at: {timestamp}<<"
                .format(factType=self._fact_type,
                        eventKey=event_key,
                        timestamp=datetime.datetime.now()))

            self.dct_sync_data["target_column"] = insert_columns
            self.dct_sync_data["source_sql"] = fetch_query

            row_count = self.transfer.transfer_data(
                dct_sync_data=self.dct_sync_data)

            self.logger.info(
                ">>Done loaded {cnt} rows from event_key:{eventKey} completed at: {timestamp}<<"
                .format(cnt=row_count,
                        factType=self._fact_type,
                        eventKey=event_key,
                        timestamp=datetime.datetime.now()))

            # if no data transfered, then update variables with previous value.
            sql = "SELECT /*+ label(GX_IRIS_SYNCFEEDBACK)*/ nvl(max(event_key), {oldEventKey}) " \
                  "FROM {schemaName}.{importTable} "\
                .format(schemaName=self.dct_sync_data["target_dw_schema"],
                        importTable=self.dct_sync_data["target_dw_table"],
                        oldEventKey=event_key)
            self.logger.info(sql)
            self.max_event_key = self.osa_dw_conn.query_scalar(sql)

            # max_event_key = -1   # testing purpose
            if self.max_event_key == -1:
                self.logger.warning(
                    "There is no feedback data in RDP table: {0}".format(
                        source_table))
                return False

            return True

        except Exception as e:
            self.logger.warning(e)
            raise

        finally:
            pass

    def process_data(self):
        """
        after load_data part completes. sync data from temp table to related schemas.
        :return:
        """
        try:
            self.logger.info("Processing feedback start...")

            # loop retailer to insert feedback data
            sql = "SELECT DISTINCT retailer_key " \
                  "FROM {cmnSchema}.{importTable}"\
                .format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                        importTable=self.dct_sync_data["target_dw_table"])
            self.logger.info(sql)
            retailers = self.osa_dw_conn.query(sql)

            if retailers.rowcount == 0:
                self.logger.warning(
                    "There is no data in table {cmnSchema}.{importTable}."
                    "It could be no incremental data. Please check fetch_query against RDP database"
                    .format(cmnSchema=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"]))

            for retailer in retailers:
                retailer_key = retailer.retailer_key
                osa_schema = self.capacity.get_retailer_schema_name(
                    retailer_key)

                # Finally, run the sql
                msg = "Processing fdbk data within retailer {retailerKey}:{retailerName}"\
                    .format(retailerKey=retailer_key, retailerName=osa_schema)
                self.logger.info(msg)

                # Normally, There should NOT be duplicated alert_id transfered by incremental.
                # But should consider this case here. Delete existing alertid from target table
                # TODO: delete could have performance issue. consider using switch partition
                delete_sql = "DELETE FROM {osaSchema}.{targetTable} " \
                             "WHERE alert_id IN (SELECT alert_id FROM {cmnSchema}.{importTable} )"\
                    .format(targetTable=self.target_table_osa,
                            osaSchema=osa_schema,
                            cmnSchema=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"])
                self.logger.info(delete_sql)
                self.osa_dw_conn.execute(delete_sql)

                # inserting feedback data into final table fact_feedback from processed table.
                sql = """
                INSERT INTO {osaSchema}.{targetTable}
                (EVENT_KEY, RETAILER_KEY, VENDOR_KEY, STORE_VISITED_PERIOD_KEY, PERIOD_KEY,
                 ALERT_ID, STORE_KEY, MERCHANDISER_STORE_NUMBER,
                 STORE_ID, ITEM_KEY, MERCHANDISER_UPC, UPC, MERCHANDISER, STORE_REP, SOURCE,
                 BEGIN_STATUS, ACTION, FEEDBACK_DESCRIPTION,
                 ON_HAND_PHYSICAL_COUNT, ON_HAND_CAO_COUNT
                )
                SELECT stage.EVENT_KEY, stage.RETAILER_KEY, stage.VENDOR_KEY,
                       TO_CHAR(stage.STORE_VISIT_DATE, 'YYYYMMDD')::int AS STORE_VISITED_PERIOD_KEY,
                       stage.PERIOD_KEY,
                       stage.ALERT_ID,
                       store.STORE_KEY AS STORE_KEY,
                       stage.MERCHANDISER_STORE_NUMBER,
                       COALESCE(store.STORE_ID , alert.STOREID, stage.STORE_ID) AS STORE_ID,
                       item.ITEM_KEY AS ITEM_KEY,
                       stage.MERCHANDISER_UPC,
                       COALESCE(item.UPC, alert.UPC, stage.INNER_UPC, stage.MERCHANDISER_UPC) AS UPC,
                       stage.MERCHANDISER, stage.STORE_REP, stage.SOURCE,
                       stage.BEGIN_STATUS, stage.ACTION, stage.FEEDBACK_DESCRIPTION,
                       0 AS ON_HAND_PHYSICAL_COUNT,
                       0 AS ON_HAND_CAO_COUNT
                FROM {cmnSchema}.{importTable} stage
                LEFT JOIN {osaSchema}.FACT_PROCESSED_ALERT alert
                ON stage.alert_id = alert.alert_id AND alert.issuanceid = 0
                AND alert.retailer_key = {retailerKey} AND stage.vendor_key = alert.vendor_key
                INNER JOIN {cmnSchema}.DIM_PRODUCT item
                ON item.retailer_key = {retailerKey} AND alert.vendor_key = item.vendor_key
                AND item.item_key = alert.item_key
                INNER JOIN {cmnSchema}.DIM_STORE store
                ON store.retailer_key = {retailerKey} AND alert.vendor_key = store.vendor_key
                AND store.store_key = alert.store_key
                WHERE stage.retailer_key = {retailerKey}
                """.format(osaSchema=osa_schema,
                           targetTable=self.target_table_osa,
                           cmnSchema=self.dct_sync_data["target_dw_schema"],
                           importTable=self.dct_sync_data["target_dw_table"],
                           retailerKey=retailer_key)
                self.logger.info(
                    "SQL used to load data to related schema. %s" % sql)
                self.osa_dw_conn.execute(sql)

            self.logger.info("Processing feedback ended...")

        except Exception as e:
            self.logger.warning("Process data for RDP {0} failed: {1}".format(
                self._rdp_id, e))
            raise

        finally:
            if self._debug.upper() == 'N':
                _drop_sql = "DROP TABLE IF EXISTS {schemaName}.{importTable};" \
                    .format(schemaName=self.dct_sync_data["target_dw_schema"],
                            importTable=self.dct_sync_data["target_dw_table"])
                self.logger.info(_drop_sql)
                self.osa_dw_conn.execute(_drop_sql)
Exemplo n.º 16
0
class DumpMain(object):
    def __init__(self, meta, params, logger=None):
        self.meta = meta
        self.params = params
        self.file_ext = self.params.get("fileExt") if self.params.get(
            "fileExt") else "txt"
        self.zip_flag = True if str(self.params.get(
            "zipFlag", "N")).upper() in ["Y", "YES", "T", "TRUE"] else False

        self._debug = "Y" if str(self.params.get(
            'debug', 'N')).upper() in ["Y", "YES", "T", "TRUE"] else 'N'
        __log_level = 'DEBUG' if self._debug == "Y" else 'INFO'
        self.logger = logger if logger else Logger(log_level="info",
                                                   target="console",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   sql_conn=None)
        self.logger.set_level(log_level=__log_level)
        self.logger.set_keys(log_id="{}".format(self.params["jobId"]))

        self.provision_url = self.meta[
            "api_provision_str"] + "/filesDelivery/layout/attributes?cycle_key="
        self.capacity = Capacity(meta=self.meta)
        self.rsi_username = self.meta.get("fileshare_username")
        self.rsi_folder = self.meta.get("fileshare_folder")
        self.cmn_schema = self.meta.get("db_conn_vertica_common_schema",
                                        "common")

        self.app_conn = MSOperation(meta=self.meta)
        self.vertica_conn = DWOperation(meta=self.meta)

        self.dumper_context = self.get_context()
        self._layout_payload = {}

    def main_process(self):
        """
        Getting AFM related parameters. 2 cases here.
        1, called by AFM directly:
           Alerts Delivery is coupled with AFM. So it is better to call by AFM. Then we can get parameters directly from AFM via self.params
        2, called via REST API.
           in case failure when called by AFM, we also support the REST API while we can manually trigger this. And getting params from REST API Body.
        :return:
        """
        try:
            # Checking the cancel status from previous step.
            _cancel_flag = self.params.get("cancel_flag", False)
            if _cancel_flag:
                return self.params.get("message")

            self.logger.debug("The context is: %s" % str(self.dumper_context))
            init = Initialize(self.dumper_context)
            afm_params = init.init_process()
            _cycle_key = afm_params.get("cycleKey")

            _retailer_name = self.capacity.get_retailer_schema_name(
                retailer_key=afm_params["retailerKey"])
            self.dumper_context["retailer_name"] = _retailer_name
            self.dumper_context["schema"] = _retailer_name
            self.dumper_context["cycle_key"] = _cycle_key

            self._layout_payload = self._get_layout()
            self._process(afm_params)

        except Warning as e:
            return str(e)

        except Exception as e:
            self.logger.warning(str(e))
            raise

    def get_context(self):
        context = {
            "meta": self.meta,
            "params": self.params,
            "dw_conn": self.vertica_conn,
            "app_conn": self.app_conn,
            "logger": self.logger
        }

        return context

    def _get_layout(self):
        # Getting layout fields for alert delivery.
        _cycle_key = self.dumper_context["cycle_key"]
        _delivery_layout_api = "{0}{1}".format(self.provision_url, _cycle_key)

        self.logger.info(
            "Alert delivery related columns(payload param) is not given, "
            "So just calling provision API: %s to retrieve them." %
            _delivery_layout_api)
        # _header = {"tokenid": "eyJhbGciOiJIUzI1NiJ9.eyJjb29raWVOYW1lIjoicnNpU3NvTmV4dEdlbiIsImNvb2tpZVZhbHVlIjoiQVFJQzV3TTJMWTRTZmN6ZmRuLVVQazA2b2NnRzVWaTlZRFc1cHZZQzF6b3djbXMuKkFBSlRTUUFDTURFQUFsTkxBQkkyTVRBNU1EUTBPRFUwTXpZeU1UWTBORFlBQWxNeEFBQS4qIiwic3RhdHVzIjoic3VjY2VzcyIsInVzZXJJZCI6ImJlbi53dUByc2ljb3JwLmxvY2FsIiwiaWF0IjoxNTQ0Njg5OTY3fQ.AzCgFFHXHo3J1M4fk-17T8fBLwReDQDb4p-DXUcBm_M"}
        _header = {}
        resp = requests.get(_delivery_layout_api, _header, verify=False)
        if resp.text == "invalid token":
            self.logger.error(
                "ERROR: invalid tokenid, Please update the tokenid manually. "
                "Then rerun this script again!!!")

        if resp.status_code != requests.codes.ok or str(
                resp.json().get("status")).lower() != "success":
            self.logger.warning("The response result is: %s" % resp.json())
            self.logger.error("Calling API failed. Refer to API: %s" %
                              _delivery_layout_api)

        # _payload = resp.json()["data"]["payload"]   # e.g. list({"key1":"value1"}, {}, {})
        _payload = resp.json()["data"]

        self.logger.debug("The layout payload is: %s" % _payload)
        return _payload

    def _gen_query(self, delivery_key):
        # getting the payload for given delivery key.
        _payload = self._layout_payload.get(str(delivery_key))["payload"]
        self.logger.debug("The payload is: %s" % str(_payload))

        # only getting data where field preselect is True.
        _payload = list(filter(lambda x: x["preselect"] is True, _payload))
        self.logger.debug("The filtered payload is: %s" % str(_payload))
        if not _payload:
            self.logger.warning(
                "There is no layout configed for this delivery.")
            return None, None

        # It is required to order by seq, so need to convert list(dict) to list(tuple) to sort the data.
        # Otherwise, the following code will not be working correctly.
        # _payload_list should be like: e.g. [(1, 'Alert', 'ALERT_ID', 'ALERT ID', None), (), ...]
        # Noted: Please don't change below fields order for generating _payload_list.
        _payload_list = [(ele["sequence"], ele["dimension_type"], ele["Field"],
                          ele["display_name"], ele["sort_by"])
                         for ele in _payload]
        self.logger.debug("Converted payload is: %s" % str(_payload_list))

        # Fields should be listed based on sequence. In case seq is None, then putting those columns to the last.
        # if many elements with None value, then they will be sorted randomly.
        _payload_list.sort(
            key=lambda x: x[0]
            if x[0] is not None else 999)  # Assuming less than 999 columns.
        self.logger.debug("Sorted payload is: %s" % str(_payload_list))

        display_fields_lst = [
        ]  # getting display name like: ["alert type", ...] for displaying on delivery file.
        columns_with_alias = [
        ]  # getting fields with format like: [alert."alert_type" as "alert type", ...]
        table_list = set(
        )  # getting required table list. ("Alert", "Store", ...)
        for columns in _payload_list:
            self.logger.debug(
                "Current display name is: \"%s\" for column: %s" %
                (columns[3], columns[2]))
            table_list.add(str(columns[1]).lower())
            if columns[3] is None:  # display name could be None
                display_fields_lst.append(columns[2])
                columns_with_alias.append(columns[1] + '."' + columns[2] +
                                          '" AS "' + columns[2] + '"')
            else:
                display_fields_lst.append(columns[3])
                columns_with_alias.append(columns[1] + '."' + columns[2] +
                                          '" AS "' + columns[3] + '"')

        # if there is no "Alert Date" enabled, then manually added this field to the beginning.
        if "Alert Date".lower() not in list(
                map(lambda x: str(x).lower(), display_fields_lst)):
            display_fields_lst.insert(0, "Alert Date")
            columns_with_alias.insert(0, 'ALERT.period_key AS "Alert Date"')

        _display_fields_str = ",".join([
            '"' + column + '"' for column in display_fields_lst
        ])  # combine them with comma(,)
        _required_columns_prefix_tmp = ",".join(
            [column for column in columns_with_alias])  # combine them
        # replace one mandatory fields alert_type to intervention_name. Since it is not the column name in fact table.
        _required_columns_prefix = re.sub(
            '(?i)' + re.escape('ALERT."ALERT_TYPE"'), 'type.intervention_name',
            _required_columns_prefix_tmp)

        # Getting columns which sort_by is True(which is enabled sorting). And sort it.
        sort_by_list = sorted(list(
            filter(lambda x: x[4] is True, _payload_list)),
                              key=lambda x: x[4])
        if sort_by_list:
            _sort_columns = ",".join([
                ele[1] + '."' + ele[2] + '" DESC NULLS LAST'
                for ele in sort_by_list
            ])
        else:
            _sort_columns = '1'
        self.logger.debug(_sort_columns)
        _sort_columns = re.sub('(?i)' + re.escape('ALERT."ALERT_TYPE"'),
                               'type.intervention_name', _sort_columns)

        _fdbk_required = True if "feedback" in table_list else False
        _item_required = True if "product" in table_list else False
        _store_required = True  # mandatory

        # Generating the initial query.
        _init_sql = """
        SELECT {columns}, ROW_NUMBER() OVER(ORDER BY store.STORE_ID, {sortedFields} ) AS rn
        FROM {schema}.FACT_PROCESSED_ALERT alert 
        INNER JOIN {cmnSchema}.alt_meta_intervention type 
        ON alert.InterventionKey = type.Intervention_Key """\
            .format(columns=_required_columns_prefix,
                    sortedFields=_sort_columns,
                    schema=self.dumper_context["schema"],
                    cmnSchema=self.cmn_schema)

        if _fdbk_required:
            _init_sql += """
            LEFT JOIN {schema}.FACT_FEEDBACK Feedback 
            ON alert.alert_id = Feedback.alert_id"""\
                .format(schema=self.dumper_context["schema"])

        if _item_required:
            _init_sql += """
            INNER JOIN {cmnSchema}.dim_product Product 
            ON alert.item_key = Product.item_key
            AND alert.vendor_key = Product.vendor_key 
            AND alert.retailer_key = Product.retailer_key"""\
                .format(cmnSchema=self.cmn_schema)

        if _store_required:
            _init_sql += """
            INNER JOIN {cmnSchema}.dim_store Store 
            ON alert.store_key = store.store_key
            AND alert.vendor_key = store.vendor_key 
            AND alert.retailer_key = store.retailer_key"""\
                .format(cmnSchema=self.cmn_schema)

        # always apply the period_key filter for given vendor/retailer
        _init_sql += """
        WHERE 1=1 AND alert.IssuanceId = 0
        AND (alert.vendor_key, alert.retailer_key, alert.period_key) IN 
            ( SELECT vendor_key, retailer_key, alert_day FROM TMP_RAW_ALERTS_INFO )
            """

        return _display_fields_str, _init_sql

    def _process(self, afm_params):
        try:
            _cycle_key = afm_params.get("cycleKey")

            # Reading configuration from meta table under IRIS MSSQL.
            # Getting all owners(includes both SVR & RETAILER rule) according to given cycle_key.
            # the delivery file will be dumped by owner.
            sql = """
            SELECT d.ID AS DELIVERY_KEY, d.CYCLE_KEY, d.RETAILER_KEY, d.DELIVERY_NAME, d.FILTERS, 
                   d.DELIMITER, d.OWNER, ep.SERVER, ep.EXTRACTION_FOLDER, ep.USERNAME, ep.PASSWORD,
                   ep.MAIL_SUBJECT, ep.MAIL_BODY, ep.MAIL_RECPSCC, ep.MAIL_RECPSTO, ep.DELIVERY_TYPE 
            FROM AP_META_DELIVERIES d 
            INNER JOIN AP_META_ENDPOINTS ep
            ON d.ENDPOINT_ID = ep.ID 
            WHERE d.cycle_key = {0}
            AND d.ENABLED = 'T' AND ep.ENABLED = 'T' 
            """.format(_cycle_key)
            self.logger.info(sql)
            meta_rows = self.app_conn.query(sql)
            self.logger.debug("The meta data is: %s" % str(meta_rows))
            if not meta_rows:
                raise Warning(
                    "There is no endpoint or delivery configed. Please check meta table!"
                )

            # There could be multi owners for the given cycle but with different filters.
            # This is required by PM. And we need to generate separate files for every single row.
            for meta_data in meta_rows:
                # 1, Getting the initial source query
                _delivery_key = meta_data.DELIVERY_KEY
                required_columns, _init_src_query = self._gen_query(
                    delivery_key=_delivery_key)
                if required_columns is None and _init_src_query is None:
                    self.logger.warning(
                        "Seems no layout configed for delivery key: %s" %
                        _delivery_key)
                    continue
                self.logger.info("The initial source query is: %s" %
                                 _init_src_query)

                delivery_type = meta_data.DELIVERY_TYPE
                if str.lower(delivery_type) == 'customer':
                    meta_data = meta_data._replace(
                        EXTRACTION_FOLDER=self.rsi_folder,
                        USERNAME=self.rsi_username)

                if meta_data.USERNAME is None:
                    self.logger.warning(
                        "There is no username configed for delivery key: %s" %
                        _delivery_key)
                    continue

                _pmp_pwd = get_password(username=meta_data.USERNAME,
                                        meta=self.meta)
                if _pmp_pwd:
                    meta_data = meta_data._replace(PASSWORD=_pmp_pwd)

                self.logger.info("Start to dump & delivery for meta: %s" %
                                 str(meta_data))
                _src_query = _init_src_query

                # 2, checking if any filters applied. (e.g alert_type, category etc.)
                # User might wants to dump only given alert types of data. This should be configurable.
                # So far, we support 2 types of filters: alert_type & category
                # TODO: confirm the filter format with UI team. Currently filters are configed with json format.
                # e.g. {"alert_type": "d-void,phantom", "category":"cat1,cat2"}
                _filters_raw = meta_data.FILTERS
                if not _filters_raw or _filters_raw == "":
                    self.logger.info("No filters applied.")
                else:
                    self.logger.info("The filters are: %s" % _filters_raw)
                    _filters = json.loads(str(_filters_raw).lower().strip())
                    alert_type_str = _filters.get(
                        "alert_type", None)  # e.g. phantom,d-void,shelf oos
                    if alert_type_str is not None and str(
                            alert_type_str).strip() != '':
                        alert_type = ','.join(
                            "'" + str(ele).strip() + "'"
                            for ele in str(alert_type_str).split(','))
                        _src_query += " AND type.intervention_name IN ({type})".format(
                            type=alert_type)

                    category_str = _filters.get("category", None)
                    if category_str is not None and str(
                            category_str).strip() != '':
                        category_type = ','.join(
                            "'" + str(ele).strip() + "'"
                            for ele in str(category_str).split(','))
                        _src_query += " AND Product.OSM_CATEGORY IN ({cat_type})".format(
                            cat_type=category_type)

                # The owner format should be like: owner1 or owner1,owner2,...
                _owners = str(meta_data.OWNER)
                if not _owners:
                    # owner is the mandatory filter for every delivery.
                    raise ValueError(
                        "There is no owner configed in delivery meta table")

                _owner_in_str = ",".join("'" + ele.strip() + "'"
                                         for ele in _owners.split(","))
                _src_query += " AND alert.owner IN ({owner}) ".format(
                    owner=_owner_in_str)

                _final_src_query = """
                SELECT {columns} FROM ({query}) x ORDER BY rn
                """.format(columns=required_columns, query=_src_query)

                self.logger.info("The final source sql is: %s" %
                                 _final_src_query)

                # delivery file name should be: <delivery_name>_<YYYYMMDD>.<fileExt>. e.g. <delivery_name>_20180101.txt
                curr_folder = os.path.dirname(os.path.realpath(__file__))
                target_filename = meta_data.DELIVERY_NAME + "_" + datetime.datetime.now(
                ).strftime('%Y%m%d')

                # delivery file will be dumped to "<curr_dir>/data" folder temporarily.
                abs_target_filename = curr_folder + os.sep + "data" + os.sep + target_filename + '.' + self.file_ext
                zip_filename = curr_folder + os.sep + "data" + os.sep + target_filename + '.zip'

                # Getting data delimiter. e.g. ','
                delimiter = str(meta_data.DELIMITER).strip()
                if len(delimiter) != 1:
                    raise ValueError("delimiter should be 1 char")

                # start to dump data
                self.dumper = dd.DumpData(context=self.dumper_context)

                # dump data from source db
                self.logger.info("Dumping data into file: %s" %
                                 abs_target_filename)
                _dump_flag = self.dumper.dump_data(
                    src_sql=_final_src_query,
                    output_file=abs_target_filename,
                    delimiter=delimiter)
                self.logger.debug("The dump flag is: %s" % _dump_flag)

                # dump alerts succeeded.
                if _dump_flag is True:
                    self.logger.info("Dumping data is done!")

                    # check the zip flag
                    if self.zip_flag:
                        _flat_file_size = round(
                            os.path.getsize(abs_target_filename) / 1024 / 1024)
                        self.logger.debug("The flat file size is: %s" %
                                          _flat_file_size)
                        self.logger.info("zipping file: %s" %
                                         abs_target_filename)
                        with zipfile.ZipFile(zip_filename, 'w') as z:
                            z.write(abs_target_filename,
                                    os.path.basename(abs_target_filename))

                        abs_target_filename = zip_filename
                        self.logger.info("The zip file name is: %s" %
                                         abs_target_filename)

                    # start to send data file
                    self.logger.info(
                        "Starting uploading delivery file to dest folder!")
                    self.sender = sd.SendData(context=self.dumper_context)
                    self.sender.delivery_file(meta_data=meta_data,
                                              src_file=abs_target_filename)

                else:
                    self.logger.warning(
                        "There is no data returned or dump data failed. "
                        "Please refer to previous log to get the related source query."
                    )

            self.logger.info("Alert delivery process is done")

        except Warning as e:
            raise

        except Exception:
            raise

        finally:
            if self.vertica_conn:
                self.vertica_conn.close_connection()
            if self.app_conn:
                self.app_conn.close_connection()
Exemplo n.º 17
0
    SEP = os.path.sep
    cwd = os.path.dirname(os.path.realpath(__file__))
    generic_main_file = cwd + SEP + '..' + SEP + 'main.py'
    CONFIG_FILE = cwd + SEP + '..' + SEP + '..' + SEP + 'config' + SEP + 'config.properties'
    exec(open(generic_main_file).read())

    _common_schema = "common"
    _dim_product = "DIM_PRODUCT"
    _dim_store = "DIM_STORE"
    _dim_calendar = "DIM_CALENDAR"
    _meta_intvn = "ALT_META_INTERVENTION"
    _dim_base_dir = meta.get("azure_storage_dim_root")
    _prefix_name = "IRIS_"

    _dw_conn = DWOperation(meta=meta)
    # Getting retailer list from dim table
    retailer_list_in_table = []
    _retailer_sql = "SELECT DISTINCT retailer_key FROM {schema}.{table} where retailer_key in (6, 5240)"\
        .format(schema=_common_schema,
                table=_dim_store)
    res = _dw_conn.query(_retailer_sql)
    for ele in res:
        retailer_list_in_table.append(ele.retailer_key)
    print(retailer_list_in_table)

    capacity = Capacity(meta=meta)
    azure_uploader = UploadToBlobStorage(meta=meta)
    # sql_for_item = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_product)
    # sql_for_store = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_store)
    # sql_for_cal = "SELECT * FROM {schema}.{table}".format(schema=_common_schema, table=_dim_calendar)
Exemplo n.º 18
0
class UploadToBlobStorage(object):
    def __init__(self, meta, logger=None, chuck_size=5000000, time_out=3600):
        """
        :param meta:  Parameters from config file
        :param logger:  logger handler
        :param chuck_size: The chuck size when reading data for Pandas
        :param time_out: time out when uploading data to Azure Storage
        """

        self.meta = meta
        self.account_name = self.meta.get("azure_storage_account_name")
        self.container_name = self.meta.get("azure_storage_blob_container")
        self.logger = logger if logger else Logger(log_level="info",
                                                   target="console",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   sql_conn=None)

        self.account_key = get_password(username=self.account_name,
                                        meta=self.meta)
        self.blob_service = BlockBlobService(self.account_name,
                                             self.account_key)
        self.dw_conn = DWOperation(meta=self.meta)

        self.sql = ""
        self.parq_filename = ""
        self.local_path = ""
        self.chuck_size = chuck_size  # 5,000,000 rows as a chuck
        self.time_out = time_out  # time secs

    def upload_azure_main(self, parq_filename, sql, azure_base_dir,
                          azure_sub_dir):
        self.parq_filename = parq_filename
        self._prepare_work()
        self.dump_data_to_parquet(sql)
        self.upload_to_blob_storage(azure_base_dir=azure_base_dir,
                                    azure_sub_dir=azure_sub_dir)

    def _prepare_work(self):
        """
        Check if the local dest folder exists: The local dest path should be like: <currdir>/data/20190403/IRIS_DIM_PRODUCT/
        1, if exists, then remove existing dir.
           Meaning this is not the first time run this script for given table & day. That case, we should remove the existing files.
        2, not exists. Then create the folder.
        :return:
        """
        _day = datetime.datetime.now().strftime('%Y%m%d')
        self.local_path = os.path.join(os.path.realpath(os.curdir), "data",
                                       _day, self.parq_filename)
        _flag = os.path.exists(self.local_path)
        if _flag:
            shutil.rmtree(self.local_path)
            time.sleep(
                0.5
            )  # in case there are many files to be deleted. then below makedirs might fail.

        os.makedirs(self.local_path)

    def dump_data_to_parquet(self, sql):
        self.logger.info(
            "Loading data to Pandas DataFrame from db based on sql: %s" %
            self.sql)
        file_index = 0
        for df in pd.read_sql(sql=sql,
                              con=self.dw_conn.get_connection(),
                              chunksize=self.chuck_size):
            final_parq_name = "%s-PART-%04d.parquet" % (self.parq_filename,
                                                        file_index)
            abs_parq_file = os.path.join(self.local_path, final_parq_name)
            self.logger.info("Dumping DataFrame data into parquet file: %s" %
                             abs_parq_file)
            df.to_parquet(abs_parq_file)
            self.logger.info("Done dumping data into file: %s" % abs_parq_file)
            # csv_file = "%s/test-PART-%04d.csv" % (self.local_path, file_index)
            # df.to_csv(csv_file, index=False)
            file_index += 1

    def clear_blobs(self, folder=None):
        """
        Remove all existing files from azure folder.
        :param folder:
        :return:
        """
        blobs = self.blob_service.list_blobs(
            container_name=self.container_name, prefix=folder)
        for blob in blobs:
            self.logger.info("Deleting blob file: %s from Azure blob storage" %
                             blob.name)
            self.blob_service.delete_blob(container_name=self.container_name,
                                          blob_name=blob.name)

    def upload_to_blob_storage(self,
                               azure_base_dir="DIM_DATA",
                               azure_sub_dir=""):
        _azure_dir = azure_base_dir + "/" + azure_sub_dir
        self.logger.info(
            "Started to clear all existing blobs from Azure under given path: %s"
            % _azure_dir)
        self.clear_blobs(folder=_azure_dir)
        self.logger.info("Done clear all existing blobs.")

        self.logger.info(
            "Start to upload parquet file into Azure Blob Storage")
        for files in glob.glob(os.path.join(self.local_path, '*.parquet')):
            self.logger.info("Uploading blob file: %s" % files)
            self.blob_service.create_blob_from_path(
                container_name=self.container_name,
                blob_name="{0}/{1}".format(
                    _azure_dir,
                    os.path.basename(files)),  # the sep in Azure is always '/'
                file_path=files,
                timeout=self.time_out)

        self.logger.info("Done uploaded into Azure Blob Storage")
Exemplo n.º 19
0
                % (str(i + 1), vendor_key, retailer_key, schema_name))
            body_sqls = []

        i += 1
        current_date_dt = start_date_dt + datetime.timedelta(days=i)

    if body_sqls:
        sql = head_sql + ' UNION '.join(body_sqls)
        dw.execute(sql)
    logger.info(
        'Totally %s records for vendor_key=%s, retailer_key=%s have been inserted into %s.DIM_INCIDENT_ID_START'
        % (str(delta), vendor_key, retailer_key, schema_name))


try:
    dw = DWOperation(meta=meta, logger=logger)

    try:
        check_sql = 'SELECT COUNT(*) FROM %s.DIM_VENDOR_RETAILER WHERE VENDOR_KEY=%s AND RETAILER_KEY=%s' % (
            schema_name, str(options.vendor_key), str(options.retailer_key))
        count = dw.query_scalar(check_sql)
        if count == 0:
            insert_sql = 'INSERT INTO %s.DIM_VENDOR_RETAILER (VENDOR_KEY, RETAILER_KEY) VALUES(%s, %s)' % (
                schema_name, str(options.vendor_key), str(
                    options.retailer_key))
            dw.execute(insert_sql)
    except pyodbc.ProgrammingError:
        logger.info('%s.DIM_VENDOR_RETAILER doesn'
                    't exist, if you run updateSQL before update' %
                    schema_name)
    logger.info(
Exemplo n.º 20
0
class Scd(object):
    def __init__(self,
                 sql,
                 actioner,
                 log_detail=True,
                 batch_size=100,
                 meta={},
                 db_type=None,
                 table_schema=None,
                 logger=None):
        self.meta = meta
        self.actioner = actioner
        self.log_detail = log_detail
        self.batch_size = batch_size
        self.logger = logger if logger else Logger(log_level="info",
                                                   vendor_key=-1,
                                                   retailer_key=-1,
                                                   module_name="scd")
        self.sql = sql
        self.raw_sql_splitted = self.sql.split(' ')
        self.raw_sql_splitted_trimed = [
            str for str in self.raw_sql_splitted if str != ''
        ]
        self.db_type = db_type
        self.table_schema = table_schema
        self.logger.info('[SQL] %s' % self.sql)
        self.logger.info('[SQL splitted] %s' % self.raw_sql_splitted)
        self.mssql = MSOperation(meta=self.meta)
        self.vtc = None
        self.saved_records_count = 0

    def set_insert_delete_update(self):
        cmd = self.raw_sql_splitted_trimed[0].upper()
        if cmd in ('DELETE', 'UPDATE', 'INSERT'):
            self.cmd = cmd
        else:
            raise RuntimeError('%s is not supported.' % cmd)

    def locate_keyword(self, keyword, sql_splitted=[]):
        # find the location of the keyword firstly appears in the sql
        # e.g. locate WHERE
        # UPDATE TEST SET COL = ' WHERE ' WHERE ID=1, the first ' WHERE ' is not a keyword, so return the index of the 2nd WHERE
        keyword = keyword.upper()
        base_seq_splitted = sql_splitted if sql_splitted else self.raw_sql_splitted  # if sql_splitted is specified, locate keyword in sql_splitted, otherwise locate keyword in raw_sql_splitted
        index_keyword = [
            k for k, v in enumerate(base_seq_splitted) if keyword in v.upper()
        ]
        for ind in index_keyword:
            count_of_single_quote_ahead_of_where = sum([
                base_seq_splitted[i].count("'") for i in range(0, ind)
                if "'" in base_seq_splitted[i]
            ])
            i = base_seq_splitted[ind].upper().index(keyword)
            count_of_single_quote_ahead_of_where += base_seq_splitted[ind][
                0:i].count("'")
            if count_of_single_quote_ahead_of_where % 2 == 0:  # the KEYWORD we want to capture is the one has n*2 "'" ahead of it
                return ind
        return -1  # KEYWORD not found

    def locate_keyword_combination(
            self,
            keywords):  #--mainly for parsing merge, may need to be updated
        # find the location of the first keyword, and to check if 2nd, 3rd keywords are just following the 1st keyword in the SQL
        # if match all the keywords, return the index of the first keyword
        # else return -1
        keywords_list = [
            w.upper().strip() for w in keywords.split(' ') if w.strip() != ''
        ]
        sql_splitted = [*self.raw_sql_splitted]
        base_ind = 0
        self.logger.info('Locating %s' % keywords_list)
        while True:
            self.logger.info('Locating 1st keyword "%s"' % keywords_list[0])
            ind = self.locate_keyword(keywords_list[0], sql_splitted)
            if ind == -1:
                self.logger.warning('Keyword not found.')
                return -1
            following = [
                v.upper().strip() for v in sql_splitted[ind + 1:]
                if v.strip() != ''
            ]
            self.logger.info('Getting following %s', following)
            if following[0:len(keywords_list) - 1] == keywords_list[1:]:
                self.logger.info('Keywords matched!')
                return base_ind + ind
            else:
                self.logger.warning('Keywords not matched.')
                sql_splitted = sql_splitted[ind + 1:]
                base_ind += ind

    def get_merge_where(
        self
    ):  #--TODO, not done yet, the sql gramma "update from" for vertica and sqlserver are different, need more time to work on this
        index_using = self.locate_keyword('USING')
        index_on = self.locate_keyword('ON')
        index_when_match = self.locate_keyword_combination('WHEN MATCHED THEN')
        index_when_not_match = self.locate_keyword_combination(
            'WHEN NOT MATCHED THEN')

    def get_where_condition(self):
        index_where = self.locate_keyword('WHERE')
        return '' if index_where == -1 else ' '.join(
            self.raw_sql_splitted[index_where:])

    def get_table_name(self):
        self.set_insert_delete_update()
        return self.raw_sql_splitted_trimed[1].upper(
        ) if self.cmd == 'UPDATE' else self.raw_sql_splitted_trimed[2].upper()

    def get_raw_table_name(self, table_name):
        return table_name.split('.')[-1]

    def get_table_def(self, raw_table_name):
        sql = "SELECT ID, TABLE_NAME, PK, DB_TYPE, TABLE_SCHEMA FROM META_SCD_TABLE_DEF WHERE TABLE_NAME='%s'" % raw_table_name
        if self.db_type:
            sql += " AND DB_TYPE='%s'" % self.db_type
        if self.table_schema:
            sql += " AND TABLE_SCHEMA='%s'" % self.table_schema
        rs = self.mssql.query(sql)
        if len(rs) == 0:
            raise ValueError(
                '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] The table is not found.'
                % (raw_table_name, db_type, table_schema))
        elif len(rs) > 1:
            self.logger.info('|ID|TABLE_NAME|DB_TYPE|TABLE_SCHEMA|')
            for r in rs:
                self.logger.info(
                    '|%s|%s|%s|%s|' %
                    (r.ID, r.TABLE_NAME, r.DB_TYPE, r.TABLE_SCHEMA))
            raise ValueError(
                '[TABLE_NAME:%s, DB_TYPE:%s, TABLE_SCHEMA:%s] More than one records returned for the table.'
                % (raw_table_name, db_type, table_schema))
        else:
            if rs[0].DB_TYPE == 'MSSQL':
                self.target_conn = self.mssql
            else:
                self.vtc = DWOperation(meta=self.meta)
                self.target_conn = self.vtc
            return rs[0]

    def get_mssql_table_ddl_def(self, db_name, table_name, table_schema):
        ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE
            FROM %s.INFORMATION_SCHEMA.COLUMNS
            WHERE TABLE_NAME = N'%s'
            AND TABLE_SCHEMA = N'%s'
            ORDER BY ORDINAL_POSITION
        ''' % (db_name, table_name, table_schema)
        rs = self.mssql.query(ddl_sql)
        #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs])
        return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs])

    def get_vertica_table_ddl_def(self, table_name, table_schema):
        ddl_sql = '''SELECT COLUMN_NAME, DATA_TYPE
            FROM COLUMNS
            WHERE TABLE_NAME = '%s'
            AND TABLE_SCHEMA = '%s'
            ORDER BY ORDINAL_POSITION
        ''' % (table_name, table_schema)
        rs = self.vtc.query(ddl_sql)
        #return dict([('"%s"'%r.COLUMN_NAME if ' ' in r.COLUMN_NAME else r.COLUMN_NAME, r.DATA_TYPE) for r in rs])
        return dict([(r.COLUMN_NAME, r.DATA_TYPE) for r in rs])

    def save_to_db(self, table_def, recs):
        pk = json.loads(table_def.PK)
        batch_sqls = []
        columns = 'TABLE_DEF_ID, PK_VALUE, ACTION, RECORD, UPDATE_TIME, ACTIONER' if self.log_detail else 'TABLE_DEF_ID, PK_VALUE, ACTION, UPDATE_TIME, ACTIONER'
        counter = 0
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        for rec in recs:
            pk_dict = collections.OrderedDict()
            for k in pk:
                pk_dict[k] = rec[k]
            pk_value = json.dumps(pk_dict).replace("'", "''")
            if self.log_detail:
                batch_sqls.append(
                    '''SELECT %s, '%s', '%s', '%s', '%s', '%s' ''' %
                    (table_def.ID, pk_value, self.cmd, json.dumps(rec).replace(
                        "'", "''"), now, self.actioner))
            else:
                batch_sqls.append(
                    '''SELECT %s, '%s', '%s', '%s', '%s' ''' %
                    (table_def.ID, pk_value, self.cmd, now, self.actioner))
            counter += 1
            if len(batch_sqls) % self.batch_size == 0:
                self.mssql.execute(
                    '''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' %
                    (columns, ' UNION ALL '.join(batch_sqls)))
                batch_sqls = []
                self.logger.info('%s has been logged.' % counter)
        if len(batch_sqls) > 0:
            self.mssql.execute('''INSERT INTO META_SCD_CHANGE_LOG(%s) %s''' %
                               (columns, ' UNION ALL '.join(batch_sqls)))
            self.logger.info('%s has been logged.' % counter)

    @Timer()
    def save_data_to_be_changed(self, table_def, where_condition):
        if table_def.DB_TYPE == 'MSSQL':
            table_schema = 'dbo' if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA
            table_ddl_def = self.get_mssql_table_ddl_def(
                self.meta['db_conn_mssql_dbname'], table_def.TABLE_NAME,
                table_schema)
        else:  # vertica
            table_schema = self.meta[
                'db_conn_vertica_common_schema'] if table_def.TABLE_SCHEMA == '(COMMON)' else table_def.TABLE_SCHEMA
            table_ddl_def = self.get_vertica_table_ddl_def(
                table_def.TABLE_NAME, table_schema)
        if not self.log_detail:  # only dump primary key columns
            pk = json.loads(table_def.PK)
            table_ddl_def = dict([(c, table_ddl_def[c]) for c in table_ddl_def
                                  if c in pk])
        self.logger.info(table_ddl_def)
        query_columns = ','.join([
            '"%s"' % c if ' ' in c else c for c in list(table_ddl_def.keys())
        ])
        pull_data_sql = 'SELECT %s FROM %s.%s %s' % (
            query_columns, table_schema, table_def.TABLE_NAME, where_condition)
        self.logger.info(pull_data_sql)
        rs = self.target_conn.query(pull_data_sql)
        recs = []
        for r in rs:
            rec = {}
            for c in list(table_ddl_def.keys()):
                #rec[c] = str(eval("r."+c)) if 'TIME' not in table_ddl_def[c].upper() and 'DATE' not in table_ddl_def[c].upper() else eval("r."+c).strftime("%Y-%m-%d %H:%M:%S")
                rec[c] = str(eval('r.__getattribute__("%s")' % c))
            recs.append(rec.copy())
        self.saved_records_count = len(recs)
        self.logger.info('%s records will be saved.' %
                         self.saved_records_count)
        self.save_to_db(table_def, recs)

    @Timer()
    def execute_sql(self):
        self.target_conn.execute(self.sql)

    def main(self):
        try:
            try:
                table_name = self.get_table_name()
                raw_table_name = self.get_raw_table_name(table_name)
                table_def = self.get_table_def(raw_table_name)
                saved = False
                if self.cmd != 'INSERT':
                    where_condition = self.get_where_condition()
                    self.save_data_to_be_changed(table_def, where_condition)
                    saved = True
            except Exception as e:
                self.logger.warning(traceback.format_exc())
                self.logger.warning(
                    'Failed to save the to-be-changed data, start to executing sql.'
                )
            self.execute_sql()
            return '(%(db_type)s)%(table_schema)s.%(table_name)s - impacted %(row_count)s rows, saved %(status)s' % {
                'db_type':
                table_def.DB_TYPE,
                'table_schema':
                table_def.TABLE_SCHEMA,
                'table_name':
                table_def.TABLE_NAME,
                'row_count':
                self.saved_records_count,
                'status':
                'N/A' if self.cmd == 'INSERT' else
                'successfully' if saved else 'unsuccessfully'
            }
        finally:
            if self.mssql:
                self.mssql.close_connection()
            if self.vtc:
                self.vtc.close_connection()