示例#1
0
    def Start(self, logger, moduleName, filelocs):
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.fileUtilities.EmptyFolderContents(
                self.localTempDirectory +
                "/QC/")  #delete and recreate the folder
            hqc = HindsightQC(self.logger, self.fileUtilities,
                              self.bcpUtilities, self.localTempDirectory)
            hqc.Get_sql_server_rowcounts("pre")  #Get pre-ETL rowcounts

            #Execute the pre-etl queries
            for sqlFile in self.job["bcpParameters"].get("preETLQueries"):
                RedshiftUtilities.PSqlExecute(
                    self.fileUtilities.GetApplicationDirectory("Hindsight") +
                    sqlFile, logger)

            for subJob in self.job["bcpParameters"]["subJobs"]:
                if subJob.get("destinationSchema") is None:
                    subJob["destinationSchema"] = self.job["bcpParameters"][
                        "destinationSchema"]
                    self.ProcessSubJob(subJob)

            #Get SQL Server rowcounts
            hqc.Get_sql_server_rowcounts("post")

            #Execute the post-etl queries to prepare the data post-ETL prior to loading into the production tables
            for sqlFile in self.job["bcpParameters"].get("postETLQueries"):
                RedshiftUtilities.PSqlExecute(
                    self.fileUtilities.GetApplicationDirectory("Hindsight") +
                    sqlFile, logger)

            #Get Redshift rowcounts
            hqc.Get_redshift_rowcounts("post")

            #Execute the post-etl qc queries
            status = hqc.ValidateETL()

            #Check whether the ETL passed the QC
            #Check 1: inter-version counts. Are the difference beyond a particular threshold
            #Check 2: pre-sql v/s post-redshift. Are the differences beyond a particular threshold
            #If the ETL doesn't pass the QC, do not update/insert the prod tables
            #If the ETL passed the QC, insert into production tables (data, attributes, history)
            if status == True:
                self.logger.info("ETL good to go")
                for sqlFile in self.job["bcpParameters"].get(
                        "FinalLoadQueries"):
                    #===========================================================
                    # add a process to backup data/attributes history tables
                    # Download to S3
                    #===========================================================
                    RedshiftUtilities.PSqlExecute(
                        self.fileUtilities.GetApplicationDirectory("Hindsight")
                        + sqlFile, logger)
            else:
                self.logger.warning("Bad ETL. No go!")

            print hqc.TimeElaspsed()
        except:
            logger.exception(moduleName + " - Exception!")
            raise
示例#2
0
    def LoadEIANetGen(self, tableSuffix):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            file_name = self.job["ExcelSheets"][0]["Name"].split(
                '.')[0] + '.csv.gz'
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"] + file_name

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": self.job["tableName"] + tableSuffix,
                    "s3Filename": s3DataFolder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                "Error while trying to save into Redshift from s3 folder.")
            raise
示例#3
0
    def LoadErcotTables(self):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": self.job["tableName"] + "DAM",
                    "s3Filename": s3DataFolder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")
            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                "Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()
示例#4
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        pulls data from each table in the catalog
        '''
        try:
            self.logger.debug(self.moduleName + " -- ProcessCatalogs for  " +
                              catalog["name"] + " starting")
            for tables in catalog["tables"]:
                fname = self.fileUtilities.CreateTableSql(
                    tables, self.fileUtilities.sqlFolder)
                RedshiftUtilities.PSqlExecute(fname, self.logger)

            for tables in catalog["tables"]:
                if tables["type"] == 'working':
                    self.ProcessTables(dbCommon, tables)
###
#  now go back and update any tables that have special characters in them
###
            for tables in catalog["tables"]:
                if "specialCharacterScript" in tables:
                    self.ResetSpecialChars(dbCommon, tables)

            for tables in catalog["tables"]:
                if "updateSection" in tables:
                    updScriptName = self.CreateUpdateScript(dbCommon, tables)
                    RedshiftUtilities.PSqlExecute(updScriptName, self.logger)

            self.logger.debug(self.moduleName + " -- ProcessCatalogs for  " +
                              catalog["name"] + " finished")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in ProcessCatalogs for " + catalog["name"])
            raise
示例#5
0
 def LoadIntoRedshift(self, job):
     '''
     Does the actual loading of data into Redshift
     '''
     self.logger.info("Loading {} into Redshift".format(job["Name"]))
     try:
         rsConnect = RedshiftUtilities.Connect(
             dbname=self.awsParams.redshift['Database'],
             host=self.awsParams.redshift['Hostname'],
             port=self.awsParams.redshift['Port'],
             user=self.awsParams.redshiftCredential['Username'],
             password=self.awsParams.redshiftCredential['Password'])
         RedshiftUtilities.LoadFileIntoRedshift(
             rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
             self.localTempDirectory + "/cleaned/" + job["Name"] + ".CSV",
             self.job["destinationSchema"],
             self.job["tableName"].lower().replace("f1_", "") + job["Name"],
             self.job["fileFormat"], self.job["dateFormat"],
             self.job["outputDelimiter"])
         rsConnect.close()
     except Exception as e:
         self.logger.exception(
             "we had an error in FoxPro.LoadIntoRedshift() while loading data into Redshift"
         )
         self.logger.exception("{}".format(str(e)))
示例#6
0
    def LoadCSVFile(self, localFilePath, loadName):
        '''
        For each file we need to process, provide the data loader the s3 key
        and destination table name
        '''
        self.logger.info("Loading data into Redshift")
        try:
            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadFileIntoRedshift(
                rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
                localFilePath, self.job["destinationSchema"],
                self.job["tableName"] + loadName["redshiftTableSuffix"],
                self.job["fileFormat"], self.job["dateFormat"],
                self.job["delimiter"])
        except Exception:
            self.logger.exception("Exception in PGCREIA860.LoadCSVFile")
            self.logger.exception(
                "Error while uploading to table:{}, filePath:{}".format(
                    self.job["tableName"] + loadName["redshiftTableSuffix"],
                    localFilePath))
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()
            self.logger.info(self.moduleName +
                             " - Finished Processing S3 file: " +
                             loadName["redshiftTableSuffix"])
示例#7
0
    def LoadAllFromS3(self, s3Source, tableName):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataSource = "s3://" + self.job["bucketName"] + s3Source

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": tableName,
                    "s3Filename": s3DataSource,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
示例#8
0
    def ProcessS3File(self, srcFileParameter):
        '''
        For each file we need to process, provide the data loader the s3 key
        and destination table name
        '''
        jobParams = dict(self.job)
        jobParams["s3Filename"] = "s3://" + self.job["bucketName"] + "/" + \
            self.job["s3SrcDirectory"] + "/" + srcFileParameter["s3Filename"]
        jobParams["tableName"] = self.job["tableName"] + \
            srcFileParameter["redshiftTableSuffix"]
        self.logger.info(self.moduleName + " - Processing S3 file: " +
                         jobParams["s3Filename"])

        rsConnect = RedshiftUtilities.Connect(
            dbname=self.awsParams.redshift['Database'],
            host=self.awsParams.redshift['Hostname'],
            port=self.awsParams.redshift['Port'],
            user=self.awsParams.redshiftCredential['Username'],
            password=self.awsParams.redshiftCredential['Password'])

        RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                         jobParams, self.logger)
        rsConnect.close()
        self.logger.info(self.moduleName + " - Finished Processing S3 file: " +
                         jobParams["s3Filename"])
示例#9
0
    def LoadAllFromS3(self):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": self.job["tableName"],
                    "s3Filename": s3DataFolder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
示例#10
0
    def ProcessS3File(self, srcFileParameter):
        '''
        Process each file
        '''
        self.logger.debug(self.moduleName + " -- " + "ProcessS3File" +
                          " starting ")
        s3Key = self.job["s3SrcDirectory"] + "/" + srcFileParameter[
            "s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.localTempDirectory + "/raw/" + fileName

        #----------------------------------------------------------------------
        S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                       self.job["bucketName"], s3Key,
                                       localGzipFilepath)

        # Remove the gz extension
        localExcelFilepath = re.sub(r'\.gz$', '', localGzipFilepath)
        self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath)

        self.logger.info(self.moduleName + " - Processing Excel file: " +
                         localExcelFilepath)
        self.fileUtilities.DeleteFile(localGzipFilepath)
        fileNameNoExt = fileName.split('.', 1)[0]
        outPutFileName = self.fileUtilities.csvFolder + fileNameNoExt + '.csv'
        xl = ExcelUtilities(self.logger)
        xl.Excel2CSV(localExcelFilepath,\
                    srcFileParameter["excelSheetName"],\
                    outPutFileName,\
                    self.fileUtilities.csvFolder,\
                    skiprows=srcFileParameter["skipRows"])
        self.fileUtilities.EmptyFolderContents(self.localTempDirectory +
                                               "/raw/")
        for tables in srcFileParameter["tables"]:
            fname = self.fileUtilities.CreateTableSql(
                tables, self.fileUtilities.sqlFolder)
            RedshiftUtilities.PSqlExecute(fname, self.logger)
        # -----------------------------------------------------------------------------
        self.logger.info(self.moduleName + " - Loading data into Redshift...")
        rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)

        RedshiftUtilities.LoadFileIntoRedshift(
            rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
            outPutFileName, tables["schemaName"], tables["table"],
            self.job["fileFormat"], self.job["dateFormat"],
            self.job["delimiter"])
        # Cleanup
        rsConnect.close()
        self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)

        self.logger.debug(self.moduleName + " -- " +
                          "ProcessS3File for file: " + s3Key + " finished ")
示例#11
0
    def LoadCategory(self, srcCategory):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " starting ")
            processingFile = self.DownloadFile(srcCategory)
            fileOut = processingFile.replace(".dbf", ".txt")

            dbfUtils = DBFUtilities(self.logger)
            dbfUtils.ConvertToCSV(processingFile, fileOut, srcCategory["delimiter"], False)

            rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)
            self.logger.info(self.moduleName + " - Loading file " + fileOut + "...")

            RedshiftUtilities.LoadFileIntoRedshift(rsConnect,
                                                   self.awsParams.s3,
                                                   self.logger,
                                                   self.fileUtilities,
                                                   fileOut,
                                                   self.job["destinationSchema"],
                                                   self.job["tableName"] + "_" + srcCategory["srcCategory"],
                                                   srcCategory["fileFormat"],
                                                   srcCategory["dateFormat"],
                                                   srcCategory["delimiter"])
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " finished ")
        except Exception as err:
            self.logger.error(self.moduleName + " - Error while trying to load category...")
            raise Exception(err.message)
示例#12
0
    def ProcessRequest(self):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:
            rawFolder = self.localTempDirectory + '/raw/'
            flist = self.DownloadFile(rawFolder)
            xl = ExcelUtilities(self.logger)

            outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv'
            for fl in flist:
                xl.Excel2CSV(rawFolder + fl,\
                            'Sheet1',\
                            outPutFileName,\
                            self.fileUtilities.csvFolder)
                outputGZ = self.fileUtilities.gzipFolder + self.moduleName + '.csv.gz'

                self.fileUtilities.GzipFile(outPutFileName, outputGZ)
                self.fileUtilities.EmptyFolderContents(
                    self.fileUtilities.csvFolder)

            for tables in self.job["tables"]:
                fname = self.fileUtilities.CreateTableSql(
                    tables, self.fileUtilities.sqlFolder)
                RedshiftUtilities.PSqlExecute(fname, self.logger)
            self.BulkUploadToS3()
            self.LoadData(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessRequest")
            raise
示例#13
0
文件: IEA.py 项目: eulertech/backup
 def AsyncBuildTables(self, proc):
     '''
     build the tables in RedShift
     '''
     try:
         ###
         ##  first create the file so we can use it
         ###
         self.logger.info(self.moduleName + " - " + proc["processname"] +
                          " - SQL tables starting.")
         sqlTableCreationScript = ApplicationBase.BuildTableCreationScriptTable(
             self,
             proc["sqlscript"],
             proc["processname"],
             templateFolder="sql",
             sqlFolder="sql")
         ###
         #  now we create the table from the file created
         ###
         RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger)
         self.logger.info(self.moduleName + " - " + proc["processname"] +
                          " - SQL tables created finished.")
     except:
         self.logger.exception(self.moduleName +
                               "- we had an error in AsyncBuildCXR")
         raise
示例#14
0
def CompareResults():
    '''
    Compare results between stage and prod
    '''
    logger = None
    schemas = ["eaa_dev.cfl_", "eaa_prod."]
    schemas = ["eaa_dev.cfl_", "eaa_dev."]
    print "Processing.  Compare results between schemas: " + schemas[0] + " and " + schemas[1]
    tables = ["auto_lv_production", "auto_lv_sales", "auto_parc", "cftc", "eia_pet_imports_series_attributes", "eia_pet_imports_series_data",
              "eia_pet_series_attributes", "eia_pet_series_data", "eia_steo_series_attributes", "eia_steo_series_data",
              "totem", "opis_retail_price", "opis_retail_volume", "chemicals", "enp", "rigcount"]
    tables = ["eia_pet_imports_series_attributes", "eia_pet_imports_series_data",
              "eia_pet_series_attributes", "eia_pet_series_data", "eia_steo_series_attributes", "eia_steo_series_data"]

    awsParams = ConfigureAWS.ConfigureAWS()
    awsParams.LoadAWSConfiguration(logger)
    rsConnect = psycopg2.connect(dbname=awsParams.redshift['Database'], host=awsParams.redshift['Hostname'], port=awsParams.redshift['Port'],
                                 user=awsParams.redshiftCredential['Username'], password=awsParams.redshiftCredential['Password'])

    for table in tables:
        recCount = {}
        for schema in schemas:
            redshiftTableName = schema + table
            try:
                recCount[schema] = RedshiftUtilities.GetRecordCount(
                    rsConnect, redshiftTableName)
            except psycopg2.ProgrammingError:
                recCount[schema] = -1 # Skip this table and mark count as -1

        diff = recCount[schemas[0]] - recCount[schemas[1]]
        flag = " *** Count Different! ***" if diff != 0 else ""
        print("Table: " + table + ", " + schemas[0] + ": " + str(recCount[schemas[0]]) +
              ", " + schemas[1] + ": " + str(recCount[schemas[1]]) + ", diff: " + str(diff) + flag)

    rsConnect.close()
示例#15
0
    def PrepODATACat(self, paramsList, filelocs, currProcId):
        '''
        gets a list of the type of data that we are going to pull and the fields for each
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "PrepODATACat " +
                              " started ")

            processTableArray = []
            endpoint = self.job["ODATA"]["endpoint"]

            muHelper = MagellanUtilities()
            muHelper.logger = self.logger
            commonParams = {}
            commonParams["sqlFolder"] = self.fileUtilities.sqlFolder
            commonParams["csvFolder"] = self.fileUtilities.csvFolder
            commonParams["gzipFolder"] = self.fileUtilities.gzipFolder

            muHelper.commonParams = commonParams

            for catalog in self.job["ODATA"]["Catalogs"]:
                if catalog["execute"] == "Y":
                    passparms = {
                        "endpoint": endpoint,
                        "name": catalog["name"],
                        "tables": catalog["tables"]
                    }
                    processTableArray.append(passparms)
                    for tblJson in catalog["tables"]:
                        fname = self.fileUtilities.CreateTableSql(
                            tblJson, self.fileUtilities.sqlFolder)
                        RedshiftUtilities.PSqlExecute(fname, self.logger)
                    self.GatherFields(catalog["tables"])
                    self.filter, fromDate = self.CreateFilter(
                        catalog, paramsList)
                    self.logger.debug(fromDate)
                    self.PullOdata(passparms, fromDate)
                    for tblJson in catalog["tables"]:
                        if "destName" in tblJson:
                            self.MoveToS3(tblJson["s3subfolder"][:4], fromDate,
                                          tblJson["s3subfolder"])
                            self.LoadData(fromDate, tblJson["s3subfolder"],
                                          tblJson)
                    if self.etlUtilities.SetInstanceParameters(filelocs["tblEtl"]["table"],\
                                                               currProcId,\
                                                               json.dumps(
                                                                       {"lastrun":fromDate, "daysback": self.job["daysback"]}
                                                                   )) is not True:
                        self.logger.info(self.moduleName +
                                         " - we could not set the instance.")
                    self.UpdateTable(filelocs["tblEtl"]["schemaName"],
                                     filelocs["tblEtl"]["table"],
                                     catalog["tables"], currProcId)

            self.logger.debug(self.moduleName + " -- " + "PrepODATACat " +
                              " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in PrepODATACat")
            raise
示例#16
0
文件: OPEC.py 项目: eulertech/backup
    def LoadCategory(self, srcCategory):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" +
                              " starting ")
            processingFile = self.DownloadFile(srcCategory)
            processingCSV = self.GetCSVFile(processingFile, srcCategory)
            rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)

            self.logger.debug(self.moduleName + " - Loading file " +
                              processingCSV + "...")

            RedshiftUtilities.LoadFileIntoRedshift(
                rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
                processingCSV, self.job["destinationSchema"],
                self.job["tableName"] + "_" + srcCategory["srcCategory"],
                srcCategory["fileFormat"], srcCategory["dateFormat"],
                srcCategory["delimiter"])
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" +
                              " finished ")
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load category...")
            raise Exception(err.message)
示例#17
0
文件: JODI.py 项目: eulertech/backup
    def ProcessCategory(self, rsConnect, srcCategory):
        '''
        Executes the processing for a single category configured...
        '''
        url = srcCategory["url"]
        self.logger.info(self.moduleName + " - Processing url: " + url)

        localFilepath = self.localTempDirectory + "/" + ntpath.basename(
            srcCategory["url"])

        fileDownload = urllib.URLopener()
        fileDownload.retrieve(url, localFilepath)

        self.fileUtilities.UnzipFile(localFilepath, self.localTempDirectory)
        localFilepath = self.localTempDirectory + "/" + srcCategory[
            "unzipFilename"]

        redshiftDestTable = self.job["tableName"] + srcCategory[
            "redshiftTableSuffixOrigin"]

        RedshiftUtilities.LoadFileIntoRedshift(
            rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
            localFilepath, self.job["destinationSchema"], redshiftDestTable,
            self.job["fileFormat"], srcCategory["dateFormat"],
            self.job["delimiter"])
示例#18
0
文件: IEA.py 项目: eulertech/backup
    def AsyncLoadFilesToRedShift(self, proc):
        '''
        load files into RedShift
        '''
        try:
            for pFile in proc["processfile"]:
                pFileNoPath = pFile.replace('/', '_')
                self.logger.debug(self.moduleName + " -- " +
                                  "AsyncLoadFilesToRedShift for " + pFile +
                                  " starting ")
                rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)
                outputfileName = self.localTempDirectory + '/scrubbed/' + 'scrub_' + pFileNoPath
                rsTable = 'working_' + proc["processname"]

                RedshiftUtilities.LoadFileIntoRedshift(
                    rsConnect, self.awsParams.s3, self.logger,
                    self.fileUtilities, outputfileName,
                    self.job["destinationSchema"], rsTable,
                    self.job["fileFormat"], self.job["dateFormat"],
                    self.job["delimiter"])
                rsConnect.close()
                self.logger.debug(self.moduleName + " -- " +
                                  "AsyncLoadFilesToRedShift for " + pFile +
                                  " finished ")
        except:
            self.logger.exception(
                self.moduleName +
                "- we had an error in AsyncLoadFilesToRedShift ")
            raise
示例#19
0
    def LoadData(self, folderName, subFolder, tblJson):
        '''
        load the data from s3 into RedShift
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "LoadData" +
                              " starting ")
            rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)

            s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] +\
                        "/" + folderName + '/' + subFolder

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": tblJson["table"],
                    "s3Filename": s3folder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")
            self.logger.info(self.moduleName + " - Finished loading s3 data to " +\
                             self.job["destinationSchema"] + '.' +  tblJson["table"])
            rsConnect.close()
            self.logger.debug(self.moduleName + " -- " + "LoadData" +
                              " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in LoadData")
            raise
示例#20
0
    def ExecuteCreateTableScript(self, serieT, tableT):
        '''
        Executed the federated timeseries tables script
        '''
        tableName = self.job['federatedTables'][serieT][tableT]['name']

        tb = {}
        tb['fields'] = []

        tb['schemaName'] = self.job['destinationSchema']
        tb['table'] = tableName
        tb["new"] = self.job['federatedTables'][serieT][tableT]['new']
        tb['sortkey'] = self.job['federatedTables'][serieT]['sortkey']

        if 'distkey' in self.job['federatedTables'][serieT]:
            if self.job['federatedTables'][serieT]['distkey'] <> '':
                tb['distkey'] = self.job['federatedTables'][serieT]['distkey']

        for field in self.job['federatedTables'][serieT]['fields']:
            if 'exclusiveTo' in field:
                if tableT in field['exclusiveTo']:
                    tb['fields'].append(field)
            else:
                tb['fields'].append(field)

        fname = self.fileUtilities.CreateTableSql(tb, self.localTempDirectory)
        RedshiftUtilities.PSqlExecute(fname, self.logger)
示例#21
0
 def LoadData(self, iso, localFilePath, fp):
     '''
     Method to load ISO data into Redshift
     '''
     self.logger.info("Loading ISO data into Redshift")
     rsConnect = RedshiftUtilities.Connect(
         dbname=self.awsParams.redshift['Database'],
         host=self.awsParams.redshift['Hostname'],
         port=self.awsParams.redshift['Port'],
         user=self.awsParams.redshiftCredential['Username'],
         password=self.awsParams.redshiftCredential['Password'])
     RedshiftUtilities.LoadFileIntoRedshift(
         rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
         localFilePath + fp, self.job["destinationSchema"],
         self.job["tableName"] + iso["Name"], self.job["fileFormat"],
         self.job["dateFormat"], self.job["delimiter"])
示例#22
0
文件: IEA.py 项目: eulertech/backup
 def AsyncBuildCXR(self, proc, basesql):
     '''
     actually build the cross reference tables
     '''
     try:
         self.logger.debug(self.moduleName + " -- " + "AsyncBuildCXR for " +
                           proc["name"] + " starting ")
         ###
         ##  first create the file so we can use it
         ###
         sqlTableCreationScript = ApplicationBase.BuildTableCreationScriptTable(
             self,
             basesql,
             proc["name"],
             templateFolder="sql",
             sqlFolder="sql")
         ###
         ##  execute DDL so we now have the blank table
         ###
         RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger)
         self.logger.debug(self.moduleName + " -- " + "AsyncBuildCXR for " +
                           proc["name"] + " finished ")
     except:
         self.logger.exception(self.moduleName +
                               "- we had an error in AsyncBuildCXR")
         raise
示例#23
0
    def LoadAllFromS3(self):
        '''
        Load all CSVs from the Vantage's S3 bucket into Redshift
        '''
        rsConnect = None

        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            for dsScript in self.job["extractingScripts"]:
                RedshiftUtilities.LoadDataFromS3(
                    rsConnect, self.awsParams.s3, {
                        "destinationSchema":
                        self.job["destinationSchema"],
                        "tableName":
                        self.job["tableName"] + dsScript["tableSuffix"],
                        "s3Filename":
                        s3DataFolder + "/" + dsScript["tableSuffix"] +
                        ".CSV.GZ",
                        "fileFormat":
                        self.job["fileFormat"],
                        "dateFormat":
                        self.job["dateFormat"],
                        "delimiter":
                        self.job["delimiter"]
                    }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()
示例#24
0
    def LoadAllData(self):
        '''
        Process:
        1)  push Attribute and data gz files to S3
        2)  load data into Redshift from S3
        '''
        self.CreateFolders("N")  #  this just sets the variable we will need
        self.fileUtilities = FileUtilities(self.logger)

        rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'],
                                              host=self.awsParams.redshift['Hostname'],
                                              port=self.awsParams.redshift['Port'],
                                              user=self.awsParams.redshiftCredential['Username'],
                                              password=self.awsParams.redshiftCredential['Password'])

        for table in self.job["tables"]:
            ###
            #  first create zip files for all we want to send to S3
            ###
            s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"]
            if table["type"] == "attributes":
                sourceFolder = self.gzipFolder + "attr"
                destFolder = s3folder + "/attribute"
            else:  # data types
                sourceFolder = self.gzipFolder + "data"
                destFolder = s3folder + "/data"

            S3Utilities.CopyItemsAWSCli(sourceFolder,
                                        destFolder,
                                        '''--recursive --quiet --include "*.gz"''')

            RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                             {
                                                 "destinationSchema": self.job["destinationSchema"],
                                                 "tableName": table["name"],
                                                 "s3Filename": destFolder,
                                                 "fileFormat": self.job["fileFormat"],
                                                 "dateFormat": self.job["dateFormat"],
                                                 "delimiter": self.job["delimiter"]
                                             },
                                             self.logger, "N")

#            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(destFolder,
#                                                        '''--recursive --quiet --include "*.gz"''')

        rsConnect.close()
示例#25
0
    def Start(self, logger, moduleName, filelocs):
        '''
        main routine
        '''
        currProcId = None
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(
                filelocs["tblEtl"]["table"], self.moduleName)
            ###
            #  set up to run create folder
            ###
            self.fileUtilities.moduleName = self.moduleName
            self.fileUtilities.localBaseDirectory = self.localTempDirectory
            self.fileUtilities.CreateFolders(self.job["folders"])
            self.fromDate = self.GetFromDate()
            ###
            localFilepath = self.GetMostRecentFile(self.job["foldertoscan"])
            #            localFilepath = r'C:\tmp\IHS Markit Outlook for Global Oil Market Fundamentals - September 2017.xlsx'
            for tables in self.job["tables"]:
                fname = self.fileUtilities.CreateTableSql(
                    tables, self.fileUtilities.sqlFolder)
                RedshiftUtilities.PSqlExecute(fname, self.logger)
                outPutFileName = self.fileUtilities.csvFolder +\
                                 self.fromDate +\
                                 "_" + tables["table"]  + '.csv'
                outputGZ = self.fileUtilities.gzipFolder + self.fromDate +\
                           "_" + tables["table"]  + '.csv.gz'
                tableJson = tables
            xl = ExcelUtilities(logger)
            if sys.version[0] == '3':
                csvfile = open(outPutFileName, 'w', newline='')
            elif sys.version[0] == '2':
                csvfile = open(outPutFileName, 'wb')
            csvWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

            if localFilepath is not None:
                self.ProcessFile(xl, localFilepath, csvWriter)

            csvfile.close()

            self.fileUtilities.GzipFile(outPutFileName, outputGZ)
            self.BulkUploadToS3()
            self.LoadData(tableJson)

            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)

            self.logger.debug(self.moduleName + " -- " + " finished ")
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\
                                             currProcId, 'F') is not True:
                self.logger.info(self.moduleName +
                                 " - we could not Complete Instance.")
            raise Exception(err.message)
示例#26
0
    def CreatePostgresTables(self, psConnect):
        '''
        Create table in local PS database  Data will be removed
        '''
        sqlTableCreationScript = self.BuildTableCreationScript(self.job['psSqlScript'])

        # The following code will recreate all the tables.  EXISTING DATA WILL BE DELETED
        RedshiftUtilities.ExecuteSQLScript(psConnect, sqlTableCreationScript, self.logger)
        self.logger.info(self.moduleName + " - SQL tables created.")
示例#27
0
    def ExecuteCreateTable(self):
        '''
        Checks if the tables needs to be created
        '''
        tb = self.job['ddl']
        tb['schemaName'] = self.job['destinationSchema']

        fname = self.fileUtilities.CreateTableSql(tb, self.localTempDirectory)
        RedshiftUtilities.PSqlExecute(fname, self.logger)
示例#28
0
    def UnloadData(self, s3TempFolder):
        '''
        Unloads only the names or series keys to s3
        '''
        unloadQuery = "select distinct name from " + self.job['destinationSchema'] + "." + \
                        self.job['federatedTables']['seriesAttributes']['history']['name']

        rsConnect = RedshiftUtilities.Connect(
            dbname=self.awsParams.redshift['Database'],
            host=self.awsParams.redshift['Hostname'],
            port=self.awsParams.redshift['Port'],
            user=self.awsParams.redshiftCredential['Username'],
            password=self.awsParams.redshiftCredential['Password'])

        RedshiftUtilities.UnloadDataToS3(rsConnect, self.awsParams.s3, {
            "query": unloadQuery,
            "s3Folder": s3TempFolder
        }, self.logger)
示例#29
0
 def PostLoadETL(self):
     '''
     Create the L2 tables post data load
     '''
     postLoadScriptTemplate = self.job.get("PostLoadScript")
     if postLoadScriptTemplate is not None:
         sqlTableCreationScript = super(PGCRERCOT, self).BuildTableCreationScript(postLoadScriptTemplate)
         RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger)
         self.logger.info(self.moduleName + " - SQL tables created.")
示例#30
0
    def CreateTables(self, sqlTemplateFilename):
        '''
        Create the actual tables
        '''
        sqlTableCreationScript = self.BuildTableCreationScript(
            sqlTemplateFilename)

        # The following code will recreate all the tables.  EXISTING DATA WILL BE DELETED
        RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger)
        self.logger.info(self.moduleName + " - SQL tables created.")