Пример #1
0
class Vantage(ApplicationBase):
    '''
    This class is used to get the Vanatage data from IHS Vantage Database, transform it and load it into Redshift.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(Vantage, self).__init__()

        self.awsParams = ""
        self.packedFolder = None
        self.rawFolder = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def BulkExtractAll(self):
        '''
        Controls the flow thru the different data sets coming from Vantage DB.
        '''
        try:
            for dsScript in self.job["extractingScripts"]:
                self.logger.info(self.moduleName + " Starts extracting " +
                                 dsScript["tableSuffix"] + " data...")

                self.bcpUtilities.RunBCPJob(
                    self.job["mssqlLoginInfo"],
                    self.job["bcpUtilityDirOnLinux"],
                    self.fileUtilities.LoadSQLQuery(self.location +
                                                    dsScript["scriptFile"]),
                    self.localTempDirectory + "/Raw/" +
                    dsScript["tableSuffix"] + ".CSV", self.job["delimiter"])
        except Exception as err:
            self.logger.error(
                "Error while trying to Bulk Extract all. Message: " +
                err.message)
            raise

    def TransformAndPackAll(self):
        '''
        Compress the csv files created.
        '''
        rawFiles = self.fileUtilities.ScanFolder(self.rawFolder, None, "CSV")

        try:
            for rFile in rawFiles:
                rFileFull = self.rawFolder + "/" + rFile

                self.logger.info(self.moduleName +
                                 " started compressing file: " + rFile)

                self.fileUtilities.GzipFile(
                    rFileFull, self.packedFolder + "/" + rFile + ".GZ")

                self.fileUtilities.RemoveFileIfItExists(rFileFull)
        except Exception as err:
            self.logger.error(self.moduleName +
                              " Error while compressing raw files. Message: " +
                              err.message)
            raise

    def LoadAllFromS3(self):
        '''
        Load all CSVs from the Vantage's S3 bucket into Redshift
        '''
        rsConnect = None

        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            for dsScript in self.job["extractingScripts"]:
                RedshiftUtilities.LoadDataFromS3(
                    rsConnect, self.awsParams.s3, {
                        "destinationSchema":
                        self.job["destinationSchema"],
                        "tableName":
                        self.job["tableName"] + dsScript["tableSuffix"],
                        "s3Filename":
                        s3DataFolder + "/" + dsScript["tableSuffix"] +
                        ".CSV.GZ",
                        "fileFormat":
                        self.job["fileFormat"],
                        "dateFormat":
                        self.job["dateFormat"],
                        "delimiter":
                        self.job["delimiter"]
                    }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()

    def BulkUploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName +
                         " - Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(
            self.packedFolder,
            "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
            "--recursive --quiet")

    def Start(self, logger, moduleName, filelocs):
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            self.packedFolder = self.localTempDirectory + "/Packed"
            self.rawFolder = self.localTempDirectory + "/Raw"

            self.fileUtilities.RemoveFolder(self.packedFolder)
            self.fileUtilities.RemoveFolder(self.rawFolder)

            self.fileUtilities.CreateFolder(self.packedFolder)
            self.fileUtilities.CreateFolder(self.rawFolder)

            self.BulkExtractAll()
            self.TransformAndPackAll()
            self.BulkUploadToS3()
            self.LoadAllFromS3()
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            raise Exception(err.message)
Пример #2
0
class PGCRAirMarkets(ApplicationBase):
    '''
    Code to process the PGCR Air Markets data
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(PGCRAirMarkets, self).__init__()

        self.awsParams = ""
        self.tempFolder = None
        self.packedFolder = None
        self.rawDataFolder = None
        self.toPackFiles = []

        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def SynchronizeSourceFolder(self):
        '''
        Synchronize the source folder from the AirMarkets bucket in s3
        '''
        self.logger.info("Synchronizing ZIP files from s3 folder...")

        S3Utilities.SyncFolderAWSCli(
            "s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"],
            self.rawDataFolder, True)

    def CleanUpAndPack(self):
        '''
        Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3.
        '''
        rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder)

        for rawFolderName in rawFolders:
            self.toPackFiles = []

            self.DecompressFromRaw(rawFolderName)
            self.CleanUpRawCSV(rawFolderName)
            self.PackFiles(rawFolderName)

    def PackFiles(self, rawFolderName):
        '''
        Compress the files for a given folder, right now is only the emissions file being packed.
        '''
        self.logger.info("Packing files for folder " + rawFolderName + "...")

        for csvFile in self.toPackFiles:
            airMarketGzFile = self.packedFolder + "/" + ntpath.basename(
                csvFile) + ".gz"

            self.fileUtilities.GzipFile(csvFile, airMarketGzFile)
            self.fileUtilities.DeleteFile(csvFile)

    def CleanUpRawCSV(self, rawFolderName):
        '''
        Performs the clean-up for the emissions files replacing bd characters.
        '''
        allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv")
        fileList = [
            fileName for fileName in allFiles
            if self.job["srcFileNamePrefix"] in fileName
        ]
        fileListToDel = [
            fileName for fileName in allFiles
            if self.job["srcFileNamePrefix"] not in fileName
        ]

        self.logger.info("Cleaning up files for folder " + rawFolderName +
                         "...")

        for airMarketFile in fileList:
            fullFileName = self.tempFolder + "/" + airMarketFile
            toPackFileName = self.tempFolder + "/" + self.job[
                "srcFileNamePrefix"] + "_" + rawFolderName + ".csv"

            self.fileUtilities.ReplaceIterativelyInFile(
                fullFileName, toPackFileName, [{
                    r"[^\x00-\x76]+": ""
                }, {
                    "'": "`"
                }])

            self.logger.info("toPackFileName" + toPackFileName)
            fc = self.fileUtilities.GetFileContents(toPackFileName)
            if fc is not None and len(fc) > 2:
                self.logger.info("2nd line from the file")
                self.logger.info(fc[1].strip())
            else:
                self.logger.info("File is empty")

            lineCount = self.fileUtilities.GetLineCount(toPackFileName)
            if lineCount > 0:
                self.fileUtilities.RemoveLines(toPackFileName,
                                               self.job["removeLines"])
                self.toPackFiles.append(toPackFileName)
            else:
                self.logger.warning("Empty file: " + toPackFileName)
            self.fileUtilities.DeleteFile(fullFileName)

        for airMarketFile in fileListToDel:
            self.fileUtilities.DeleteFile(self.tempFolder + "/" +
                                          airMarketFile)

    def DecompressFromRaw(self, rawFolderName):
        '''
        Extracts the files from the EPADownload.zip file...
        '''
        try:
            filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job[
                "inputZipFileName"]

            self.logger.info("Unpacking file: " + filePath)
            self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder)
        except Exception as err:
            self.logger.info("Unable to decompress file: " + filePath +
                             " Error: " + err.message)

    def UploadPackedToS3(self):
        '''
        Uploads all files packed to s3.
        '''
        self.logger.info("Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(
            self.packedFolder,
            "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
            "--recursive --quiet")

    def LoadAirMarketsTables(self):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema":
                    self.job["destinationSchema"],
                    "tableName":
                    self.job["tableName"] + self.job["srcFileNamePrefix"],
                    "s3Filename":
                    s3DataFolder,
                    "fileFormat":
                    self.job["fileFormat"],
                    "dateFormat":
                    self.job["dateFormat"],
                    "delimiter":
                    self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                "Error while trying to save into Redshift from s3 folder.")
            raise

    def CleanWorkingFolders(self):
        '''
        Ensures the folders are cleaned and ready before the process execution.
        '''
        self.logger.info("Cleaning local working folders...")

        FileUtilities.RemoveFolder(self.tempFolder)
        FileUtilities.RemoveFolder(self.packedFolder)

        FileUtilities.CreateFolder(self.tempFolder)
        FileUtilities.CreateFolder(self.packedFolder)

    def Start(self, logger, moduleName, filelocs):
        '''
        Start of routine
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            self.tempFolder = self.localTempDirectory + "/Temp"
            self.packedFolder = self.localTempDirectory + "/Packed"
            self.rawDataFolder = self.localTempDirectory + "/RawData"

            self.CleanWorkingFolders()
            self.SynchronizeSourceFolder()
            self.CleanUpAndPack()
            self.UploadPackedToS3()
            self.LoadAirMarketsTables()
        except:
            logger.exception(moduleName + " - Exception!")
            raise
Пример #3
0
class Consensus(ApplicationBase):
    '''
    This class is used to control the data load process from different OPEC file sources.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(Consensus, self).__init__()

        self.awsParams = ""
        self.rawFolder = None
        self.csvFolder = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def BulkDownload(self):
        '''
        Download all files.
        '''
        sharedFiles = self.fileUtilities.ScanFolder(
            self.job["srcSharedFolder"])

        self.logger.info(self.moduleName +
                         " - Downloading files from shared folder...")

        for fileName in sharedFiles:
            if (fileName[:2] == self.job["fileNamePrefix"]
                ) and os.path.splitext(fileName)[1] in self.job["validExts"]:
                shutil.copyfile(
                    os.path.join(self.job["srcSharedFolder"], fileName),
                    self.rawFolder + "/" + fileName)

    def DfCleanUp(self, df, surveyDateVal):
        '''
        Converts the actual excel file into csv for the worksheet configured.
        '''
        bankNameColumnIn = "Unnamed: 0"
        surveyDateColName = "surveyDate"

        for colName in self.job["columnsToDrop"]:
            df = df.drop(colName, 1)

        df = df.drop(self.job["dropAfterHeader"], 0)

        for colName in df.head(0):
            dtTest = colName

            if not isinstance(dtTest,
                              datetime) and colName != bankNameColumnIn:
                df = df.drop(colName, 1)

        df = df.assign(surveyDate=surveyDateVal)

        newOrder = [surveyDateColName]

        for colName in df.head(0):
            if colName != surveyDateColName:
                newOrder.append(colName)

        df = df[newOrder]
        df = df.melt(id_vars=[surveyDateColName, bankNameColumnIn])

        return df

    def LoadAllFromS3(self):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": self.job["tableName"],
                    "s3Filename": s3DataFolder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise

    def GetData(self, rawFileName, mode=None):
        '''
        Returns the data frame or survey date
        '''

        if mode == "getSurveyDate":
            skipRows = 0
        else:
            skipRows = self.job["skipRows"]

        df = pandas.read_excel(rawFileName,
                               sheetname=self.job["worksheetName"],
                               index_col=None,
                               na_values=["na"],
                               skiprows=skipRows,
                               skip_footer=self.job["skipFooter"])

        if mode == "getSurveyDate":
            valRerturn = df.iloc[self.job["surveyDateRow"] - 2][0]
        else:
            valRerturn = df

        return valRerturn

    @staticmethod
    def FormatSurveyDate(emFile):
        '''
        Returns the date based on the file's name
        '''

        surveyDateColValue = os.path.splitext(emFile)[0]
        surveyDateColValue = surveyDateColValue[2:len(surveyDateColValue)]
        surveyDateColValue = surveyDateColValue.replace("CF", "")
        surveyDateColValue = str(surveyDateColValue[3:]) + "-" + str(
            list(calendar.month_abbr).index(surveyDateColValue[:3])) + "-01"

        return surveyDateColValue

    def ProcessFiles(self):
        '''
        Controls the workflow for the conversion, clean up and pack of the input files.
        '''
        filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder)

        for emFile in filesToProcess:
            self.logger.info(self.moduleName + " - Processing file: " + emFile)

            rawFileName = self.rawFolder + "/" + emFile
            csvFilename = self.csvFolder + "/" + os.path.splitext(
                emFile)[0] + ".csv"

            try:
                surveyDatedt = self.GetData(rawFileName, "getSurveyDate")

                if isinstance(surveyDatedt, float):
                    surveyDatedt = self.FormatSurveyDate(emFile)

                df = self.GetData(rawFileName)
                df = self.DfCleanUp(df, surveyDatedt)

                df.to_csv(csvFilename,
                          header=False,
                          sep=str(self.job["delimiter"]),
                          encoding='utf-8',
                          index=False)

                self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz")
                self.fileUtilities.DeleteFile(csvFilename)
            except XLRDError:
                self.logger.info(self.moduleName + " - No tab named '" +
                                 self.job["worksheetName"] + "' in " + emFile)
            except Exception:
                self.logger.error(self.moduleName +
                                  " - Error while trying to process " + emFile)
                raise
            finally:
                FileUtilities.RemoveFileIfItExists(rawFileName)

    def CheckWorkingFolders(self):
        '''
        Check if the working folders are out there to re-create them
        '''
        self.logger.info(self.moduleName + "Checking on working folders...")

        FileUtilities.RemoveFolder(self.rawFolder)
        FileUtilities.RemoveFolder(self.csvFolder)
        FileUtilities.CreateFolder(self.rawFolder)
        FileUtilities.CreateFolder(self.csvFolder)

    def BulkUploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName +
                         " - Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(
            self.csvFolder,
            "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
            "--recursive --quiet")

    def Start(self, logger, moduleName, filelocs):
        currProcId = None
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(
                filelocs["tblEtl"]["table"], self.moduleName)

            self.rawFolder = self.localTempDirectory + "/" + "Raw"
            self.csvFolder = self.localTempDirectory + "/" + "CSV"

            self.CheckWorkingFolders()
            self.BulkDownload()
            self.ProcessFiles()
            self.BulkUploadToS3()
            self.LoadAllFromS3()
            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)
            self.logger.debug(self.moduleName + " -- " + " finished ")
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\
                                             currProcId, 'F') is not True:
                self.logger.info(self.moduleName +
                                 " - we could not Complete Instance.")
            raise Exception(err.message)
Пример #4
0
class AutoInsight(ApplicationBase):
    '''
    This class is used to control the data load process for Auto Insight.
    '''

    def __init__(self):
        '''
        Initial settings
        '''
        super(AutoInsight, self).__init__()

        self.awsParams = ""
        self.processingFile = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))

    def BulkDownload(self):
        '''
        Download all files.
        '''
        sharedFiles = self.fileUtilities.ScanFolder(self.job["srcSharedFolder"])

        self.logger.info(self.moduleName + " - Downloading files from shared folder...")

        for fileName in sharedFiles:
            if fileName == self.job["fileName"]:
                self.processingFile = fileName
                shutil.copyfile(os.path.join(self.job["srcSharedFolder"], fileName), self.localTempDirectory + "/" + self.processingFile)

    def LoadAllFromS3(self):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'],
                                                  host=self.awsParams.redshift['Hostname'],
                                                  port=self.awsParams.redshift['Port'],
                                                  user=self.awsParams.redshiftCredential['Username'],
                                                  password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                             {
                                                 "destinationSchema": self.job["destinationSchema"],
                                                 "tableName": self.job["ddl"]["table"],
                                                 "s3Filename": s3DataFolder,
                                                 "fileFormat": self.job["fileFormat"],
                                                 "dateFormat": self.job["dateFormat"],
                                                 "delimiter": self.job["delimiter"]
                                             },
                                             self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(self.moduleName + " - Error while trying to save into Redshift from s3 folder.")
            raise

    @staticmethod
    def FormatColNameDate(dtText):
        '''
        Converts the abbreviated date to YYYY-MM-01 format
        '''
        textFixed = None
        if sys.version[0] == '2':
            customException = StandardError()
        elif sys.version[0] == '3':
            customException = Exception()
        
        try:
            textFixed = str(dtText[4:]) + "-" + str(list(calendar.month_abbr).index(dtText[:3])) + "-01"
        except customException:
            textFixed = dtText

        return textFixed

    def ProcessFiles(self):
        '''
        Controls the workflow for the conversion, clean up and pack of the input files.
        '''
        self.logger.info(self.moduleName + " - Processing file: " + self.processingFile)

        rawFileName = self.localTempDirectory + "/" + self.processingFile
        csvFilename = self.localTempDirectory + "/" + self.processingFile.split(".")[0] + ".csv"

        try:
            columnNames = []

            df = pandas.read_excel(rawFileName,
                                   sheetname=self.job["worksheetName"],
                                   index_col=None,
                                   na_values=None,
                                   skiprows=self.job["skipRows"],
                                   skip_footer=self.job["skipFooter"])

            for colName in df.head(0):
                if colName not in self.job["columns_no_melt"]:
                    columnNames.append(self.FormatColNameDate(colName))
                else:
                    columnNames.append(colName)

            df.columns = columnNames
            df = df.melt(id_vars=self.job["columns_no_melt"])

            df.to_csv(csvFilename,
                      header=False,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz")
            self.fileUtilities.DeleteFile(csvFilename)
        except XLRDError:
            self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + self.processingFile)
        except Exception:
            self.logger.error(self.moduleName + " - Error while trying to process file " +  self.processingFile)
            raise
        finally:
            FileUtilities.RemoveFileIfItExists(rawFileName)

    def BulkUploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...")

        fileName = self.processingFile.split(".")[0] + ".csv.gz"
        S3Utilities.CopyItemsAWSCli(self.localTempDirectory + "/" + fileName,
                                    "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--quiet")

    def ExecuteCreateTable(self):
        '''
        Checks if the tables needs to be created
        '''
        tb = self.job['ddl']
        tb['schemaName'] = self.job['destinationSchema']

        fname = self.fileUtilities.CreateTableSql(tb, self.localTempDirectory)
        RedshiftUtilities.PSqlExecute(fname, self.logger)

    def Start(self, logger, moduleName, filelocs):
        currProcId = None
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName)

            self.ExecuteCreateTable()
            self.BulkDownload()
            self.ProcessFiles()
            self.BulkUploadToS3()
            self.LoadAllFromS3()

            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)

            self.logger.debug(self.moduleName + " -- " + " finished ")
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " + err.message)

            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], currProcId, 'F') is not True:
                self.logger.info(self.moduleName + " - we could not Complete Instance.")

            raise Exception()
Пример #5
0
class PGCRAirMarketsAthenaSpark(ApplicationBase):
    '''
    Code to process the PGCR Air Markets data
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(PGCRAirMarketsAthenaSpark, self).__init__()

        self.awsParams = ""
        self.tempFolder = None
        self.packedFolder = None
        self.rawDataFolder = None
        self.toPackFiles = []

        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))

    def SynchronizeSourceFolder(self):
        '''
        Synchronize the source folder from the AirMarkets bucket in s3
        '''
        self.logger.info("Synchronizing ZIP files from s3 folder...")

        S3Utilities.SyncFolderAWSCli("s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"],
                                     self.rawDataFolder,
                                     True)

    def CleanUpAndPack(self):
        '''
        Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3.
        '''
        rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder)

        for rawFolderName in rawFolders:
            self.toPackFiles = []

            self.DecompressFromRaw(rawFolderName)
            self.CleanUpRawCSV(rawFolderName)
            self.PackFiles(rawFolderName)

    def PackFiles(self, rawFolderName):
        '''
        Compress the files for a given folder, right now is only the emissions file being packed.
        '''
        self.logger.info("Packing files for folder " + rawFolderName + "...")

        for csvFile in self.toPackFiles:
            airMarketGzFile = self.packedFolder + "/" + ntpath.basename(csvFile) + ".gz"

            self.fileUtilities.GzipFile(csvFile, airMarketGzFile)
            self.fileUtilities.DeleteFile(csvFile)

    def CleanUpRawCSV(self, rawFolderName):
        '''
        Performs the clean-up for the emissions files replacing bd characters.
        '''
        allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv")
        fileList = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] in fileName]
        fileListToDel = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] not in fileName]

        self.logger.info("Cleaning up files for folder " + rawFolderName + "...")

        for airMarketFile in fileList:
            fullFileName = self.tempFolder + "/" + airMarketFile
            toPackFileName = self.tempFolder + "/" + self.job["srcFileNamePrefix"] + "_" + rawFolderName + ".csv"

            self.fileUtilities.ReplaceIterativelyInFile(fullFileName,
                                                        toPackFileName,
                                                        [{r"[^\x00-\x76]+":""}, {"'":"`"}])

            self.fileUtilities.RemoveLines(toPackFileName, self.job["removeLines"])
            self.toPackFiles.append(toPackFileName)
            self.fileUtilities.DeleteFile(fullFileName)

        for airMarketFile in fileListToDel:
            self.fileUtilities.DeleteFile(self.tempFolder + "/" + airMarketFile)

    def DecompressFromRaw(self, rawFolderName):
        '''
        Extracts the files from the EPADownload.zip file...
        '''
        try:
            filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job["inputZipFileName"]

            self.logger.info("Unpacking file: " + filePath)
            self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder)
        except StandardError as err:
            self.logger.info("Unable to decompress file: " + filePath + " Error: " + err.message)

    def UploadPackedToS3(self):
        '''
        Uploads all files packed to s3.
        '''
        self.logger.info("Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(self.packedFolder,
                                    "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
                                    "--recursive --quiet")

    def LoadAirMarketsTables(self):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'],
                                                  host=self.awsParams.redshift['Hostname'],
                                                  port=self.awsParams.redshift['Port'],
                                                  user=self.awsParams.redshiftCredential['Username'],
                                                  password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                             {
                                                 "destinationSchema": self.job["destinationSchema"],
                                                 "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"],
                                                 "s3Filename": s3DataFolder,
                                                 "fileFormat": self.job["fileFormat"],
                                                 "dateFormat": self.job["dateFormat"],
                                                 "delimiter": self.job["delimiter"]
                                             },
                                             self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error("Error while trying to save into Redshift from s3 folder.")
            raise

    def CleanWorkingFolders(self):
        '''
        Ensures the folders are cleaned and ready before the process execution.
        '''
        self.logger.info("Cleaning local working folders...")

        FileUtilities.RemoveFolder(self.tempFolder)
        FileUtilities.RemoveFolder(self.packedFolder)

        FileUtilities.CreateFolder(self.tempFolder)
        FileUtilities.CreateFolder(self.packedFolder)

    def ProcessTable(self,table):
        '''
        Process data for the table
        :param table:
        :return:
        '''

        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.fileUtilities.moduleName = self.moduleName
        self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"]
        self.fileUtilities.CreateFolders(self.job["folders"])

        fileName = ntpath.basename(s3Key)

        local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName

        S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"],
                                       s3Key,local7zipFilePath)

        localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName
        localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath)


        self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath)
        fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv'

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)

        df = (spark.read
              .format("com.databricks.spark.csv")
              .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true')
              .schema(schema)
              .load(fileToBeloaded)
              )

        #df.show()
        self.logger.info(
            self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet)
        self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")

    def Start(self, logger, moduleName, filelocs):
        '''
        Start of routine
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName)

            for table in self.job["tables"]:
                self.ProcessTable(table)

            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)

            self.logger.debug(self.moduleName + " -- " + " finished ")

        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " + err.message)
            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], \
                                                  currProcId, 'F') is not True:
                self.logger.info(self.moduleName + " - we could not Complete Instance.")
            raise Exception(err.message)
Пример #6
0
class ECRConnect(ApplicationBase):
    '''
    This class is used to get the Risk data from IHS Connect, transform it and load it into Redshift.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(ECRConnect, self).__init__()

        self.awsParams = ""
        self.csvFile = None
        self.csvFileHistory = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def TransformToCsv(self, jData):
        '''
        Transforms from json to csv file.
        '''
        try:
            # Gets the latest version
            df = json_normalize(jData, 'Risks', ['Country'])
            df['ClassName'] = ''
            df['ClassAvg'] = ''
            df = df[[
                'Country', 'Name', 'Value', 'Description', 'ClassName',
                'ClassAvg', 'UpdatedOn'
            ]]

            df.to_csv(self.csvFile,
                      header=False,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            self.fileUtilities.GzipFile(self.csvFile, self.csvFile + ".gz")
            self.fileUtilities.RemoveFileIfItExists(self.csvFile)

            # Gets the history
            df = json_normalize(jData, ['Risks', 'History'],
                                ['Country', ['Risks', 'Name']])
            df = df[['Country', 'Risks.Name', 'Value', 'UpdatedOn']]

            df.to_csv(self.csvFileHistory,
                      header=False,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            self.fileUtilities.GzipFile(self.csvFileHistory,
                                        self.csvFileHistory + ".gz")
            self.fileUtilities.RemoveFileIfItExists(self.csvFileHistory)
        except Exception as err:
            self.logger.error(
                "Error while trying to transform json to csv. Error:" +
                err.message)
            raise

    def GetAndTransform(self):
        '''
        Download all files.
        '''
        try:
            request = urllib2.Request(self.job["connectAPI"]["baseurl"] +
                                      self.job["connectAPI"]["riskService"])
            base64string = base64.b64encode(
                '%s:%s' % (self.job["connectAPI"]["username"],
                           self.job["connectAPI"]["password"]))
            request.add_header("Authorization", "Basic %s" % base64string)

            response = urllib2.urlopen(request)
            jData = json.load(response)

            self.TransformToCsv(jData)
        except Exception as err:
            self.logger.error(
                "Error while trying to get and transform from IHS Connect API service. Error:"
                + err.message)
            raise

    def LoadAllFromS3(self, s3Source, tableName):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataSource = "s3://" + self.job["bucketName"] + s3Source

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": tableName,
                    "s3Filename": s3DataSource,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise

    def UploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName +
                         " - Uploading GZIP files to s3 folder...")
        fileName = self.job["fileNameOut"] + ".gz"
        fileNameHistory = self.job["fileNameOutHistory"] + ".gz"

        S3Utilities.CopyItemsAWSCli(
            self.localTempDirectory + "/" + fileName,
            's3://' + self.job["bucketName"] + self.job["s3ToDirectory"] +
            '/' + fileName)

        S3Utilities.CopyItemsAWSCli(
            self.localTempDirectory + "/" + fileNameHistory,
            's3://' + self.job["bucketName"] + self.job["s3ToDirectory"] +
            '/' + fileNameHistory)

    def ExecutePostETL(self):
        '''
        Will execute the post load sql script...
        '''
        try:
            sqlTemplate = self.location + "/" + self.job["postSQLScript"]
            sqlScript = self.localTempDirectory + "/" + self.job[
                "postSQLScript"]

            self.fileUtilities.CreateActualFileFromTemplate(
                sqlTemplate, sqlScript, self.job["destinationSchema"],
                self.job["tableName"])
            RedshiftUtilities.PSqlExecute(sqlScript, self.logger)
        except Exception as err:
            self.logger.error(
                self.moduleName +
                " - Error while updating the countries codes. Message: " +
                err.message)
            raise

    def Start(self, logger, moduleName, filelocs):
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            self.csvFile = self.localTempDirectory + "/" + self.job[
                "fileNameOut"]
            self.csvFileHistory = self.localTempDirectory + "/" + self.job[
                "fileNameOutHistory"]

            self.GetAndTransform()
            self.UploadToS3()
            self.LoadAllFromS3(
                self.job["s3ToDirectory"] + '/' + self.job["fileNameOut"] +
                '.gz', self.job["tableName"])
            self.LoadAllFromS3(
                self.job["s3ToDirectory"] + '/' +
                self.job["fileNameOutHistory"] + '.gz',
                self.job["tableName"] + '_history')
            self.LoadAllFromS3(
                self.job["xReference"]["s3DataDirectory"],
                self.job["tableName"] + self.job["xReference"]["tableNameSfx"])
            self.ExecutePostETL()
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            raise Exception(err.message)