Пример #1
0
 def ProcessTables(self, dbCommon, tables):
     '''
     process steps:
     pulls file from share and place in raw folder
     '''
     try:
         spark = SparkUtilities.GetCreateSparkSession(self.logger)
         self.fileUtilities.EmptyFolderContents(
             self.fileUtilities.csvFolder)
         if "srcS3DataFolder" in tables:
             self.DownloadFilesFromS3(tables)
         df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"],
                                         False,
                                         self.fileUtilities.csvFolder,
                                         self.logger)
         if "adjustFormat" in tables:
             for fld in tables["adjustFormat"]:
                 df = SparkUtilities.FormatColumn(df, fld["name"],
                                                  fld["inputFormat"])
         #  remove any null records
         df = df.dropna(how='all')
         SparkUtilities.SaveParquet(df, self.fileUtilities)
         self.UploadFilesCreateAthenaTablesAndSqlScripts(
             tables, self.fileUtilities.parquet, None)
         if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
             self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in ProcessRequest")
         raise
Пример #2
0
    def ProcessTables(self, dbCommon, tables):
        '''
        get the last partition value and use that as the date to pull data
        then put that data into Athena
        '''
        try:
            outputCSV = self.fileUtilities.csvFolder + self.moduleName + ".CSV"
            fieldTerminator = self.job["fieldTerminator"]
            if "fieldTerminator" in tables:
                fieldTerminator = tables["fieldTerminator"]            
            rowTerminator = None # Not using this. Stick with the default of CR/LF.  self.job["rowTerminator"]
 
            if "pullTemplate" in tables:
                sqlPullDataScript = self.CreatePullScript(tables)
                self.bcpUtilities.BulkExtract(self.fileUtilities.LoadSQLQuery(sqlPullDataScript),
                                              outputCSV, dbCommon, tables, fieldTerminator, rowTerminator,
                                              self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
                
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables, self.job["fieldTerminator"], False,
                                            self.fileUtilities.csvFolder, self.logger)
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                    self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)

        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessTables")
            raise 
Пример #3
0
    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:
            self.DownloadFilesFromS3(tables)
            for fileName in self.fileUtilities.ScanFolder(
                    self.localTempDirectory + '/raw/'):
                outPutFileName = self.GenerateCSVFromSpreadSheet(
                    tables, self.localTempDirectory + '/raw/', fileName)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"],
                                            False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise
Пример #4
0
 def ProcessTables(self, dbCommon, tables):
     '''
     process steps:
     pulls file from share and place in raw folder
     '''
     try:
         self.rawFolder = self.localTempDirectory + "/" + "Raw"
         self.BulkDownload()
         self.ProcessFiles()
         spark = SparkUtilities.GetCreateSparkSession(self.logger)
         df = SparkUtilities.ReadCSVFile(spark, tables,
                                         self.job["delimiter"], False,
                                         self.fileUtilities.csvFolder,
                                         self.logger)
         SparkUtilities.SaveParquet(df, self.fileUtilities)
         self.UploadFilesCreateAthenaTablesAndSqlScripts(
             tables, self.fileUtilities.parquet)
         self.fileUtilities.EmptyFolderContents(
             self.fileUtilities.csvFolder)
         if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
             self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in ProcessRequest")
         raise
Пример #5
0
    def ProcessTables(self, dbCommon, tables):
        '''
        pulls data from different sheets and put that information into csv file
        '''
        try:
            xl = ExcelUtilities(self.logger)
            localFilepath = self.fileUtilities.FindMostCurrentFile(
                self.job["foldertoscan"])
            csvfile = self.CreateCsvFile(tables)
            csvWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

            if localFilepath is not None:
                self.ProcessFile(xl, localFilepath, csvWriter)

            csvfile.close()
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables,
                                            self.job["delimiter"], False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise
Пример #6
0
 def IncrementalLoad(self, dbCommon, tables):
     self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder)
     try:
         # This is where we last ended.  Start at 1 + this end
         athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"])
         chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"],
                                                tables["incrementalconditions"]["keyfield"], self.logger))
     except ValueError:
         chunkEnd = 0 # Table does not exist yet
     except:
         raise
             
     #chunkEnd = 2960000000
     #maxValue = 3708000000 # 2249000000 3708000000
     maxValue = BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, self.logger)
         
     chunkSize = tables["incrementalconditions"]["chunksize"]
     chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
 
     fieldTerminator = self.job["fieldTerminator"]
     rowTerminator = None # Not using this. Stick with the default of CR/LF.  self.job["rowTerminator"]
 
     chunkStartData = chunkStart
     # Each ETL gets the same date so that we can do a smart insert based on ETL and chunkStartData
     partitionValue = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
     while chunkStart <= maxValue:
         sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd,
                                                           self.logger, self.fileUtilities, self.location)
         # Construct a file name that is meaning full.  That is, it has the start and end IDs
         fileBaseName = tables["incrementalconditions"]["keyfield"] + "-" + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd)
         outputCSV = self.fileUtilities.csvFolder + fileBaseName + ".csv"
         self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)
         self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldTerminator, rowTerminator,
                                       self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
     
         # Process the data using Spark and save as Parquet
         spark = SparkUtilities.GetCreateSparkSession(self.logger)
         df = SparkUtilities.ReadCSVFile(spark, tables, fieldTerminator, False,
                                         self.fileUtilities.csvFolder, self.logger)
         SparkUtilities.SaveParquet(df, self.fileUtilities, fileBaseName)
         self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue)
         
         tables["new"] = "N" # Do not recreate
         if chunkSize < 0:
             break;  # Done with the single load
         chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
     
     # Load only the data that we processed into Redshift.  We cannot use the run ETL date parition value
     # since we are loading the data based on record IDs
     customWhereCondition = tables["incrementalconditions"]["keyfield"] + " >= " + str(chunkStartData)
     self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables, customWhereCondition)
Пример #7
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the current table to load it up
        '''
        self.logger.debug(self.moduleName + " -- ProcessTables for " +
                          tables["table"] + " starting")
        self.ProcessFiles(dbCommon)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        df = SparkUtilities.ReadCSVFile(spark, tables, dbCommon["delimiter"],
                                        False, self.fileUtilities.csvFolder,
                                        self.logger)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.debug(self.moduleName + " -- ProcessTables for " +
                          tables["table"] + " Done.")
Пример #8
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process the liquids balance catalog.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " +
                              "Processing data for catalog: " +
                              catalog["name"])

            self.GetLastLiquidsBalanceFileInfo(dbCommon)
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            dfConsolidated = None

            for tableDDL in catalog["tables"]:
                if tableDDL["type"] == "raw":
                    csvInfo = self.GetSheetDataToCsv(dbCommon, tableDDL)
                    df = SparkUtilities.ReadCSVFile(spark, csvInfo["tableDDL"],
                                                    self.job["delimiter"],
                                                    True,
                                                    csvInfo["csvFileName"],
                                                    self.logger)

                    if dfConsolidated is None:
                        dfConsolidated = self.MeltDataFrame(
                            catalog["applyCategoryCol"], tableDDL, df)
                    else:
                        dfConsolidated.unionAll(
                            self.MeltDataFrame(catalog["applyCategoryCol"],
                                               tableDDL, df))

            for tableDDL in catalog["tables"]:
                if tableDDL["type"] == "destination":
                    SparkUtilities.SaveParquet(dfConsolidated,
                                               self.fileUtilities)
                    self.UploadFilesCreateAthenaTablesAndSqlScripts(
                        tableDDL, self.fileUtilities.parquet)
                    self.LoadDataFromAthenaIntoRedShiftLocalScripts(tableDDL)
                    break

        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load table. Error: " +
                              err.message)
            raise
Пример #9
0
    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:
            rawFolder = self.localTempDirectory + '/raw/'
            if "srcS3DataFolder" in tables:
                self.DownloadFilesFromS3(tables)
                xl = ExcelUtilities(self.logger)
                outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv'
                xl.Excel2CSV(rawFolder + tables["srcFile"],\
                            None,\
                            outPutFileName,\
                            self.fileUtilities.csvFolder,\
                            defDateFormat='%Y-%m-%d',\
                            skiprows=tables["skipRows"])
            else:
                self.ProcessWebCall(tables, rawFolder)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"],
                                            False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])

            self.fileUtilities.EmptyFolderContents(rawFolder)
            #  remove any null records
            df = df.dropna(how='all')
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessRequest")
            raise
Пример #10
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the data for the table
        '''
        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)
        
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        # Unzip the file rather than reading the gzip as Spark is faster with csv
        localCSVFilepath = self.fileUtilities.csvFolder + "/" + fileName + ".csv"
        self.fileUtilities.GunzipFile(localGzipFilepath, localCSVFilepath)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], True, self.fileUtilities.csvFolder, self.logger)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
Пример #11
0
    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:
            rawFolder = self.localTempDirectory + '/raw/'
            flist = self.DownloadFile(rawFolder)
            xl = ExcelUtilities(self.logger)

            outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv'
            for fl in flist:
                xl.Excel2CSV(rawFolder + fl,\
                            'Sheet1',\
                            outPutFileName,\
                            self.fileUtilities.csvFolder)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables,
                                            self.job["delimiter"], False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise
Пример #12
0
    def LoadClassRefDF(self, spark):
        '''
        Loads de class reference data
        '''
        xReferencesDF = {}

        for catalog in self.job["catalogs"]:
            if catalog["name"] == "xReferences":
                for xrefTable in catalog["tables"]:
                    if self.xRefPulled is False:
                        S3Utilities.CopyItemsAWSCli(
                            "s3://" + self.job["bucketName"] +
                            xrefTable["s3SourceFolder"] +
                            xrefTable["sourceFileName"],
                            self.fileUtilities.csvFolder, "--quiet")

                    xReferencesDF[
                        xrefTable["table"]] = SparkUtilities.ReadCSVFile(
                            spark, xrefTable, self.job["delimiter"], False,
                            self.fileUtilities.csvFolder + "/" +
                            xrefTable["sourceFileName"], self.logger)

        self.xRefPulled = True
        return xReferencesDF
Пример #13
0
    def ProcessTables(self, dbCommonNotUsed, table):
        '''
        the actual process starts here
        '''
        try:
            strDateTodayMinus1 = datetime.datetime.strftime(
                datetime.date.today() - datetime.timedelta(days=1), "%Y-%m-%d")
            latestValuationDateInAthena = self.GetLatestValuationDateInAthena(
                table)
            if (latestValuationDateInAthena == strDateTodayMinus1):
                self.logger.debug(
                    self.moduleName + " -- " +
                    "*** Totem data is already up-to-date as of: " +
                    latestValuationDateInAthena + " ***")
                return

            self.SetGlobalVariables()

            yearMonthsToProcess = self.GetYearMonthsToProcess(
                latestValuationDateInAthena)
            #yearMonthsToProcess = self.GetYearMonthsToProcess("2017-11-10") # For debugging
            for yearMonth in yearMonthsToProcess:
                self.logger.debug(self.moduleName + " -- " +
                                  "Processing Year-Month: " + yearMonth)

                strDateToday = datetime.datetime.strftime(
                    datetime.date.today(), "%Y-%m-%d")
                self.fileUtilities.EmptyFolderContents(
                    self.fileUtilities.csvFolder)
                fileListForYearMonth = self.GetFileListForYearMonth(yearMonth)
                for fileName in fileListForYearMonth:
                    self.GetFile(yearMonth, fileName)

                spark = SparkUtilities.GetCreateSparkSession(self.logger)
                df = SparkUtilities.ReadCSVFile(spark, table,
                                                self.job["delimiter"], False,
                                                self.fileUtilities.csvFolder,
                                                self.logger)

                # The data frame contains a number of valuation dates.  Get the distinct valuation dates
                # and create a partition for each valuation date
                distinctValuationDates = sorted(
                    df.select(df.valuationdate).distinct().collect())
                for item in distinctValuationDates:
                    # Process new days only.  Skip today so that we don't get partials.  Otherwise we will have
                    # to delete data from Athena/RedShift to avoid duplicates
                    if item.valuationdate <= latestValuationDateInAthena or item.valuationdate == strDateToday:
                        continue

                    self.logger.debug(self.moduleName +
                                      " - Processing Valuation Date: " +
                                      item.valuationdate)
                    dfValuationDate = df.filter(
                        df.valuationdate == item.valuationdate)
                    fileBaseName = "ValuationDate-" + item.valuationdate
                    SparkUtilities.SaveParquet(dfValuationDate,
                                               self.fileUtilities,
                                               fileBaseName)
                    self.UploadFilesCreateAthenaTablesAndSqlScripts(
                        table, self.fileUtilities.parquet, item.valuationdate)

                    if "loadToRedshift" in table and table[
                            "loadToRedshift"] == "Y":
                        self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)
            self.logger.info(self.moduleName + " - Finished processing.")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in ProcessDatabase for " + table["table"])
            raise
Пример #14
0
    def ProcessDataSource(self, srcCategory, tables):
        '''
        Process each category
        '''
        self.logger.debug(self.moduleName + " -- " + "ProcessCategory " +
                          " starting ")
        try:
            for year in srcCategory["years"]:
                url = srcCategory["urlPrefix"] + year + "." + srcCategory[
                    "urlExt"]
                self.logger.info(self.moduleName + " - Processing url: " + url)

                localFilePath = self.localTempDirectory + "/raw/" +\
                                ntpath.basename(srcCategory["urlPrefix"]) +\
                                year + "." + srcCategory["urlExt"]
                scrubbedFilepath = self.localTempDirectory + "/scrub/" +\
                                ntpath.basename(srcCategory["urlPrefix"]) +\
                                year + "." + srcCategory["urlExt"]

                FileUtilities.DownloadFromURL(url, localFilePath)

                if srcCategory[
                        "urlExt"] == "zip":  # Unzip the file if we receive a zip format
                    unzipFilelocation = self.localTempDirectory + "/raw/"
                    self.fileUtilities.UnzipFile(localFilePath,
                                                 unzipFilelocation)
                    localFilePath = unzipFilelocation + srcCategory[
                        "unzipFilename"]
                    scrubbedFilepath = self.localTempDirectory + "/scrub/" + year + "_" + srcCategory[
                        "unzipFilename"]

                # Need to clean up the file and add the tags
                tag = srcCategory["srcCategory"] + "," + srcCategory[
                    "srcDescription"] + ","
                replacements = {
                    '^': tag,
                    '" ': '"',
                    '#VALUE!': '',
                    r'\.,': ' ,'
                }
                self.fileUtilities.ReplaceStringInFile(localFilePath,
                                                       scrubbedFilepath,
                                                       replacements)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(
                spark, tables, self.job["delimiter"], True,
                self.localTempDirectory + "/scrub/", self.logger)
            if "adjustFormat" in srcCategory:
                for fld in srcCategory["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            df.write.parquet(self.fileUtilities.parquet, mode="append")
            self.fileUtilities.EmptyFolderContents(self.localTempDirectory +
                                                   "/scrub/")
            self.fileUtilities.EmptyFolderContents(self.localTempDirectory +
                                                   "/raw/")

            self.logger.debug(self.moduleName + " -- " +
                              "ProcessCategory for " +
                              srcCategory["srcCategory"] + " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessCategory")
            raise