示例#1
0
    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:

            processingFile = self.DownloadFile(self.job["srcCategories"])
            self.CreateCSVFile(processingFile, self.job["srcCategories"])

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schemaAllString = SparkUtilities.BuildSparkSchema(tables, True)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read
                    .format("com.databricks.spark.csv")
                    .options(header=False, delimiter=self.job["srcCategories"]["delimiter"])
                    .option("ignoreTrailingWhiteSpace", "true")
                    .option("ignoreLeadingWhiteSpace", "true")            
                    .schema(schemaAllString)
                    .load(self.fileUtilities.csvFolder)
                )
            df = SparkUtilities.ReplaceAll(df, "\xE2\x80\x93", "")
            df2 = SparkUtilities.ConvertTypesToSchema(df, schema)            
            SparkUtilities.SaveParquet(df2, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                    self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessTables")
            raise
示例#2
0
    def ProcessTables(self, dbCommon, tables):
        '''
        pulls data from different sheets and put that information into csv file
        '''
        try:
            xl = ExcelUtilities(self.logger)
            localFilepath = self.fileUtilities.FindMostCurrentFile(
                self.job["foldertoscan"])
            csvfile = self.CreateCsvFile(tables)
            csvWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

            if localFilepath is not None:
                self.ProcessFile(xl, localFilepath, csvWriter)

            csvfile.close()
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables,
                                            self.job["delimiter"], False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise
示例#3
0
 def ProcessTables(self, dbCommon, tables):
     '''
     process steps:
     pulls file from share and place in raw folder
     '''
     try:
         spark = SparkUtilities.GetCreateSparkSession(self.logger)
         self.fileUtilities.EmptyFolderContents(
             self.fileUtilities.csvFolder)
         if "srcS3DataFolder" in tables:
             self.DownloadFilesFromS3(tables)
         df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"],
                                         False,
                                         self.fileUtilities.csvFolder,
                                         self.logger)
         if "adjustFormat" in tables:
             for fld in tables["adjustFormat"]:
                 df = SparkUtilities.FormatColumn(df, fld["name"],
                                                  fld["inputFormat"])
         #  remove any null records
         df = df.dropna(how='all')
         SparkUtilities.SaveParquet(df, self.fileUtilities)
         self.UploadFilesCreateAthenaTablesAndSqlScripts(
             tables, self.fileUtilities.parquet, None)
         if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
             self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in ProcessRequest")
         raise
示例#4
0
    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:
            self.DownloadFilesFromS3(tables)
            for fileName in self.fileUtilities.ScanFolder(
                    self.localTempDirectory + '/raw/'):
                outPutFileName = self.GenerateCSVFromSpreadSheet(
                    tables, self.localTempDirectory + '/raw/', fileName)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"],
                                            False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise
示例#5
0
    def ProcessTables(self, dbCommon, tables):
        '''
        get the last partition value and use that as the date to pull data
        then put that data into Athena
        '''
        try:
            outputCSV = self.fileUtilities.csvFolder + self.moduleName + ".CSV"
            fieldTerminator = self.job["fieldTerminator"]
            if "fieldTerminator" in tables:
                fieldTerminator = tables["fieldTerminator"]            
            rowTerminator = None # Not using this. Stick with the default of CR/LF.  self.job["rowTerminator"]
 
            if "pullTemplate" in tables:
                sqlPullDataScript = self.CreatePullScript(tables)
                self.bcpUtilities.BulkExtract(self.fileUtilities.LoadSQLQuery(sqlPullDataScript),
                                              outputCSV, dbCommon, tables, fieldTerminator, rowTerminator,
                                              self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
                
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables, self.job["fieldTerminator"], False,
                                            self.fileUtilities.csvFolder, self.logger)
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                    self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)

        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessTables")
            raise 
示例#6
0
    def CreateParquetFilesAndLoad(self, catalog, partitionValue):
        '''
        Creates the parquet files
        '''
        try:
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            for tables in catalog["tables"]:
                if tables["type"] == "attributes":
                    srcFolder = self.fileUtilities.csvFolder + '/attribute/'
                else:
                    srcFolder = self.fileUtilities.csvFolder + '/data/'
                tableSchema = SparkUtilities.BuildSparkSchema(tables)

                df = (spark.read.format("com.databricks.spark.csv").options(
                    header=False, delimiter=self.job["delimiter"]).schema(
                        tableSchema).load(srcFolder))
                SparkUtilities.SaveParquet(df, self.fileUtilities)
                self.UploadFilesCreateAthenaTablesAndSqlScripts(
                    tables, self.fileUtilities.parquet, partitionValue)
                self.fileUtilities.EmptyFolderContents(
                    self.fileUtilities.parquet)
                self.fileUtilities.EmptyFolderContents(srcFolder)

        except Exception as ex:
            self.logger.exception(
                self.moduleName +
                " - we had an error in CreateParquetFilesAndLoad " +
                ex.message)
            raise
示例#7
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process each Vantage table.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " +
                              "Processing data for table:" + tables["table"])
            fileName = self.BulkExtract(tables["table"], tables["scriptFile"],
                                        dbCommon)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read.format("com.databricks.spark.csv").options(
                header='false',
                delimiter=self.job["delimiter"]).schema(schema).load(fileName))

            self.logger.info(self.moduleName + " -- " + "Done reading " +
                             str(df.count()) +
                             " rows.  Now saving as parquet file...")
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load table. Error: " +
                              err.message)
            raise
示例#8
0
 def ProcessTables(self, dbCommon, tables):
     '''
     process steps:
     pulls file from share and place in raw folder
     '''
     try:
         self.rawFolder = self.localTempDirectory + "/" + "Raw"
         self.BulkDownload()
         self.ProcessFiles()
         spark = SparkUtilities.GetCreateSparkSession(self.logger)
         df = SparkUtilities.ReadCSVFile(spark, tables,
                                         self.job["delimiter"], False,
                                         self.fileUtilities.csvFolder,
                                         self.logger)
         SparkUtilities.SaveParquet(df, self.fileUtilities)
         self.UploadFilesCreateAthenaTablesAndSqlScripts(
             tables, self.fileUtilities.parquet)
         self.fileUtilities.EmptyFolderContents(
             self.fileUtilities.csvFolder)
         if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
             self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in ProcessRequest")
         raise
示例#9
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Will load the ENP Yearly Table
        '''
        try:
            outputfileName = self.fileUtilities.csvFolder + '/ENPdata.csv'

            conn = self.EstablishConnection(dbCommon)
            cur = conn.cursor()
            sqlline = self.FixSQLStatement(dbCommon)
            cur.execute(sqlline)

            self.ConvertToCSV(cur, outputfileName)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read.format("com.databricks.spark.csv").options(
                header='false', delimiter=self.job["delimiter"]).schema(
                    schema).load(outputfileName))
            self.logger.info(self.moduleName + " -- " + "Done reading " +
                             str(df.count()) +
                             " rows.  Now saving as parquet file...")
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load table. Error:" +
                              err.message)
            raise
示例#10
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" +
                              " starting ")
            processingFile = self.DownloadFile()
            fileOut = processingFile.replace(".dbf", ".txt")

            dbfUtils = DBFUtilities(self.logger)
            dbfUtils.ConvertToCSV(processingFile, fileOut,
                                  self.job["delimiter"], False)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read.format("com.databricks.spark.csv").options(
                header='false',
                delimiter=self.job["delimiter"]).schema(schema).load(fileOut))
            self.logger.info(self.moduleName + " -- " + "Done reading " +
                             str(df.count()) +
                             " rows.  Now saving as parquet file...")
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" +
                              " finished ")
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load category...")
            raise Exception(err.message)
示例#11
0
    def ProcessTable(self, table):
        '''
        Process the data for the table
        '''
        s3Key = table["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.CreateFolders(table["table"])

        fileName = ntpath.basename(s3Key)
        localTxtFilepath = self.fileUtilities.csvFolder + "/" + fileName
        S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                       self.job["bucketName"], s3Key,
                                       localTxtFilepath)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)
        df = (spark.read.format("com.databricks.spark.csv").options(
            header='false', delimiter=self.job["delimiter"]).schema(
                schema).load(localTxtFilepath))
        self.logger.info(self.moduleName + " -- " + "Done reading " +
                         str(df.count()) +
                         " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            table, self.fileUtilities.parquet, table["partitionValue"])
        self.logger.info(self.moduleName + " -- " + "ProcessTable " +
                         " finished ")
示例#12
0
 def FlushAndFillUsingJDBC(self, dbCommon, tables):
     '''
     Simple flush and fill.  Get the data from JDBC and load into Athena
     '''
     spark = SparkUtilities.GetCreateSparkSession(self.logger)
     url, driver = SparkUtilities.GetSqlServerConnectionParams(dbCommon)
     df = SparkUtilities.ReadTableUsingJDBC(spark, url, driver, tables, self.logger)
     SparkUtilities.SaveParquet(df, self.fileUtilities)
     self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
     self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
示例#13
0
 def IncrementalLoad(self, dbCommon, tables):
     self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder)
     try:
         # This is where we last ended.  Start at 1 + this end
         athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"])
         chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"],
                                                tables["incrementalconditions"]["keyfield"], self.logger))
     except ValueError:
         chunkEnd = 0 # Table does not exist yet
     except:
         raise
             
     #chunkEnd = 2960000000
     #maxValue = 3708000000 # 2249000000 3708000000
     maxValue = BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, self.logger)
         
     chunkSize = tables["incrementalconditions"]["chunksize"]
     chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
 
     fieldTerminator = self.job["fieldTerminator"]
     rowTerminator = None # Not using this. Stick with the default of CR/LF.  self.job["rowTerminator"]
 
     chunkStartData = chunkStart
     # Each ETL gets the same date so that we can do a smart insert based on ETL and chunkStartData
     partitionValue = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
     while chunkStart <= maxValue:
         sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd,
                                                           self.logger, self.fileUtilities, self.location)
         # Construct a file name that is meaning full.  That is, it has the start and end IDs
         fileBaseName = tables["incrementalconditions"]["keyfield"] + "-" + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd)
         outputCSV = self.fileUtilities.csvFolder + fileBaseName + ".csv"
         self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)
         self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldTerminator, rowTerminator,
                                       self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
     
         # Process the data using Spark and save as Parquet
         spark = SparkUtilities.GetCreateSparkSession(self.logger)
         df = SparkUtilities.ReadCSVFile(spark, tables, fieldTerminator, False,
                                         self.fileUtilities.csvFolder, self.logger)
         SparkUtilities.SaveParquet(df, self.fileUtilities, fileBaseName)
         self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue)
         
         tables["new"] = "N" # Do not recreate
         if chunkSize < 0:
             break;  # Done with the single load
         chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
     
     # Load only the data that we processed into Redshift.  We cannot use the run ETL date parition value
     # since we are loading the data based on record IDs
     customWhereCondition = tables["incrementalconditions"]["keyfield"] + " >= " + str(chunkStartData)
     self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables, customWhereCondition)
示例#14
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process each file
        '''
        # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum
        s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        FileUtilities.EmptyFolderContents(
            self.fileUtilities.gzipFolder
        )  # Clear the folder from the previous run
        FileUtilities.EmptyFolderContents(
            self.fileUtilities.csvFolder
        )  # Clear the folder from the previous run
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName
        # Remove the gz extension
        localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath)
        self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath)

        # Don't have a raw excel reader for Spark so use Pandas
        self.logger.info(self.moduleName + " - Processing Excel file: " +
                         localExcelFilepath)
        pandasDf = pd.read_excel(localExcelFilepath,
                                 catalog["excelSheetName"],
                                 index_col=None,
                                 na_values=['NaN'],
                                 skiprows=catalog["skipRows"])
        pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        table = catalog["tables"][0]  # There is only table in a catalog
        schema = SparkUtilities.BuildSparkSchema(table)
        df = spark.createDataFrame(pandasDf, schema)
        df = SparkUtilities.ConvertNanToNull(df)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            table, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(table)
        self.logger.debug(self.moduleName + " -- " +
                          "ProcessS3File for file: " + s3Key +
                          " finished.\n\n")
示例#15
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the current table to load it up
        '''
        self.logger.debug(self.moduleName + " -- ProcessTables for " +
                          tables["table"] + " starting")
        self.ProcessFiles(dbCommon)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        df = SparkUtilities.ReadCSVFile(spark, tables, dbCommon["delimiter"],
                                        False, self.fileUtilities.csvFolder,
                                        self.logger)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.debug(self.moduleName + " -- ProcessTables for " +
                          tables["table"] + " Done.")
    def ProcessTable(self,table):
        '''
        Process data for the table
        :param table:
        :return:
        '''

        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.fileUtilities.moduleName = self.moduleName
        self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"]
        self.fileUtilities.CreateFolders(self.job["folders"])

        fileName = ntpath.basename(s3Key)

        local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName

        S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"],
                                       s3Key,local7zipFilePath)

        localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName
        localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath)


        self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath)
        fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv'

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)

        df = (spark.read
              .format("com.databricks.spark.csv")
              .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true')
              .schema(schema)
              .load(fileToBeloaded)
              )

        #df.show()
        self.logger.info(
            self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet)
        self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process the liquids balance catalog.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " +
                              "Processing data for catalog: " +
                              catalog["name"])

            self.GetLastLiquidsBalanceFileInfo(dbCommon)
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            dfConsolidated = None

            for tableDDL in catalog["tables"]:
                if tableDDL["type"] == "raw":
                    csvInfo = self.GetSheetDataToCsv(dbCommon, tableDDL)
                    df = SparkUtilities.ReadCSVFile(spark, csvInfo["tableDDL"],
                                                    self.job["delimiter"],
                                                    True,
                                                    csvInfo["csvFileName"],
                                                    self.logger)

                    if dfConsolidated is None:
                        dfConsolidated = self.MeltDataFrame(
                            catalog["applyCategoryCol"], tableDDL, df)
                    else:
                        dfConsolidated.unionAll(
                            self.MeltDataFrame(catalog["applyCategoryCol"],
                                               tableDDL, df))

            for tableDDL in catalog["tables"]:
                if tableDDL["type"] == "destination":
                    SparkUtilities.SaveParquet(dfConsolidated,
                                               self.fileUtilities)
                    self.UploadFilesCreateAthenaTablesAndSqlScripts(
                        tableDDL, self.fileUtilities.parquet)
                    self.LoadDataFromAthenaIntoRedShiftLocalScripts(tableDDL)
                    break

        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load table. Error: " +
                              err.message)
            raise
示例#18
0
    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:
            rawFolder = self.localTempDirectory + '/raw/'
            if "srcS3DataFolder" in tables:
                self.DownloadFilesFromS3(tables)
                xl = ExcelUtilities(self.logger)
                outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv'
                xl.Excel2CSV(rawFolder + tables["srcFile"],\
                            None,\
                            outPutFileName,\
                            self.fileUtilities.csvFolder,\
                            defDateFormat='%Y-%m-%d',\
                            skiprows=tables["skipRows"])
            else:
                self.ProcessWebCall(tables, rawFolder)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"],
                                            False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])

            self.fileUtilities.EmptyFolderContents(rawFolder)
            #  remove any null records
            df = df.dropna(how='all')
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessRequest")
            raise
示例#19
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process each risks table.
        '''
        try:
            last_update_date = self.GetLastUpdateDate(catalog["tables"][0])

            for i in range(dbCommon["daysback"]):
                last_update_date += datetime.timedelta(days=1)

                if last_update_date < datetime.date.today():
                    self.logger.info(self.moduleName + "- Processing Date: " +
                                     str(last_update_date))

                    self.fileUtilities.EmptyFolderContents(
                        self.localTempDirectory + "/raw")
                    self.PullOData(dbCommon, catalog["tables"][0],
                                   last_update_date)
                    spark = SparkUtilities.GetCreateSparkSession(self.logger)

                    for tableDDL in catalog["tables"]:
                        fileBaseName = "last_update_date-" + str(
                            last_update_date)
                        df = spark.read.json(self.localTempDirectory +
                                             "/raw/magellan_" +
                                             tableDDL["type"] + ".json")

                        SparkUtilities.SaveParquet(df, self.fileUtilities,
                                                   fileBaseName)
                        self.UploadFilesCreateAthenaTablesAndSqlScripts(
                            tableDDL, self.fileUtilities.parquet,
                            str(last_update_date))
                else:
                    self.logger.info(self.moduleName +
                                     "- Already up to date. " + str(i))
        except Exception as err:
            self.logger.error(
                self.moduleName +
                " - Error while trying to process catalog. Error: " +
                err.message)
            raise
示例#20
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process each file
        '''
        self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " starting")
        FileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)   # Clear the folder from the previous run
        self.ProcessFiles(tables)
        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        
        # We will compute "period_type" later
        schemaWithoutPeriodType = SparkUtilities.BuildSparkSchema(tables, excludeComputed=True)
        df = (spark.read
                .format("com.databricks.spark.csv")
                .options(header=False, delimiter=self.job['delimiter'],
                         ignoreTrailingWhiteSpace=True, ignoreLeadingWhiteSpace=True)
                .schema(schemaWithoutPeriodType)
                .load(self.fileUtilities.csvFolder)
            )        

        if "filterData" in tables:
            df = df.filter(tables["filterData"])
        
        # Replace "NEW" with blank.  E.g. DEC1990NEW to DEC1990
        from pyspark.sql import functions as F  #@UnresolvedImport
        df = SparkUtilities.RenameColumnsInList(df, [("period", "period_old")]) # Rename column since we cannot edit in place
        df = df.withColumn("period", F.regexp_replace(df["period_old"], "NEW", ""))

        # Compute "period_type".  Following simple rules have been applied
        #    MAY2013 - 7 characters so assumed to be 'M'
        #    Q12017  - 6 characters so assumed to be 'Q'
        #    2017    - 4 characters so assumed to be 'Y'
        df = df.withColumn("period_type", F.when(F.length(df.period)==7, "M").when(F.length(df.period)==6, "Q").when(F.length(df.period)==4, "Y").otherwise(""))
        
        # Reorder the columns based on the input column order
        schema = SparkUtilities.BuildSparkSchema(tables)
        df = df.select(schema.names)
        
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " finished")
示例#21
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the data for the table
        '''
        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)
        
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        # Unzip the file rather than reading the gzip as Spark is faster with csv
        localCSVFilepath = self.fileUtilities.csvFolder + "/" + fileName + ".csv"
        self.fileUtilities.GunzipFile(localGzipFilepath, localCSVFilepath)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], True, self.fileUtilities.csvFolder, self.logger)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:
            rawFolder = self.localTempDirectory + '/raw/'
            flist = self.DownloadFile(rawFolder)
            xl = ExcelUtilities(self.logger)

            outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv'
            for fl in flist:
                xl.Excel2CSV(rawFolder + fl,\
                            'Sheet1',\
                            outPutFileName,\
                            self.fileUtilities.csvFolder)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables,
                                            self.job["delimiter"], False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise
示例#23
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process each risks table.
        '''
        try:
            if catalog["name"] == "Risks":
                self.logger.debug(self.moduleName + " -- " +
                                  "Processing data for catalog: " +
                                  catalog["name"])
                self.DownloadData(dbCommon)

                for tableDDL in catalog["tables"]:
                    dfFixed = None

                    spark = SparkUtilities.GetCreateSparkSession(self.logger)
                    xReferencesDF = self.LoadClassRefDF(spark)

                    xReferencesDF["class_xref"].createOrReplaceTempView(
                        "class_xref")
                    xReferencesDF["iso3166_xref"].createOrReplaceTempView(
                        "iso3166_xref")

                    dfMaster = spark.read.json(self.localTempDirectory +
                                               "/raw/ecr_risks_" +
                                               tableDDL["type"] + ".json")

                    if tableDDL["type"] == "current":
                        dfMaster.createOrReplaceTempView("risks")
                        dfFixed = spark.sql("""
                                                SELECT iso3166_xref.countryname AS country, clsRef.risk_desc AS risk_name,
                                                       CAST(risks.risk_value AS DOUBLE) AS risk_value, risks.risk_description, clsRef.class_name AS risk_class,
                                                       avgs.class_avg AS risk_class_avg, risks.updated_on
                                                FROM risks 
                                                    inner join iso3166_xref on iso3166_xref.iso3166 = risks.country
                                                    inner join class_xref clsRef on clsRef.risk_name = risks.risk_name
                                                    inner join (SELECT country, risk_name, risk_class, AVG(risk_value) 
                                                                OVER(PARTITION BY country, risk_class) AS class_avg FROM risks) avgs
                                                        ON avgs.country = risks.country
                                                           AND avgs.risk_name = risks.risk_name
                                                           AND avgs.risk_class = risks.risk_class
                                                """)
                    else:
                        dfMaster.createOrReplaceTempView("risksHistory")
                        dfFixed = spark.sql("""
                                                SELECT iso3166_xref.countryname AS country, clsRef.risk_desc AS risk_name,
                                                       CAST(risksHistory.risk_value AS DOUBLE) AS risk_value, risksHistory.updated_on
                                                FROM risksHistory 
                                                    inner join iso3166_xref on iso3166_xref.iso3166 = risksHistory.country
                                                    inner join class_xref clsRef on clsRef.risk_name = risksHistory.risk_name
                                                """)

                    self.logger.info(self.moduleName + " -- " +
                                     "Done reading " + str(dfFixed.count()) +
                                     " rows.  Now saving as parquet file...")
                    SparkUtilities.SaveParquet(dfFixed, self.fileUtilities)
                    self.UploadFilesCreateAthenaTablesAndSqlScripts(
                        tableDDL, self.fileUtilities.parquet)
                    self.LoadDataFromAthenaIntoRedShiftLocalScripts(tableDDL)

                    spark.catalog.dropTempView("class_xref")
                    spark.catalog.dropTempView("iso3166_xref")
                    spark.catalog.dropTempView("risks")
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load table. Error: " +
                              err.message)
            raise
示例#24
0
    def ProcessDataSource(self, srcCategory, tables):
        '''
        Process each category
        '''
        self.logger.debug(self.moduleName + " -- " + "ProcessCategory " +
                          " starting ")
        try:
            for year in srcCategory["years"]:
                url = srcCategory["urlPrefix"] + year + "." + srcCategory[
                    "urlExt"]
                self.logger.info(self.moduleName + " - Processing url: " + url)

                localFilePath = self.localTempDirectory + "/raw/" +\
                                ntpath.basename(srcCategory["urlPrefix"]) +\
                                year + "." + srcCategory["urlExt"]
                scrubbedFilepath = self.localTempDirectory + "/scrub/" +\
                                ntpath.basename(srcCategory["urlPrefix"]) +\
                                year + "." + srcCategory["urlExt"]

                FileUtilities.DownloadFromURL(url, localFilePath)

                if srcCategory[
                        "urlExt"] == "zip":  # Unzip the file if we receive a zip format
                    unzipFilelocation = self.localTempDirectory + "/raw/"
                    self.fileUtilities.UnzipFile(localFilePath,
                                                 unzipFilelocation)
                    localFilePath = unzipFilelocation + srcCategory[
                        "unzipFilename"]
                    scrubbedFilepath = self.localTempDirectory + "/scrub/" + year + "_" + srcCategory[
                        "unzipFilename"]

                # Need to clean up the file and add the tags
                tag = srcCategory["srcCategory"] + "," + srcCategory[
                    "srcDescription"] + ","
                replacements = {
                    '^': tag,
                    '" ': '"',
                    '#VALUE!': '',
                    r'\.,': ' ,'
                }
                self.fileUtilities.ReplaceStringInFile(localFilePath,
                                                       scrubbedFilepath,
                                                       replacements)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(
                spark, tables, self.job["delimiter"], True,
                self.localTempDirectory + "/scrub/", self.logger)
            if "adjustFormat" in srcCategory:
                for fld in srcCategory["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            df.write.parquet(self.fileUtilities.parquet, mode="append")
            self.fileUtilities.EmptyFolderContents(self.localTempDirectory +
                                                   "/scrub/")
            self.fileUtilities.EmptyFolderContents(self.localTempDirectory +
                                                   "/raw/")

            self.logger.debug(self.moduleName + " -- " +
                              "ProcessCategory for " +
                              srcCategory["srcCategory"] + " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessCategory")
            raise
示例#25
0
    def ProcessTables(self, dbCommon, tables):
        '''
        get the last partition value and use that as the date to pull data
        then put that data into Athena
        '''
        try:
            outputCSV = self.fileUtilities.csvFolder + self.moduleName + ".CSV"
            fieldTerminator = self.job["fieldTerminator"]
            if "fieldTerminator" in tables:
                fieldTerminator = tables["fieldTerminator"]
            rawFolder = self.localTempDirectory + "/raw/"
            rowTerminator = None  # Not using this. Stick with the default of CR/LF.  self.job["rowTerminator"]

            if "pullTemplate" in tables:
                lastRunDate = self.GetParameters(tables)
                formattedLastRunDate = lastRunDate[4:6] + '/' + lastRunDate[
                    6:8] + '/' + lastRunDate[:4]
                sqlPullDataScript = self.CreatePullScript(
                    tables, formattedLastRunDate)
                self.bcpUtilities.BulkExtract(
                    self.fileUtilities.LoadSQLQuery(sqlPullDataScript),
                    outputCSV, dbCommon, tables, fieldTerminator,
                    rowTerminator, self.job["bcpUtilityDirOnLinux"],
                    self.fileUtilities, self.logger)

                self.masterSchema = SparkUtilities.BuildSparkSchema(tables)
                self.fileUtilities.MoveFilesFromOneFolderToAnother(self.fileUtilities.csvFolder,\
                                                                   rawFolder,\
                                                                   '*.csv')
                return
###
#  load data frame from CSV file
###
            partitionValue = self.GetPartitionValue()
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = (spark.read.format("com.databricks.spark.csv").options(
                header=False, delimiter=fieldTerminator).schema(
                    self.masterSchema).load(rawFolder))
            cols = []
            for field in tables["fields"]:
                if "athenaOnly" in field:
                    if field["athenaOnly"] != "Y":
                        cols.append(field["name"])
                else:
                    cols.append(field["name"])
            if tables["type"] == "attributes":
                dfAttributes = df.select(cols).distinct()
                if dfAttributes.count() == 0:
                    self.logger.debug(
                        self.moduleName +
                        " - no records to process for Attribute data")
                    return
                SparkUtilities.SaveParquet(dfAttributes, self.fileUtilities)
            elif tables["type"] == "series":
                dfSeries = df.select(cols)
                if "adjustFormat" in tables:
                    for fld in tables["adjustFormat"]:
                        dfSeries = SparkUtilities.FormatColumn(
                            dfSeries, fld["name"], fld["inputFormat"])
                if dfSeries.count() == 0:
                    self.logger.debug(
                        self.moduleName +
                        " - no records to process for Series data")
                    return
                SparkUtilities.SaveParquet(dfSeries, self.fileUtilities)

            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet, partitionValue)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise
示例#26
0
    def ProcessTables(self, dbCommonNotUsed, table):
        '''
        the actual process starts here
        '''
        try:
            strDateTodayMinus1 = datetime.datetime.strftime(
                datetime.date.today() - datetime.timedelta(days=1), "%Y-%m-%d")
            latestValuationDateInAthena = self.GetLatestValuationDateInAthena(
                table)
            if (latestValuationDateInAthena == strDateTodayMinus1):
                self.logger.debug(
                    self.moduleName + " -- " +
                    "*** Totem data is already up-to-date as of: " +
                    latestValuationDateInAthena + " ***")
                return

            self.SetGlobalVariables()

            yearMonthsToProcess = self.GetYearMonthsToProcess(
                latestValuationDateInAthena)
            #yearMonthsToProcess = self.GetYearMonthsToProcess("2017-11-10") # For debugging
            for yearMonth in yearMonthsToProcess:
                self.logger.debug(self.moduleName + " -- " +
                                  "Processing Year-Month: " + yearMonth)

                strDateToday = datetime.datetime.strftime(
                    datetime.date.today(), "%Y-%m-%d")
                self.fileUtilities.EmptyFolderContents(
                    self.fileUtilities.csvFolder)
                fileListForYearMonth = self.GetFileListForYearMonth(yearMonth)
                for fileName in fileListForYearMonth:
                    self.GetFile(yearMonth, fileName)

                spark = SparkUtilities.GetCreateSparkSession(self.logger)
                df = SparkUtilities.ReadCSVFile(spark, table,
                                                self.job["delimiter"], False,
                                                self.fileUtilities.csvFolder,
                                                self.logger)

                # The data frame contains a number of valuation dates.  Get the distinct valuation dates
                # and create a partition for each valuation date
                distinctValuationDates = sorted(
                    df.select(df.valuationdate).distinct().collect())
                for item in distinctValuationDates:
                    # Process new days only.  Skip today so that we don't get partials.  Otherwise we will have
                    # to delete data from Athena/RedShift to avoid duplicates
                    if item.valuationdate <= latestValuationDateInAthena or item.valuationdate == strDateToday:
                        continue

                    self.logger.debug(self.moduleName +
                                      " - Processing Valuation Date: " +
                                      item.valuationdate)
                    dfValuationDate = df.filter(
                        df.valuationdate == item.valuationdate)
                    fileBaseName = "ValuationDate-" + item.valuationdate
                    SparkUtilities.SaveParquet(dfValuationDate,
                                               self.fileUtilities,
                                               fileBaseName)
                    self.UploadFilesCreateAthenaTablesAndSqlScripts(
                        table, self.fileUtilities.parquet, item.valuationdate)

                    if "loadToRedshift" in table and table[
                            "loadToRedshift"] == "Y":
                        self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)
            self.logger.info(self.moduleName + " - Finished processing.")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in ProcessDatabase for " + table["table"])
            raise
示例#27
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process the current table to load it up
        '''
        try:
            FileUtilities.EmptyFolderContents(
                self.fileUtilities.gzipFolder
            )  # Clear the folder from the previous run
            FileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder
            )  # Clear the folder from the previous run
            url = dbCommon["urlPrefix"] + catalog["name"] + "." + dbCommon[
                "urlExt"]
            self.logger.info(self.moduleName + " - Processing url: " + url)

            localZipFilepath = self.fileUtilities.gzipFolder + "/" + \
                catalog["name"] + "." + dbCommon["urlExt"]

            self.fileUtilities.DownloadFromURL(url, localZipFilepath)

            self.fileUtilities.UnzipFile(localZipFilepath,
                                         self.fileUtilities.csvFolder)
            localFilepath = self.fileUtilities.csvFolder + "/" + catalog[
                "name"] + ".txt"

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            dfMaster = spark.read.json(localFilepath)
            dfMaster = dfMaster.filter(dfMaster.series_id != "")
            for table in catalog["tables"]:

                self.logger.info(self.moduleName + " -- " +
                                 "Processing table: " + table["table"])
                # The column names being used in the source may be different from the once in the final
                # database.  Select columns based on source and then rename to destination
                schemaSrc = SparkUtilities.BuildSparkSchema(table,
                                                            useValidation=True)
                if table["dataSet"] == "attributes":
                    df = dfMaster.select(schemaSrc.names)
                elif table["dataSet"] == "data":
                    print(
                        dfMaster.rdd.take(5)
                    )  # There is some instability we need to monitor.  Print seems to slow down and stabilize the run???
                    df = dfMaster.rdd.flatMap(lambda row: EIAAthenaSpark.
                                              ProcessDataRecords(row)).toDF(
                                                  schemaSrc.names)
                else:
                    raise ValueError("Undefined dataSet type")

                schemaDst = SparkUtilities.BuildSparkSchema(table)
                df = SparkUtilities.RenameColumnsToSchema(df, schemaDst)
                df = SparkUtilities.ConvertTypesToSchema(df, schemaDst)
                self.logger.info(self.moduleName + " -- " + "Done reading " +
                                 str(df.count()) +
                                 " rows.  Now saving as parquet file...")

                FileUtilities.EmptyFolderContents(
                    self.fileUtilities.sqlFolder
                )  # Clear the folder from the previous run
                SparkUtilities.SaveParquet(df, self.fileUtilities)
                self.UploadFilesCreateAthenaTablesAndSqlScripts(
                    table, self.fileUtilities.parquet)
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)

            self.logger.debug(self.moduleName + " -- " +
                              "ProcessS3File for: " + url + " finished.\n\n")
        except:
            self.logger.exception("we had an error in EIA on ProcessS3File")
            raise
示例#28
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the current table to load it up
        '''
        try:
            self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " starting")
            
            # Cleanup first (TODO - Need a more generic way to do this)
            self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder)
            
            # Variables used for handling chunks.  -1 for full load
            chunkStart =  chunkEnd = maxValue = chunkSize = -1
            
            if "incrementalconditions" in tables:
                incrementalConditions = tables["incrementalconditions"]
                if "startID" in incrementalConditions:
                    chunkEnd = incrementalConditions["startID"] - 1
                else:
                    athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"])
                    try:
                        # This is where we last ended.  Start at 1 + this end
                        chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["distkey"], self.logger))
                    except ValueError:
                        chunkEnd = 0 # Table does not exist yet
                    except:
                        raise

                if "endID" in incrementalConditions:
                    maxValue = incrementalConditions["endID"]
                else:
                    # TODO - Fix this.  Also, we should start at the source min value not 0.
                    maxValue = 2000000000 #BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, chunkStart)
                    
                chunkSize = tables["incrementalconditions"]["chunksize"]
                chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
                    
            fieldDelimiter = self.job["delimiter"]
            if "delimiter" in tables:
                fieldDelimiter = tables["delimiter"]
            
            while chunkStart <= maxValue:
                partitionValue = self.GetPartitionValue(tables, chunkStart)
                sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd,
                                                                  self.logger, self.fileUtilities, self.location)
                # Construct a file name that is meaning full.  That is, it has the start and end IDs
                outputCSV = self.fileUtilities.csvFolder + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) + ".csv"
                self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)
                self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldDelimiter,
                                              self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
                # Process the data using Spark and save as Parquet
                spark = SparkUtilities.GetCreateSparkSession(self.logger)
                schema = SparkUtilities.BuildSparkSchema(tables)
                df = (spark.read
                         .format("com.databricks.spark.csv")
                         .options(header='false', delimiter=fieldDelimiter)
                         .schema(schema)
                         .load(self.fileUtilities.csvFolder)
                         )
                df.printSchema()
                df.show()
                df = SparkUtilities.ProcessSpecialCharsIfAny(df, tables)
            
                self.logger.info(self.moduleName + " -- " + "DONE READING " + str(df.count()) + " ROWS.  Now saving as parquet file...")
                self.fileUtilities.EmptyFolderContents(self.fileUtilities.parquet)
                SparkUtilities.SaveParquet(df, self.fileUtilities)
            
                # Need to load the data and clear the local space
                self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue)
                
                tables["new"] = "N" # Do not recreate
                if chunkSize < 0:
                    break;  # Done with the single load
                chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
            
            # TODO - Need to make sure we don't end up with duplicate data if we run the code
            # Twice on the same day
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)

            self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " finished")
        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessDatabase for " + tables["table"])
            raise