def ProcessTables(self, dbCommon, tables): ''' Will load the ENP Yearly Table ''' try: outputfileName = self.fileUtilities.csvFolder + '/ENPdata.csv' conn = self.EstablishConnection(dbCommon) cur = conn.cursor() sqlline = self.FixSQLStatement(dbCommon) cur.execute(sqlline) self.ConvertToCSV(cur, outputfileName) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema( schema).load(outputfileName)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load table. Error:" + err.message) raise
def ProcessTables(self, dbCommon, tables): ''' pulls data from different sheets and put that information into csv file ''' try: xl = ExcelUtilities(self.logger) localFilepath = self.fileUtilities.FindMostCurrentFile( self.job["foldertoscan"]) csvfile = self.CreateCsvFile(tables) csvWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL) if localFilepath is not None: self.ProcessFile(xl, localFilepath, csvWriter) csvfile.close() spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: self.DownloadFilesFromS3(tables) for fileName in self.fileUtilities.ScanFolder( self.localTempDirectory + '/raw/'): outPutFileName = self.GenerateCSVFromSpreadSheet( tables, self.localTempDirectory + '/raw/', fileName) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def ProcessTables(self, dbCommon, tables): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " starting ") processingFile = self.DownloadFile() fileOut = processingFile.replace(".dbf", ".txt") dbfUtils = DBFUtilities(self.logger) dbfUtils.ConvertToCSV(processingFile, fileOut, self.job["delimiter"], False) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema(schema).load(fileOut)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " finished ") except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load category...") raise Exception(err.message)
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: spark = SparkUtilities.GetCreateSparkSession(self.logger) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "srcS3DataFolder" in tables: self.DownloadFilesFromS3(tables) df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) # remove any null records df = df.dropna(how='all') SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet, None) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise
def CreateParquetFilesAndLoad(self, catalog, partitionValue): ''' Creates the parquet files ''' try: spark = SparkUtilities.GetCreateSparkSession(self.logger) for tables in catalog["tables"]: if tables["type"] == "attributes": srcFolder = self.fileUtilities.csvFolder + '/attribute/' else: srcFolder = self.fileUtilities.csvFolder + '/data/' tableSchema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read.format("com.databricks.spark.csv").options( header=False, delimiter=self.job["delimiter"]).schema( tableSchema).load(srcFolder)) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet, partitionValue) self.fileUtilities.EmptyFolderContents( self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents(srcFolder) except Exception as ex: self.logger.exception( self.moduleName + " - we had an error in CreateParquetFilesAndLoad " + ex.message) raise
def ProcessTables(self, dbCommon, tables): ''' Process each Vantage table. ''' try: self.logger.debug(self.moduleName + " -- " + "Processing data for table:" + tables["table"]) fileName = self.BulkExtract(tables["table"], tables["scriptFile"], dbCommon) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema(schema).load(fileName)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load table. Error: " + err.message) raise
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: self.rawFolder = self.localTempDirectory + "/" + "Raw" self.BulkDownload() self.ProcessFiles() spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise
def ProcessTables(self, dbCommon, tables): ''' get the last partition value and use that as the date to pull data then put that data into Athena ''' try: outputCSV = self.fileUtilities.csvFolder + self.moduleName + ".CSV" fieldTerminator = self.job["fieldTerminator"] if "fieldTerminator" in tables: fieldTerminator = tables["fieldTerminator"] rowTerminator = None # Not using this. Stick with the default of CR/LF. self.job["rowTerminator"] if "pullTemplate" in tables: sqlPullDataScript = self.CreatePullScript(tables) self.bcpUtilities.BulkExtract(self.fileUtilities.LoadSQLQuery(sqlPullDataScript), outputCSV, dbCommon, tables, fieldTerminator, rowTerminator, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["fieldTerminator"], False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def ProcessTable(self, table): ''' Process the data for the table ''' s3Key = table["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.CreateFolders(table["table"]) fileName = ntpath.basename(s3Key) localTxtFilepath = self.fileUtilities.csvFolder + "/" + fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localTxtFilepath) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema( schema).load(localTxtFilepath)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet, table["partitionValue"]) self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
def FlushAndFillUsingJDBC(self, dbCommon, tables): ''' Simple flush and fill. Get the data from JDBC and load into Athena ''' spark = SparkUtilities.GetCreateSparkSession(self.logger) url, driver = SparkUtilities.GetSqlServerConnectionParams(dbCommon) df = SparkUtilities.ReadTableUsingJDBC(spark, url, driver, tables, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
def IncrementalLoad(self, dbCommon, tables): self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder) try: # This is where we last ended. Start at 1 + this end athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"]) chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["incrementalconditions"]["keyfield"], self.logger)) except ValueError: chunkEnd = 0 # Table does not exist yet except: raise #chunkEnd = 2960000000 #maxValue = 3708000000 # 2249000000 3708000000 maxValue = BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, self.logger) chunkSize = tables["incrementalconditions"]["chunksize"] chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) fieldTerminator = self.job["fieldTerminator"] rowTerminator = None # Not using this. Stick with the default of CR/LF. self.job["rowTerminator"] chunkStartData = chunkStart # Each ETL gets the same date so that we can do a smart insert based on ETL and chunkStartData partitionValue = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d') while chunkStart <= maxValue: sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd, self.logger, self.fileUtilities, self.location) # Construct a file name that is meaning full. That is, it has the start and end IDs fileBaseName = tables["incrementalconditions"]["keyfield"] + "-" + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) outputCSV = self.fileUtilities.csvFolder + fileBaseName + ".csv" self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldTerminator, rowTerminator, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) # Process the data using Spark and save as Parquet spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, fieldTerminator, False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities, fileBaseName) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue) tables["new"] = "N" # Do not recreate if chunkSize < 0: break; # Done with the single load chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) # Load only the data that we processed into Redshift. We cannot use the run ETL date parition value # since we are loading the data based on record IDs customWhereCondition = tables["incrementalconditions"]["keyfield"] + " >= " + str(chunkStartData) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables, customWhereCondition)
def MeltDataFrame(applyCategoryCol, tableDDL, df): ''' Will return a transposed dataframe ''' dfMelted = None stackSentence = "stack(" + str(len(df.columns) - 2) + "," for col in df.columns: if col not in ["region", "country"]: stackSentence = stackSentence + "'" + col.replace( "y", "") + "'," + col + "," stackSentence = stackSentence[:len(stackSentence) - 1] + ")" if applyCategoryCol == "Y": dfMelted = df.selectExpr( "'" + tableDDL["sheetSrc"]["subSetName"] + "' AS category", "region", "country", stackSentence).where("region is not null") else: dfMelted = df.selectExpr("region", "country", stackSentence).where("region is not null") dfMelted = SparkUtilities.RenameColumns(dfMelted, ["col0", "col1"], ["year", "value"]) return dfMelted
def ProcessCatalogs(self, dbCommon, catalog): ''' Process each file ''' # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName # Remove the gz extension localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath) self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath) # Don't have a raw excel reader for Spark so use Pandas self.logger.info(self.moduleName + " - Processing Excel file: " + localExcelFilepath) pandasDf = pd.read_excel(localExcelFilepath, catalog["excelSheetName"], index_col=None, na_values=['NaN'], skiprows=catalog["skipRows"]) pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf) spark = SparkUtilities.GetCreateSparkSession(self.logger) table = catalog["tables"][0] # There is only table in a catalog schema = SparkUtilities.BuildSparkSchema(table) df = spark.createDataFrame(pandasDf, schema) df = SparkUtilities.ConvertNanToNull(df) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for file: " + s3Key + " finished.\n\n")
def ProcessTables(self, dbCommon, tables): ''' Process the current table to load it up ''' self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " starting") self.ProcessFiles(dbCommon) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, dbCommon["delimiter"], False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " Done.")
def ProcessCatalogs(self, dbCommon, catalog): ''' Process the liquids balance catalog. ''' try: self.logger.debug(self.moduleName + " -- " + "Processing data for catalog: " + catalog["name"]) self.GetLastLiquidsBalanceFileInfo(dbCommon) spark = SparkUtilities.GetCreateSparkSession(self.logger) dfConsolidated = None for tableDDL in catalog["tables"]: if tableDDL["type"] == "raw": csvInfo = self.GetSheetDataToCsv(dbCommon, tableDDL) df = SparkUtilities.ReadCSVFile(spark, csvInfo["tableDDL"], self.job["delimiter"], True, csvInfo["csvFileName"], self.logger) if dfConsolidated is None: dfConsolidated = self.MeltDataFrame( catalog["applyCategoryCol"], tableDDL, df) else: dfConsolidated.unionAll( self.MeltDataFrame(catalog["applyCategoryCol"], tableDDL, df)) for tableDDL in catalog["tables"]: if tableDDL["type"] == "destination": SparkUtilities.SaveParquet(dfConsolidated, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tableDDL, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tableDDL) break except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load table. Error: " + err.message) raise
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: rawFolder = self.localTempDirectory + '/raw/' if "srcS3DataFolder" in tables: self.DownloadFilesFromS3(tables) xl = ExcelUtilities(self.logger) outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv' xl.Excel2CSV(rawFolder + tables["srcFile"],\ None,\ outPutFileName,\ self.fileUtilities.csvFolder,\ defDateFormat='%Y-%m-%d',\ skiprows=tables["skipRows"]) else: self.ProcessWebCall(tables, rawFolder) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) self.fileUtilities.EmptyFolderContents(rawFolder) # remove any null records df = df.dropna(how='all') SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise
def ProcessTable(self,table): ''' Process data for the table :param table: :return: ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"] self.fileUtilities.CreateFolders(self.job["folders"]) fileName = ntpath.basename(s3Key) local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"], s3Key,local7zipFilePath) localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath) self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath) fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv' spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read .format("com.databricks.spark.csv") .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true') .schema(schema) .load(fileToBeloaded) ) #df.show() self.logger.info( self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet) self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")
def ProcessTables(self, dbCommon, tables): ''' Process each file ''' self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " starting") FileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) # Clear the folder from the previous run self.ProcessFiles(tables) spark = SparkUtilities.GetCreateSparkSession(self.logger) # We will compute "period_type" later schemaWithoutPeriodType = SparkUtilities.BuildSparkSchema(tables, excludeComputed=True) df = (spark.read .format("com.databricks.spark.csv") .options(header=False, delimiter=self.job['delimiter'], ignoreTrailingWhiteSpace=True, ignoreLeadingWhiteSpace=True) .schema(schemaWithoutPeriodType) .load(self.fileUtilities.csvFolder) ) if "filterData" in tables: df = df.filter(tables["filterData"]) # Replace "NEW" with blank. E.g. DEC1990NEW to DEC1990 from pyspark.sql import functions as F #@UnresolvedImport df = SparkUtilities.RenameColumnsInList(df, [("period", "period_old")]) # Rename column since we cannot edit in place df = df.withColumn("period", F.regexp_replace(df["period_old"], "NEW", "")) # Compute "period_type". Following simple rules have been applied # MAY2013 - 7 characters so assumed to be 'M' # Q12017 - 6 characters so assumed to be 'Q' # 2017 - 4 characters so assumed to be 'Y' df = df.withColumn("period_type", F.when(F.length(df.period)==7, "M").when(F.length(df.period)==6, "Q").when(F.length(df.period)==4, "Y").otherwise("")) # Reorder the columns based on the input column order schema = SparkUtilities.BuildSparkSchema(tables) df = df.select(schema.names) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " finished")
def ProcessCatalogs(self, dbCommon, catalog): ''' Process each risks table. ''' try: last_update_date = self.GetLastUpdateDate(catalog["tables"][0]) for i in range(dbCommon["daysback"]): last_update_date += datetime.timedelta(days=1) if last_update_date < datetime.date.today(): self.logger.info(self.moduleName + "- Processing Date: " + str(last_update_date)) self.fileUtilities.EmptyFolderContents( self.localTempDirectory + "/raw") self.PullOData(dbCommon, catalog["tables"][0], last_update_date) spark = SparkUtilities.GetCreateSparkSession(self.logger) for tableDDL in catalog["tables"]: fileBaseName = "last_update_date-" + str( last_update_date) df = spark.read.json(self.localTempDirectory + "/raw/magellan_" + tableDDL["type"] + ".json") SparkUtilities.SaveParquet(df, self.fileUtilities, fileBaseName) self.UploadFilesCreateAthenaTablesAndSqlScripts( tableDDL, self.fileUtilities.parquet, str(last_update_date)) else: self.logger.info(self.moduleName + "- Already up to date. " + str(i)) except Exception as err: self.logger.error( self.moduleName + " - Error while trying to process catalog. Error: " + err.message) raise
def ProcessTables(self, dbCommon, tables): ''' Process the data for the table ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) # Unzip the file rather than reading the gzip as Spark is faster with csv localCSVFilepath = self.fileUtilities.csvFolder + "/" + fileName + ".csv" self.fileUtilities.GunzipFile(localGzipFilepath, localCSVFilepath) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], True, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: processingFile = self.DownloadFile(self.job["srcCategories"]) self.CreateCSVFile(processingFile, self.job["srcCategories"]) spark = SparkUtilities.GetCreateSparkSession(self.logger) schemaAllString = SparkUtilities.BuildSparkSchema(tables, True) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read .format("com.databricks.spark.csv") .options(header=False, delimiter=self.job["srcCategories"]["delimiter"]) .option("ignoreTrailingWhiteSpace", "true") .option("ignoreLeadingWhiteSpace", "true") .schema(schemaAllString) .load(self.fileUtilities.csvFolder) ) df = SparkUtilities.ReplaceAll(df, "\xE2\x80\x93", "") df2 = SparkUtilities.ConvertTypesToSchema(df, schema) SparkUtilities.SaveParquet(df2, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: rawFolder = self.localTempDirectory + '/raw/' flist = self.DownloadFile(rawFolder) xl = ExcelUtilities(self.logger) outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv' for fl in flist: xl.Excel2CSV(rawFolder + fl,\ 'Sheet1',\ outPutFileName,\ self.fileUtilities.csvFolder) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def LoadClassRefDF(self, spark): ''' Loads de class reference data ''' xReferencesDF = {} for catalog in self.job["catalogs"]: if catalog["name"] == "xReferences": for xrefTable in catalog["tables"]: if self.xRefPulled is False: S3Utilities.CopyItemsAWSCli( "s3://" + self.job["bucketName"] + xrefTable["s3SourceFolder"] + xrefTable["sourceFileName"], self.fileUtilities.csvFolder, "--quiet") xReferencesDF[ xrefTable["table"]] = SparkUtilities.ReadCSVFile( spark, xrefTable, self.job["delimiter"], False, self.fileUtilities.csvFolder + "/" + xrefTable["sourceFileName"], self.logger) self.xRefPulled = True return xReferencesDF
def ProcessTables(self, dbCommon, tables): ''' get the last partition value and use that as the date to pull data then put that data into Athena ''' try: outputCSV = self.fileUtilities.csvFolder + self.moduleName + ".CSV" fieldTerminator = self.job["fieldTerminator"] if "fieldTerminator" in tables: fieldTerminator = tables["fieldTerminator"] rawFolder = self.localTempDirectory + "/raw/" rowTerminator = None # Not using this. Stick with the default of CR/LF. self.job["rowTerminator"] if "pullTemplate" in tables: lastRunDate = self.GetParameters(tables) formattedLastRunDate = lastRunDate[4:6] + '/' + lastRunDate[ 6:8] + '/' + lastRunDate[:4] sqlPullDataScript = self.CreatePullScript( tables, formattedLastRunDate) self.bcpUtilities.BulkExtract( self.fileUtilities.LoadSQLQuery(sqlPullDataScript), outputCSV, dbCommon, tables, fieldTerminator, rowTerminator, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) self.masterSchema = SparkUtilities.BuildSparkSchema(tables) self.fileUtilities.MoveFilesFromOneFolderToAnother(self.fileUtilities.csvFolder,\ rawFolder,\ '*.csv') return ### # load data frame from CSV file ### partitionValue = self.GetPartitionValue() self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = (spark.read.format("com.databricks.spark.csv").options( header=False, delimiter=fieldTerminator).schema( self.masterSchema).load(rawFolder)) cols = [] for field in tables["fields"]: if "athenaOnly" in field: if field["athenaOnly"] != "Y": cols.append(field["name"]) else: cols.append(field["name"]) if tables["type"] == "attributes": dfAttributes = df.select(cols).distinct() if dfAttributes.count() == 0: self.logger.debug( self.moduleName + " - no records to process for Attribute data") return SparkUtilities.SaveParquet(dfAttributes, self.fileUtilities) elif tables["type"] == "series": dfSeries = df.select(cols) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: dfSeries = SparkUtilities.FormatColumn( dfSeries, fld["name"], fld["inputFormat"]) if dfSeries.count() == 0: self.logger.debug( self.moduleName + " - no records to process for Series data") return SparkUtilities.SaveParquet(dfSeries, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet, partitionValue) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def ProcessCatalogs(self, dbCommon, catalog): ''' Process each risks table. ''' try: if catalog["name"] == "Risks": self.logger.debug(self.moduleName + " -- " + "Processing data for catalog: " + catalog["name"]) self.DownloadData(dbCommon) for tableDDL in catalog["tables"]: dfFixed = None spark = SparkUtilities.GetCreateSparkSession(self.logger) xReferencesDF = self.LoadClassRefDF(spark) xReferencesDF["class_xref"].createOrReplaceTempView( "class_xref") xReferencesDF["iso3166_xref"].createOrReplaceTempView( "iso3166_xref") dfMaster = spark.read.json(self.localTempDirectory + "/raw/ecr_risks_" + tableDDL["type"] + ".json") if tableDDL["type"] == "current": dfMaster.createOrReplaceTempView("risks") dfFixed = spark.sql(""" SELECT iso3166_xref.countryname AS country, clsRef.risk_desc AS risk_name, CAST(risks.risk_value AS DOUBLE) AS risk_value, risks.risk_description, clsRef.class_name AS risk_class, avgs.class_avg AS risk_class_avg, risks.updated_on FROM risks inner join iso3166_xref on iso3166_xref.iso3166 = risks.country inner join class_xref clsRef on clsRef.risk_name = risks.risk_name inner join (SELECT country, risk_name, risk_class, AVG(risk_value) OVER(PARTITION BY country, risk_class) AS class_avg FROM risks) avgs ON avgs.country = risks.country AND avgs.risk_name = risks.risk_name AND avgs.risk_class = risks.risk_class """) else: dfMaster.createOrReplaceTempView("risksHistory") dfFixed = spark.sql(""" SELECT iso3166_xref.countryname AS country, clsRef.risk_desc AS risk_name, CAST(risksHistory.risk_value AS DOUBLE) AS risk_value, risksHistory.updated_on FROM risksHistory inner join iso3166_xref on iso3166_xref.iso3166 = risksHistory.country inner join class_xref clsRef on clsRef.risk_name = risksHistory.risk_name """) self.logger.info(self.moduleName + " -- " + "Done reading " + str(dfFixed.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(dfFixed, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tableDDL, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tableDDL) spark.catalog.dropTempView("class_xref") spark.catalog.dropTempView("iso3166_xref") spark.catalog.dropTempView("risks") except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load table. Error: " + err.message) raise
{ "name": "Contract_Units", "type": "VARCHAR", "size": "200" }, { "name": "CFTC_Contract_Market_Code_Quotes", "type": "VARCHAR", "size": "30" }, { "name": "CFTC_Market_Code_Quotes", "type": "VARCHAR", "size": "30" }, { "name": "CFTC_Commodity_Code_Quotes", "type": "VARCHAR", "size": "30" }, { "name": "CFTC_SubGroup_Code", "type": "VARCHAR", "size": "30" }, { "name": "FutOnly_or_Combined", "type": "VARCHAR", "size": "40" } ] }''' table = json.loads(tableString) logger = FileUtilities.CreateLogger("log", 10) os.environ["SPARK_HOME"] = "C:/WorkSpaceEclipse36/SparkWindows/spark" os.environ["HADOOP_HOME"] = "C:/WorkSpaceEclipse36/SparkWindows/hadoop" sc, sqlContext = SparkUtilities.CreateSparkContext(logger) samplejson = '''{ "fields": [ {"metadata": {}, "nullable": true, "name": "sourceset", "type": "string"}, {"metadata": {}, "nullable": true, "name": "sourcesetdesc", "type": "string"}, {"metadata": {}, "nullable": true, "name": "market_and_exchange_names", "type": "string"}, {"metadata": {}, "nullable": true, "name": "as_of_date_in_form_yymmdd", "type": "integer"}, {"metadata": {}, "nullable": true, "name": "report_date_as_yyyy_mm_dd", "type": "string"} ], "type": "struct" }''' #schemaJson = json.loads(samplejson) #from pyspark.sql.types import StructType#@UnresolvedImport #schema = StructType.fromJson(schemaJson)
def ProcessCatalogs(self, dbCommon, catalog): ''' Process the current table to load it up ''' try: FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run url = dbCommon["urlPrefix"] + catalog["name"] + "." + dbCommon[ "urlExt"] self.logger.info(self.moduleName + " - Processing url: " + url) localZipFilepath = self.fileUtilities.gzipFolder + "/" + \ catalog["name"] + "." + dbCommon["urlExt"] self.fileUtilities.DownloadFromURL(url, localZipFilepath) self.fileUtilities.UnzipFile(localZipFilepath, self.fileUtilities.csvFolder) localFilepath = self.fileUtilities.csvFolder + "/" + catalog[ "name"] + ".txt" spark = SparkUtilities.GetCreateSparkSession(self.logger) dfMaster = spark.read.json(localFilepath) dfMaster = dfMaster.filter(dfMaster.series_id != "") for table in catalog["tables"]: self.logger.info(self.moduleName + " -- " + "Processing table: " + table["table"]) # The column names being used in the source may be different from the once in the final # database. Select columns based on source and then rename to destination schemaSrc = SparkUtilities.BuildSparkSchema(table, useValidation=True) if table["dataSet"] == "attributes": df = dfMaster.select(schemaSrc.names) elif table["dataSet"] == "data": print( dfMaster.rdd.take(5) ) # There is some instability we need to monitor. Print seems to slow down and stabilize the run??? df = dfMaster.rdd.flatMap(lambda row: EIAAthenaSpark. ProcessDataRecords(row)).toDF( schemaSrc.names) else: raise ValueError("Undefined dataSet type") schemaDst = SparkUtilities.BuildSparkSchema(table) df = SparkUtilities.RenameColumnsToSchema(df, schemaDst) df = SparkUtilities.ConvertTypesToSchema(df, schemaDst) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") FileUtilities.EmptyFolderContents( self.fileUtilities.sqlFolder ) # Clear the folder from the previous run SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for: " + url + " finished.\n\n") except: self.logger.exception("we had an error in EIA on ProcessS3File") raise
def ProcessTables(self, dbCommon, tables): ''' Process the current table to load it up ''' try: self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " starting") # Cleanup first (TODO - Need a more generic way to do this) self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder) # Variables used for handling chunks. -1 for full load chunkStart = chunkEnd = maxValue = chunkSize = -1 if "incrementalconditions" in tables: incrementalConditions = tables["incrementalconditions"] if "startID" in incrementalConditions: chunkEnd = incrementalConditions["startID"] - 1 else: athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"]) try: # This is where we last ended. Start at 1 + this end chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["distkey"], self.logger)) except ValueError: chunkEnd = 0 # Table does not exist yet except: raise if "endID" in incrementalConditions: maxValue = incrementalConditions["endID"] else: # TODO - Fix this. Also, we should start at the source min value not 0. maxValue = 2000000000 #BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, chunkStart) chunkSize = tables["incrementalconditions"]["chunksize"] chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) fieldDelimiter = self.job["delimiter"] if "delimiter" in tables: fieldDelimiter = tables["delimiter"] while chunkStart <= maxValue: partitionValue = self.GetPartitionValue(tables, chunkStart) sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd, self.logger, self.fileUtilities, self.location) # Construct a file name that is meaning full. That is, it has the start and end IDs outputCSV = self.fileUtilities.csvFolder + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) + ".csv" self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldDelimiter, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) # Process the data using Spark and save as Parquet spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read .format("com.databricks.spark.csv") .options(header='false', delimiter=fieldDelimiter) .schema(schema) .load(self.fileUtilities.csvFolder) ) df.printSchema() df.show() df = SparkUtilities.ProcessSpecialCharsIfAny(df, tables) self.logger.info(self.moduleName + " -- " + "DONE READING " + str(df.count()) + " ROWS. Now saving as parquet file...") self.fileUtilities.EmptyFolderContents(self.fileUtilities.parquet) SparkUtilities.SaveParquet(df, self.fileUtilities) # Need to load the data and clear the local space self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue) tables["new"] = "N" # Do not recreate if chunkSize < 0: break; # Done with the single load chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) # TODO - Need to make sure we don't end up with duplicate data if we run the code # Twice on the same day self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " finished") except: self.logger.exception(self.moduleName + " - we had an error in ProcessDatabase for " + tables["table"]) raise
def ProcessTables(self, dbCommonNotUsed, table): ''' the actual process starts here ''' try: strDateTodayMinus1 = datetime.datetime.strftime( datetime.date.today() - datetime.timedelta(days=1), "%Y-%m-%d") latestValuationDateInAthena = self.GetLatestValuationDateInAthena( table) if (latestValuationDateInAthena == strDateTodayMinus1): self.logger.debug( self.moduleName + " -- " + "*** Totem data is already up-to-date as of: " + latestValuationDateInAthena + " ***") return self.SetGlobalVariables() yearMonthsToProcess = self.GetYearMonthsToProcess( latestValuationDateInAthena) #yearMonthsToProcess = self.GetYearMonthsToProcess("2017-11-10") # For debugging for yearMonth in yearMonthsToProcess: self.logger.debug(self.moduleName + " -- " + "Processing Year-Month: " + yearMonth) strDateToday = datetime.datetime.strftime( datetime.date.today(), "%Y-%m-%d") self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) fileListForYearMonth = self.GetFileListForYearMonth(yearMonth) for fileName in fileListForYearMonth: self.GetFile(yearMonth, fileName) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, table, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) # The data frame contains a number of valuation dates. Get the distinct valuation dates # and create a partition for each valuation date distinctValuationDates = sorted( df.select(df.valuationdate).distinct().collect()) for item in distinctValuationDates: # Process new days only. Skip today so that we don't get partials. Otherwise we will have # to delete data from Athena/RedShift to avoid duplicates if item.valuationdate <= latestValuationDateInAthena or item.valuationdate == strDateToday: continue self.logger.debug(self.moduleName + " - Processing Valuation Date: " + item.valuationdate) dfValuationDate = df.filter( df.valuationdate == item.valuationdate) fileBaseName = "ValuationDate-" + item.valuationdate SparkUtilities.SaveParquet(dfValuationDate, self.fileUtilities, fileBaseName) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet, item.valuationdate) if "loadToRedshift" in table and table[ "loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(table) self.logger.info(self.moduleName + " - Finished processing.") except: self.logger.exception( self.moduleName + " - we had an error in ProcessDatabase for " + table["table"]) raise