def test_DateString_Ymd(self): self.assertEqual( wd.dateCleanup('20200205').strftime("%Y%m%d%H%M%S"), '20200205000000')
def test_DateWithColonTimeNoSeperationFormat(self): self.assertEqual( wd.dateCleanup('2020:02:2812:30:00').strftime("%Y%m%d%H%M%S"), '20200228123000')
def test_DayLessThanTweleve(self): self.assertEqual( wd.dateCleanup('2020-02-05').strftime("%Y%m%d%H%M%S"), '20200205000000')
def test_alphaDateShortFormat(self): self.assertEqual( wd.dateCleanup('MAR 25 2020').strftime("%Y%m%d%H%M%S"), '20200325000000')
def test_DateWithTimeFormat(self): self.assertEqual( wd.dateCleanup('2020-02-28 12:30:00').strftime("%Y%m%d%H%M%S"), '20200228123000')
def test_germanDateFormatWithTime(self): self.assertEqual( wd.dateCleanup('07.04.2020 12:12:12').strftime("%Y%m%d%H%M%S"), '20200407121212')
def test_alphaDateFormat(self): self.assertEqual( wd.dateCleanup('march 25, 2020').strftime("%Y%m%d%H%M%S"), '20200325000000')
def test_germanDateFormat(self): self.assertEqual( wd.dateCleanup('07.04.2020').strftime("%Y%m%d%H%M%S"), '20200407000000')
def test_TZFormat(self): self.assertEqual( wd.dateCleanup('2015-03-26T10:58:51Z').strftime("%Y%m%d%H%M%S"), '20150326105851')
def test_epochFormat(self): self.assertEqual( wd.dateCleanup(1571824800000, epoch=True).strftime("%Y%m%d%H%M%S"), '20191023100000')
def dc(val): result = wd.dateCleanup(val, epoch=True) return result
def __init__(self, spark): # --------------------------------------------------------------------- # set config attributes # --------------------------------------------------------------------- if len(sys.argv) > 1: with open(sys.argv[1]) as dependencyFile: conf = json.load(dependencyFile) for x in conf: setattr(parent, x, conf.get(x, "")) # --------------------------------------------------------------------- # set the process state # --------------------------------------------------------------------- processStart = wd.dateCleanup( datetime.utcnow().strftime("%Y%m%d%H%M%S")) # --------------------------------------------------------------------- # create udf dateCleanup # --------------------------------------------------------------------- def dc(val): result = wd.dateCleanup(val, epoch=True) return result dateCleanup = F.udf(dc, TimestampType()) # --------------------------------------------------------------------- # set audit # --------------------------------------------------------------------- tableId = self.name columnAudit = "tableid,ins_gmt_ts,process_timestamp" namespaceAudit = "ea_sc_kif" tableAudit = "batch_process_times" cfAudit = "o" dfAudit = DataFrame( spark.sparkContext._jvm.com.hpe.hbase.HbaseManager.getDF( columnAudit, namespaceAudit, tableAudit, cfAudit), spark).where(F.col("tableid") == tableId) # --------------------------------------------------------------------- # get last proccesed time # --------------------------------------------------------------------- col = "ins_gmt_ts" lastProcessedDate = dfAudit.select(col).collect() if len(lastProcessedDate) > 0: DTS = str([ele[col] for ele in lastProcessedDate][0]) deltaFilter = (F.col('epoch_ts') > DTS) else: DTS = "" deltaFilter = "" # --------------------------------------------------------------------- # get mapper attributes # --------------------------------------------------------------------- with open(self.app["mapper-properties"]) as mapperFile: topMap = json.load(mapperFile) # --------------------------------------------------------------------- # set hbase table properties # --------------------------------------------------------------------- hbase = topMap["hbase"] table = hbase.get("table", "none") cf = hbase.get("cf", "none") namespace = hbase.get("namespace", "none") column = hbase.get("column", "none") latestVersionMapped = hbase.get("latestVersion", "none") if latestVersionMapped == "True": latestVersion = True else: latestVersion = False # --------------------------------------------------------------------- # create data frame # --------------------------------------------------------------------- df = spark.sparkContext \ ._jvm.com.hpe.hbase \ .HbaseManager.getDF(column, namespace, table, cf, latestVersion) pyDF = DataFrame(df, spark) pyDF = pyDF.withColumn( "epoch_ts", F.to_timestamp(dateCleanup(pyDF["ts"]), 'yyyy-MM-dd HH:mm:ss')) sqlDF = pyDF if type(deltaFilter) is not str: sqlDF = sqlDF.where(deltaFilter) # --------------------------------------------------------------------- # max date # --------------------------------------------------------------------- maxDate = dfColumnToString( sqlDF.agg(F.max("epoch_ts")).select("max(epoch_ts)"), "max(epoch_ts)") # --------------------------------------------------------------------- # if no delta results available then end else process delta # --------------------------------------------------------------------- if maxDate: printInc = ("Found delta maxdate of '{}'." + " Starting incremental update on process of " + "delta records.") print(printInc.format(maxDate)) # ----------------------------------------------------------------- # set checkpoint dir # ----------------------------------------------------------------- spark \ .sparkContext \ .setCheckpointDir(self.app["checkpoint"]) sqlDF.persist(StorageLevel.MEMORY_AND_DISK) sqlDF.take(1) # ----------------------------------------------------------------- # set recursive table properties # ----------------------------------------------------------------- recursive = topMap["recursive"] id = str(recursive.get("id", "none")) parentid = str(recursive.get("parentid", "none")) parentLookupCol = str(recursive.get("parentLookupCol", "none")) lookupCol = str(recursive.get("lookupCol", "none")) levels = recursive.get("levels", "2") # ----------------------------------------------------------------- # set output DF columns # ----------------------------------------------------------------- outCols = stringToList(column) outCols.insert(0, id) outCols.append(lookupCol) # ----------------------------------------------------------------- # run recursiveLookup # ----------------------------------------------------------------- recursiveDF = recursiveLookup(sqlDF, id, parentid, parentLookupCol, lookupCol, levels).select(outCols) recursiveDF = recursiveDF.withColumn( "parentlevel", F.col("parentlevel").cast(StringType())) recursiveDF = recursiveDF.withColumn("key", F.col(id)) # ----------------------------------------------------------------- # store df to hbase # ----------------------------------------------------------------- outputCols = appendString("key", "parentlevel", lookupCol) print("writing recursive output to Hbase . . .") spark.sparkContext \ ._jvm.com.hpe.hbase \ .HbaseManager.setDF(recursiveDF._jdf, outputCols, "key", namespace, table, cf) # ----------------------------------------------------------------- # close process state # ----------------------------------------------------------------- processEnd = wd.dateCleanup( datetime.utcnow().strftime("%Y%m%d%H%M%S")) dfAuditWrite = sqlDF.agg(F.max("epoch_ts") .alias("ins_gmt_ts")) \ .select(F.col("ins_gmt_ts").cast(StringType())) \ .withColumn("tableid", F.lit(tableId)) \ .withColumn("process_start_ts", F.lit(processStart).cast(StringType())) \ .withColumn("process_end_ts", F.lit(processEnd ).cast(StringType())) auditWriteCols = "tableid,ins_gmt_ts,process_start_ts" \ + ",process_end_ts" # ----------------------------------------------------------------- # write to the audit log # ----------------------------------------------------------------- print("updating audit log . . .") spark.sparkContext \ ._jvm.com.hpe.hbase \ .HbaseManager \ .setDF(dfAuditWrite._jdf, auditWriteCols, "tableid", namespaceAudit, tableAudit, cfAudit) # ----------------------------------------------------------------- # add finial df to class (if run outside of __main__) # ----------------------------------------------------------------- self.df = recursiveDF else: # ----------------------------------------------------------------- # No delta results # ----------------------------------------------------------------- print("No delta available.")