def save_offsets(rdd): print("Saving offset | Exactly Once Semantics") zk = PipelineUtils.getZookeeperInstance() for offset in rdd.offsetRanges(): path = f"/consumers/{offset.topic}/{offset.partition}" zk.ensure_path(path) zk.set(path, str(offset.untilOffset).encode())
def voidFlatObs(encounter_ids): try: db = PipelineUtils.getConfig()['storage']['db'] encounter_ids=','.join(map(str, encounter_ids)) if db=="delta": deltaTable = DeltaUtils.getDeltaTable("flat_obs_orders") deltaTable.delete("encounter_id IN ({0})".format(encounter_ids)) elif db=="cassandra": CassandraUtils.deleteFromCassandra("flat_obs_orders",encounter_ids) except Exception as e: print("An unexpected error occurred while sinking FlatObs microbatch", e) raise
def read_offsets(topics): try: zk = PipelineUtils.getZookeeperInstance() from_offsets = {} for topic in topics: for partition in zk.get_children(f'/consumers/{topic}'): topic_partion = TopicAndPartition(topic, int(partition)) offset = int(zk.get(f'/consumers/{topic}/{partition}')[0]) from_offsets[topic_partion] = offset print("Previous offset -->", from_offsets) return from_offsets except Exception as e: print("An unexpected error occurred while reading offset", e) pass
def sinkFlatObs(microbatch, batchId): try: db = PipelineUtils.getConfig()['storage']['db'] if db=="delta": patient_id = microbatch.select("patient_id").rdd.flatMap(lambda x: x).collect() whereClause = "table.patient_id IN ({0}) AND table.encounter_id = updates.encounter_id"\ .format(','.join(map(str, patient_id))) print(whereClause) DeltaUtils.upsertMicroBatchToDelta("flat_obs_orders", # delta tablename microbatch, # microbatch whereClause # where clause condition ) elif db=="cassandra": CassandraUtils.sinkToCassandra(microbatch, "flat_obs_orders", mode="append") except Exception as e: print("An unexpected error occurred while sinking FlatObs microbatch", e) raise
def getDeltaTable(table): deltaConfig = PipelineUtils.getConfig()['storage'] path=deltaConfig['tables'][table]["path"] spark = PipelineUtils.getSpark() return DeltaTable.forPath(spark, path)
from common.utils import PipelineUtils PipelineUtils.getSpark() from delta.tables import * # ignore pylint error class DeltaUtils: @staticmethod def getDeltaTable(table): deltaConfig = PipelineUtils.getConfig()['storage'] path=deltaConfig['tables'][table]["path"] spark = PipelineUtils.getSpark() return DeltaTable.forPath(spark, path) # static method for merging incremental updates into Delta tables @staticmethod def upsertMicroBatchToDelta(tableName,microBatchOutputDF, whereClause="table.id = updates.id"): deltaTable = DeltaUtils.getDeltaTable(tableName) return deltaTable.alias("table").merge(microBatchOutputDF.alias("updates"), whereClause)\ .whenMatchedUpdateAll()\ .whenNotMatchedInsertAll()\ .execute()
def sourceFromCassandra(table): return PipelineUtils.getSpark().read\ .format("org.apache.spark.sql.cassandra")\ .options(table=table, keyspace="elt")\ .load()