Exemplo n.º 1
0
 def save_offsets(rdd):
     print("Saving offset | Exactly Once Semantics")
     zk = PipelineUtils.getZookeeperInstance()
     for offset in rdd.offsetRanges():
         path = f"/consumers/{offset.topic}/{offset.partition}"
         zk.ensure_path(path)
         zk.set(path, str(offset.untilOffset).encode())
Exemplo n.º 2
0
 def voidFlatObs(encounter_ids):
     try:
         db = PipelineUtils.getConfig()['storage']['db']
         encounter_ids=','.join(map(str, encounter_ids))
         if db=="delta":
             deltaTable = DeltaUtils.getDeltaTable("flat_obs_orders")
             deltaTable.delete("encounter_id IN ({0})".format(encounter_ids))
         elif db=="cassandra":
             CassandraUtils.deleteFromCassandra("flat_obs_orders",encounter_ids)
                 
     except Exception as e:
         print("An unexpected error occurred while sinking FlatObs microbatch", e)
         raise
Exemplo n.º 3
0
 def read_offsets(topics):
     try:
         zk = PipelineUtils.getZookeeperInstance()
         from_offsets = {}
         for topic in topics:
             for partition in zk.get_children(f'/consumers/{topic}'):
                 topic_partion = TopicAndPartition(topic, int(partition))
                 offset = int(zk.get(f'/consumers/{topic}/{partition}')[0])
                 from_offsets[topic_partion] = offset
         print("Previous offset -->", from_offsets)
         return from_offsets
     except Exception as e:
         print("An unexpected error occurred while reading offset", e)
         pass
Exemplo n.º 4
0
    def sinkFlatObs(microbatch, batchId):
        try:
            db = PipelineUtils.getConfig()['storage']['db']
            if db=="delta":
                patient_id = microbatch.select("patient_id").rdd.flatMap(lambda x: x).collect()
                whereClause = "table.patient_id IN ({0}) AND table.encounter_id = updates.encounter_id"\
                                    .format(','.join(map(str, patient_id)))
                print(whereClause)
                DeltaUtils.upsertMicroBatchToDelta("flat_obs_orders", # delta tablename
                                                microbatch, # microbatch
                                                whereClause # where clause condition
                                                )


            elif db=="cassandra":
                CassandraUtils.sinkToCassandra(microbatch, "flat_obs_orders", mode="append")
        except Exception as e:
            print("An unexpected error occurred while sinking FlatObs microbatch", e)
            raise
Exemplo n.º 5
0
 def getDeltaTable(table):
     deltaConfig = PipelineUtils.getConfig()['storage']
     path=deltaConfig['tables'][table]["path"]
     spark = PipelineUtils.getSpark()
     return DeltaTable.forPath(spark, path)
Exemplo n.º 6
0
from common.utils import PipelineUtils
PipelineUtils.getSpark()
from delta.tables import * # ignore pylint error 

class DeltaUtils:
    @staticmethod
    def getDeltaTable(table):
        deltaConfig = PipelineUtils.getConfig()['storage']
        path=deltaConfig['tables'][table]["path"]
        spark = PipelineUtils.getSpark()
        return DeltaTable.forPath(spark, path)

    # static method for merging incremental updates  into Delta tables
    @staticmethod
    def upsertMicroBatchToDelta(tableName,microBatchOutputDF, whereClause="table.id = updates.id"):
        deltaTable = DeltaUtils.getDeltaTable(tableName)
        return deltaTable.alias("table").merge(microBatchOutputDF.alias("updates"), whereClause)\
                .whenMatchedUpdateAll()\
                .whenNotMatchedInsertAll()\
                .execute()
Exemplo n.º 7
0
 def sourceFromCassandra(table):
     return PipelineUtils.getSpark().read\
         .format("org.apache.spark.sql.cassandra")\
         .options(table=table, keyspace="elt")\
         .load()