def aggregateSalesRevenueDeltaBacked(updatesDF, epochId): # Sum up the new incoming keys incomingSalesAggregateDF = (updatesDF.withColumn( "timestamp", f.date_trunc("minute", "timestamp")).groupBy( f.col("timestamp"), f.col("item_id")).agg(f.sum("sales").alias("sales"))) targetTable = DeltaTable.forName(spark, "sw_db.delta_backed_state") # We merge the new sales with the already existing sales. # We simulate a watermark by only retrieving timestamp records greater than max seen timestamp - 5 minutes # Note that it is even better to partition the state by date if you have days worth of data, to skip over entire partitions, # when pushing down the predicate. mostRecentTimestamp = targetTable.toDF().select( f.max("timestamp").alias("max_timestamp")).head().max_timestamp watermarkTime = mostRecentTimestamp - timedelta( minutes=5) if mostRecentTimestamp else datetime.min (targetTable.alias("target").merge( incomingSalesAggregateDF.alias("source"), f""" target.item_id = source.item_id AND target.timestamp = source.timestamp AND target.timestamp > cast('{watermarkTime}' AS TIMESTAMP) AND source.timestamp > cast('{watermarkTime}' AS TIMESTAMP) """).whenMatchedUpdate( set={ "sales": f.col("source.sales") + f.col("target.sales") }).whenNotMatchedInsertAll().execute())
def _get_delta_table(spark, delta_path, delta_table): if [delta_path, delta_table].count(None) == 2: raise ValueError("delta_path ou delta_table deve ser passado") if delta_path is not None: delta_table = DeltaTable.forPath(spark, delta_path) else: delta_table = DeltaTable.forName(spark, delta_table) return delta_table
def _get_delta_table(self, spark, table_or_path, update_delta_table): try: deltaTable = DeltaTable.forPath(spark, table_or_path) except: try: deltaTable = DeltaTable.forName(spark, table_or_path) except AssertionError as E: raise E if update_delta_table: return deltaTable return deltaTable.toDF()
def merge( self, df: DataFrame, location: str, condition: str, # Only supports SQL-like string condition match_update_dict: dict, # "target_column": "expression" insert_when_not_matched: False, # Set to True for upsert save_mode: str = 'table'): '''Merge a dataframe to target table or path. This merge operation can represent both update and upsert operation. Source and target table is defaultly alias-ed as 'SRC' and 'TGT'. This could be used in condition string and update/insert expressions. Args: df (DataFrame): The source dataframe to write. save_mode (str): 'table' or 'path' location (str): The table name or path to be merge into. condition (str): The condition in SQL-like string form. match_update_dict (dict): Contains ("target_column": "expression"). This represents the updated value if matched. NOTE: "target_column"'s come without schema ("SRC" or "TGT"). not_match_insert_dict (dict): Contains ("target_column": "expression"). This represents the inserted value if not matched. Other columns which are not specified shall be null. NOTE: "target_column"'s come without schema ("SRC" or "TGT"). ''' super(DeltaDataSource, self).merge(df, condition, match_update_dict, insert_when_not_matched=insert_when_not_matched) save_mode = save_mode.lower() if save_mode == "table": target_table = DeltaTable.forName(self.spark, location) elif save_mode == "path": target_table = DeltaTable.forPath(self.spark, location) else: raise ValueError("save_mode should be 'path' or 'table'.") merger = target_table.alias("TGT").merge(df.alias("SRC"), condition) merger = merger.whenMatchedUpdate(set=match_update_dict) if insert_when_not_matched: merger = merger.whenNotMatchedInsert(values=match_update_dict) merger.execute()
def test_forName(self) -> None: self.__writeAsTable([('a', 1), ('b', 2), ('c', 3)], "test") df = DeltaTable.forName(self.spark, "test").toDF() self.__checkAnswer(df, [('a', 1), ('b', 2), ('c', 3)])