示例#1
0
def aggregateSalesRevenueDeltaBacked(updatesDF, epochId):

    # Sum up the new incoming keys
    incomingSalesAggregateDF = (updatesDF.withColumn(
        "timestamp", f.date_trunc("minute", "timestamp")).groupBy(
            f.col("timestamp"),
            f.col("item_id")).agg(f.sum("sales").alias("sales")))

    targetTable = DeltaTable.forName(spark, "sw_db.delta_backed_state")
    # We merge the new sales with the already existing sales.
    # We simulate a watermark by only retrieving timestamp records greater than max seen timestamp - 5 minutes
    # Note that it is even better to partition the state by date if you have days worth of data, to skip over entire partitions,
    # when pushing down the predicate.
    mostRecentTimestamp = targetTable.toDF().select(
        f.max("timestamp").alias("max_timestamp")).head().max_timestamp
    watermarkTime = mostRecentTimestamp - timedelta(
        minutes=5) if mostRecentTimestamp else datetime.min

    (targetTable.alias("target").merge(
        incomingSalesAggregateDF.alias("source"), f"""
       target.item_id = source.item_id AND 
       target.timestamp = source.timestamp AND
       target.timestamp > cast('{watermarkTime}' AS TIMESTAMP) AND 
       source.timestamp > cast('{watermarkTime}' AS TIMESTAMP)
       """).whenMatchedUpdate(
            set={
                "sales": f.col("source.sales") + f.col("target.sales")
            }).whenNotMatchedInsertAll().execute())
示例#2
0
    def _get_delta_table(spark, delta_path, delta_table):

        if [delta_path, delta_table].count(None) == 2:
            raise ValueError("delta_path ou delta_table deve ser passado")

        if delta_path is not None:
            delta_table = DeltaTable.forPath(spark, delta_path)

        else:
            delta_table = DeltaTable.forName(spark, delta_table)

        return delta_table
示例#3
0
    def _get_delta_table(self, spark, table_or_path, update_delta_table):

        try:
            deltaTable = DeltaTable.forPath(spark, table_or_path)
        except:
            try:
                deltaTable = DeltaTable.forName(spark, table_or_path)
            except AssertionError as E:
                raise E

        if update_delta_table:
            return deltaTable

        return deltaTable.toDF()
示例#4
0
    def merge(
            self,
            df: DataFrame,
            location: str,
            condition: str,  # Only supports SQL-like string condition
            match_update_dict: dict,  # "target_column": "expression"
            insert_when_not_matched: False,  # Set to True for upsert
            save_mode: str = 'table'):
        '''Merge a dataframe to target table or path.

        This merge operation can represent both update and upsert operation.
        Source and target table is defaultly alias-ed as 'SRC' and 'TGT'. This could be used in condition string and update/insert expressions.
        Args:
            df (DataFrame): The source dataframe to write.
            save_mode (str): 'table' or 'path'
            location (str): The table name or path to be merge into.
            condition (str): The condition in SQL-like string form.
            match_update_dict (dict): Contains ("target_column": "expression"). 
                This represents the updated value if matched.
                NOTE: "target_column"'s come without schema ("SRC" or "TGT").
            not_match_insert_dict (dict): Contains ("target_column": "expression"). 
                This represents the inserted value if not matched. 
                Other columns which are not specified shall be null.
                NOTE: "target_column"'s come without schema ("SRC" or "TGT").
        '''
        super(DeltaDataSource,
              self).merge(df,
                          condition,
                          match_update_dict,
                          insert_when_not_matched=insert_when_not_matched)
        save_mode = save_mode.lower()
        if save_mode == "table":
            target_table = DeltaTable.forName(self.spark, location)
        elif save_mode == "path":
            target_table = DeltaTable.forPath(self.spark, location)
        else:
            raise ValueError("save_mode should be 'path' or 'table'.")

        merger = target_table.alias("TGT").merge(df.alias("SRC"), condition)
        merger = merger.whenMatchedUpdate(set=match_update_dict)

        if insert_when_not_matched:
            merger = merger.whenNotMatchedInsert(values=match_update_dict)

        merger.execute()
示例#5
0
 def test_forName(self) -> None:
     self.__writeAsTable([('a', 1), ('b', 2), ('c', 3)], "test")
     df = DeltaTable.forName(self.spark, "test").toDF()
     self.__checkAnswer(df, [('a', 1), ('b', 2), ('c', 3)])