Пример #1
0
 def test_isDeltaTable(self) -> None:
     df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"])
     df.write.format("parquet").save(self.tempFile)
     tempFile2 = self.tempFile + '_2'
     df.write.format("delta").save(tempFile2)
     self.assertEqual(DeltaTable.isDeltaTable(self.spark, self.tempFile), False)
     self.assertEqual(DeltaTable.isDeltaTable(self.spark, tempFile2), True)
Пример #2
0
    def test_restore_to_timestamp(self) -> None:
        self.__writeDeltaTable([('a', 1), ('b', 2)])
        timestampToRestore = DeltaTable.forPath(self.spark, self.tempFile) \
            .history() \
            .head() \
            .timestamp \
            .strftime('%Y-%m-%d %H:%M:%S.%f')

        self.__overwriteDeltaTable([('a', 3), ('b', 2)],
                                   schema=["key_new", "value_new"],
                                   overwriteSchema='true')

        overwritten = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(
            overwritten,
            [Row(key_new='a', value_new=3),
             Row(key_new='b', value_new=2)])

        DeltaTable.forPath(
            self.spark, self.tempFile).restoreToTimestamp(timestampToRestore)

        restored = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(restored,
                           [Row(key='a', value=1),
                            Row(key='b', value=2)])
Пример #3
0
    def test_convertToDelta(self) -> None:
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"])
        df.write.format("parquet").save(self.tempFile)
        dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % self.tempFile)
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile),
            [('a', 1), ('b', 2), ('c', 3)])

        # test if convert to delta with partition columns work
        tempFile2 = self.tempFile + "_2"
        df.write.partitionBy("value").format("parquet").save(tempFile2)
        schema = StructType()
        schema.add("value", IntegerType(), True)
        dt = DeltaTable.convertToDelta(
            self.spark,
            "parquet.`%s`" % tempFile2,
            schema)
        self.__checkAnswer(
            self.spark.read.format("delta").load(tempFile2),
            [('a', 1), ('b', 2), ('c', 3)])
        self.assertEqual(type(dt), DeltaTable)

        # convert to delta with partition column provided as a string
        tempFile3 = self.tempFile + "_3"
        df.write.partitionBy("value").format("parquet").save(tempFile3)
        dt = DeltaTable.convertToDelta(
            self.spark,
            "parquet.`%s`" % tempFile3,
            "value int")
        self.__checkAnswer(
            self.spark.read.format("delta").load(tempFile3),
            [('a', 1), ('b', 2), ('c', 3)])
        self.assertEqual(type(dt), DeltaTable)
Пример #4
0
    def merge_write(logger, df_dict: Dict[str, DataFrame], rules: Dict[str,
                                                                       str],
                    output_path: str, spark: SparkSession):
        """
        Write data if the dataset doesn't exist or merge it to the existing dataset
        Args:
            logger: Logger instance used to log events
            df_dict: Dictionary of the datasets with the structure {Name: Dataframe}
            rules: Matching rules use to merge
            output_path: Path to write the data
            spark: Spark instance

        Returns:

        """
        try:
            from delta.tables import DeltaTable
            for df_name, df in df_dict.items():
                file_path = path.join(output_path, df_name)
                if DeltaTable.isDeltaTable(spark, file_path):
                    delta_table = DeltaTable.forPath(spark, file_path)
                    delta_table.alias("old").merge(
                        df.alias("new"), rules.get(df_name)
                    ).whenMatchedUpdateAll().whenNotMatchedInsertAll()
                else:
                    df.write.format("delta").save(file_path)

        except Exception as e:
            logger.error(
                "Writing sanitized data couldn't be performed: {}".format(e),
                traceback.format_exc())
            raise e
        else:
            logger.info("Sanitized dataframes written in {} folder".format(
                output_path))
Пример #5
0
    def test_restore_to_timestamp(self) -> None:
        self.__writeDeltaTable([('a', 1), ('b', 2)])
        timestampToRestore = DeltaTable.forPath(self.spark, self.tempFile) \
            .history() \
            .head() \
            .timestamp \
            .strftime('%Y-%m-%d %H:%M:%S.%f')

        self.__overwriteDeltaTable([('a', 3), ('b', 2)],
                                   schema=["key_new", "value_new"],
                                   overwriteSchema='true')

        overwritten = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(
            overwritten,
            [Row(key_new='a', value_new=3),
             Row(key_new='b', value_new=2)])

        DeltaTable.forPath(
            self.spark, self.tempFile).restoreToTimestamp(timestampToRestore)

        restored = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(restored,
                           [Row(key='a', value=1),
                            Row(key='b', value=2)])

        # we cannot test the actual working of restore to timestamp here but we can make sure
        # that the api is being called at least
        def runRestore() -> None:
            DeltaTable.forPath(self.spark,
                               self.tempFile).restoreToTimestamp('05/04/1999')

        self.__intercept(
            runRestore, "The provided timestamp ('05/04/1999') "
            "cannot be converted to a valid timestamp")
Пример #6
0
    def test_create_table_with_existing_schema(self) -> None:
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"])
        deltaTable = DeltaTable.create(self.spark).tableName("test") \
            .addColumns(df.schema) \
            .addColumn("value2", dataType="int")\
            .partitionedBy(["value2", "value"])\
            .execute()
        self.__verify_table_schema("test",
                                   deltaTable.toDF().schema,
                                   ["key", "value", "value2"],
                                   [StringType(), LongType(), IntegerType()],
                                   nullables={"key", "value", "value2"},
                                   partitioningColumns=["value", "value2"])

        # verify creating table with list of structFields
        deltaTable2 = DeltaTable.create(self.spark).tableName("test2").addColumns(
            df.schema.fields) \
            .addColumn("value2", dataType="int") \
            .partitionedBy("value2", "value")\
            .execute()
        self.__verify_table_schema("test2",
                                   deltaTable2.toDF().schema,
                                   ["key", "value", "value2"],
                                   [StringType(), LongType(), IntegerType()],
                                   nullables={"key", "value", "value2"},
                                   partitioningColumns=["value", "value2"])
Пример #7
0
    def test_create_replace_table_with_no_spark_session_passed(self) -> None:
        # create table.
        deltaTable = DeltaTable.create().tableName("test")\
            .addColumn("value", dataType="int").execute()
        self.__verify_table_schema("test",
                                   deltaTable.toDF().schema, ["value"],
                                   [IntegerType()],
                                   nullables={"value"})

        # ignore existence with createIfNotExists
        deltaTable = DeltaTable.createIfNotExists().tableName("test") \
            .addColumn("value2", dataType="int").execute()
        self.__verify_table_schema("test",
                                   deltaTable.toDF().schema, ["value"],
                                   [IntegerType()],
                                   nullables={"value"})

        # replace table with replace
        deltaTable = DeltaTable.replace().tableName("test") \
            .addColumn("key", dataType="int").execute()
        self.__verify_table_schema("test",
                                   deltaTable.toDF().schema, ["key"],
                                   [IntegerType()],
                                   nullables={"key"})

        # replace with a new column again
        deltaTable = DeltaTable.createOrReplace().tableName("test") \
            .addColumn("col1", dataType="int").execute()

        self.__verify_table_schema("test",
                                   deltaTable.toDF().schema, ["col1"],
                                   [IntegerType()],
                                   nullables={"col1"})
Пример #8
0
    def _merge_into_table(self, df, destination_path, checkpoints_path,
                          condition):
        """ Merges data from the given dataframe into the delta table at the specified destination_path, based on the given condition.
            If not delta table exists at the specified destination_path, a new delta table is created and the data from the given dataframe is inserted.
            eg, merge_into_table(df_lookup, np_destination_path, source_path + '/_checkpoints/delta_np', "current.id_pseudonym = updates.id_pseudonym")
        """
        if DeltaTable.isDeltaTable(spark, destination_path):
            dt = DeltaTable.forPath(spark, destination_path)

            def upsert(batch_df, batchId):
                dt.alias("current").merge(
                    batch_df.alias("updates"), condition).whenMatchedUpdateAll(
                    ).whenNotMatchedInsertAll().execute()

            query = df.writeStream.format("delta").foreachBatch(
                upsert).outputMode("update").trigger(once=True).option(
                    "checkpointLocation", checkpoints_path)
        else:
            logger.info(
                f'Delta table does not yet exist at {destination_path} - creating one now and inserting initial data.'
            )
            query = df.writeStream.format("delta").outputMode(
                "append").trigger(once=True).option("checkpointLocation",
                                                    checkpoints_path)
        query = query.start(destination_path)
        query.awaitTermination(
        )  # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
        logger.info(query.lastProgress)
Пример #9
0
 def __verify_generated_column(self, tableName: str,
                               deltaTable: DeltaTable) -> None:
     cmd = "INSERT INTO {table} (col1, col2) VALUES (1, 11)".format(
         table=tableName)
     self.spark.sql(cmd)
     deltaTable.update(expr("col2 = 11"), {"col1": expr("2")})
     self.__checkAnswer(deltaTable.toDF(), [(2, 12)],
                        schema=["col1", "col2"])
Пример #10
0
 def __replace_table(self, orCreate, tableName=None, location=None):
     builder = DeltaTable.createOrReplace(self.spark) if orCreate \
         else DeltaTable.replace(self.spark)
     if tableName:
         builder = builder.tableName(tableName)
     if location:
         builder = builder.location(location)
     return self.__build_delta_table(builder)
Пример #11
0
 def __create_table(self, ifNotExists, tableName=None, location=None):
     builder = DeltaTable.createIfNotExists(self.spark) if ifNotExists \
         else DeltaTable.create(self.spark)
     if tableName:
         builder = builder.tableName(tableName)
     if location:
         builder = builder.location(location)
     return self.__build_delta_table(builder)
Пример #12
0
    def _get_delta_table(spark, delta_path, delta_table):

        if [delta_path, delta_table].count(None) == 2:
            raise ValueError("delta_path ou delta_table deve ser passado")

        if delta_path is not None:
            delta_table = DeltaTable.forPath(spark, delta_path)

        else:
            delta_table = DeltaTable.forName(spark, delta_table)

        return delta_table
Пример #13
0
 def get_current_data(self) -> DeltaTable:
     try:
         delta_output = DeltaTable.forPath(
             self.spark_configuration.spark_session,
             self.delta_src + self.current_data_table_name)
     except AnalysisException:
         print("Delta Table not exists -> creating")
         self._init_current_data()
         delta_output = DeltaTable.forPath(
             self.spark_configuration.spark_session,
             self.delta_src + self.current_data_table_name)
     return delta_output
Пример #14
0
    def _get_delta_table(self, spark, table_or_path, update_delta_table):

        try:
            deltaTable = DeltaTable.forPath(spark, table_or_path)
        except:
            try:
                deltaTable = DeltaTable.forName(spark, table_or_path)
            except AssertionError as E:
                raise E

        if update_delta_table:
            return deltaTable

        return deltaTable.toDF()
Пример #15
0
def createDeltaBackedState(tableName, overwrite=False):

    from delta.tables import DeltaTable
    import pyspark.sql.types as T

    db_location = "dbfs:/home/[email protected]/streamingWorkshop/db"
    db_table_name = "sw_db." + tableName
    checkpoint_location = db_location + "/checkpointTables/" + db_table_name

    delta_schema = (T.StructType([
        T.StructField("item_id", T.LongType()),
        T.StructField("timestamp", T.TimestampType()),
        T.StructField("sales", T.LongType())
    ]))

    # Create an empty Delta table if it does not exist. This is required for the MERGE to work in the first mini batch.
    if overwrite or not DeltaTable.isDeltaTable(
            spark, db_location + "/" + db_table_name):
        (spark.createDataFrame(
            [], delta_schema).write.mode("overwrite").option(
                "overwriteSchema",
                "true").format("delta").saveAsTable(db_table_name))
        spark.sql(
            f"ALTER TABLE {db_table_name} SET TBLPROPERTIES (delta.autoOptimize.optimizeWrite = true, delta.autoOptimize.autoCompact = false)"
        )
Пример #16
0
def get_delta_table(
        spark: SparkSession,
        schema: StructType,
        delta_library_jar: str,
        delta_path: str):
    # load delta library jar, so we can use delta module
    spark.sparkContext.addPyFile(delta_library_jar)
    from delta.tables import DeltaTable

    # check existence of delta table
    if not DeltaTable.isDeltaTable(spark, delta_path):
        print(f">>> Delta table: {delta_path} is not initialized, performing initialization..")
        df = spark.createDataFrame([], schema=schema)
        df.write.format("delta").save(delta_path)

    return DeltaTable.forPath(spark, delta_path)
Пример #17
0
def upsert_table(spark,
                 updatesDF,
                 condition,
                 output_file,
                 partition_columns=None):
    '''
    update/insert transformed immigration data to fact_table
    incase of duplicate id, overwrite the old value with new value, otherwise append it to dataframe
    '''

    from delta.tables import DeltaTable

    if not os.path.exists(output_file):

        if partition_columns is None:
            updatesDF.write.format('delta').save(output_file)
        else:
            updatesDF.write.format('delta').partitionBy(
                *partition_columns).save(output_file)

    else:

        deltaTable = DeltaTable.forPath(spark, output_file)

        deltaTable.alias("source").merge(
            source = updatesDF.alias("update"),
            condition = condition) \
          .whenMatchedUpdateAll() \
          .whenNotMatchedInsertAll() \
          .execute()
Пример #18
0
def aggregateSalesRevenueDeltaBacked(updatesDF, epochId):

    # Sum up the new incoming keys
    incomingSalesAggregateDF = (updatesDF.withColumn(
        "timestamp", f.date_trunc("minute", "timestamp")).groupBy(
            f.col("timestamp"),
            f.col("item_id")).agg(f.sum("sales").alias("sales")))

    targetTable = DeltaTable.forName(spark, "sw_db.delta_backed_state")
    # We merge the new sales with the already existing sales.
    # We simulate a watermark by only retrieving timestamp records greater than max seen timestamp - 5 minutes
    # Note that it is even better to partition the state by date if you have days worth of data, to skip over entire partitions,
    # when pushing down the predicate.
    mostRecentTimestamp = targetTable.toDF().select(
        f.max("timestamp").alias("max_timestamp")).head().max_timestamp
    watermarkTime = mostRecentTimestamp - timedelta(
        minutes=5) if mostRecentTimestamp else datetime.min

    (targetTable.alias("target").merge(
        incomingSalesAggregateDF.alias("source"), f"""
       target.item_id = source.item_id AND 
       target.timestamp = source.timestamp AND
       target.timestamp > cast('{watermarkTime}' AS TIMESTAMP) AND 
       source.timestamp > cast('{watermarkTime}' AS TIMESTAMP)
       """).whenMatchedUpdate(
            set={
                "sales": f.col("source.sales") + f.col("target.sales")
            }).whenNotMatchedInsertAll().execute())
Пример #19
0
def run() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        required=True,
        help='input file path',
    )
    parser.add_argument(
        '--output',
        dest='output',
        required=True,
        help='output delta table path',
    )
    parser.add_argument(
        '--action',
        dest='action',
        default='show',
        help='action to apply',
    )
    args = parser.parse_args()

    spark = SparkSession.builder.appName('taxipy') \
        .config('spark.jars.packages', 'io.delta:delta-core_2.12:0.7.0') \
        .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension') \
        .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog') \
        .getOrCreate()

    from delta.tables import DeltaTable
    taxi_ride_table = DeltaTable.forPath(spark, args.output)
    taxi_ride_table.update(condition=expr('vendor_id == 2'),
                           set={'total_amount': '123'})
    show_records(spark, args.output)
    spark.stop()
Пример #20
0
    def test_optimize_zorder_by(self) -> None:
        # write an unoptimized delta table
        self.spark.createDataFrame([i for i in range(0, 100)], IntegerType()) \
            .withColumn("col1", floor(col("value") % 7)) \
            .withColumn("col2", floor(col("value") % 27)) \
            .withColumn("p", floor(col("value") % 10)) \
            .repartition(4).write.partitionBy("p").format("delta").save(self.tempFile)

        # create DeltaTable
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # execute Z-Order Optimization
        optimizer = dt.optimize()
        result = optimizer.executeZOrderBy(["col1", "col2"])
        metrics = result.select("metrics.*").head()

        self.assertTrue(metrics.numFilesAdded == 10)
        self.assertTrue(metrics.numFilesRemoved == 37)
        self.assertTrue(metrics.totalFilesSkipped == 0)
        self.assertTrue(metrics.totalConsideredFiles == 37)
        self.assertTrue(metrics.zOrderStats.strategyName == 'all')
        self.assertTrue(metrics.zOrderStats.numOutputCubes == 10)

        # negative test: Z-Order on partition column
        def optimize() -> None:
            dt.optimize().where("p = 1").executeZOrderBy(["p"])

        self.__intercept(
            optimize, "p is a partition column. "
            "Z-Ordering can only be performed on data columns")
Пример #21
0
    def test_optimize(self) -> None:
        # write an unoptimized delta table
        df = self.spark.createDataFrame([("a", 1), ("a", 2)],
                                        ["key", "value"]).repartition(1)
        df.write.format("delta").save(self.tempFile)
        df = self.spark.createDataFrame([("a", 3), ("a", 4)],
                                        ["key", "value"]).repartition(1)
        df.write.format("delta").save(self.tempFile, mode="append")
        df = self.spark.createDataFrame([("b", 1), ("b", 2)],
                                        ["key", "value"]).repartition(1)
        df.write.format("delta").save(self.tempFile, mode="append")

        # create DeltaTable
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # execute bin compaction
        optimizer = dt.optimize()
        res = optimizer.executeCompaction()
        op_params = dt.history().first().operationParameters

        # assertions
        self.assertTrue(isinstance(optimizer, DeltaOptimizeBuilder))
        self.assertTrue(isinstance(res, DataFrame))
        self.assertEqual(1, res.first().metrics.numFilesAdded)
        self.assertEqual(3, res.first().metrics.numFilesRemoved)
        self.assertEqual('[]', op_params['predicate'])

        # test non-partition column
        def optimize() -> None:
            dt.optimize().where("key = 'a'").executeCompaction()

        self.__intercept(
            optimize, "Predicate references non-partition column 'key'. "
            "Only the partition columns may be referenced: []")
Пример #22
0
    def test_optimize_zorder_by_w_partition_filter(self) -> None:
        # write an unoptimized delta table
        df = self.spark.createDataFrame([i for i in range(0, 100)], IntegerType()) \
            .withColumn("col1", floor(col("value") % 7)) \
            .withColumn("col2", floor(col("value") % 27)) \
            .withColumn("p", floor(col("value") % 10)) \
            .repartition(4).write.partitionBy("p")

        df.format("delta").save(self.tempFile)

        # create DeltaTable
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # execute Z-OrderBy
        optimizer = dt.optimize().where("p = 2")
        result = optimizer.executeZOrderBy(["col1", "col2"])
        metrics = result.select("metrics.*").head()

        # assertions (partition 'p = 2' has four files)
        self.assertTrue(metrics.numFilesAdded == 1)
        self.assertTrue(metrics.numFilesRemoved == 4)
        self.assertTrue(metrics.totalFilesSkipped == 0)
        self.assertTrue(metrics.totalConsideredFiles == 4)
        self.assertTrue(metrics.zOrderStats.strategyName == 'all')
        self.assertTrue(metrics.zOrderStats.numOutputCubes == 1)
Пример #23
0
    def merge(
            self,
            df: DataFrame,
            location: str,
            condition: str,  # Only supports SQL-like string condition
            match_update_dict: dict,  # "target_column": "expression"
            insert_when_not_matched: False,  # Set to True for upsert
            save_mode: str = 'table'):
        '''Merge a dataframe to target table or path.

        This merge operation can represent both update and upsert operation.
        Source and target table is defaultly alias-ed as 'SRC' and 'TGT'. This could be used in condition string and update/insert expressions.
        Args:
            df (DataFrame): The source dataframe to write.
            save_mode (str): 'table' or 'path'
            location (str): The table name or path to be merge into.
            condition (str): The condition in SQL-like string form.
            match_update_dict (dict): Contains ("target_column": "expression"). 
                This represents the updated value if matched.
                NOTE: "target_column"'s come without schema ("SRC" or "TGT").
            not_match_insert_dict (dict): Contains ("target_column": "expression"). 
                This represents the inserted value if not matched. 
                Other columns which are not specified shall be null.
                NOTE: "target_column"'s come without schema ("SRC" or "TGT").
        '''
        super(DeltaDataSource,
              self).merge(df,
                          condition,
                          match_update_dict,
                          insert_when_not_matched=insert_when_not_matched)
        save_mode = save_mode.lower()
        if save_mode == "table":
            target_table = DeltaTable.forName(self.spark, location)
        elif save_mode == "path":
            target_table = DeltaTable.forPath(self.spark, location)
        else:
            raise ValueError("save_mode should be 'path' or 'table'.")

        merger = target_table.alias("TGT").merge(df.alias("SRC"), condition)
        merger = merger.whenMatchedUpdate(set=match_update_dict)

        if insert_when_not_matched:
            merger = merger.whenNotMatchedInsert(values=match_update_dict)

        merger.execute()
Пример #24
0
    def test_restore_to_version(self) -> None:
        self.__writeDeltaTable([('a', 1), ('b', 2)])
        self.__overwriteDeltaTable([('a', 3), ('b', 2)],
                                   schema=["key_new", "value_new"],
                                   overwriteSchema='true')

        overwritten = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(
            overwritten,
            [Row(key_new='a', value_new=3),
             Row(key_new='b', value_new=2)])

        DeltaTable.forPath(self.spark, self.tempFile).restoreToVersion(0)
        restored = DeltaTable.forPath(self.spark, self.tempFile).toDF()

        self.__checkAnswer(restored,
                           [Row(key='a', value=1),
                            Row(key='b', value=2)])
Пример #25
0
def get_delta_table(path):
    try:
        dt= DeltaTable.forPath(spark,path)
    except AnalysisException as e:
        if('doesn\'t exist;' in str(e).lower() or 'is not a delta table.' in str(e).lower()):
            print("Error Occured due to : "+str(e))
            return None
        else:
            raise e
    return dt
Пример #26
0
    def test_convertToDelta(self):
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        df.write.format("parquet").save(self.tempFile)
        self.tempFile2 = self.tempFile + "_"
        dt = DeltaTable.convertToDelta(self.spark,
                                       "parquet.`" + self.tempFile + "`")
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile), [('a', 1),
                                                                  ('b', 2),
                                                                  ('c', 3)])

        # test if convert to delta with partition columns work
        df.write.partitionBy("value").format("parquet").save(self.tempFile2)
        schema = StructType()
        schema.add("value", IntegerType(), True)
        dt = DeltaTable.convertToDelta(self.spark,
                                       "parquet.`" + self.tempFile2 + "`",
                                       schema)
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile2), [('a', 1),
                                                                   ('b', 2),
                                                                   ('c', 3)])
Пример #27
0
    def test_history(self) -> None:
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        operations = dt.history().select('operation')
        self.__checkAnswer(
            operations, [Row("WRITE"), Row("WRITE")],
            StructType([StructField("operation", StringType(), True)]))

        lastMode = dt.history(1).select('operationParameters.mode')
        self.__checkAnswer(
            lastMode, [Row("Overwrite")],
            StructType(
                [StructField("operationParameters.mode", StringType(), True)]))
Пример #28
0
def merge(spark, update, tableName, cols, key):
    """
    将DataFrame和delta表进行merge操作,insert操作要求DataFrame必须包含delta表所有的列(0.5版本)
    当我们使用merge操作更新/插入delta表其中几列时,指定在DataFrame中不存在的列的值为null。

    注:DataFrame中要写入delta表的列要和delta表一样
    :param spark,SparkSession实例
    :param update,spark DataFrame
    :param tableName,要更新的delta表
    """
    # 如果没有dt列,创建当前日期的dt列
    if "dt" not in cols:
        update = update.withColumn("dt", f.current_date())
        cols.append("dt")

    # 1.构建merge条件
    mergeExpr = f"origin.{key}=update.{key}"
    print(f"merge expression:{mergeExpr}")

    # 2.构建更新表达式
    updateExpr = {}
    for c in cols:
        updateExpr[c] = f"update.{c}"

    print(f"update expression:{updateExpr}")

    origin = DeltaTable.forPath(spark, tableName)
    origin_cols = origin.toDF().columns

    # 3.构建插入表达式
    insertExpr = {}
    for origin_col in origin_cols:
        if origin_col in cols:
            insertExpr[origin_col] = f"update.{origin_col}"
        else:
            # 不存在,插入null值(不是字符串)
            insertExpr[origin_col] = "null"

    print(f"insert expression:{insertExpr}")

    # for origin_col in origin_cols:
    #     if origin_col not in cols:
    #         update=update.withColumn(origin_col,f.lit(None))

    origin.alias("origin") \
        .merge(update.alias("update"), mergeExpr) \
        .whenMatchedUpdate(set=updateExpr) \
        .whenNotMatchedInsert(values=insertExpr) \
        .execute()
Пример #29
0
    def test_vacuum(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        self.__createFile('abc.txt', 'abcde')
        self.__createFile('bac.txt', 'abcdf')
        self.assertEqual(True, self.__checkFileExists('abc.txt'))
        dt.vacuum()  # will not delete files as default retention is used.

        self.assertEqual(True, self.__checkFileExists('bac.txt'))
        retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled"
        self.spark.conf.set(retentionConf, "false")
        dt.vacuum(0.0)
        self.spark.conf.set(retentionConf, "true")
        self.assertEqual(False, self.__checkFileExists('bac.txt'))
        self.assertEqual(False, self.__checkFileExists('abc.txt'))
Пример #30
0
 def update_sum_count_or_insert(self, new_data: DataFrame, table: str, id_col: str):
     try:
         delta_table = DeltaTable.forPath(self.spark_configuration.spark_session, self.delta_src + table)
     except AnalysisException:
         # If delta table not exists just create it
         new_data.write \
             .format("delta") \
             .save(self.delta_src + table)
         return
     delta_table.alias("current_data").merge(
         new_data.alias("updates"),
         "current_data.{0} = updates.{0}".format(id_col))\
         .whenMatchedUpdate(set={
             "count": "current_data.count + updates.count"
         }) \
         .whenNotMatchedInsertAll() \
         .execute()