Пример #1
0
class DeltaSqlTests(PySparkTestCase):
    def setUp(self):
        super(DeltaSqlTests, self).setUp()
        spark = SparkSession(self.sc)
        if self.sc.version < "3.":
            # Manually activate "DeltaSparkSessionExtension" in PySpark 2.4 in a cloned session
            # because "spark.sql.extensions" is not picked up. (See SPARK-25003).
            self.sc._jvm.io.delta.sql.DeltaSparkSessionExtension() \
                .apply(spark._jsparkSession.extensions())
            self.spark = SparkSession(self.sc,
                                      spark._jsparkSession.cloneSession())
        else:
            self.spark = spark
        self.temp_path = tempfile.mkdtemp()
        self.temp_file = os.path.join(self.temp_path, "delta_sql_test_table")
        # Create a simple Delta table inside the temp directory to test SQL commands.
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        df.write.format("delta").save(self.temp_file)
        df.write.mode("overwrite").format("delta").save(self.temp_file)

    def tearDown(self):
        self.spark.stop()
        shutil.rmtree(self.temp_path)
        super(DeltaSqlTests, self).tearDown()

    def test_vacuum(self):
        self.spark.sql(
            "set spark.databricks.delta.retentionDurationCheck.enabled = false"
        )
        try:
            deleted_files = self.spark.sql("VACUUM '%s' RETAIN 0 HOURS" %
                                           self.temp_file).collect()
            # Verify `VACUUM` did delete some data files
            self.assertTrue(self.temp_file in deleted_files[0][0])
        finally:
            self.spark.sql(
                "set spark.databricks.delta.retentionDurationCheck.enabled = true"
            )

    def test_describe_history(self):
        assert (len(
            self.spark.sql("desc history delta.`%s`" %
                           (self.temp_file)).collect()) > 0)

    def test_generate(self):
        # create a delta table
        temp_path = tempfile.mkdtemp()
        temp_file = os.path.join(temp_path, "delta_sql_test_table")
        numFiles = 10
        self.spark.range(100).repartition(numFiles).write.format("delta").save(
            temp_file)

        # Generate the symlink format manifest
        self.spark.sql(
            "GENERATE SYMLINK_FORMAT_MANIFEST FOR TABLE delta.`{}`".format(
                temp_file))

        # check the contents of the manifest
        # NOTE: this is not a correctness test, we are testing correctness in the scala suite
        manifestPath = os.path.join(
            temp_file, os.path.join("_symlink_format_manifest", "manifest"))
        files = []
        with open(manifestPath) as f:
            files = f.readlines()

        shutil.rmtree(temp_path)
        # the number of files we write should equal the number of lines in the manifest
        assert (len(files) == numFiles)

    def test_convert(self):
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        temp_path2 = tempfile.mkdtemp()
        temp_path3 = tempfile.mkdtemp()
        temp_file2 = os.path.join(temp_path2, "delta_sql_test2")
        temp_file3 = os.path.join(temp_path3, "delta_sql_test3")

        df.write.format("parquet").save(temp_file2)
        self.spark.sql("CONVERT TO DELTA parquet.`" + temp_file2 + "`")
        self.__checkAnswer(
            self.spark.read.format("delta").load(temp_file2), [('a', 1),
                                                               ('b', 2),
                                                               ('c', 3)])

        # test if convert to delta with partition columns work
        df.write.partitionBy("value").format("parquet").save(temp_file3)
        self.spark.sql("CONVERT TO DELTA parquet.`" + temp_file3 +
                       "` PARTITIONED BY (value INT)")
        self.__checkAnswer(
            self.spark.read.format("delta").load(temp_file3), [('a', 1),
                                                               ('b', 2),
                                                               ('c', 3)])

        shutil.rmtree(temp_path2)
        shutil.rmtree(temp_path3)

    def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]):
        if not expectedAnswer:
            self.assertEqual(df.count(), 0)
            return
        expectedDF = self.spark.createDataFrame(expectedAnswer, schema)
        self.assertEqual(df.count(), expectedDF.count())
        self.assertEqual(len(df.columns), len(expectedDF.columns))
        self.assertEqual([], df.subtract(expectedDF).take(1))
        self.assertEqual([], expectedDF.subtract(df).take(1))
class TestShowResults(PySparkTestCase):

    def setUp(self):
        super().setUp()
        self.spark = SparkSession(self.sc)

        c1 = DenseVector(np.array([1.0, 1.0]))
        c2 = DenseVector(np.array([5.0, 5.0]))
        c3 = DenseVector(np.array([30.0, 30.0]))
        df = pd.DataFrame(
            {'prediction': [0, 0, 0, 0, 0, 0, 1, 2, 1, 1],
             'point_col': [DenseVector(np.array([1.0, 2.0])),
                           DenseVector(np.array([2.0, 1.0])),
                           DenseVector(np.array([0.0, 1.0])),
                           DenseVector(np.array([1.0, 0.0])),
                           DenseVector(np.array([1.0, -1.0])),
                           DenseVector(np.array([4.0, 5.0])),
                           DenseVector(np.array([5.0, 6.0])),
                           DenseVector(np.array([20.0, 30.0])),
                           DenseVector(np.array([5.0, 7.0])),
                           DenseVector(np.array([5.0, 10.0]))],
             'centers': [c1, c1, c1, c1, c1, c1, c2, c3, c2, c2]},
            columns=['prediction', 'point_col', 'centers'])

        self.dataframe = self.spark.createDataFrame(df)

    # probably not needed!!!
    def test_add_row_index(self):

        computed_dataframe = ShowResults._add_row_index(self.dataframe)
        self.assertIn(('rowId', 'bigint'), computed_dataframe.dtypes)

        computed_dataframe = ShowResults._add_row_index(self.dataframe, rowId='roow')
        self.assertIn(('roow', 'bigint'), computed_dataframe.dtypes)

    def test_add_distances(self):
        from math import sqrt
        computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col')
        self.assertIn(('distance', 'double'), computed_dataframe.dtypes)

        p_computed_dataframe = computed_dataframe.toPandas()
        actual_distances = [sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(4.0),
                            sqrt(9.0+16.0), sqrt(1.0), sqrt(100.0), sqrt(4.0), sqrt(25.0)]
        for idx, val in enumerate(actual_distances):
            self.assertEqual(val, p_computed_dataframe['distance'][idx])
        print('add_distance \n', p_computed_dataframe)

    def test_add_outliers(self):
        computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col')
        computed_pdf = ShowResults._add_outliers(computed_dataframe).toPandas()

        # Boundary pre calculated mean for prediction 0: mean+2*stddev
        actual_values = [False]*5+[True]+4*[False]
        self.assertListEqual(list(computed_pdf['is_outlier']), actual_values)
        print('add_outliers \n', computed_pdf)

    @unittest.skip('Reason: Compute summary not in play.')
    def test_compute_summary(self):
        computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col')
        computed_df = ShowResults._add_outliers(computed_dataframe)
        summary_pdf = ShowResults.compute_summary(computed_df).toPandas()

        # counts from predictionCol
        actual_count_prediction = [6, 3, 1]
        # counts from outliers in distance
        actual_count_outliers = [1, 0, 0]
        # percentage from actual_count_outliers / actual_count_prediction
        actual_count_percentage = list(map(float, ['%.f' % elem for elem in
                                                   [out/pre*100 for out, pre in
                                                    zip(actual_count_outliers, actual_count_prediction)]]))

        self.assertEqual(list(summary_pdf['count']), actual_count_prediction)
        self.assertEqual(list(summary_pdf['outlier_count']), actual_count_outliers)
        self.assertEqual(list(summary_pdf['outlier percentage']), actual_count_percentage)
        print('compute_summary \n', summary_pdf)

    def test_prepare_table_data(self):
        # not tested through
        table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
        print('prepare_table_data \n', table_df)

    @unittest.skip('reason: Not implemented')
    def test_cluster_graph(self):
        # not tested through

        table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
        grouped = table_df.groupby('prediction')
        for i in range(1, len(table_df.prediction.unique())+1):
            group_i = grouped.get_group(i)
            table_json = ShowResults.cluster_graph(group_i)
            print('cluster_graph \n', table_json)

    @unittest.skip('Reason: JSON_histogram not applicable')
    def test_json_histogram(self):
        # not tested through

        table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas()
        hist_json = ShowResults.json_histogram(table_df)
        print('json_hist \n', hist_json)

    def test_arrange_output(self):
        # Preamble: setup data
        features = ['a', 'b']
        id = 'id'
        prediction = 'Prediction'
        # Test 1: Are all columns there?

        shows = ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10)
        d_point = 'dp'
        df = self._generate_data()
        arranged_df = shows.arrange_output(self.sc, df, data_point_name=d_point)
        expected_cols = [prediction, d_point, 'amount', 'percentage_outlier', 'amount_outlier', 'buckets']
        self.assertListEqual(sorted(expected_cols), sorted(arranged_df.columns))

    def _generate_data(self):
        df = self.spark.range(10)
        output = (df
                  .select(
                   'id', F.rand(42).alias('a'), F.randn(1).alias('b'),
                   F.round(10 * F.rand(42)).alias('Prediction'),
                   F.rand().alias('distance'))
                  .withColumn('is_outlier', F.when(F.col('distance') >= 0.7, 1.0).otherwise(0.))
                  .withColumn('computed_boundary', F.randn())
                  )
        return output

    def test_create_linspace(self):  # data, min, max, buckets, boundary):
        data = ShowResults.prepare_table_data(self.dataframe, point_col='point_col')
        data.show()



    # @unittest.skip('Test not created')
    def test_create_buckets(self):
        # Preamble: setup data
        df = self._generate_data()
        features = ['a', 'b']
        id = 'id'
        prediction = 'Prediction'

        shows = ShowResults.prepare_table_data(self.dataframe, point_col='point_col')
        # ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10)
        buckets = shows.create_buckets(sc=self.sc, dataframe=df, buckets=20, prediction_col=prediction)

        print(buckets.rdd.take(1)[0]['buckets'])
        self.fail()
Пример #3
0
dict = {1: 'test', 2: 'test2'}
sc = SparkContext()
spark = SparkSession(sc)

rdd = spark.parallelize([dict])
#toDF() needs a SparkSession, must be a row type before conversion to df
rdd.map(lambda x: Row(**x)).toDF().show() # **x transposes the row
# OR using createDataFrame
rdd = rdd.map(lambda x: Row(**x))
spark.createDataFrame(rdd).show()


## APPENDING NEW DATA
#--------------------------------------------------------
firstDF = spark.range(3).toDF("myCol")
newRow = spark.createDataFrame([[20]])
appended = firstDF.union(newRow)
display(appended)



## EXPLORATORY
#--------------------------------------------------------
df.describe() #show datatypes
df.describe().show() #show max, min, stdev



## COLUMNS
#--------------------------------------------------------
Пример #4
0
    .appName("utilities") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.sources.parallelPartitionDiscovery.parallelism", "8") \
    .getOrCreate()

# Apache Spark 2.4.x has a known issue (SPARK-25003) that requires explicit activation
# of the extension and cloning of the session. This will unnecessary in Apache Spark 3.x.
if spark.sparkContext.version < "3.":
    spark.sparkContext._jvm.io.delta.sql.DeltaSparkSessionExtension() \
        .apply(spark._jsparkSession.extensions())
    spark = SparkSession(spark.sparkContext, spark._jsparkSession.cloneSession())

# Create a table
print("########### Create a Parquet table ##############")
data = spark.range(0, 5)
data.write.format("parquet").save("/tmp/delta-table")

# Convert to delta
print("########### Convert to Delta ###########")
DeltaTable.convertToDelta(spark, "parquet.`/tmp/delta-table`")

# Read the table
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")
print("######## Vacuum the table ########")
deltaTable.vacuum()

print("######## Describe history for the table ######")
Пример #5
0
class DeltaTableTests(PySparkTestCase):
    def setUp(self):
        super(DeltaTableTests, self).setUp()
        self.sqlContext = SQLContext(self.sc)
        self.spark = SparkSession(self.sc)
        self.tempPath = tempfile.mkdtemp()
        self.tempFile = os.path.join(self.tempPath, "tempFile")

    def tearDown(self):
        self.spark.stop()
        shutil.rmtree(self.tempPath)
        super(DeltaTableTests, self).tearDown()

    def test_forPath(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(dt, [('a', 1), ('b', 2), ('c', 3)])

    def test_alias_and_toDF(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile).toDF()
        self.__checkAnswer(
            dt.alias("myTable").select('myTable.key', 'myTable.value'),
            [('a', 1), ('b', 2), ('c', 3)])

    def test_delete(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # delete with condition as str
        dt.delete("key = 'a'")
        self.__checkAnswer(dt.toDF(), [('b', 2), ('c', 3), ('d', 4)])

        # delete with condition as Column
        dt.delete(col("key") == lit("b"))
        self.__checkAnswer(dt.toDF(), [('c', 3), ('d', 4)])

        # delete without condition
        dt.delete()
        self.__checkAnswer(dt.toDF(), [])

        # bad args
        with self.assertRaises(TypeError):
            dt.delete(condition=1)

    def test_generate(self):
        # create a delta table
        numFiles = 10
        self.spark.range(100).repartition(numFiles).write.format("delta").save(
            self.tempFile)
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # Generate the symlink format manifest
        dt.generate("symlink_format_manifest")

        # check the contents of the manifest
        # NOTE: this is not a correctness test, we are testing correctness in the scala suite
        manifestPath = os.path.join(
            self.tempFile, os.path.join("_symlink_format_manifest",
                                        "manifest"))
        files = []
        with open(manifestPath) as f:
            files = f.readlines()

        # the number of files we write should equal the number of lines in the manifest
        assert (len(files) == numFiles)

    def test_update(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # update with condition as str and with set exprs as str
        dt.update("key = 'a' or key = 'b'", {"value": "1"})
        self.__checkAnswer(dt.toDF(), [('a', 1), ('b', 1), ('c', 3), ('d', 4)])

        # update with condition as Column and with set exprs as Columns
        dt.update(expr("key = 'a' or key = 'b'"), {"value": expr("0")})
        self.__checkAnswer(dt.toDF(), [('a', 0), ('b', 0), ('c', 3), ('d', 4)])

        # update without condition
        dt.update(set={"value": "200"})
        self.__checkAnswer(dt.toDF(), [('a', 200), ('b', 200), ('c', 200),
                                       ('d', 200)])

        # bad args
        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.update({"value": "200"})

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.update(condition='a')

        with self.assertRaisesRegex(TypeError, "must be a dict"):
            dt.update(set=1)

        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.update(1, {})

        with self.assertRaisesRegex(TypeError,
                                    "Values of dict in .* must contain only"):
            dt.update(set={"value": 1})

        with self.assertRaisesRegex(TypeError,
                                    "Keys of dict in .* must contain only"):
            dt.update(set={1: ""})

        with self.assertRaises(TypeError):
            dt.update(set=1)

    def test_merge(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        source = self.spark.createDataFrame([('a', -1), ('b', 0), ('e', -5),
                                             ('f', -6)], ["k", "v"])

        def reset_table():
            self.__overwriteDeltaTable([('a', 1), ('b', 2), ('c', 3),
                                        ('d', 4)])

        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # ============== Test basic syntax ==============

        # String expressions in merge condition and dicts
        reset_table()
        dt.merge(source, "key = k") \
            .whenMatchedUpdate(set={"value": "v + 0"}) \
            .whenNotMatchedInsert(values={"key": "k", "value": "v + 0"}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3),
                                        ('d', 4), ('e', -5), ('f', -6)]))

        # Column expressions in merge condition and dicts
        reset_table()
        dt.merge(source, expr("key = k")) \
            .whenMatchedUpdate(set={"value": col("v") + 0}) \
            .whenNotMatchedInsert(values={"key": "k", "value": col("v") + 0}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3),
                                        ('d', 4), ('e', -5), ('f', -6)]))

        # ============== Test clause conditions ==============

        # String expressions in all conditions and dicts
        reset_table()
        dt.merge(source, "key = k") \
            .whenMatchedUpdate(condition="k = 'a'", set={"value": "v + 0"}) \
            .whenMatchedDelete(condition="k = 'b'") \
            .whenNotMatchedInsert(condition="k = 'e'", values={"key": "k", "value": "v + 0"}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4),
                                        ('e', -5)]))

        # Column expressions in all conditions and dicts
        reset_table()
        dt.merge(source, expr("key = k")) \
            .whenMatchedUpdate(
                condition=expr("k = 'a'"),
                set={"value": col("v") + 0}) \
            .whenMatchedDelete(condition=expr("k = 'b'")) \
            .whenNotMatchedInsert(
                condition=expr("k = 'e'"),
                values={"key": "k", "value": col("v") + 0}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4),
                                        ('e', -5)]))

        # Positional arguments
        reset_table()
        dt.merge(source, "key = k") \
            .whenMatchedUpdate("k = 'a'", {"value": "v + 0"}) \
            .whenMatchedDelete("k = 'b'") \
            .whenNotMatchedInsert("k = 'e'", {"key": "k", "value": "v + 0"}) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4),
                                        ('e', -5)]))

        # ============== Test updateAll/insertAll ==============

        # No clause conditions and insertAll/updateAll + aliases
        reset_table()
        dt.alias("t") \
            .merge(source.toDF("key", "value").alias("s"), expr("t.key = s.key")) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3),
                                        ('d', 4), ('e', -5), ('f', -6)]))

        # String expressions in all clause conditions and insertAll/updateAll + aliases
        reset_table()
        dt.alias("t") \
            .merge(source.toDF("key", "value").alias("s"), "s.key = t.key") \
            .whenMatchedUpdateAll("s.key = 'a'") \
            .whenNotMatchedInsertAll("s.key = 'e'") \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 2), ('c', 3),
                                        ('d', 4), ('e', -5)]))

        # Column expressions in all clause conditions and insertAll/updateAll + aliases
        reset_table()
        dt.alias("t") \
            .merge(source.toDF("key", "value").alias("s"), expr("t.key = s.key")) \
            .whenMatchedUpdateAll(expr("s.key = 'a'")) \
            .whenNotMatchedInsertAll(expr("s.key = 'e'")) \
            .execute()
        self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 2), ('c', 3),
                                        ('d', 4), ('e', -5)]))

        # ============== Test bad args ==============
        # ---- bad args in merge()
        with self.assertRaisesRegex(TypeError, "must be DataFrame"):
            dt.merge(1, "key = k")

        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.merge(source, 1)

        # ---- bad args in whenMatchedUpdate()
        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source, "key = k").whenMatchedUpdate({"value": "v"})

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source, "key = k").whenMatchedUpdate(1)

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source,
                     "key = k").whenMatchedUpdate(condition="key = 'a'")

        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.merge(source, "key = k").whenMatchedUpdate(1, {"value": "v"})

        with self.assertRaisesRegex(TypeError, "must be a dict"):
            dt.merge(source, "key = k").whenMatchedUpdate("k = 'a'", 1)

        with self.assertRaisesRegex(TypeError,
                                    "Values of dict in .* must contain only"):
            dt.merge(source, "key = k").whenMatchedUpdate(set={"value": 1})

        with self.assertRaisesRegex(TypeError,
                                    "Keys of dict in .* must contain only"):
            dt.merge(source, "key = k").whenMatchedUpdate(set={1: ""})

        with self.assertRaises(TypeError):
            dt.merge(source,
                     "key = k").whenMatchedUpdate(set="k = 'a'",
                                                  condition={"value": 1})

        # bad args in whenMatchedDelete()
        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.merge(source, "key = k").whenMatchedDelete(1)

        # ---- bad args in whenNotMatchedInsert()
        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source, "key = k").whenNotMatchedInsert({"value": "v"})

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source, "key = k").whenNotMatchedInsert(1)

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            dt.merge(source,
                     "key = k").whenNotMatchedInsert(condition="key = 'a'")

        with self.assertRaisesRegex(TypeError,
                                    "must be a Spark SQL Column or a string"):
            dt.merge(source, "key = k").whenNotMatchedInsert(1, {"value": "v"})

        with self.assertRaisesRegex(TypeError, "must be a dict"):
            dt.merge(source, "key = k").whenNotMatchedInsert("k = 'a'", 1)

        with self.assertRaisesRegex(TypeError,
                                    "Values of dict in .* must contain only"):
            dt.merge(source,
                     "key = k").whenNotMatchedInsert(values={"value": 1})

        with self.assertRaisesRegex(TypeError,
                                    "Keys of dict in .* must contain only"):
            dt.merge(source,
                     "key = k").whenNotMatchedInsert(values={1: "value"})

        with self.assertRaises(TypeError):
            dt.merge(source,
                     "key = k").whenNotMatchedInsert(values="k = 'a'",
                                                     condition={"value": 1})

    def test_history(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        operations = dt.history().select('operation')
        self.__checkAnswer(
            operations, [Row("WRITE"), Row("WRITE")],
            StructType([StructField("operation", StringType(), True)]))

        lastMode = dt.history(1).select('operationParameters.mode')
        self.__checkAnswer(
            lastMode, [Row("Overwrite")],
            StructType(
                [StructField("operationParameters.mode", StringType(), True)]))

    def test_vacuum(self):
        self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
        dt = DeltaTable.forPath(self.spark, self.tempFile)
        self.__createFile('abc.txt', 'abcde')
        self.__createFile('bac.txt', 'abcdf')
        self.assertEqual(True, self.__checkFileExists('abc.txt'))
        dt.vacuum()  # will not delete files as default retention is used.
        dt.vacuum(1000)  # test whether integers work

        self.assertEqual(True, self.__checkFileExists('bac.txt'))
        retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled"
        self.spark.conf.set(retentionConf, "false")
        dt.vacuum(0.0)
        self.spark.conf.set(retentionConf, "true")
        self.assertEqual(False, self.__checkFileExists('bac.txt'))
        self.assertEqual(False, self.__checkFileExists('abc.txt'))

    def test_convertToDelta(self):
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        df.write.format("parquet").save(self.tempFile)
        dt = DeltaTable.convertToDelta(self.spark,
                                       "parquet.`%s`" % self.tempFile)
        self.__checkAnswer(
            self.spark.read.format("delta").load(self.tempFile), [('a', 1),
                                                                  ('b', 2),
                                                                  ('c', 3)])

        # test if convert to delta with partition columns work
        tempFile2 = self.tempFile + "_2"
        df.write.partitionBy("value").format("parquet").save(tempFile2)
        schema = StructType()
        schema.add("value", IntegerType(), True)
        dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % tempFile2,
                                       schema)
        self.__checkAnswer(
            self.spark.read.format("delta").load(tempFile2),
            [('a', 1), ('b', 2), ('c', 3)])

        # convert to delta with partition column provided as a string
        tempFile3 = self.tempFile + "_3"
        df.write.partitionBy("value").format("parquet").save(tempFile3)
        dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % tempFile3,
                                       "value int")
        self.__checkAnswer(
            self.spark.read.format("delta").load(tempFile3),
            [('a', 1), ('b', 2), ('c', 3)])

    def test_isDeltaTable(self):
        df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)],
                                        ["key", "value"])
        df.write.format("parquet").save(self.tempFile)
        tempFile2 = self.tempFile + '_2'
        df.write.format("delta").save(tempFile2)
        self.assertEqual(DeltaTable.isDeltaTable(self.spark, self.tempFile),
                         False)
        self.assertEqual(DeltaTable.isDeltaTable(self.spark, tempFile2), True)

    def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]):
        if not expectedAnswer:
            self.assertEqual(df.count(), 0)
            return
        expectedDF = self.spark.createDataFrame(expectedAnswer, schema)
        try:
            self.assertEqual(df.count(), expectedDF.count())
            self.assertEqual(len(df.columns), len(expectedDF.columns))
            self.assertEqual([], df.subtract(expectedDF).take(1))
            self.assertEqual([], expectedDF.subtract(df).take(1))
        except AssertionError:
            print("Expected:")
            expectedDF.show()
            print("Found:")
            df.show()
            raise

    def __writeDeltaTable(self, datalist):
        df = self.spark.createDataFrame(datalist, ["key", "value"])
        df.write.format("delta").save(self.tempFile)

    def __overwriteDeltaTable(self, datalist):
        df = self.spark.createDataFrame(datalist, ["key", "value"])
        df.write.format("delta").mode("overwrite").save(self.tempFile)

    def __createFile(self, fileName, content):
        with open(os.path.join(self.tempFile, fileName), 'w') as f:
            f.write(content)

    def __checkFileExists(self, fileName):
        return os.path.exists(os.path.join(self.tempFile, fileName))
Пример #6
0
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc, col

# _we can use spark in either local mode or cluster mode. Below is the configuration for local mode.

sc = SparkContext("local", "Hello World")
sc.setLogLevel('ERROR')

# _start spark sessin from context
spark = SparkSession(sc)
range = spark.range(100).toDF('numbers')
range.show(5)

# _select only the values which are divisible by 3
div_by_3 = range.where("numbers % 3 == 0")
div_by_3.show(5)

# _read csv file
flightdata = spark.read.option('inferSchema',
                               'true').option('header',
                                              'true').csv('2015-summary.csv')
flightdata.show(5)
flightdata.printSchema()

# _sorted data descending order
sorted_flightdata = flightdata.sort(desc('count'))
sorted_flightdata.show(5)

# _filter data using sql
flightdata.createOrReplaceTempView('flightTable')
Пример #7
0
# spark_context.stop()

NUM_SAMPLES = 100000000
def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1

count = spark_context.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()
pi = 4 * count / NUM_SAMPLES
f"Pi is roughly {pi}"

datetime.datetime.now().isoformat()

local_pth = "/tmp/delta-table-local"
data = spark.range(20, 30)
data.write.format("delta").mode("overwrite").save(local_pth)
data2 = spark.range(5_000, 10_000)
data2.write.format("delta").mode("overwrite").save(local_pth)

df = spark.read.format("delta").load(local_pth)
spark.read.format("delta").load(local_pth).count()

spark.read.format("delta").option("versionAsOf", 0).load(local_pth).count()
spark.read.format("delta").option("versionAsOf", 1).load(local_pth).count()
spark.read.format("delta").option("versionAsOf", 2).load(local_pth).count()
spark.read.format("delta").option("versionAsOf", 3).load(local_pth).count()

df1 = spark.read.format(
    "delta"
).option(
Пример #8
0
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, lit, unix_timestamp, to_timestamp

sc = SparkContext()
spark = SparkSession(sc)

dateDF = spark.range(10)\
    .withColumn("today", current_date())\
    .withColumn("now", current_timestamp())

dateDF.createTempView("dateTable")

dateDF.printSchema()

#1 take days or add days
print("1")
dateDF.select(
    date_sub(col("today"), 5),
    date_add(col("today"), 5))\
.show(1)

#2 date diference
print("2")
dateDF\
    .withColumn("week_ago", date_sub(col("today"), 7))\
    .select(datediff(col("week_ago"), col("today")))\
    .show(1)

#3 between two month
print("3")
Пример #9
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

conf = SparkConf().setAppName('count_connections').setMaster('local[*]')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

# generate a synthetic graph by taking 1% of all
# possible connections between 1000 vertices

x = spark.range(1000)
x_1 = x.select(col('id').alias('id_1'))
x_2 = x.select(col('id').alias('id_2'))

y = (x_1.crossJoin(x_2).filter(col('id_1') < col('id_2')).sample(False, 0.01))

connections = y
connections.cache()
connections.show(20)
connections.createOrReplaceTempView('connections')

# perform analytics on the graph

connection_counts = spark.sql('''

SELECT id,
       COUNT(*) AS n_connections

FROM (
    SELECT id_1 AS id
Пример #10
0
# [replace] here we are using the base df and replace a country name
replaced_df = df.withColumn(
    "Country", regexp_replace(col("Country"), "United States", "USA"))
# replaced_df.show()

# ------------------------------------------- CASE - WHEN ------------------------------------------------------------
# [case when] create new column with case when
df_with_case = df.withColumn("Case_When",
                        when(col("Price") < 1000, lit("Small"))\
                        .when(col("Price").between(1000, 1200), lit("Medium"))\
                        .when(col("Price").between(1201, 3600), lit("Big"))\
                        .when(col("Price") > 3600, lit("Huge"))\
                        .otherwise("Else")
                    )
#df_with_case.show()

# ------------------------------------------- Date functions ----------------------------------------------------------
# [current date]
df_curr_date = spark.range(1).select(current_date()).alias("Current_Date")

# [to timestamp] Used only two yy in "MM/dd/yy" because we don't have century in date
df_to_timestamp = df.withColumn(
    "Account_Create", to_timestamp(col("Account_Created"), "MM/dd/yy HH:mm"))
# df_to_timestamp.show()
# [date difference]
df_date_diff = df_to_timestamp.select(
    datediff(current_date(), col("Account_Create"))).alias("Date_Difference")

# df_date_diff.show()
Пример #11
0
class Chapter2(object):
    def __init__(self):
        self.sc = SparkContext("local", "chapter2_examples")
        self.spark = SparkSession(self.sc)
        self.spark.conf.set("spark.sql.shuffle.partitions", "1000")

        self.flight_data_2015 = self.spark.read.option("inferSchema", "true").option("header", "true")\
         .csv("/Spark-The-Definitive-Guide/data/flight-data/csv/2015-summary.csv")
        self.table_name = 'flight_data_2015'

    def exercise1(self):
        test_range = self.spark.range(100000).toDF("number")
        test_where = test_range.where(
            "number % 2 == 0 and 10 < number and number < 20")
        print(f"number of the elements: {test_where.count()}")
        test_where.show()

    def exercise2(self):
        self.flight_data_2015.show()
        print(f"take 3: {self.flight_data_2015.take(3)}")

        self.flight_data_2015_sorted = self.flight_data_2015.sort(
            "count", ascending=False)
        self.flight_data_2015_sorted.show()

    def exercise3(self):
        self.flight_data_2015.createOrReplaceTempView(self.table_name)

        # dataframes vs sql
        sql_way = self.spark.sql(f"""
        SELECT DEST_COUNTRY_NAME, count(1)
        FROM {self.table_name}
        GROUP BY DEST_COUNTRY_NAME
        """)
        data_frame_way = self.flight_data_2015.groupBy(
            "DEST_COUNTRY_NAME").count()

        diff = (ndiff(
            sql_way._jdf.queryExecution().toString().splitlines(keepends=True),
            data_frame_way._jdf.queryExecution().toString().splitlines(
                keepends=True)))
        print(''.join(diff), end="")

        print(
            f'take 3: {self.flight_data_2015.select(functions.max("count")).take(1)}'
        )

    def exercise4(self):
        sql_start = perf_counter_ns()

        max_sql = self.spark.sql(f"""
        SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
        FROM {self.table_name}
        GROUP BY DEST_COUNTRY_NAME
        ORDER BY sum(count) DESC
        LIMIT 10
        """)
        max_sql.show()

        sql_stop = perf_counter_ns()

        df_start = perf_counter_ns()

        self.flight_data_2015\
         .groupBy("DEST_COUNTRY_NAME")\
         .sum("count")\
         .withColumnRenamed("sum(count)", "destination_total")\
         .sort(functions.desc("destination_total"))\
         .limit(10)\
         .show()

        df_stop = perf_counter_ns()

        print(
            f'df execution time: {df_stop-df_start} vs sql execution time: {sql_stop-sql_start}'
        )
Пример #12
0
def main(argv):
    # Define SparkSession
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession

    # The Console Progress Bar output only makes sense in an interactive session.
    # In the log file it interferes with program log messages.
    conf = SparkConf().set('spark.ui.showConsoleProgress', 'False')
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc).builder.getOrCreate()
    sc.setLogLevel('OFF')

    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--path',
                        help='Directory path for test output file',
                        default=os.path.join(os.environ['DSX_PROJECT_DIR'],
                                             'datasets'))
    parser.add_argument('-f',
                        '--filename',
                        help='Test output filename',
                        default='testfile')
    parser.add_argument('-e',
                        '--extension',
                        help='Test output filename extension',
                        default='.csv.gz',
                        choices=['.csv', '.csv.gz'])
    parser.add_argument('-c',
                        '--compress',
                        help='Compression type',
                        default='gzip',
                        choices=['gzip', 'None'])
    parser.add_argument('-n',
                        '--noheader',
                        help='Suppress the header row',
                        action='store_true')
    parser.add_argument('-r',
                        '--rows',
                        help='Number of rows in test DataFrame',
                        default=1000000,
                        type=int)
    parser.add_argument('-l',
                        '--loglevel',
                        help='Verbosity of output',
                        default='INFO',
                        choices=['DEBUG', 'INFO', 'WARN'])

    args = parser.parse_args()
    compress = args.compress if args.compress.lower() != 'none' else None
    header = not args.noheader
    if args.loglevel == 'DEBUG':
        level = logging.DEBUG
    elif args.loglevel == 'INFO':
        level = logging.INFO
    else:
        level = logging.WARN

    # Get ready for debugging
    logger = logging.getLogger('spark1csv')
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    handler.setLevel(logging.DEBUG)
    logger.setLevel(level)
    logger.addHandler(handler)

    # logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

    # Generate a sample DataFrame.
    # Write a million rows populated with random numeric data. You can change the number of rows.
    from pyspark.sql.functions import rand

    rows = args.rows
    df_all = (spark.range(
        0, rows, numPartitions=10).withColumnRenamed('id', 'ID').withColumn(
            'COLUMN_ONE',
            (rand(seed=123) * rows + 1 * rows).cast('integer')).withColumn(
                'COLUMN_TWO',
                (rand(seed=456) * rows + 2 * rows).cast('integer')).withColumn(
                    'COLUMN_THREE', (rand(seed=789) * rows +
                                     3 * rows).cast('integer'))).cache()

    # Use logging, not print, to make it part of the same stream (stderr) as output from the function.
    # Otherwise, output from the main (stdout) and the function come out in the wrong order
    logger.info('Number of rows:       %7d', df_all.count())
    logger.info('Number of partitions: %7d', df_all.rdd.getNumPartitions())

    outfile = os.path.join(args.path, args.filename + args.extension)

    # Experiment with different values, especially for mode
    # First write a new file
    if os.path.exists(outfile):
        os.remove(outfile)

    logger.info('--- File does not exist, mode %s ---', 'overwrite')
    try:  # File does not exist, mode overwrite: should just work
        spark_write_one_csv(df_all,
                            outfile,
                            mode='overwrite',
                            compression=compress,
                            header=header)
    except:
        logger.error('Unexpected error: ', sys.exc_info()[0])
        raise

    # File exists, mode append: should raise exception because append is not supported
    logger.info('--- File exists, mode %s ---', 'append')
    try:
        spark_write_one_csv(df_all,
                            outfile,
                            mode='append',
                            compression=compress,
                            header=header)
    except ValueError as err:
        logger.info('Expected ValueError properly caught: {}'.format(err))
    except:
        logger.error('Unexpected error: ', sys.exc_info()[0])
        raise

    # File exists, mode ignore: silently skips writing the file
    logger.info('--- File exists, mode %s ---', 'ignore')
    try:
        spark_write_one_csv(df_all,
                            outfile,
                            mode='ignore',
                            compression=compress,
                            header=header)
    except:
        logger.error('Unexpected error: ', sys.exc_info()[0])
        raise

    # File exists, mode error: should raise exception
    logger.info('--- File exists, mode %s ---', 'error')
    try:
        spark_write_one_csv(df_all,
                            outfile,
                            mode='error',
                            compression=compress,
                            header=header)
    except FileExistsError as err:
        logger.info('Expected FileExistsError properly caught: {}'.format(err))
    except:
        logger.error('Unexpected error: ', sys.exc_info()[0])
        raise

    # File exists, mode overwrite: should just work
    logger.info('--- File exists, mode %s ---', 'overwrite')
    try:
        spark_write_one_csv(df_all,
                            outfile,
                            mode='overwrite',
                            compression=compress,
                            header=header)
    except:
        logger.error('Unexpected error: ', sys.exc_info()[0])
        raise
Пример #13
0
from pyspark.sql import SparkSession
from pyspark import SparkContext

sc = SparkContext()
spark = SparkSession(sc)

myRange = spark.range(1000).toDF("number")
division = myRange.where("number % 2 = 0")

count = division.count()

print(count)