class DeltaSqlTests(PySparkTestCase): def setUp(self): super(DeltaSqlTests, self).setUp() spark = SparkSession(self.sc) if self.sc.version < "3.": # Manually activate "DeltaSparkSessionExtension" in PySpark 2.4 in a cloned session # because "spark.sql.extensions" is not picked up. (See SPARK-25003). self.sc._jvm.io.delta.sql.DeltaSparkSessionExtension() \ .apply(spark._jsparkSession.extensions()) self.spark = SparkSession(self.sc, spark._jsparkSession.cloneSession()) else: self.spark = spark self.temp_path = tempfile.mkdtemp() self.temp_file = os.path.join(self.temp_path, "delta_sql_test_table") # Create a simple Delta table inside the temp directory to test SQL commands. df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("delta").save(self.temp_file) df.write.mode("overwrite").format("delta").save(self.temp_file) def tearDown(self): self.spark.stop() shutil.rmtree(self.temp_path) super(DeltaSqlTests, self).tearDown() def test_vacuum(self): self.spark.sql( "set spark.databricks.delta.retentionDurationCheck.enabled = false" ) try: deleted_files = self.spark.sql("VACUUM '%s' RETAIN 0 HOURS" % self.temp_file).collect() # Verify `VACUUM` did delete some data files self.assertTrue(self.temp_file in deleted_files[0][0]) finally: self.spark.sql( "set spark.databricks.delta.retentionDurationCheck.enabled = true" ) def test_describe_history(self): assert (len( self.spark.sql("desc history delta.`%s`" % (self.temp_file)).collect()) > 0) def test_generate(self): # create a delta table temp_path = tempfile.mkdtemp() temp_file = os.path.join(temp_path, "delta_sql_test_table") numFiles = 10 self.spark.range(100).repartition(numFiles).write.format("delta").save( temp_file) # Generate the symlink format manifest self.spark.sql( "GENERATE SYMLINK_FORMAT_MANIFEST FOR TABLE delta.`{}`".format( temp_file)) # check the contents of the manifest # NOTE: this is not a correctness test, we are testing correctness in the scala suite manifestPath = os.path.join( temp_file, os.path.join("_symlink_format_manifest", "manifest")) files = [] with open(manifestPath) as f: files = f.readlines() shutil.rmtree(temp_path) # the number of files we write should equal the number of lines in the manifest assert (len(files) == numFiles) def test_convert(self): df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) temp_path2 = tempfile.mkdtemp() temp_path3 = tempfile.mkdtemp() temp_file2 = os.path.join(temp_path2, "delta_sql_test2") temp_file3 = os.path.join(temp_path3, "delta_sql_test3") df.write.format("parquet").save(temp_file2) self.spark.sql("CONVERT TO DELTA parquet.`" + temp_file2 + "`") self.__checkAnswer( self.spark.read.format("delta").load(temp_file2), [('a', 1), ('b', 2), ('c', 3)]) # test if convert to delta with partition columns work df.write.partitionBy("value").format("parquet").save(temp_file3) self.spark.sql("CONVERT TO DELTA parquet.`" + temp_file3 + "` PARTITIONED BY (value INT)") self.__checkAnswer( self.spark.read.format("delta").load(temp_file3), [('a', 1), ('b', 2), ('c', 3)]) shutil.rmtree(temp_path2) shutil.rmtree(temp_path3) def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]): if not expectedAnswer: self.assertEqual(df.count(), 0) return expectedDF = self.spark.createDataFrame(expectedAnswer, schema) self.assertEqual(df.count(), expectedDF.count()) self.assertEqual(len(df.columns), len(expectedDF.columns)) self.assertEqual([], df.subtract(expectedDF).take(1)) self.assertEqual([], expectedDF.subtract(df).take(1))
class TestShowResults(PySparkTestCase): def setUp(self): super().setUp() self.spark = SparkSession(self.sc) c1 = DenseVector(np.array([1.0, 1.0])) c2 = DenseVector(np.array([5.0, 5.0])) c3 = DenseVector(np.array([30.0, 30.0])) df = pd.DataFrame( {'prediction': [0, 0, 0, 0, 0, 0, 1, 2, 1, 1], 'point_col': [DenseVector(np.array([1.0, 2.0])), DenseVector(np.array([2.0, 1.0])), DenseVector(np.array([0.0, 1.0])), DenseVector(np.array([1.0, 0.0])), DenseVector(np.array([1.0, -1.0])), DenseVector(np.array([4.0, 5.0])), DenseVector(np.array([5.0, 6.0])), DenseVector(np.array([20.0, 30.0])), DenseVector(np.array([5.0, 7.0])), DenseVector(np.array([5.0, 10.0]))], 'centers': [c1, c1, c1, c1, c1, c1, c2, c3, c2, c2]}, columns=['prediction', 'point_col', 'centers']) self.dataframe = self.spark.createDataFrame(df) # probably not needed!!! def test_add_row_index(self): computed_dataframe = ShowResults._add_row_index(self.dataframe) self.assertIn(('rowId', 'bigint'), computed_dataframe.dtypes) computed_dataframe = ShowResults._add_row_index(self.dataframe, rowId='roow') self.assertIn(('roow', 'bigint'), computed_dataframe.dtypes) def test_add_distances(self): from math import sqrt computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') self.assertIn(('distance', 'double'), computed_dataframe.dtypes) p_computed_dataframe = computed_dataframe.toPandas() actual_distances = [sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(1.0), sqrt(4.0), sqrt(9.0+16.0), sqrt(1.0), sqrt(100.0), sqrt(4.0), sqrt(25.0)] for idx, val in enumerate(actual_distances): self.assertEqual(val, p_computed_dataframe['distance'][idx]) print('add_distance \n', p_computed_dataframe) def test_add_outliers(self): computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') computed_pdf = ShowResults._add_outliers(computed_dataframe).toPandas() # Boundary pre calculated mean for prediction 0: mean+2*stddev actual_values = [False]*5+[True]+4*[False] self.assertListEqual(list(computed_pdf['is_outlier']), actual_values) print('add_outliers \n', computed_pdf) @unittest.skip('Reason: Compute summary not in play.') def test_compute_summary(self): computed_dataframe = ShowResults._add_distances(self.dataframe, point_col='point_col') computed_df = ShowResults._add_outliers(computed_dataframe) summary_pdf = ShowResults.compute_summary(computed_df).toPandas() # counts from predictionCol actual_count_prediction = [6, 3, 1] # counts from outliers in distance actual_count_outliers = [1, 0, 0] # percentage from actual_count_outliers / actual_count_prediction actual_count_percentage = list(map(float, ['%.f' % elem for elem in [out/pre*100 for out, pre in zip(actual_count_outliers, actual_count_prediction)]])) self.assertEqual(list(summary_pdf['count']), actual_count_prediction) self.assertEqual(list(summary_pdf['outlier_count']), actual_count_outliers) self.assertEqual(list(summary_pdf['outlier percentage']), actual_count_percentage) print('compute_summary \n', summary_pdf) def test_prepare_table_data(self): # not tested through table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas() print('prepare_table_data \n', table_df) @unittest.skip('reason: Not implemented') def test_cluster_graph(self): # not tested through table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas() grouped = table_df.groupby('prediction') for i in range(1, len(table_df.prediction.unique())+1): group_i = grouped.get_group(i) table_json = ShowResults.cluster_graph(group_i) print('cluster_graph \n', table_json) @unittest.skip('Reason: JSON_histogram not applicable') def test_json_histogram(self): # not tested through table_df = ShowResults.prepare_table_data(self.dataframe, point_col='point_col').toPandas() hist_json = ShowResults.json_histogram(table_df) print('json_hist \n', hist_json) def test_arrange_output(self): # Preamble: setup data features = ['a', 'b'] id = 'id' prediction = 'Prediction' # Test 1: Are all columns there? shows = ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10) d_point = 'dp' df = self._generate_data() arranged_df = shows.arrange_output(self.sc, df, data_point_name=d_point) expected_cols = [prediction, d_point, 'amount', 'percentage_outlier', 'amount_outlier', 'buckets'] self.assertListEqual(sorted(expected_cols), sorted(arranged_df.columns)) def _generate_data(self): df = self.spark.range(10) output = (df .select( 'id', F.rand(42).alias('a'), F.randn(1).alias('b'), F.round(10 * F.rand(42)).alias('Prediction'), F.rand().alias('distance')) .withColumn('is_outlier', F.when(F.col('distance') >= 0.7, 1.0).otherwise(0.)) .withColumn('computed_boundary', F.randn()) ) return output def test_create_linspace(self): # data, min, max, buckets, boundary): data = ShowResults.prepare_table_data(self.dataframe, point_col='point_col') data.show() # @unittest.skip('Test not created') def test_create_buckets(self): # Preamble: setup data df = self._generate_data() features = ['a', 'b'] id = 'id' prediction = 'Prediction' shows = ShowResults.prepare_table_data(self.dataframe, point_col='point_col') # ShowResults(id=id, list_features=features, list_labels=['k'], predictionCol=prediction, k=10) buckets = shows.create_buckets(sc=self.sc, dataframe=df, buckets=20, prediction_col=prediction) print(buckets.rdd.take(1)[0]['buckets']) self.fail()
dict = {1: 'test', 2: 'test2'} sc = SparkContext() spark = SparkSession(sc) rdd = spark.parallelize([dict]) #toDF() needs a SparkSession, must be a row type before conversion to df rdd.map(lambda x: Row(**x)).toDF().show() # **x transposes the row # OR using createDataFrame rdd = rdd.map(lambda x: Row(**x)) spark.createDataFrame(rdd).show() ## APPENDING NEW DATA #-------------------------------------------------------- firstDF = spark.range(3).toDF("myCol") newRow = spark.createDataFrame([[20]]) appended = firstDF.union(newRow) display(appended) ## EXPLORATORY #-------------------------------------------------------- df.describe() #show datatypes df.describe().show() #show max, min, stdev ## COLUMNS #--------------------------------------------------------
.appName("utilities") \ .master("local[*]") \ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ .config("spark.sql.sources.parallelPartitionDiscovery.parallelism", "8") \ .getOrCreate() # Apache Spark 2.4.x has a known issue (SPARK-25003) that requires explicit activation # of the extension and cloning of the session. This will unnecessary in Apache Spark 3.x. if spark.sparkContext.version < "3.": spark.sparkContext._jvm.io.delta.sql.DeltaSparkSessionExtension() \ .apply(spark._jsparkSession.extensions()) spark = SparkSession(spark.sparkContext, spark._jsparkSession.cloneSession()) # Create a table print("########### Create a Parquet table ##############") data = spark.range(0, 5) data.write.format("parquet").save("/tmp/delta-table") # Convert to delta print("########### Convert to Delta ###########") DeltaTable.convertToDelta(spark, "parquet.`/tmp/delta-table`") # Read the table df = spark.read.format("delta").load("/tmp/delta-table") df.show() deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table") print("######## Vacuum the table ########") deltaTable.vacuum() print("######## Describe history for the table ######")
class DeltaTableTests(PySparkTestCase): def setUp(self): super(DeltaTableTests, self).setUp() self.sqlContext = SQLContext(self.sc) self.spark = SparkSession(self.sc) self.tempPath = tempfile.mkdtemp() self.tempFile = os.path.join(self.tempPath, "tempFile") def tearDown(self): self.spark.stop() shutil.rmtree(self.tempPath) super(DeltaTableTests, self).tearDown() def test_forPath(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile).toDF() self.__checkAnswer(dt, [('a', 1), ('b', 2), ('c', 3)]) def test_alias_and_toDF(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile).toDF() self.__checkAnswer( dt.alias("myTable").select('myTable.key', 'myTable.value'), [('a', 1), ('b', 2), ('c', 3)]) def test_delete(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)]) dt = DeltaTable.forPath(self.spark, self.tempFile) # delete with condition as str dt.delete("key = 'a'") self.__checkAnswer(dt.toDF(), [('b', 2), ('c', 3), ('d', 4)]) # delete with condition as Column dt.delete(col("key") == lit("b")) self.__checkAnswer(dt.toDF(), [('c', 3), ('d', 4)]) # delete without condition dt.delete() self.__checkAnswer(dt.toDF(), []) # bad args with self.assertRaises(TypeError): dt.delete(condition=1) def test_generate(self): # create a delta table numFiles = 10 self.spark.range(100).repartition(numFiles).write.format("delta").save( self.tempFile) dt = DeltaTable.forPath(self.spark, self.tempFile) # Generate the symlink format manifest dt.generate("symlink_format_manifest") # check the contents of the manifest # NOTE: this is not a correctness test, we are testing correctness in the scala suite manifestPath = os.path.join( self.tempFile, os.path.join("_symlink_format_manifest", "manifest")) files = [] with open(manifestPath) as f: files = f.readlines() # the number of files we write should equal the number of lines in the manifest assert (len(files) == numFiles) def test_update(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)]) dt = DeltaTable.forPath(self.spark, self.tempFile) # update with condition as str and with set exprs as str dt.update("key = 'a' or key = 'b'", {"value": "1"}) self.__checkAnswer(dt.toDF(), [('a', 1), ('b', 1), ('c', 3), ('d', 4)]) # update with condition as Column and with set exprs as Columns dt.update(expr("key = 'a' or key = 'b'"), {"value": expr("0")}) self.__checkAnswer(dt.toDF(), [('a', 0), ('b', 0), ('c', 3), ('d', 4)]) # update without condition dt.update(set={"value": "200"}) self.__checkAnswer(dt.toDF(), [('a', 200), ('b', 200), ('c', 200), ('d', 200)]) # bad args with self.assertRaisesRegex(ValueError, "cannot be None"): dt.update({"value": "200"}) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.update(condition='a') with self.assertRaisesRegex(TypeError, "must be a dict"): dt.update(set=1) with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.update(1, {}) with self.assertRaisesRegex(TypeError, "Values of dict in .* must contain only"): dt.update(set={"value": 1}) with self.assertRaisesRegex(TypeError, "Keys of dict in .* must contain only"): dt.update(set={1: ""}) with self.assertRaises(TypeError): dt.update(set=1) def test_merge(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)]) source = self.spark.createDataFrame([('a', -1), ('b', 0), ('e', -5), ('f', -6)], ["k", "v"]) def reset_table(): self.__overwriteDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)]) dt = DeltaTable.forPath(self.spark, self.tempFile) # ============== Test basic syntax ============== # String expressions in merge condition and dicts reset_table() dt.merge(source, "key = k") \ .whenMatchedUpdate(set={"value": "v + 0"}) \ .whenNotMatchedInsert(values={"key": "k", "value": "v + 0"}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3), ('d', 4), ('e', -5), ('f', -6)])) # Column expressions in merge condition and dicts reset_table() dt.merge(source, expr("key = k")) \ .whenMatchedUpdate(set={"value": col("v") + 0}) \ .whenNotMatchedInsert(values={"key": "k", "value": col("v") + 0}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3), ('d', 4), ('e', -5), ('f', -6)])) # ============== Test clause conditions ============== # String expressions in all conditions and dicts reset_table() dt.merge(source, "key = k") \ .whenMatchedUpdate(condition="k = 'a'", set={"value": "v + 0"}) \ .whenMatchedDelete(condition="k = 'b'") \ .whenNotMatchedInsert(condition="k = 'e'", values={"key": "k", "value": "v + 0"}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4), ('e', -5)])) # Column expressions in all conditions and dicts reset_table() dt.merge(source, expr("key = k")) \ .whenMatchedUpdate( condition=expr("k = 'a'"), set={"value": col("v") + 0}) \ .whenMatchedDelete(condition=expr("k = 'b'")) \ .whenNotMatchedInsert( condition=expr("k = 'e'"), values={"key": "k", "value": col("v") + 0}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4), ('e', -5)])) # Positional arguments reset_table() dt.merge(source, "key = k") \ .whenMatchedUpdate("k = 'a'", {"value": "v + 0"}) \ .whenMatchedDelete("k = 'b'") \ .whenNotMatchedInsert("k = 'e'", {"key": "k", "value": "v + 0"}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4), ('e', -5)])) # ============== Test updateAll/insertAll ============== # No clause conditions and insertAll/updateAll + aliases reset_table() dt.alias("t") \ .merge(source.toDF("key", "value").alias("s"), expr("t.key = s.key")) \ .whenMatchedUpdateAll() \ .whenNotMatchedInsertAll() \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3), ('d', 4), ('e', -5), ('f', -6)])) # String expressions in all clause conditions and insertAll/updateAll + aliases reset_table() dt.alias("t") \ .merge(source.toDF("key", "value").alias("s"), "s.key = t.key") \ .whenMatchedUpdateAll("s.key = 'a'") \ .whenNotMatchedInsertAll("s.key = 'e'") \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 2), ('c', 3), ('d', 4), ('e', -5)])) # Column expressions in all clause conditions and insertAll/updateAll + aliases reset_table() dt.alias("t") \ .merge(source.toDF("key", "value").alias("s"), expr("t.key = s.key")) \ .whenMatchedUpdateAll(expr("s.key = 'a'")) \ .whenNotMatchedInsertAll(expr("s.key = 'e'")) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 2), ('c', 3), ('d', 4), ('e', -5)])) # ============== Test bad args ============== # ---- bad args in merge() with self.assertRaisesRegex(TypeError, "must be DataFrame"): dt.merge(1, "key = k") with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.merge(source, 1) # ---- bad args in whenMatchedUpdate() with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenMatchedUpdate({"value": "v"}) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenMatchedUpdate(1) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenMatchedUpdate(condition="key = 'a'") with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.merge(source, "key = k").whenMatchedUpdate(1, {"value": "v"}) with self.assertRaisesRegex(TypeError, "must be a dict"): dt.merge(source, "key = k").whenMatchedUpdate("k = 'a'", 1) with self.assertRaisesRegex(TypeError, "Values of dict in .* must contain only"): dt.merge(source, "key = k").whenMatchedUpdate(set={"value": 1}) with self.assertRaisesRegex(TypeError, "Keys of dict in .* must contain only"): dt.merge(source, "key = k").whenMatchedUpdate(set={1: ""}) with self.assertRaises(TypeError): dt.merge(source, "key = k").whenMatchedUpdate(set="k = 'a'", condition={"value": 1}) # bad args in whenMatchedDelete() with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.merge(source, "key = k").whenMatchedDelete(1) # ---- bad args in whenNotMatchedInsert() with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenNotMatchedInsert({"value": "v"}) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenNotMatchedInsert(1) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenNotMatchedInsert(condition="key = 'a'") with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.merge(source, "key = k").whenNotMatchedInsert(1, {"value": "v"}) with self.assertRaisesRegex(TypeError, "must be a dict"): dt.merge(source, "key = k").whenNotMatchedInsert("k = 'a'", 1) with self.assertRaisesRegex(TypeError, "Values of dict in .* must contain only"): dt.merge(source, "key = k").whenNotMatchedInsert(values={"value": 1}) with self.assertRaisesRegex(TypeError, "Keys of dict in .* must contain only"): dt.merge(source, "key = k").whenNotMatchedInsert(values={1: "value"}) with self.assertRaises(TypeError): dt.merge(source, "key = k").whenNotMatchedInsert(values="k = 'a'", condition={"value": 1}) def test_history(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)]) dt = DeltaTable.forPath(self.spark, self.tempFile) operations = dt.history().select('operation') self.__checkAnswer( operations, [Row("WRITE"), Row("WRITE")], StructType([StructField("operation", StringType(), True)])) lastMode = dt.history(1).select('operationParameters.mode') self.__checkAnswer( lastMode, [Row("Overwrite")], StructType( [StructField("operationParameters.mode", StringType(), True)])) def test_vacuum(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile) self.__createFile('abc.txt', 'abcde') self.__createFile('bac.txt', 'abcdf') self.assertEqual(True, self.__checkFileExists('abc.txt')) dt.vacuum() # will not delete files as default retention is used. dt.vacuum(1000) # test whether integers work self.assertEqual(True, self.__checkFileExists('bac.txt')) retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled" self.spark.conf.set(retentionConf, "false") dt.vacuum(0.0) self.spark.conf.set(retentionConf, "true") self.assertEqual(False, self.__checkFileExists('bac.txt')) self.assertEqual(False, self.__checkFileExists('abc.txt')) def test_convertToDelta(self): df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % self.tempFile) self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile), [('a', 1), ('b', 2), ('c', 3)]) # test if convert to delta with partition columns work tempFile2 = self.tempFile + "_2" df.write.partitionBy("value").format("parquet").save(tempFile2) schema = StructType() schema.add("value", IntegerType(), True) dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % tempFile2, schema) self.__checkAnswer( self.spark.read.format("delta").load(tempFile2), [('a', 1), ('b', 2), ('c', 3)]) # convert to delta with partition column provided as a string tempFile3 = self.tempFile + "_3" df.write.partitionBy("value").format("parquet").save(tempFile3) dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % tempFile3, "value int") self.__checkAnswer( self.spark.read.format("delta").load(tempFile3), [('a', 1), ('b', 2), ('c', 3)]) def test_isDeltaTable(self): df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) tempFile2 = self.tempFile + '_2' df.write.format("delta").save(tempFile2) self.assertEqual(DeltaTable.isDeltaTable(self.spark, self.tempFile), False) self.assertEqual(DeltaTable.isDeltaTable(self.spark, tempFile2), True) def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]): if not expectedAnswer: self.assertEqual(df.count(), 0) return expectedDF = self.spark.createDataFrame(expectedAnswer, schema) try: self.assertEqual(df.count(), expectedDF.count()) self.assertEqual(len(df.columns), len(expectedDF.columns)) self.assertEqual([], df.subtract(expectedDF).take(1)) self.assertEqual([], expectedDF.subtract(df).take(1)) except AssertionError: print("Expected:") expectedDF.show() print("Found:") df.show() raise def __writeDeltaTable(self, datalist): df = self.spark.createDataFrame(datalist, ["key", "value"]) df.write.format("delta").save(self.tempFile) def __overwriteDeltaTable(self, datalist): df = self.spark.createDataFrame(datalist, ["key", "value"]) df.write.format("delta").mode("overwrite").save(self.tempFile) def __createFile(self, fileName, content): with open(os.path.join(self.tempFile, fileName), 'w') as f: f.write(content) def __checkFileExists(self, fileName): return os.path.exists(os.path.join(self.tempFile, fileName))
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.functions import desc, col # _we can use spark in either local mode or cluster mode. Below is the configuration for local mode. sc = SparkContext("local", "Hello World") sc.setLogLevel('ERROR') # _start spark sessin from context spark = SparkSession(sc) range = spark.range(100).toDF('numbers') range.show(5) # _select only the values which are divisible by 3 div_by_3 = range.where("numbers % 3 == 0") div_by_3.show(5) # _read csv file flightdata = spark.read.option('inferSchema', 'true').option('header', 'true').csv('2015-summary.csv') flightdata.show(5) flightdata.printSchema() # _sorted data descending order sorted_flightdata = flightdata.sort(desc('count')) sorted_flightdata.show(5) # _filter data using sql flightdata.createOrReplaceTempView('flightTable')
# spark_context.stop() NUM_SAMPLES = 100000000 def inside(p): x, y = random.random(), random.random() return x*x + y*y < 1 count = spark_context.parallelize(range(0, NUM_SAMPLES)).filter(inside).count() pi = 4 * count / NUM_SAMPLES f"Pi is roughly {pi}" datetime.datetime.now().isoformat() local_pth = "/tmp/delta-table-local" data = spark.range(20, 30) data.write.format("delta").mode("overwrite").save(local_pth) data2 = spark.range(5_000, 10_000) data2.write.format("delta").mode("overwrite").save(local_pth) df = spark.read.format("delta").load(local_pth) spark.read.format("delta").load(local_pth).count() spark.read.format("delta").option("versionAsOf", 0).load(local_pth).count() spark.read.format("delta").option("versionAsOf", 1).load(local_pth).count() spark.read.format("delta").option("versionAsOf", 2).load(local_pth).count() spark.read.format("delta").option("versionAsOf", 3).load(local_pth).count() df1 = spark.read.format( "delta" ).option(
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql.functions import col, current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, lit, unix_timestamp, to_timestamp sc = SparkContext() spark = SparkSession(sc) dateDF = spark.range(10)\ .withColumn("today", current_date())\ .withColumn("now", current_timestamp()) dateDF.createTempView("dateTable") dateDF.printSchema() #1 take days or add days print("1") dateDF.select( date_sub(col("today"), 5), date_add(col("today"), 5))\ .show(1) #2 date diference print("2") dateDF\ .withColumn("week_ago", date_sub(col("today"), 7))\ .select(datediff(col("week_ago"), col("today")))\ .show(1) #3 between two month print("3")
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.sql.functions import col conf = SparkConf().setAppName('count_connections').setMaster('local[*]') sc = SparkContext(conf=conf) spark = SparkSession(sc) # generate a synthetic graph by taking 1% of all # possible connections between 1000 vertices x = spark.range(1000) x_1 = x.select(col('id').alias('id_1')) x_2 = x.select(col('id').alias('id_2')) y = (x_1.crossJoin(x_2).filter(col('id_1') < col('id_2')).sample(False, 0.01)) connections = y connections.cache() connections.show(20) connections.createOrReplaceTempView('connections') # perform analytics on the graph connection_counts = spark.sql(''' SELECT id, COUNT(*) AS n_connections FROM ( SELECT id_1 AS id
# [replace] here we are using the base df and replace a country name replaced_df = df.withColumn( "Country", regexp_replace(col("Country"), "United States", "USA")) # replaced_df.show() # ------------------------------------------- CASE - WHEN ------------------------------------------------------------ # [case when] create new column with case when df_with_case = df.withColumn("Case_When", when(col("Price") < 1000, lit("Small"))\ .when(col("Price").between(1000, 1200), lit("Medium"))\ .when(col("Price").between(1201, 3600), lit("Big"))\ .when(col("Price") > 3600, lit("Huge"))\ .otherwise("Else") ) #df_with_case.show() # ------------------------------------------- Date functions ---------------------------------------------------------- # [current date] df_curr_date = spark.range(1).select(current_date()).alias("Current_Date") # [to timestamp] Used only two yy in "MM/dd/yy" because we don't have century in date df_to_timestamp = df.withColumn( "Account_Create", to_timestamp(col("Account_Created"), "MM/dd/yy HH:mm")) # df_to_timestamp.show() # [date difference] df_date_diff = df_to_timestamp.select( datediff(current_date(), col("Account_Create"))).alias("Date_Difference") # df_date_diff.show()
class Chapter2(object): def __init__(self): self.sc = SparkContext("local", "chapter2_examples") self.spark = SparkSession(self.sc) self.spark.conf.set("spark.sql.shuffle.partitions", "1000") self.flight_data_2015 = self.spark.read.option("inferSchema", "true").option("header", "true")\ .csv("/Spark-The-Definitive-Guide/data/flight-data/csv/2015-summary.csv") self.table_name = 'flight_data_2015' def exercise1(self): test_range = self.spark.range(100000).toDF("number") test_where = test_range.where( "number % 2 == 0 and 10 < number and number < 20") print(f"number of the elements: {test_where.count()}") test_where.show() def exercise2(self): self.flight_data_2015.show() print(f"take 3: {self.flight_data_2015.take(3)}") self.flight_data_2015_sorted = self.flight_data_2015.sort( "count", ascending=False) self.flight_data_2015_sorted.show() def exercise3(self): self.flight_data_2015.createOrReplaceTempView(self.table_name) # dataframes vs sql sql_way = self.spark.sql(f""" SELECT DEST_COUNTRY_NAME, count(1) FROM {self.table_name} GROUP BY DEST_COUNTRY_NAME """) data_frame_way = self.flight_data_2015.groupBy( "DEST_COUNTRY_NAME").count() diff = (ndiff( sql_way._jdf.queryExecution().toString().splitlines(keepends=True), data_frame_way._jdf.queryExecution().toString().splitlines( keepends=True))) print(''.join(diff), end="") print( f'take 3: {self.flight_data_2015.select(functions.max("count")).take(1)}' ) def exercise4(self): sql_start = perf_counter_ns() max_sql = self.spark.sql(f""" SELECT DEST_COUNTRY_NAME, sum(count) as destination_total FROM {self.table_name} GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 10 """) max_sql.show() sql_stop = perf_counter_ns() df_start = perf_counter_ns() self.flight_data_2015\ .groupBy("DEST_COUNTRY_NAME")\ .sum("count")\ .withColumnRenamed("sum(count)", "destination_total")\ .sort(functions.desc("destination_total"))\ .limit(10)\ .show() df_stop = perf_counter_ns() print( f'df execution time: {df_stop-df_start} vs sql execution time: {sql_stop-sql_start}' )
def main(argv): # Define SparkSession from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession # The Console Progress Bar output only makes sense in an interactive session. # In the log file it interferes with program log messages. conf = SparkConf().set('spark.ui.showConsoleProgress', 'False') sc = SparkContext(conf=conf) spark = SparkSession(sc).builder.getOrCreate() sc.setLogLevel('OFF') parser = argparse.ArgumentParser() parser.add_argument('-p', '--path', help='Directory path for test output file', default=os.path.join(os.environ['DSX_PROJECT_DIR'], 'datasets')) parser.add_argument('-f', '--filename', help='Test output filename', default='testfile') parser.add_argument('-e', '--extension', help='Test output filename extension', default='.csv.gz', choices=['.csv', '.csv.gz']) parser.add_argument('-c', '--compress', help='Compression type', default='gzip', choices=['gzip', 'None']) parser.add_argument('-n', '--noheader', help='Suppress the header row', action='store_true') parser.add_argument('-r', '--rows', help='Number of rows in test DataFrame', default=1000000, type=int) parser.add_argument('-l', '--loglevel', help='Verbosity of output', default='INFO', choices=['DEBUG', 'INFO', 'WARN']) args = parser.parse_args() compress = args.compress if args.compress.lower() != 'none' else None header = not args.noheader if args.loglevel == 'DEBUG': level = logging.DEBUG elif args.loglevel == 'INFO': level = logging.INFO else: level = logging.WARN # Get ready for debugging logger = logging.getLogger('spark1csv') handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) handler.setLevel(logging.DEBUG) logger.setLevel(level) logger.addHandler(handler) # logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') # Generate a sample DataFrame. # Write a million rows populated with random numeric data. You can change the number of rows. from pyspark.sql.functions import rand rows = args.rows df_all = (spark.range( 0, rows, numPartitions=10).withColumnRenamed('id', 'ID').withColumn( 'COLUMN_ONE', (rand(seed=123) * rows + 1 * rows).cast('integer')).withColumn( 'COLUMN_TWO', (rand(seed=456) * rows + 2 * rows).cast('integer')).withColumn( 'COLUMN_THREE', (rand(seed=789) * rows + 3 * rows).cast('integer'))).cache() # Use logging, not print, to make it part of the same stream (stderr) as output from the function. # Otherwise, output from the main (stdout) and the function come out in the wrong order logger.info('Number of rows: %7d', df_all.count()) logger.info('Number of partitions: %7d', df_all.rdd.getNumPartitions()) outfile = os.path.join(args.path, args.filename + args.extension) # Experiment with different values, especially for mode # First write a new file if os.path.exists(outfile): os.remove(outfile) logger.info('--- File does not exist, mode %s ---', 'overwrite') try: # File does not exist, mode overwrite: should just work spark_write_one_csv(df_all, outfile, mode='overwrite', compression=compress, header=header) except: logger.error('Unexpected error: ', sys.exc_info()[0]) raise # File exists, mode append: should raise exception because append is not supported logger.info('--- File exists, mode %s ---', 'append') try: spark_write_one_csv(df_all, outfile, mode='append', compression=compress, header=header) except ValueError as err: logger.info('Expected ValueError properly caught: {}'.format(err)) except: logger.error('Unexpected error: ', sys.exc_info()[0]) raise # File exists, mode ignore: silently skips writing the file logger.info('--- File exists, mode %s ---', 'ignore') try: spark_write_one_csv(df_all, outfile, mode='ignore', compression=compress, header=header) except: logger.error('Unexpected error: ', sys.exc_info()[0]) raise # File exists, mode error: should raise exception logger.info('--- File exists, mode %s ---', 'error') try: spark_write_one_csv(df_all, outfile, mode='error', compression=compress, header=header) except FileExistsError as err: logger.info('Expected FileExistsError properly caught: {}'.format(err)) except: logger.error('Unexpected error: ', sys.exc_info()[0]) raise # File exists, mode overwrite: should just work logger.info('--- File exists, mode %s ---', 'overwrite') try: spark_write_one_csv(df_all, outfile, mode='overwrite', compression=compress, header=header) except: logger.error('Unexpected error: ', sys.exc_info()[0]) raise
from pyspark.sql import SparkSession from pyspark import SparkContext sc = SparkContext() spark = SparkSession(sc) myRange = spark.range(1000).toDF("number") division = myRange.where("number % 2 = 0") count = division.count() print(count)