def test_histogram_others(self, spark_session, meta_conf): strings = [] for i in range(1, 101): str = "str-{}".format(i) new_strings = [str] * i strings.extend(new_strings) strings = [(i, ) for i in strings] df = spark_session.createDataFrame(strings, ["string_column"]) value_meta = SparkDataFrameValueType().get_value_meta(df, meta_conf) histogram = value_meta.histograms["string_column"] assert len(histogram[0]) == 50 and len(histogram[1]) == 50 assert histogram[0][0] == 100 and histogram[1][0] == "str-100" assert histogram[0][10] == 90 and histogram[1][10] == "str-90" assert histogram[0][-2] == 52 and histogram[1][-2] == "str-52" assert histogram[0][-1] == sum(range( 1, 52)) and histogram[1][-1] == "_others" stats = value_meta.descriptive_stats["string_column"] assert stats["count"] == 5050 == sum(histogram[0]) assert stats["non-null"] == 5050 assert stats["null-count"] == 0 assert stats["distinct"] == 100 assert stats["type"] == "string"
def test_float_column(self, spark_session, meta_conf, numbers): numbers = [(float(i), ) if i else (None, ) for i in numbers] df = spark_session.createDataFrame(numbers, ["numerical_column"]) value_meta = SparkDataFrameValueType().get_value_meta(df, meta_conf) # pandas_df = df.toPandas() # pandas_stats, pandas_histograms = DataFrameValueType().get_histograms(pandas_df, meta_conf) self.validate_numerical_histogram_and_stats(value_meta, "numerical_column") stats = value_meta.descriptive_stats["numerical_column"] assert stats["type"] == "double"
def test_null_str_column(self, spark_session, meta_conf): column_name = "null_column" nulls = [(None, ) for _ in range(20)] schema = StructType([StructField(column_name, StringType(), True)]) null_df = spark_session.createDataFrame(nulls, schema=schema) value_meta = SparkDataFrameValueType().get_value_meta( null_df, meta_conf) assert value_meta.histograms[column_name] == ([20], [None]) stats = value_meta.descriptive_stats[column_name] assert stats["type"] == "string"
def test_complex_column(self, spark_session, meta_conf, numbers): complex = [(i, [str(i), str(i + 1)]) if i else [None] * 2 for i in numbers] df = spark_session.createDataFrame( complex, ["numerical_column", "complex_column"]) value_meta = SparkDataFrameValueType().get_value_meta(df, meta_conf) assert list(value_meta.histograms.keys()) == ["numerical_column"] assert list( value_meta.descriptive_stats.keys()) == ["numerical_column"] self.validate_numerical_histogram_and_stats(value_meta, "numerical_column")
def test_multiple_columns(self, spark_session, meta_conf, numbers): values = [(i, float(i), str(i), str(i)) if i else [None] * 4 for i in numbers] df = spark_session.createDataFrame(values, ["ints", "floats", "str1", "str2"]) value_meta = SparkDataFrameValueType().get_value_meta(df, meta_conf) self.validate_numerical_histogram_and_stats(value_meta, "ints") self.validate_numerical_histogram_and_stats(value_meta, "floats") str_histogram_1 = value_meta.histograms["str1"] str_histogram_2 = value_meta.histograms["str2"] assert str_histogram_1[0] == [4, 3, 2, 1] assert str_histogram_1[1] == ["1", "5", None, "3"] assert str_histogram_1 == str_histogram_2
def test_boolean_histogram(self, spark_session, meta_conf): booleans = [True] * 10 + [None] * 10 + [False] * 20 + [True] * 20 booleans = [(i, ) for i in booleans] boolean_df = spark_session.createDataFrame(booleans, ["boolean_column"]) value_meta = SparkDataFrameValueType().get_value_meta( boolean_df, meta_conf) histogram = value_meta.histograms["boolean_column"] assert histogram[0] == [30, 20, 10] assert histogram[1] == [True, False, None] stats = value_meta.descriptive_stats["boolean_column"] assert stats["count"] == 60 assert stats["type"] == "boolean"