def test_sparkdfdataset_persist(spark_session):
    df = pd.DataFrame({"a": [1, 2, 3]})
    sdf = spark_session.createDataFrame(df)
    sdf.persist = mock.MagicMock()
    _ = SparkDFDataset(sdf, persist=True)
    sdf.persist.assert_called_once()

    sdf = spark_session.createDataFrame(df)
    sdf.persist = mock.MagicMock()
    _ = SparkDFDataset(sdf, persist=False)
    sdf.persist.assert_not_called()

    sdf = spark_session.createDataFrame(df)
    sdf.persist = mock.MagicMock()
    _ = SparkDFDataset(sdf)
    sdf.persist.assert_called_once()
def test_dataframe(spark_session):
    from pyspark.sql.types import IntegerType, StringType, StructField, StructType

    schema = StructType([
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField(
            "address",
            StructType([
                StructField("street", StringType(), True),
                StructField("city", StringType(), True),
                StructField("house_number", IntegerType(), True),
            ]),
            False,
        ),
        StructField("name_duplicate", StringType(), True),
        StructField("non.nested", StringType(), True),
    ])
    rows = [
        ("Alice", 1, ("Street 1", "Alabama", 10), "Alice", "a"),
        ("Bob", 2, ("Street 2", "Brooklyn", 11), "Bob", "b"),
        ("Charlie", 3, ("Street 3", "Alabama", 12), "Charlie", "c"),
    ]

    rdd = spark_session.sparkContext.parallelize(rows)

    df = spark_session.createDataFrame(rdd, schema)
    return SparkDFDataset(df, persist=True)
예제 #3
0
    def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwargs):
        """class-private implementation of get_data_asset"""
        if self.spark is None:
            logger.error("No spark session available")
            return None

        batch_kwargs.update(kwargs)
        reader_options = batch_kwargs.copy()
        if "path" in batch_kwargs:
            path = reader_options.pop("path")  # We remove this so it is not used as a reader option
            reader_options.pop("timestamp", "")    # ditto timestamp (but missing ok)
            reader_method = reader_options.pop("reader_method", None)
            if reader_method is None:
                reader_method = self._guess_reader_method_from_path(path)
                if reader_method is None:
                    raise BatchKwargsError("Unable to determine reader for path: %s" % path, batch_kwargs)
            else:
                try:
                    reader_method = ReaderMethods[reader_method]
                except KeyError:
                    raise BatchKwargsError("Unknown reader method: %s" % reader_method, batch_kwargs)

            reader = self.spark.read

            for option in reader_options.items():
                reader = reader.option(*option)

            if reader_method == ReaderMethods.CSV:
                df = reader.csv(path)
            elif reader_method == ReaderMethods.parquet:
                df = reader.parquet(path)
            else:
                raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs)
            
        elif "query" in batch_kwargs:
            df = self.spark.sql(batch_kwargs.query)

        elif "df" in batch_kwargs and isinstance(batch_kwargs["df"], (DataFrame, SparkDFDataset)):
            df = batch_kwargs.pop("df")  # We don't want to store the actual DataFrame in kwargs
            if isinstance(df, SparkDFDataset):
                # Grab just the spark_df reference, since we want to override everything else
                df = df.spark_df
            batch_kwargs["SparkDFRef"] = True
        else:
            raise BatchKwargsError("Unrecognized batch_kwargs for spark_source", batch_kwargs)

        return SparkDFDataset(df,
                              expectation_suite=expectation_suite,
                              data_context=self._data_context,
                              batch_kwargs=batch_kwargs,
                              caching=caching)
예제 #4
0
    def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwargs):
        """class-private implementation of get_data_asset"""
        if self.spark is None:
            logger.error("No spark session available")
            return None

        batch_kwargs.update(kwargs)
        reader_options = batch_kwargs.copy()
        if "path" in batch_kwargs:
            path = reader_options.pop("path")  # We remove this so it is not used as a reader option
            reader_options.pop("timestamp", "")    # ditto timestamp (but missing ok)
            reader_method = reader_options.pop("reader_method", None)
            if reader_method is None:
                reader_method = self._guess_reader_method_from_path(path)
                if reader_method is None:
                    raise BatchKwargsError("Unable to determine reader for path: %s" % path, batch_kwargs)
            else:
                try:
                    reader_method = ReaderMethods[reader_method]
                except KeyError:
                    raise BatchKwargsError("Unknown reader method: %s" % reader_method, batch_kwargs)

            reader = self.spark.read

            for option in reader_options.items():
                reader = reader.option(*option)

            if reader_method == ReaderMethods.CSV:
                df = reader.csv(path)
            elif reader_method == ReaderMethods.parquet:
                df = reader.parquet(path)
            else:
                raise BatchKwargsError("Unsupported reader: %s" % reader_method.name, batch_kwargs)
            
        elif "query" in batch_kwargs:
            df = self.spark.sql(batch_kwargs.query)

        return SparkDFDataset(df,
                              expectation_suite=expectation_suite,
                              data_context=self._data_context,
                              batch_kwargs=batch_kwargs,
                              caching=caching)
def test_expect_column_values_to_be_json_parseable(spark_session):
    d1 = json.dumps({"i": [1, 2, 3], "j": 35, "k": {"x": "five", "y": 5, "z": "101"}})
    d2 = json.dumps({"i": 1, "j": 2, "k": [3, 4, 5]})
    d3 = json.dumps({"i": "a", "j": "b", "k": "c"})
    d4 = json.dumps(
        {"i": [4, 5], "j": [6, 7], "k": [8, 9], "l": {4: "x", 5: "y", 6: "z"}}
    )
    inner = {
        "json_col": [d1, d2, d3, d4],
        "not_json": [4, 5, 6, 7],
        "py_dict": [
            {"a": 1, "out": 1},
            {"b": 2, "out": 4},
            {"c": 3, "out": 9},
            {"d": 4, "out": 16},
        ],
        "most": [d1, d2, d3, "d4"],
    }

    data_reshaped = list(zip(*[v for _, v in inner.items()]))
    df = spark_session.createDataFrame(
        data_reshaped, ["json_col", "not_json", "py_dict", "most"]
    )
    D = SparkDFDataset(df)
    D.set_default_expectation_argument("result_format", "COMPLETE")

    T = [
        {
            "in": {"column": "json_col"},
            "out": {
                "success": True,
                "unexpected_list": [],
            },
        },
        {
            "in": {"column": "not_json"},
            "out": {
                "success": False,
                "unexpected_list": [4, 5, 6, 7],
            },
        },
        {
            "in": {"column": "py_dict"},
            "out": {
                "success": False,
                "unexpected_list": [
                    {"a": 1, "out": 1},
                    {"b": 2, "out": 4},
                    {"c": 3, "out": 9},
                    {"d": 4, "out": 16},
                ],
            },
        },
        {
            "in": {"column": "most"},
            "out": {
                "success": False,
                "unexpected_list": ["d4"],
            },
        },
        {
            "in": {"column": "most", "mostly": 0.75},
            "out": {
                "success": True,
                "unexpected_index_list": [3],
                "unexpected_list": ["d4"],
            },
        },
    ]

    for t in T:
        out = D.expect_column_values_to_be_json_parseable(**t["in"])
        assert t["out"]["success"] == out.success
        assert t["out"]["unexpected_list"] == out.result["unexpected_list"]