Exemplo n.º 1
0
def test_get_compute_domain_with_unmeetable_row_condition(spark_session):
    pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})
    df = spark_session.createDataFrame(
        [
            tuple(
                None if isinstance(x, (float, int)) and np.isnan(x) else x
                for x in record.tolist()
            )
            for record in pd_df.to_records(index=False)
        ],
        pd_df.columns.tolist(),
    )
    expected_df = df.filter(F.col("b") > 24)

    engine = SparkDFExecutionEngine()
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"row_condition": "b > 24", "condition_parser": "spark"},
        domain_type=MetricDomainTypes.TABLE,
    )
    # Ensuring data has been properly queried
    assert data.schema == expected_df.schema
    assert data.collect() == expected_df.collect()

    # Ensuring compute kwargs have not been modified
    assert "row_condition" in compute_kwargs.keys()
    assert accessor_kwargs == {}
Exemplo n.º 2
0
def test_split_on_multi_column_values_and_sample_using_random(test_sparkdf):
    returned_df = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_sparkdf,
            splitter_method="_split_on_multi_column_values",
            splitter_kwargs={
                "column_names": ["y", "m", "d"],
                "partition_definition": {
                    "y": 2020,
                    "m": 1,
                    "d": 5,
                },
            },
            sampling_method="_sample_using_random",
            sampling_kwargs={
                "p": 0.5,
            },
        )
    )

    # The test dataframe contains 10 columns and 120 rows.
    assert len(returned_df.columns) == 10
    # The number of returned rows corresponding to the value of "partition_definition" above is 4.
    assert 0 <= returned_df.count() <= 4
    # The sampling probability "p" used in "SparkDFExecutionEngine._sample_using_random()" is 0.5 (the equivalent of a
    # fair coin with the 50% chance of coming up as "heads").  Hence, on average we should get 50% of the rows, which is
    # 2; however, for such a small sample (of 4 rows), the number of rows returned by an individual run can deviate from
    # this average.  Still, in the majority of trials, the number of rows should not be fewer than 2 or greater than 3.
    # The assertion in the next line, supporting this reasoning, is commented out to insure zero failures.  Developers
    # are encouraged to uncomment it, whenever the "_sample_using_random" feature is the main focus of a given effort.
    # assert 2 <= returned_df.count() <= 3

    for val in returned_df.collect():
        assert val.date == datetime.date(2020, 1, 5)
Exemplo n.º 3
0
def test_get_batch_with_split_on_whole_table_s3(spark_session):
    def mocked_get_reader_function(*args, **kwargs):
        def mocked_reader_function(*args, **kwargs):
            pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})
            df = spark_session.createDataFrame(
                [
                    tuple(
                        None if isinstance(x, (float, int)) and np.isnan(x) else x
                        for x in record.tolist()
                    )
                    for record in pd_df.to_records(index=False)
                ],
                pd_df.columns.tolist(),
            )
            return df

        return mocked_reader_function

    spark_engine = SparkDFExecutionEngine()
    spark_engine._get_reader_fn = mocked_get_reader_function

    test_sparkdf = spark_engine.get_batch_data(
        S3BatchSpec(
            s3="s3://bucket/test/test.csv",
            reader_method="csv",
            reader_options={"header": True},
            splitter_method="_split_on_whole_table",
        )
    )
    assert test_sparkdf.count() == 4
    assert len(test_sparkdf.columns) == 2
Exemplo n.º 4
0
def test_get_batch_with_split_on_whole_table(test_sparkdf):
    test_sparkdf = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_sparkdf, splitter_method="_split_on_whole_table"
        )
    )
    assert test_sparkdf.count() == 120
    assert len(test_sparkdf.columns) == 10
Exemplo n.º 5
0
def test_reader_fn(spark_session):
    engine = SparkDFExecutionEngine()
    # Testing that can recognize basic csv file
    fn = engine._get_reader_fn(reader=spark_session.read, path="myfile.csv")
    assert "<bound method DataFrameReader.csv" in str(fn)

    # Ensuring that other way around works as well - reader_method should always override path
    fn_new = engine._get_reader_fn(reader=spark_session.read, reader_method="csv")
    assert "<bound method DataFrameReader.csv" in str(fn_new)
Exemplo n.º 6
0
def test_get_batch_with_split_on_whole_table_filesystem(
    test_folder_connection_path_csv, ):
    # reader_method not configured because spark will configure own reader by default
    test_sparkdf = SparkDFExecutionEngine().get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_csv, "test.csv"),
            splitter_method="_split_on_whole_table",
        ))
    assert test_sparkdf.count() == 6
    assert len(test_sparkdf.columns) == 2
Exemplo n.º 7
0
def test_sample_using_random(test_sparkdf):
    sampled_df = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(batch_data=test_sparkdf,
                             sampling_method="_sample_using_random"))
    # The test dataframe contains 10 columns and 120 rows.
    assert len(sampled_df.columns) == 10
    assert 0 <= sampled_df.count() <= 120
    # The sampling probability "p" used in "SparkDFExecutionEngine._sample_using_random()" is 0.1 (the equivalent of an
    # unfair coin with the 10% chance of coming up as "heads").  Hence, we should never get as much as 20% of the rows.
    assert sampled_df.count() < 25
Exemplo n.º 8
0
def test_get_batch_empty_splitter_parquet(test_folder_connection_path_parquet):
    # Note: reader method and reader_options are not needed, because
    # SparkDFExecutionEngine automatically determines the file type as well as the schema of the Parquet file.
    test_sparkdf = SparkDFExecutionEngine().get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_parquet, "test.parquet"),
            splitter_method=None,
        )
    )
    assert test_sparkdf.count() == 5
    assert len(test_sparkdf.columns) == 2
Exemplo n.º 9
0
def test_get_batch_empty_splitter(test_folder_connection_path_csv):
    # reader_method not configured because spark will configure own reader by default
    # reader_options are needed to specify the fact that the first line of test file is the header
    test_sparkdf = SparkDFExecutionEngine().get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_csv, "test.csv"),
            reader_options={"header": True},
            splitter_method=None,
        ))
    assert test_sparkdf.count() == 5
    assert len(test_sparkdf.columns) == 2
def test_dataframe_property_given_loaded_batch(spark_session):

    engine = SparkDFExecutionEngine()

    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    df = spark_session.createDataFrame(df)

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    # Ensuring Data not distorted
    assert engine.dataframe == df
Exemplo n.º 11
0
def test_sample_using_a_list(test_sparkdf):
    sampled_df = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_sparkdf,
            sampling_method="_sample_using_a_list",
            sampling_kwargs={
                "column_name": "id",
                "value_list": [3, 5, 7, 11],
            },
        ))
    assert sampled_df.count() == 4
    assert len(sampled_df.columns) == 10
Exemplo n.º 12
0
def test_get_batch_empty_splitter_tsv(test_folder_connection_path_tsv):
    # reader_method not configured because spark will configure own reader by default
    # reader_options are needed to specify the fact that the first line of test file is the header
    # reader_options are also needed to specify the separator (otherwise, comma will be used as the default separator)
    test_sparkdf = SparkDFExecutionEngine().get_batch_data(
        PathBatchSpec(
            path=os.path.join(test_folder_connection_path_tsv, "test.tsv"),
            reader_options={"header": True, "sep": "\t"},
            splitter_method=None,
        )
    )
    assert test_sparkdf.count() == 5
    assert len(test_sparkdf.columns) == 2
Exemplo n.º 13
0
def test_sample_using_mod(test_sparkdf):
    sampled_df = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_sparkdf,
            sampling_method="_sample_using_mod",
            sampling_kwargs={
                "column_name": "id",
                "mod": 5,
                "value": 4,
            },
        ))
    assert sampled_df.count() == 24
    assert len(sampled_df.columns) == 10
Exemplo n.º 14
0
def test_get_batch_with_split_on_converted_datetime(test_sparkdf):
    split_df = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_sparkdf,
            splitter_method="_split_on_converted_datetime",
            splitter_kwargs={
                "column_name": "timestamp",
                "partition_definition": {"timestamp": "2020-01-03"},
            },
        )
    )
    assert split_df.count() == 2
    assert len(split_df.columns) == 10
Exemplo n.º 15
0
def test_dataframe_property_given_loaded_batch():
    from pyspark.sql import SparkSession

    engine = SparkDFExecutionEngine()

    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    spark = SparkSession.builder.getOrCreate()
    df = spark.createDataFrame(df)

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    # Ensuring Data not distorted
    assert engine.dataframe == df
Exemplo n.º 16
0
def _build_spark_engine(df, spark_session):
    df = spark_session.createDataFrame(
        [
            tuple(
                None if isinstance(x, (float, int)) and np.isnan(x) else x
                for x in record.tolist()
            )
            for record in df.to_records(index=False)
        ],
        df.columns.tolist(),
    )
    engine = SparkDFExecutionEngine()
    engine.load_batch_data("my_id", SparkDFBatchData(engine, df))
    return engine
Exemplo n.º 17
0
def test_get_batch_with_split_on_hashed_column(test_sparkdf):
    split_df = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_sparkdf,
            splitter_method="_split_on_hashed_column",
            splitter_kwargs={
                "column_name": "favorite_color",
                "hash_digits": 1,
                "hash_function_name": "sha256",
                "partition_definition": {
                    "hash_value": "a",
                },
            },
        ))
    assert split_df.count() == 8
    assert len(split_df.columns) == 10
Exemplo n.º 18
0
    def _spark(
        cls,
        execution_engine: SparkDFExecutionEngine,
        metric_domain_kwargs: dict,
        metric_value_kwargs: dict,
        metrics: Dict[str, Any],
        runtime_configuration: dict,
    ) -> List[pyspark_sql_Row]:
        query: Optional[str] = metric_value_kwargs.get(
            "query"
        ) or cls.default_kwarg_values.get("query")

        df: pyspark_sql_DataFrame
        df, _, _ = execution_engine.get_compute_domain(
            metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE
        )

        df.createOrReplaceTempView("tmp_view")
        column: str = metric_value_kwargs.get("column")
        query = query.format(col=column, active_batch="tmp_view")

        engine: pyspark_sql_SparkSession = execution_engine.spark
        result: List[pyspark_sql_Row] = engine.sql(query).collect()

        return result
Exemplo n.º 19
0
def test_get_aggregate_count_aware_metric_dependencies(spark_session):
    mp = ColumnValuesNonNull()
    metric = MetricConfiguration(
        "column_values.nonnull.unexpected_count", dict(), dict()
    )
    dependencies = mp.get_evaluation_dependencies(
        metric, execution_engine=PandasExecutionEngine()
    )
    assert (
        dependencies["unexpected_condition"].id[0] == "column_values.nonnull.condition"
    )

    metric = MetricConfiguration(
        "column_values.nonnull.unexpected_count", dict(), dict()
    )
    dependencies = mp.get_evaluation_dependencies(
        metric, execution_engine=SparkDFExecutionEngine()
    )
    assert (
        dependencies["metric_partial_fn"].id[0]
        == "column_values.nonnull.unexpected_count.aggregate_fn"
    )

    metric = MetricConfiguration(
        "column_values.nonnull.unexpected_count.aggregate_fn", dict(), dict()
    )
    dependencies = mp.get_evaluation_dependencies(metric)
    assert (
        dependencies["unexpected_condition"].id[0] == "column_values.nonnull.condition"
    )
def test_for_self_check_using_InferredAssetFilesystemDataConnector_SparkDFExecutionEngine(
        spark_session, tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "basic_data_connector_inferred_asset_filesystem_data_connector"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20201010_1000.csv",
            "abe_202011111_2000.csv",
            "will_20201212_3000.csv",
        ],
    )
    my_data_connector = InferredAssetFilesystemDataConnector(
        name="my_data_connector",
        base_directory=base_directory,
        glob_directive="*.csv",
        datasource_name="FAKE_DATASOURCE",
        execution_engine=SparkDFExecutionEngine(),
        default_regex={
            "pattern": "(.+)_(\\d+)_(\\d+)\\.csv",
            "group_names": ["data_asset_name", "timestamp", "size"],
        },
    )
    self_check_results = my_data_connector.self_check()
    assert self_check_results["data_asset_count"] == 3
    assert self_check_results["example_data_reference"]["n_rows"] == 3
Exemplo n.º 21
0
def test_sample_using_md5(test_sparkdf):
    sampled_df = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_sparkdf,
            sampling_method="_sample_using_hash",
            sampling_kwargs={
                "column_name": "date",
                "hash_function_name": "md5",
            },
        )
    )
    assert sampled_df.count() == 10
    assert len(sampled_df.columns) == 10

    collected = sampled_df.collect()
    for val in collected:
        assert val.date in [datetime.date(2020, 1, 15), datetime.date(2020, 1, 29)]
Exemplo n.º 22
0
def test_get_batch_with_split_on_divided_integer(test_sparkdf):
    split_df = SparkDFExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_sparkdf,
            splitter_method="_split_on_divided_integer",
            splitter_kwargs={
                "column_name": "id",
                "divisor": 10,
                "partition_definition": {"id": 5},
            },
        )
    )
    assert split_df.count() == 10
    assert len(split_df.columns) == 10
    max_result = split_df.select([F.max("id")])
    assert max_result.collect()[0]["max(id)"] == 59
    min_result = split_df.select([F.min("id")])
    assert min_result.collect()[0]["min(id)"] == 50
Exemplo n.º 23
0
def test_get_compute_domain_with_column_domain(spark_session):
    pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})
    df = spark_session.createDataFrame(
        [
            tuple(None if isinstance(x, (float, int)) and np.isnan(x) else x
                  for x in record.tolist())
            for record in pd_df.to_records(index=False)
        ],
        pd_df.columns.tolist(),
    )
    engine = SparkDFExecutionEngine()
    engine.load_batch_data(batch_data=df, batch_id="1234")
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN)
    assert compute_kwargs is not None, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {"column": "a"}
    assert data.schema == df.schema
    assert data.collect() == df.collect()
def test_get_batch_with_split_on_multi_column_values(test_sparkdf):
    split_df = (
        SparkDFExecutionEngine()
        .get_batch_data(
            RuntimeDataBatchSpec(
                batch_data=test_sparkdf,
                splitter_method="_split_on_multi_column_values",
                splitter_kwargs={
                    "column_names": ["y", "m", "d"],
                    "partition_definition": {
                        "y": 2020,
                        "m": 1,
                        "d": 5,
                    },
                },
            )
        )
        .dataframe
    )
    assert split_df.count() == 4
    assert len(split_df.columns) == 10
    collected = split_df.collect()
    for val in collected:
        assert val.date == datetime.date(2020, 1, 5)

    with pytest.raises(ValueError):
        split_df = (
            SparkDFExecutionEngine()
            .get_batch_data(
                RuntimeDataBatchSpec(
                    batch_data=test_sparkdf,
                    splitter_method="_split_on_multi_column_values",
                    splitter_kwargs={
                        "column_names": ["I", "dont", "exist"],
                        "partition_definition": {
                            "y": 2020,
                            "m": 1,
                            "d": 5,
                        },
                    },
                )
            )
            .dataframe
        )
    def _spark(
        cls,
        execution_engine: SparkDFExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        min_value = metric_value_kwargs.get("min_value")
        max_value = metric_value_kwargs.get("max_value")
        strict_min = metric_value_kwargs.get("strict_min")
        strict_max = metric_value_kwargs.get("strict_max")
        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")

        (
            df,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = df[accessor_domain_kwargs["column"]]

        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")

        if min_value is None:
            if strict_max:
                condition = column < max_value
            else:
                condition = column <= max_value

        elif max_value is None:
            if strict_min:
                condition = column > min_value
            else:
                condition = column >= min_value

        else:
            if strict_min and strict_max:
                condition = (column > min_value) & (column < max_value)
            elif strict_min:
                condition = (column > min_value) & (column <= max_value)
            elif strict_max:
                condition = (column >= min_value) & (column < max_value)
            else:
                condition = (column >= min_value) & (column <= max_value)

        return df.filter(condition).count()
def test_get_batch_empty_sampler(test_sparkdf):
    sampled_df = (
        SparkDFExecutionEngine()
        .get_batch_data(
            RuntimeDataBatchSpec(batch_data=test_sparkdf, sampling_method=None)
        )
        .dataframe
    )
    assert sampled_df.count() == 120
    assert len(sampled_df.columns) == 10
def test_get_batch_data(test_sparkdf):
    test_sparkdf = (
        SparkDFExecutionEngine()
        .get_batch_data(
            RuntimeDataBatchSpec(batch_data=test_sparkdf, data_asset_name="DATA_ASSET")
        )
        .dataframe
    )
    assert test_sparkdf.count() == 120
    assert len(test_sparkdf.columns) == 10
Exemplo n.º 28
0
def _build_spark_engine(spark_session, df):
    df = spark_session.createDataFrame(
        [
            tuple(None if isinstance(x, (float, int)) and np.isnan(x) else x
                  for x in record.tolist())
            for record in df.to_records(index=False)
        ],
        df.columns.tolist(),
    )
    engine = SparkDFExecutionEngine(batch_data_dict={"temp_id": df})
    return engine
Exemplo n.º 29
0
def test_sample_using_md5_wrong_hash_function_name(test_sparkdf):
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        sampled_df = SparkDFExecutionEngine().get_batch_data(
            RuntimeDataBatchSpec(
                batch_data=test_sparkdf,
                sampling_method="_sample_using_hash",
                sampling_kwargs={
                    "column_name": "date",
                    "hash_function_name": "I_wont_work",
                },
            ))
 def _spark(
     cls,
     execution_engine: SparkDFExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[Tuple, Any],
     runtime_configuration: Dict,
 ):
     df, _, _ = execution_engine.get_compute_domain(
         metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE)
     return _get_spark_column_metadata(
         df.schema, include_nested=metric_value_kwargs["include_nested"])