def test_column_pairs_greater_metric_pd():
    df = pd.DataFrame({
        "a": [2, 3, 4, None, 3, None],
        "b": [1, 2, 3, None, 3, 5]
    })
    engine = PandasExecutionEngine(batch_data_dict={"my_id": df})
    desired_metric = MetricConfiguration(
        metric_name="column_pair_values.a_greater_than_b.condition",
        metric_domain_kwargs={
            "column_A": "a",
            "column_B": "b"
        },
        metric_value_kwargs={
            "or_equal": True,
            "ignore_row_if": "either_value_is_missing",
        },
    )
    results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ))
    assert (results[desired_metric.id][0].reset_index(drop=True).equals(
        pd.Series([True, True, True, True])))
예제 #2
0
def test_get_batch_with_split_on_converted_datetime(test_df):
    split_df = PandasExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_df,
            splitter_method="_split_on_converted_datetime",
            splitter_kwargs={
                "column_name": "timestamp",
                "batch_identifiers": {"timestamp": "2020-01-30"},
            },
        )
    )
    assert split_df.dataframe.shape == (3, 10)
예제 #3
0
def test_sample_using_mod(test_df):
    sampled_df = PandasExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_df,
            sampling_method="_sample_using_mod",
            sampling_kwargs={
                "column_name": "id",
                "mod": 5,
                "value": 4,
            },
        ))
    assert sampled_df.dataframe.shape == (24, 10)
예제 #4
0
def test_return_all_batch_definitions_too_many_sorters(
        mock_gcs_conn, mock_list_keys, mock_emit,
        empty_data_context_stats_enabled):
    my_data_connector_yaml = yaml.load(
        f"""
       class_name: ConfiguredAssetGCSDataConnector
       datasource_name: test_environment
       bucket_or_name: my_bucket
       prefix: ""
       assets:
           TestFiles:
       default_regex:
           pattern: (.+)_.+_.+\\.csv
           group_names:
               - name
       sorters:
           - orderby: asc
             class_name: LexicographicSorter
             name: name
           - datetime_format: "%Y%m%d"
             orderby: desc
             class_name: DateTimeSorter
             name: timestamp
           - orderby: desc
             class_name: NumericSorter
             name: price
   """, )

    mock_list_keys.return_value = [
        "alex_20200809_1000.csv",
        "eugene_20200809_1500.csv",
        "james_20200811_1009.csv",
        "abe_20200809_1040.csv",
        "will_20200809_1002.csv",
        "james_20200713_1567.csv",
        "eugene_20201129_1900.csv",
        "will_20200810_1001.csv",
        "james_20200810_1003.csv",
        "alex_20200819_1300.csv",
    ]

    # Raises error due to a non-existent sorter being specified in `FilePathDataConnector._validate_sorters_configuration()`
    with pytest.raises(ge_exceptions.DataConnectorError):
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_gcs_data_connector",
                "execution_engine": PandasExecutionEngine(),
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
def test_add_column_row_condition_with_unsupported_conditions():
    e = PandasExecutionEngine()

    # Ensuring that an attempt to filter nans within base class yields an error
    with pytest.raises(GreatExpectationsError) as error:
        new_domain_kwargs = e.add_column_row_condition({},
                                                       "a",
                                                       filter_nan=True)

    # Having a pre-existing row condition should result in an error, as we should not be updating it
    with pytest.raises(GreatExpectationsError) as error:
        new_domain_kwargs = e.add_column_row_condition({
            "column":
            "a",
            "row_condition":
            "col(a) == 2"
        })

    # Testing that error raised when column not given
    with pytest.raises(AssertionError) as error:
        new_domain_kwargs = e.add_column_row_condition({})
def test_add_column_row_condition():
    e = PandasExecutionEngine()

    # Checking that adding a simple column row condition is functional
    new_domain_kwargs = e.add_column_row_condition({}, "a")
    assert new_domain_kwargs == {
        "condition_parser": "great_expectations__experimental__",
        "row_condition": 'col("a").notnull()',
    }

    # Ensuring that this also works when formatted differently
    new_domain_kwargs = e.add_column_row_condition({"column": "a"})
    assert new_domain_kwargs == {
        "column": "a",
        "condition_parser": "great_expectations__experimental__",
        "row_condition": 'col("a").notnull()',
    }

    # Ensuring that everything still works if a row condition of None given
    new_domain_kwargs = e.add_column_row_condition(
        {"column": "a", "row_condition": None}
    )
    assert new_domain_kwargs == {
        "column": "a",
        "row_condition": None,
        "condition_parser": "great_expectations__experimental__",
        "row_condition": 'col("a").notnull()',
    }

    # Identity case
    new_domain_kwargs = e.add_column_row_condition({}, "a", filter_null=False)
    assert new_domain_kwargs == {}
    def _pandas(
        cls,
        execution_engine: PandasExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        min_value = metric_value_kwargs.get("min_value")
        max_value = metric_value_kwargs.get("max_value")
        strict_min = metric_value_kwargs.get("strict_min")
        strict_max = metric_value_kwargs.get("strict_max")
        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")

        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        (
            df,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        val = df[accessor_domain_kwargs["column"]]

        if min_value is not None and max_value is not None:
            if strict_min and strict_max:
                series = (min_value < val) and (val < max_value)
            elif strict_min:
                series = (min_value < val) and (val <= max_value)
            elif strict_max:
                series = (min_value <= val) and (val < max_value)
            else:
                series = (min_value <= val) and (val <= max_value)

        elif min_value is None and max_value is not None:
            if strict_max:
                series = val < max_value
            else:
                series = val <= max_value

        elif min_value is not None and max_value is None:
            if strict_min:
                series = min_value < val
            else:
                series = min_value <= val
        else:
            raise ValueError("unable to parse domain and value kwargs")

        return np.count_nonzero(series)
예제 #8
0
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector(
    test_s3_files, test_df_small
):
    bucket, _keys = test_s3_files
    expected_df = test_df_small

    execution_engine: ExecutionEngine = PandasExecutionEngine()

    my_data_connector = ConfiguredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        bucket=bucket,
        execution_engine=execution_engine,
        prefix="",
        assets={"alpha": {}},
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
    )
    batch_def: BatchDefinition = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_connector_name="my_data_connector",
        data_asset_name="alpha",
        batch_identifiers=IDDict(index=1),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    test_df = execution_engine.get_batch_data(
        batch_spec=my_data_connector.build_batch_spec(batch_definition=batch_def)
    )
    assert test_df.dataframe.shape == expected_df.shape

    # if key does not exist
    batch_def_no_key = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_connector_name="my_data_connector",
        data_asset_name="alpha",
        batch_identifiers=IDDict(index=9),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        execution_engine.get_batch_data(
            batch_spec=my_data_connector.build_batch_spec(
                batch_definition=batch_def_no_key
            )
        )
예제 #9
0
def test_test_yaml_config(mock_gcs_conn, mock_list_keys, mock_emit,
                          empty_data_context_stats_enabled):
    context: DataContext = empty_data_context_stats_enabled

    report_object = context.test_yaml_config(
        """
module_name: great_expectations.datasource.data_connector
class_name: InferredAssetGCSDataConnector
datasource_name: FAKE_DATASOURCE
name: TEST_DATA_CONNECTOR
bucket_or_name: test_bucket
prefix: ""
default_regex:
    pattern: (\\d{4})/(\\d{2})/(.*)-.*\\.csv
    group_names:
        - year_dir
        - month_dir
        - data_asset_name
    """,
        runtime_environment={
            "execution_engine": PandasExecutionEngine(),
        },
        return_mode="report_object",
    )

    assert report_object == {
        "class_name": "InferredAssetGCSDataConnector",
        "data_asset_count": 2,
        "example_data_asset_names": ["alpha", "beta"],
        "data_assets": {
            "alpha": {
                "example_data_references": [
                    "2020/01/alpha-*.csv",
                    "2020/02/alpha-*.csv",
                    "2020/03/alpha-*.csv",
                ],
                "batch_definition_count":
                3,
            },
            "beta": {
                "example_data_references": [
                    "2020/01/beta-*.csv",
                    "2020/02/beta-*.csv",
                    "2020/03/beta-*.csv",
                ],
                "batch_definition_count":
                4,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
    }
예제 #10
0
def test_graph_validate(basic_datasource):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]})
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "b",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )

    batch = basic_datasource.get_single_batch_from_batch_request(
        BatchRequest(
            **{
                "datasource_name":
                "my_datasource",
                "data_connector_name":
                "test_runtime_data_connector",
                "batch_data":
                df,
                "partition_request":
                PartitionRequest(
                    **{
                        "partition_identifiers": {
                            "pipeline_stage_name": 0,
                            "airflow_run_id": 0,
                            "custom_key_0": 0,
                        }
                    }),
            }))

    result = Validator(execution_engine=PandasExecutionEngine(),
                       batches=[batch]).graph_validate(
                           configurations=[expectationConfiguration])
    assert result == [
        ExpectationValidationResult(
            success=True,
            expectation_config=None,
            meta={},
            result={
                "element_count": 6,
                "unexpected_count": 0,
                "unexpected_percent": 0.0,
                "partial_unexpected_list": [],
                "missing_count": 1,
                "missing_percent": 16.666666666666664,
                "unexpected_percent_nonmissing": 0.0,
            },
            exception_info=None,
        )
    ]
예제 #11
0
def test_resolve_metrics_with_extraneous_value_key():
    df = pd.DataFrame({"a": [1, 2, 3, None]})
    engine = PandasExecutionEngine(batch_data_dict={"my_id": df})
    mean = MetricConfiguration(
        metric_name="column.mean",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={},
    )

    # Ensuring that an unused value key will not mess up computation
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={"value_set": [1, 2, 3, 4, 5]},
    )
    desired_metrics = (mean, stdev)
    metrics = engine.resolve_metrics(metrics_to_resolve=desired_metrics)

    # Ensuring extraneous value key did not change computation
    assert (metrics[("column.standard_deviation", "column=a",
                     "value_set=[1, 2, 3, 4, 5]")] == 1.0)
def test_get_table_metric_provider_metric_dependencies(empty_sqlite_db):
    mp = ColumnMax()
    metric = MetricConfiguration("column.max", dict(), dict())
    dependencies = mp.get_evaluation_dependencies(
        metric,
        execution_engine=SqlAlchemyExecutionEngine(engine=empty_sqlite_db))
    assert dependencies["metric_partial_fn"].id[0] == "column.max.aggregate_fn"

    mp = ColumnMax()
    metric = MetricConfiguration("column.max", dict(), dict())
    dependencies = mp.get_evaluation_dependencies(
        metric, execution_engine=PandasExecutionEngine())
    assert dependencies == dict()
예제 #13
0
def test_nested_directory_data_asset_name_in_folder(mock_gcs_client,
                                                    mock_list_keys, mock_emit,
                                                    empty_data_context):
    context = empty_data_context

    report_object = context.test_yaml_config(
        """
    module_name: great_expectations.datasource.data_connector
    class_name: InferredAssetGCSDataConnector
    datasource_name: FAKE_DATASOURCE
    name: TEST_DATA_CONNECTOR
    bucket_or_name: test_bucket
    prefix: ""
    default_regex:
        group_names:
            - data_asset_name
            - letter
            - number
        pattern: (\\w{1})\\/(\\w{1})-(\\d{1})\\.csv
        """,
        runtime_environment={
            "execution_engine": PandasExecutionEngine(),
        },
        return_mode="report_object",
    )

    assert report_object == {
        "class_name": "InferredAssetGCSDataConnector",
        "data_asset_count": 4,
        "example_data_asset_names": ["A", "B", "C"],
        "data_assets": {
            "A": {
                "batch_definition_count": 3,
                "example_data_references":
                ["A/A-1.csv", "A/A-2.csv", "A/A-3.csv"],
            },
            "B": {
                "batch_definition_count": 3,
                "example_data_references":
                ["B/B-1.csv", "B/B-2.csv", "B/B-3.csv"],
            },
            "C": {
                "batch_definition_count": 3,
                "example_data_references":
                ["C/C-1.csv", "C/C-2.csv", "C/C-3.csv"],
            },
        },
        "unmatched_data_reference_count": 0,
        "example_unmatched_data_references": [],
    }
def test_self_check(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_self_check"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "A-100.csv",
            "A-101.csv",
            "B-1.csv",
            "B-2.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        InferredAssetFilesystemDataConnector(
            name="my_data_connector",
            datasource_name="FAKE_DATASOURCE_NAME",
            execution_engine=PandasExecutionEngine(),
            default_regex={
                "pattern": r"(.+)-(\d+)\.csv",
                "group_names": ["data_asset_name", "number"],
            },
            glob_directive="*",
            base_directory=base_directory,
        )
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    self_check_report_object = my_data_connector.self_check()

    assert self_check_report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 2,
        "example_data_asset_names": ["A", "B"],
        "data_assets": {
            "A": {
                "example_data_references": ["A-100.csv", "A-101.csv"],
                "batch_definition_count": 2,
            },
            "B": {
                "example_data_references": ["B-1.csv", "B-2.csv"],
                "batch_definition_count": 2,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
예제 #15
0
 def _pandas(
     cls,
     execution_engine: PandasExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[Tuple, Any],
     runtime_configuration: Dict,
 ):
     df, _, _ = execution_engine.get_compute_domain(
         metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE)
     if metric_value_kwargs.get("fetch_all",
                                cls.default_kwarg_values["fetch_all"]):
         return df
     return df.head(metric_value_kwargs["n_rows"])
예제 #16
0
def test_validator_progress_bar_config_disabled(mock_tqdm,
                                                mock_validation_graph,
                                                mock_data_context):
    data_context = mock_data_context()
    data_context.progress_bars = ProgressBarsConfig(metric_calculations=False)
    engine = PandasExecutionEngine()
    validator = Validator(engine, data_context=data_context)

    # ValidationGraph is a complex object that requires len > 3 to not trigger tqdm
    mock_validation_graph.edges.__len__ = lambda _: 3
    validator.resolve_validation_graph(mock_validation_graph, {})

    assert mock_tqdm.called is True
    assert mock_tqdm.call_args[1]["disable"] is True
 def _pandas(
     cls,
     execution_engine: PandasExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[Tuple, Any],
     runtime_configuration: Dict,
 ):
     df, _, _ = execution_engine.get_compute_domain(
         metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE)
     return [{
         "name": name,
         "type": dtype
     } for (name, dtype) in zip(df.columns, df.dtypes)]
예제 #18
0
def test_validator_progress_bar_config_enabled(mock_tqdm,
                                               mock_validation_graph,
                                               mock_data_context):
    data_context = mock_data_context()
    engine = PandasExecutionEngine()
    validator = Validator(engine, data_context=data_context)

    # ValidationGraph is a complex object that requires len > 3 to not trigger tqdm
    mock_validation_graph.edges.__len__ = lambda _: 3
    validator.resolve_validation_graph(mock_validation_graph, {})

    # Still invoked but doesn't actually do anything due to `disabled`
    assert mock_tqdm.called is True
    assert mock_tqdm.call_args[1]["disable"] is False
def test_basic_instantiation():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "path/A-100.csv",
        "path/A-101.csv",
        "directory/B-1.csv",
        "directory/B-2.csv",
    ]
    for key in keys:
        client.put_object(
            Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key
        )

    my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(.+)/(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "letter", "number"],
        },
        bucket=bucket,
        prefix="",
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    assert my_data_connector.get_data_reference_list_count() == 4
    assert my_data_connector.get_unmatched_data_references() == []

    # Illegal execution environment name
    with pytest.raises(ValueError):
        print(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="something",
                    data_connector_name="my_data_connector",
                    data_asset_name="something",
                )
            )
        )
 def _pandas(
     cls,
     execution_engine: PandasExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[str, Any],
     runtime_configuration: Dict,
 ):
     df, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
         domain_kwargs=metric_domain_kwargs,
         domain_type=MetricDomainTypes.COLUMN)
     column = accessor_domain_kwargs["column"]
     bins = metric_value_kwargs["bins"]
     hist, bin_edges = np.histogram(df[column], bins, density=False)
     return list(hist)
    def _pandas_columns(
        self,
        batches: Dict[str, Batch],
        execution_engine: PandasExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict,
        runtime_configuration: dict = None,
    ):
        """Metric which returns all columns in a dataframe"""
        df = execution_engine.get_domain_dataframe(
            domain_kwargs=metric_domain_kwargs, batches=batches)

        cols = df.columns
        return cols.tolist()
예제 #22
0
def test_get_batch_with_split_on_mod_integer(test_df):
    split_df = PandasExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_df,
            splitter_method="_split_on_mod_integer",
            splitter_kwargs={
                "column_name": "id",
                "mod": 10,
                "batch_identifiers": {"id": 5},
            },
        )
    )
    assert split_df.dataframe.shape == (12, 10)
    assert split_df.dataframe.id.min() == 5
    assert split_df.dataframe.id.max() == 115
예제 #23
0
def test_instantiation_from_a_config_with_nonmatching_regex_creates_unmatched_references(
        mock_gcs_conn, mock_list_keys, mock_emit,
        empty_data_context_stats_enabled):
    context: DataContext = empty_data_context_stats_enabled

    report_object = context.test_yaml_config(
        f"""
        module_name: great_expectations.datasource.data_connector
        class_name: ConfiguredAssetGCSDataConnector
        datasource_name: FAKE_DATASOURCE
        name: TEST_DATA_CONNECTOR
        default_regex:
            pattern: beta-(.*)\\.csv
            group_names:
                - index
        bucket_or_name: my_bucket
        prefix: ""
        assets:
            alpha:
    """,
        runtime_environment={
            "execution_engine": PandasExecutionEngine(),
        },
        return_mode="report_object",
    )

    assert report_object == {
        "class_name":
        "ConfiguredAssetGCSDataConnector",
        "data_asset_count":
        1,
        "example_data_asset_names": [
            "alpha",
        ],
        "data_assets": {
            "alpha": {
                "example_data_references": [],
                "batch_definition_count": 0
            },
        },
        "example_unmatched_data_references": [
            "alpha-1.csv",
            "alpha-2.csv",
            "alpha-3.csv",
        ],
        "unmatched_data_reference_count":
        3,
    }
    def _pandas_column_a_greater_than_b(
        self,
        batches: Dict[str, Batch],
        execution_engine: PandasExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict,
        runtime_configuration: dict = None,
    ):
        """Metric which returns all columns in a dataframe"""
        df = execution_engine.get_domain_dataframe(
            domain_kwargs=metric_domain_kwargs, batches=batches)
        # Initialization of necessary value kwargs
        allow_cross_type_comparisons = None
        parse_strings_as_datetimes = None
        or_equal = None

        column_A = df[metric_value_kwargs["column_A"]]
        column_B = df[metric_value_kwargs["column_B"]]

        # If value kwargs are given that could impact outcome, initializing them
        if allow_cross_type_comparisons in metric_value_kwargs:
            allow_cross_type_comparisons = metric_value_kwargs[
                "allow_cross_type_comparisons"]

        if parse_strings_as_datetimes in metric_value_kwargs:
            parse_strings_as_datetimes = metric_value_kwargs[
                "parse_strings_as_datetimes"]

        if or_equal in metric_value_kwargs:
            or_equal = metric_value_kwargs["or_equal"]

        if allow_cross_type_comparisons:
            column_A = column_A.apply(str)
            column_B = column_B.apply(str)

        if parse_strings_as_datetimes:
            temp_column_A = column_A.map(parse)
            temp_column_B = column_B.map(parse)

        else:
            temp_column_A = column_A
            temp_column_B = column_B

        if or_equal:
            return temp_column_A >= temp_column_B
        else:
            return temp_column_A > temp_column_B
예제 #25
0
    def _pandas_equal_columns(
        self,
        batches: Dict[str, Batch],
        execution_engine: PandasExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict,
        runtime_configuration: dict = None,
    ):
        """Metric which returns all columns in a dataframe"""
        df = execution_engine.get_domain_dataframe(
            domain_kwargs=metric_domain_kwargs, batches=batches)
        column_A = df[metric_value_kwargs["column_A"]]
        column_B = df[metric_value_kwargs["column_B"]]

        return (column_A == column_B).any()
예제 #26
0
def test_expect_column_value_z_scores_to_be_less_than_impl():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(
        expectationConfiguration)
    engine = PandasExecutionEngine(batch_data_dict={"my_id": df})
    result = expectation.validate(Validator(execution_engine=engine))
    assert result == ExpectationValidationResult(success=True, )
예제 #27
0
def test_get_definition_list_from_batch_request_with_empty_args_raises_error(
        mock_gcs_conn, mock_list_keys, mock_emit,
        empty_data_context_stats_enabled):
    my_data_connector_yaml = yaml.load(
        f"""
           class_name: ConfiguredAssetGCSDataConnector
           datasource_name: test_environment
           bucket_or_name: my_bucket
           prefix: ""
           assets:
               TestFiles:
           default_regex:
               pattern: (.+)_(.+)_(.+)\\.csv
               group_names:
                   - name
                   - timestamp
                   - price
       """, )

    mock_list_keys.return_value = ([
        "alex_20200809_1000.csv",
        "eugene_20200809_1500.csv",
        "james_20200811_1009.csv",
        "abe_20200809_1040.csv",
        "will_20200809_1002.csv",
        "james_20200713_1567.csv",
        "eugene_20201129_1900.csv",
        "will_20200810_1001.csv",
        "james_20200810_1003.csv",
        "alex_20200819_1300.csv",
    ], )

    my_data_connector: ConfiguredAssetGCSDataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "general_gcs_data_connector",
            "execution_engine": PandasExecutionEngine(),
        },
        config_defaults={
            "module_name": "great_expectations.datasource.data_connector"
        },
    )

    # Raises error in `FilePathDataConnector.get_batch_definition_list_from_batch_request()` due to missing a `batch_request` arg
    with pytest.raises(TypeError):
        # noinspection PyArgumentList
        my_data_connector.get_batch_definition_list_from_batch_request()
예제 #28
0
def test_resolve_metrics_with_aggregates_and_column_map():
    # Testing resolve metric function for a variety of cases - test from test_core used
    df = pd.DataFrame({"a": [1, 2, 3, None]})
    engine = PandasExecutionEngine(batch_data_dict={"my_id": df})
    mean = MetricConfiguration(
        metric_name="column.mean",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    stdev = MetricConfiguration(
        metric_name="column.standard_deviation",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
    )
    desired_metrics = (mean, stdev)
    metrics = engine.resolve_metrics(metrics_to_resolve=desired_metrics)

    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.map",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs=dict(),
        metric_dependencies={
            "column.standard_deviation": stdev,
            "column.mean": mean,
        },
    )
    results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ),
                                     metrics=metrics)
    metrics.update(results)
    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.condition",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "double_sided": True,
            "threshold": 2
        },
        metric_dependencies={"column_values.z_score.map": desired_metric},
    )
    results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ),
                                     metrics=metrics)
    assert list(results[desired_metric.id][0]) == [False, False, False]
    metrics.update(results)
    desired_metric = MetricConfiguration(
        metric_name="column_values.z_score.under_threshold.unexpected_count",
        metric_domain_kwargs={"column": "a"},
        metric_value_kwargs={
            "double_sided": True,
            "threshold": 2
        },
        metric_dependencies={"unexpected_condition": desired_metric},
    )
    results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ),
                                     metrics=metrics)
    assert results[desired_metric.id] == 0
def test_basic_instantiation(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "basic_data_connector__filesystem_data_connector"))

    # noinspection PyUnusedLocal
    my_data_connector = ConfiguredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE",
        execution_engine=PandasExecutionEngine(),
        base_directory=base_directory,
        glob_directive="*.csv",
        default_regex={
            "pattern": "(.*)",
            "group_names": ["file_name"],
        },
        assets={"my_asset_name": {}},
    )
예제 #30
0
def test_instantiation_without_args(mock_gcs_conn, mock_list_keys,
                                    expected_config_dict):
    my_data_connector = InferredAssetGCSDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(.+)/(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "letter", "number"],
        },
        bucket_or_name="test_bucket",
        prefix="",
    )
    assert my_data_connector.self_check() == expected_config_dict

    my_data_connector._refresh_data_references_cache()
    assert my_data_connector.get_data_reference_list_count() == 4
    assert my_data_connector.get_unmatched_data_references() == []