def test_column_pairs_greater_metric_pd(): df = pd.DataFrame({ "a": [2, 3, 4, None, 3, None], "b": [1, 2, 3, None, 3, 5] }) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) desired_metric = MetricConfiguration( metric_name="column_pair_values.a_greater_than_b.condition", metric_domain_kwargs={ "column_A": "a", "column_B": "b" }, metric_value_kwargs={ "or_equal": True, "ignore_row_if": "either_value_is_missing", }, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, )) assert (results[desired_metric.id][0].reset_index(drop=True).equals( pd.Series([True, True, True, True])))
def test_get_batch_with_split_on_converted_datetime(test_df): split_df = PandasExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_df, splitter_method="_split_on_converted_datetime", splitter_kwargs={ "column_name": "timestamp", "batch_identifiers": {"timestamp": "2020-01-30"}, }, ) ) assert split_df.dataframe.shape == (3, 10)
def test_sample_using_mod(test_df): sampled_df = PandasExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_df, sampling_method="_sample_using_mod", sampling_kwargs={ "column_name": "id", "mod": 5, "value": 4, }, )) assert sampled_df.dataframe.shape == (24, 10)
def test_return_all_batch_definitions_too_many_sorters( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled): my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetGCSDataConnector datasource_name: test_environment bucket_or_name: my_bucket prefix: "" assets: TestFiles: default_regex: pattern: (.+)_.+_.+\\.csv group_names: - name sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) mock_list_keys.return_value = [ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ] # Raises error due to a non-existent sorter being specified in `FilePathDataConnector._validate_sorters_configuration()` with pytest.raises(ge_exceptions.DataConnectorError): instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_gcs_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, )
def test_add_column_row_condition_with_unsupported_conditions(): e = PandasExecutionEngine() # Ensuring that an attempt to filter nans within base class yields an error with pytest.raises(GreatExpectationsError) as error: new_domain_kwargs = e.add_column_row_condition({}, "a", filter_nan=True) # Having a pre-existing row condition should result in an error, as we should not be updating it with pytest.raises(GreatExpectationsError) as error: new_domain_kwargs = e.add_column_row_condition({ "column": "a", "row_condition": "col(a) == 2" }) # Testing that error raised when column not given with pytest.raises(AssertionError) as error: new_domain_kwargs = e.add_column_row_condition({})
def test_add_column_row_condition(): e = PandasExecutionEngine() # Checking that adding a simple column row condition is functional new_domain_kwargs = e.add_column_row_condition({}, "a") assert new_domain_kwargs == { "condition_parser": "great_expectations__experimental__", "row_condition": 'col("a").notnull()', } # Ensuring that this also works when formatted differently new_domain_kwargs = e.add_column_row_condition({"column": "a"}) assert new_domain_kwargs == { "column": "a", "condition_parser": "great_expectations__experimental__", "row_condition": 'col("a").notnull()', } # Ensuring that everything still works if a row condition of None given new_domain_kwargs = e.add_column_row_condition( {"column": "a", "row_condition": None} ) assert new_domain_kwargs == { "column": "a", "row_condition": None, "condition_parser": "great_expectations__experimental__", "row_condition": 'col("a").notnull()', } # Identity case new_domain_kwargs = e.add_column_row_condition({}, "a", filter_null=False) assert new_domain_kwargs == {}
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): min_value = metric_value_kwargs.get("min_value") max_value = metric_value_kwargs.get("max_value") strict_min = metric_value_kwargs.get("strict_min") strict_max = metric_value_kwargs.get("strict_max") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) val = df[accessor_domain_kwargs["column"]] if min_value is not None and max_value is not None: if strict_min and strict_max: series = (min_value < val) and (val < max_value) elif strict_min: series = (min_value < val) and (val <= max_value) elif strict_max: series = (min_value <= val) and (val < max_value) else: series = (min_value <= val) and (val <= max_value) elif min_value is None and max_value is not None: if strict_max: series = val < max_value else: series = val <= max_value elif min_value is not None and max_value is None: if strict_min: series = min_value < val else: series = min_value <= val else: raise ValueError("unable to parse domain and value kwargs") return np.count_nonzero(series)
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector( test_s3_files, test_df_small ): bucket, _keys = test_s3_files expected_df = test_df_small execution_engine: ExecutionEngine = PandasExecutionEngine() my_data_connector = ConfiguredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", bucket=bucket, execution_engine=execution_engine, prefix="", assets={"alpha": {}}, default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, ) batch_def: BatchDefinition = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict(index=1), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) test_df = execution_engine.get_batch_data( batch_spec=my_data_connector.build_batch_spec(batch_definition=batch_def) ) assert test_df.dataframe.shape == expected_df.shape # if key does not exist batch_def_no_key = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict(index=9), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine.get_batch_data( batch_spec=my_data_connector.build_batch_spec( batch_definition=batch_def_no_key ) )
def test_test_yaml_config(mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled): context: DataContext = empty_data_context_stats_enabled report_object = context.test_yaml_config( """ module_name: great_expectations.datasource.data_connector class_name: InferredAssetGCSDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR bucket_or_name: test_bucket prefix: "" default_regex: pattern: (\\d{4})/(\\d{2})/(.*)-.*\\.csv group_names: - year_dir - month_dir - data_asset_name """, runtime_environment={ "execution_engine": PandasExecutionEngine(), }, return_mode="report_object", ) assert report_object == { "class_name": "InferredAssetGCSDataConnector", "data_asset_count": 2, "example_data_asset_names": ["alpha", "beta"], "data_assets": { "alpha": { "example_data_references": [ "2020/01/alpha-*.csv", "2020/02/alpha-*.csv", "2020/03/alpha-*.csv", ], "batch_definition_count": 3, }, "beta": { "example_data_references": [ "2020/01/beta-*.csv", "2020/02/beta-*.csv", "2020/03/beta-*.csv", ], "batch_definition_count": 4, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, }
def test_graph_validate(basic_datasource): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "b", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) batch = basic_datasource.get_single_batch_from_batch_request( BatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "batch_data": df, "partition_request": PartitionRequest( **{ "partition_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, } }), })) result = Validator(execution_engine=PandasExecutionEngine(), batches=[batch]).graph_validate( configurations=[expectationConfiguration]) assert result == [ ExpectationValidationResult( success=True, expectation_config=None, meta={}, result={ "element_count": 6, "unexpected_count": 0, "unexpected_percent": 0.0, "partial_unexpected_list": [], "missing_count": 1, "missing_percent": 16.666666666666664, "unexpected_percent_nonmissing": 0.0, }, exception_info=None, ) ]
def test_resolve_metrics_with_extraneous_value_key(): df = pd.DataFrame({"a": [1, 2, 3, None]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={}, ) # Ensuring that an unused value key will not mess up computation stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3, 4, 5]}, ) desired_metrics = (mean, stdev) metrics = engine.resolve_metrics(metrics_to_resolve=desired_metrics) # Ensuring extraneous value key did not change computation assert (metrics[("column.standard_deviation", "column=a", "value_set=[1, 2, 3, 4, 5]")] == 1.0)
def test_get_table_metric_provider_metric_dependencies(empty_sqlite_db): mp = ColumnMax() metric = MetricConfiguration("column.max", dict(), dict()) dependencies = mp.get_evaluation_dependencies( metric, execution_engine=SqlAlchemyExecutionEngine(engine=empty_sqlite_db)) assert dependencies["metric_partial_fn"].id[0] == "column.max.aggregate_fn" mp = ColumnMax() metric = MetricConfiguration("column.max", dict(), dict()) dependencies = mp.get_evaluation_dependencies( metric, execution_engine=PandasExecutionEngine()) assert dependencies == dict()
def test_nested_directory_data_asset_name_in_folder(mock_gcs_client, mock_list_keys, mock_emit, empty_data_context): context = empty_data_context report_object = context.test_yaml_config( """ module_name: great_expectations.datasource.data_connector class_name: InferredAssetGCSDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR bucket_or_name: test_bucket prefix: "" default_regex: group_names: - data_asset_name - letter - number pattern: (\\w{1})\\/(\\w{1})-(\\d{1})\\.csv """, runtime_environment={ "execution_engine": PandasExecutionEngine(), }, return_mode="report_object", ) assert report_object == { "class_name": "InferredAssetGCSDataConnector", "data_asset_count": 4, "example_data_asset_names": ["A", "B", "C"], "data_assets": { "A": { "batch_definition_count": 3, "example_data_references": ["A/A-1.csv", "A/A-2.csv", "A/A-3.csv"], }, "B": { "batch_definition_count": 3, "example_data_references": ["B/B-1.csv", "B/B-2.csv", "B/B-3.csv"], }, "C": { "batch_definition_count": 3, "example_data_references": ["C/C-1.csv", "C/C-2.csv", "C/C-3.csv"], }, }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], }
def test_self_check(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_self_check")) create_files_in_directory( directory=base_directory, file_name_list=[ "A-100.csv", "A-101.csv", "B-1.csv", "B-2.csv", ], ) my_data_connector: InferredAssetFilesystemDataConnector = ( InferredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "number"], }, glob_directive="*", base_directory=base_directory, ) ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() self_check_report_object = my_data_connector.self_check() assert self_check_report_object == { "class_name": "InferredAssetFilesystemDataConnector", "data_asset_count": 2, "example_data_asset_names": ["A", "B"], "data_assets": { "A": { "example_data_references": ["A-100.csv", "A-101.csv"], "batch_definition_count": 2, }, "B": { "example_data_references": ["B-1.csv", "B-2.csv"], "batch_definition_count": 2, }, }, "example_unmatched_data_references": [], "unmatched_data_reference_count": 0, # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": {}, }
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) if metric_value_kwargs.get("fetch_all", cls.default_kwarg_values["fetch_all"]): return df return df.head(metric_value_kwargs["n_rows"])
def test_validator_progress_bar_config_disabled(mock_tqdm, mock_validation_graph, mock_data_context): data_context = mock_data_context() data_context.progress_bars = ProgressBarsConfig(metric_calculations=False) engine = PandasExecutionEngine() validator = Validator(engine, data_context=data_context) # ValidationGraph is a complex object that requires len > 3 to not trigger tqdm mock_validation_graph.edges.__len__ = lambda _: 3 validator.resolve_validation_graph(mock_validation_graph, {}) assert mock_tqdm.called is True assert mock_tqdm.call_args[1]["disable"] is True
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) return [{ "name": name, "type": dtype } for (name, dtype) in zip(df.columns, df.dtypes)]
def test_validator_progress_bar_config_enabled(mock_tqdm, mock_validation_graph, mock_data_context): data_context = mock_data_context() engine = PandasExecutionEngine() validator = Validator(engine, data_context=data_context) # ValidationGraph is a complex object that requires len > 3 to not trigger tqdm mock_validation_graph.edges.__len__ = lambda _: 3 validator.resolve_validation_graph(mock_validation_graph, {}) # Still invoked but doesn't actually do anything due to `disabled` assert mock_tqdm.called is True assert mock_tqdm.call_args[1]["disable"] is False
def test_basic_instantiation(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)/(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "letter", "number"], }, bucket=bucket, prefix="", ) # noinspection PyProtectedMember my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 4 assert my_data_connector.get_unmatched_data_references() == [] # Illegal execution environment name with pytest.raises(ValueError): print( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="something", data_connector_name="my_data_connector", data_asset_name="something", ) ) )
def _pandas( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = accessor_domain_kwargs["column"] bins = metric_value_kwargs["bins"] hist, bin_edges = np.histogram(df[column], bins, density=False) return list(hist)
def _pandas_columns( self, batches: Dict[str, Batch], execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict, runtime_configuration: dict = None, ): """Metric which returns all columns in a dataframe""" df = execution_engine.get_domain_dataframe( domain_kwargs=metric_domain_kwargs, batches=batches) cols = df.columns return cols.tolist()
def test_get_batch_with_split_on_mod_integer(test_df): split_df = PandasExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_df, splitter_method="_split_on_mod_integer", splitter_kwargs={ "column_name": "id", "mod": 10, "batch_identifiers": {"id": 5}, }, ) ) assert split_df.dataframe.shape == (12, 10) assert split_df.dataframe.id.min() == 5 assert split_df.dataframe.id.max() == 115
def test_instantiation_from_a_config_with_nonmatching_regex_creates_unmatched_references( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled): context: DataContext = empty_data_context_stats_enabled report_object = context.test_yaml_config( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetGCSDataConnector datasource_name: FAKE_DATASOURCE name: TEST_DATA_CONNECTOR default_regex: pattern: beta-(.*)\\.csv group_names: - index bucket_or_name: my_bucket prefix: "" assets: alpha: """, runtime_environment={ "execution_engine": PandasExecutionEngine(), }, return_mode="report_object", ) assert report_object == { "class_name": "ConfiguredAssetGCSDataConnector", "data_asset_count": 1, "example_data_asset_names": [ "alpha", ], "data_assets": { "alpha": { "example_data_references": [], "batch_definition_count": 0 }, }, "example_unmatched_data_references": [ "alpha-1.csv", "alpha-2.csv", "alpha-3.csv", ], "unmatched_data_reference_count": 3, }
def _pandas_column_a_greater_than_b( self, batches: Dict[str, Batch], execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict, runtime_configuration: dict = None, ): """Metric which returns all columns in a dataframe""" df = execution_engine.get_domain_dataframe( domain_kwargs=metric_domain_kwargs, batches=batches) # Initialization of necessary value kwargs allow_cross_type_comparisons = None parse_strings_as_datetimes = None or_equal = None column_A = df[metric_value_kwargs["column_A"]] column_B = df[metric_value_kwargs["column_B"]] # If value kwargs are given that could impact outcome, initializing them if allow_cross_type_comparisons in metric_value_kwargs: allow_cross_type_comparisons = metric_value_kwargs[ "allow_cross_type_comparisons"] if parse_strings_as_datetimes in metric_value_kwargs: parse_strings_as_datetimes = metric_value_kwargs[ "parse_strings_as_datetimes"] if or_equal in metric_value_kwargs: or_equal = metric_value_kwargs["or_equal"] if allow_cross_type_comparisons: column_A = column_A.apply(str) column_B = column_B.apply(str) if parse_strings_as_datetimes: temp_column_A = column_A.map(parse) temp_column_B = column_B.map(parse) else: temp_column_A = column_A temp_column_B = column_B if or_equal: return temp_column_A >= temp_column_B else: return temp_column_A > temp_column_B
def _pandas_equal_columns( self, batches: Dict[str, Batch], execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict, runtime_configuration: dict = None, ): """Metric which returns all columns in a dataframe""" df = execution_engine.get_domain_dataframe( domain_kwargs=metric_domain_kwargs, batches=batches) column_A = df[metric_value_kwargs["column_A"]] column_B = df[metric_value_kwargs["column_B"]] return (column_A == column_B).any()
def test_expect_column_value_z_scores_to_be_less_than_impl(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) expectationConfiguration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) expectation = ExpectColumnValueZScoresToBeLessThan( expectationConfiguration) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) result = expectation.validate(Validator(execution_engine=engine)) assert result == ExpectationValidationResult(success=True, )
def test_get_definition_list_from_batch_request_with_empty_args_raises_error( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled): my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetGCSDataConnector datasource_name: test_environment bucket_or_name: my_bucket prefix: "" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) mock_list_keys.return_value = ([ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector: ConfiguredAssetGCSDataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_gcs_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) # Raises error in `FilePathDataConnector.get_batch_definition_list_from_batch_request()` due to missing a `batch_request` arg with pytest.raises(TypeError): # noinspection PyArgumentList my_data_connector.get_batch_definition_list_from_batch_request()
def test_resolve_metrics_with_aggregates_and_column_map(): # Testing resolve metric function for a variety of cases - test from test_core used df = pd.DataFrame({"a": [1, 2, 3, None]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) desired_metrics = (mean, stdev) metrics = engine.resolve_metrics(metrics_to_resolve=desired_metrics) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, }, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ), metrics=metrics) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "double_sided": True, "threshold": 2 }, metric_dependencies={"column_values.z_score.map": desired_metric}, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ), metrics=metrics) assert list(results[desired_metric.id][0]) == [False, False, False] metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "double_sided": True, "threshold": 2 }, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ), metrics=metrics) assert results[desired_metric.id] == 0
def test_basic_instantiation(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "basic_data_connector__filesystem_data_connector")) # noinspection PyUnusedLocal my_data_connector = ConfiguredAssetFilesystemDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE", execution_engine=PandasExecutionEngine(), base_directory=base_directory, glob_directive="*.csv", default_regex={ "pattern": "(.*)", "group_names": ["file_name"], }, assets={"my_asset_name": {}}, )
def test_instantiation_without_args(mock_gcs_conn, mock_list_keys, expected_config_dict): my_data_connector = InferredAssetGCSDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(.+)/(.+)-(\d+)\.csv", "group_names": ["data_asset_name", "letter", "number"], }, bucket_or_name="test_bucket", prefix="", ) assert my_data_connector.self_check() == expected_config_dict my_data_connector._refresh_data_references_cache() assert my_data_connector.get_data_reference_list_count() == 4 assert my_data_connector.get_unmatched_data_references() == []