def test_resolve_metrics_with_incomplete_metric_input(): engine = PandasExecutionEngine() mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, }, ) # Ensuring that incomplete metrics given raises a GreatExpectationsError with pytest.raises(GreatExpectationsError) as error: engine.resolve_metrics(metrics_to_resolve=(desired_metric,), metrics={})
def test_resolve_metrics_with_aggregates_and_column_map(): # Testing resolve metric function for a variety of cases - test from test_core used df = pd.DataFrame({"a": [1, 2, 3, None]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) desired_metrics = (mean, stdev) metrics = engine.resolve_metrics(metrics_to_resolve=desired_metrics) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, }, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ), metrics=metrics) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "double_sided": True, "threshold": 2 }, metric_dependencies={"column_values.z_score.map": desired_metric}, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ), metrics=metrics) assert list(results[desired_metric.id][0]) == [False, False, False] metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={ "double_sided": True, "threshold": 2 }, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ), metrics=metrics) assert results[desired_metric.id] == 0
def test_basic_metric(): df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) batch = Batch(data=df) engine = PandasExecutionEngine(batch_data_dict={batch.id: batch.data}) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert results == {desired_metric.id: 3}
def test_max_metric_pd_column_does_not_exist(): df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) batch = Batch(data=df) engine = PandasExecutionEngine(batch_data_dict={batch.id: batch.data}) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "non_existent_column"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) with pytest.raises(ge_exceptions.ExecutionEngineError) as eee: # noinspection PyUnusedLocal results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) assert ( str(eee.value) == 'Error: The column "non_existent_column" in BatchData does not exist.' )
def test_table_metric_pd(): df = pd.DataFrame({"a": [1, 2, 3, 3, None], "b": [1, 2, 3, 3, None]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) desired_metric = MetricConfiguration( metric_name="table.row_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, )) assert results == {desired_metric.id: 5}
def test_column_pairs_equal_metric_pd(): df = pd.DataFrame({"a": [1, 2, 3, 3], "b": [1, 2, 3, 3]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) desired_metric = MetricConfiguration( metric_name="column_pair_values.equal.condition", metric_domain_kwargs={"column_A": "a", "column_B": "b"}, metric_value_kwargs=dict(), ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric,)) assert results[desired_metric.id][0].equals(pd.Series([True, True, True, True]))
def test_column_max(): df = pd.DataFrame({"a": [1, 2, 3, 3, None]}) batch = Batch(data=df) engine = PandasExecutionEngine(batch_data_dict={batch.id: batch.data}) desired_metric = MetricConfiguration( metric_name="column.max", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric,)) assert results == {desired_metric.id: 3}
def test_column_pairs_in_set_metric_pd(): df = pd.DataFrame({"a": [10, 3, 4, None, 3, None], "b": [1, 2, 3, None, 3, 5]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) desired_metric = MetricConfiguration( metric_name="column_pair_values.in_set.condition", metric_domain_kwargs={"column_A": "a", "column_B": "b"}, metric_value_kwargs={ "value_pairs_set": [(2, 1), (3, 2), (4, 3), (3, 3)], "ignore_row_if": "either_value_is_missing", }, ) results = engine.resolve_metrics(metrics_to_resolve=(desired_metric,)) assert ( results[desired_metric.id][0] .reset_index(drop=True) .equals(pd.Series([False, True, True, True])) )
def test_resolve_metrics_with_extraneous_value_key(): df = pd.DataFrame({"a": [1, 2, 3, None]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) # Ensuring that an unused value key will not mess up computation stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"value_set": [1, 2, 3, 4, 5]}, metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) # Ensuring extraneous value key did not change computation assert ( metrics[("column.standard_deviation", "column=a", "value_set=[1, 2, 3, 4, 5]")] == 1.0 )
def test_z_score_under_threshold_pd(): df = pd.DataFrame({"a": [1, 2, 3, None]}) engine = PandasExecutionEngine(batch_data_dict={"my_id": df}) metrics: dict = {} table_columns_metric: MetricConfiguration results: dict table_columns_metric, results = get_table_columns_metric(engine=engine) metrics.update(results) mean = MetricConfiguration( metric_name="column.mean", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) stdev = MetricConfiguration( metric_name="column.standard_deviation", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "table.columns": table_columns_metric, }, ) desired_metrics = (mean, stdev) results = engine.resolve_metrics( metrics_to_resolve=desired_metrics, metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.map", metric_domain_kwargs={"column": "a"}, metric_value_kwargs=dict(), metric_dependencies={ "column.standard_deviation": stdev, "column.mean": mean, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.condition", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={ "column_values.z_score.map": desired_metric, "table.columns": table_columns_metric, }, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert list(results[desired_metric.id][0]) == [False, False, False] metrics.update(results) desired_metric = MetricConfiguration( metric_name="column_values.z_score.under_threshold.unexpected_count", metric_domain_kwargs={"column": "a"}, metric_value_kwargs={"double_sided": True, "threshold": 2}, metric_dependencies={"unexpected_condition": desired_metric}, ) results = engine.resolve_metrics( metrics_to_resolve=(desired_metric,), metrics=metrics ) assert results[desired_metric.id] == 0