def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector( test_s3_files, test_df_small ): bucket, _keys = test_s3_files expected_df = test_df_small execution_engine: ExecutionEngine = PandasExecutionEngine() my_data_connector = ConfiguredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", bucket=bucket, execution_engine=execution_engine, prefix="", assets={"alpha": {}}, default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, ) batch_def: BatchDefinition = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict(index=1), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) test_df = execution_engine.get_batch_data( batch_spec=my_data_connector.build_batch_spec(batch_definition=batch_def) ) assert test_df.dataframe.shape == expected_df.shape # if key does not exist batch_def_no_key = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict(index=9), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine.get_batch_data( batch_spec=my_data_connector.build_batch_spec( batch_definition=batch_def_no_key ) )
def from_object(cls, validation_result): batch_kwargs = validation_result.meta.get("batch_kwargs", {}) if isinstance(batch_kwargs, IDDict): batch_identifier = batch_kwargs.to_id() elif isinstance(batch_kwargs, dict): batch_identifier = IDDict(batch_kwargs).to_id() else: raise DataContextError("Unable to construct ValidationResultIdentifier from provided object.") return cls( expectation_suite_identifier=ExpectationSuiteIdentifier(validation_result.meta["expectation_suite_name"]), run_id=validation_result.meta.get("run_id"), batch_identifier=batch_identifier )
def test_populate_dependencies_with_incorrect_metric_name(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than" ) validation_dependencies = expectation_impl( configuration ).get_validation_dependencies( configuration, engine, ) try: Validator(execution_engine=engine).build_metric_dependency_graph( graph, MetricConfiguration("column_values.not_a_metric", IDDict()), configuration, execution_engine=engine, ) except MetricProviderError as e: graph = e assert isinstance(graph, MetricProviderError)
def id(self) -> str: return IDDict(self.to_json_dict()).to_id()
def resolve_metric_bundle( self, metric_fn_bundle: Iterable[Tuple[MetricConfiguration, Any, dict, dict]], ) -> Dict[Tuple[str, str, str], Any]: """For every metric in a set of Metrics to resolve, obtains necessary metric keyword arguments and builds bundles of the metrics into one large query dictionary so that they are all executed simultaneously. Will fail if bundling the metrics together is not possible. Args: metric_fn_bundle (Iterable[Tuple[MetricConfiguration, Callable, dict]): \ A Dictionary containing a MetricProvider's MetricConfiguration (its unique identifier), its metric provider function (the function that actually executes the metric), and the arguments to pass to the metric provider function. A dictionary of metrics defined in the registry and corresponding arguments Returns: A dictionary of metric names and their corresponding now-queried values. """ resolved_metrics = {} # We need a different query for each domain (where clause). queries: Dict[Tuple, dict] = {} for ( metric_to_resolve, engine_fn, compute_domain_kwargs, accessor_domain_kwargs, metric_provider_kwargs, ) in metric_fn_bundle: if not isinstance(compute_domain_kwargs, IDDict): compute_domain_kwargs = IDDict(compute_domain_kwargs) domain_id = compute_domain_kwargs.to_id() if domain_id not in queries: queries[domain_id] = { "select": [], "ids": [], "domain_kwargs": compute_domain_kwargs, } if self.engine.dialect.name == "clickhouse": queries[domain_id]["select"].append( engine_fn.label( metric_to_resolve.metric_name.join( random.choices(string.ascii_lowercase, k=2) ) ) ) else: queries[domain_id]["select"].append( engine_fn.label(metric_to_resolve.metric_name) ) queries[domain_id]["ids"].append(metric_to_resolve.id) for query in queries.values(): domain_kwargs = query["domain_kwargs"] selectable = self.get_domain_records( domain_kwargs=domain_kwargs, ) assert len(query["select"]) == len(query["ids"]) try: """ If a custom query is passed, selectable will be TextClause and not formatted as a subquery wrapped in "(subquery) alias". TextClause must first be converted to TextualSelect using sa.columns() before it can be converted to type Subquery """ if TextClause and isinstance(selectable, TextClause): res = self.engine.execute( sa.select(query["select"]).select_from( selectable.columns().subquery() ) ).fetchall() else: res = self.engine.execute( sa.select(query["select"]).select_from(selectable) ).fetchall() logger.debug( f"SqlAlchemyExecutionEngine computed {len(res[0])} metrics on domain_id {IDDict(domain_kwargs).to_id()}" ) except OperationalError as oe: exception_message: str = "An SQL execution Exception occurred. " exception_traceback: str = traceback.format_exc() exception_message += f'{type(oe).__name__}: "{str(oe)}". Traceback: "{exception_traceback}".' logger.error(exception_message) raise ExecutionEngineError(message=exception_message) assert ( len(res) == 1 ), "all bundle-computed metrics must be single-value statistics" assert len(query["ids"]) == len( res[0] ), "unexpected number of metrics returned" for idx, id in enumerate(query["ids"]): resolved_metrics[id] = convert_to_json_serializable(res[0][idx]) return resolved_metrics
def resolve_metric_bundle( self, metric_fn_bundle: Iterable[Tuple[MetricConfiguration, Any, dict, dict]], ) -> dict: """For every metrics in a set of Metrics to resolve, obtains necessary metric keyword arguments and builds a bundles the metrics into one large query dictionary so that they are all executed simultaneously. Will fail if bundling the metrics together is not possible. Args: metric_fn_bundle (Iterable[Tuple[MetricConfiguration, Callable, dict]): \ A Dictionary containing a MetricProvider's MetricConfiguration (its unique identifier), its metric provider function (the function that actually executes the metric), and the arguments to pass to the metric provider function. metrics (Dict[Tuple, Any]): \ A dictionary of metrics defined in the registry and corresponding arguments Returns: A dictionary of metric names and their corresponding now-queried values. """ resolved_metrics = dict() # We need a different query for each domain (where clause). queries: Dict[Tuple, dict] = dict() for ( metric_to_resolve, engine_fn, compute_domain_kwargs, accessor_domain_kwargs, metric_provider_kwargs, ) in metric_fn_bundle: if not isinstance(compute_domain_kwargs, IDDict): compute_domain_kwargs = IDDict(compute_domain_kwargs) domain_id = compute_domain_kwargs.to_id() if domain_id not in queries: queries[domain_id] = { "select": [], "ids": [], "domain_kwargs": compute_domain_kwargs, } queries[domain_id]["select"].append( engine_fn.label(metric_to_resolve.metric_name) ) queries[domain_id]["ids"].append(metric_to_resolve.id) for query in queries.values(): selectable, compute_domain_kwargs, _ = self.get_compute_domain( query["domain_kwargs"], domain_type="identity" ) assert len(query["select"]) == len(query["ids"]) res = self.engine.execute( sa.select(query["select"]).select_from(selectable) ).fetchall() logger.debug( f"SqlAlchemyExecutionEngine computed {len(res[0])} metrics on domain_id {IDDict(compute_domain_kwargs).to_id()}" ) assert ( len(res) == 1 ), "all bundle-computed metrics must be single-value statistics" assert len(query["ids"]) == len( res[0] ), "unexpected number of metrics returned" for idx, id in enumerate(query["ids"]): resolved_metrics[id] = convert_to_json_serializable(res[0][idx]) # Convert metrics to be serializable return resolved_metrics
data_connector_name=data_connector_name, data_asset_name=data_asset_name, ) batch_definition_list: List[ BatchDefinition] = data_connector.get_batch_definition_list_from_batch_request( batch_request) assert len( batch_definition_list) == test_case.num_expected_batch_definitions expected_batch_definition_list: List[BatchDefinition] = [ BatchDefinition( datasource_name=datasource_name, data_connector_name=data_connector_name, data_asset_name=data_asset_name, batch_identifiers=IDDict({column_name: pickup_datetime}), ) for pickup_datetime in test_case.expected_pickup_datetimes ] assert set(batch_definition_list) == set( expected_batch_definition_list ), f"BatchDefinition lists don't match\n\nbatch_definition_list:\n{batch_definition_list}\n\nexpected_batch_definition_list:\n{expected_batch_definition_list}" # 4. Check that loaded data is as expected # Use expected_batch_definition_list since it is sorted, and we already # asserted that it contains the same items as batch_definition_list batch_spec: SqlAlchemyDatasourceBatchSpec = data_connector.build_batch_spec( expected_batch_definition_list[0]) batch_data: SqlAlchemyBatchData = context.datasources[
def test_return_all_batch_definitions_returns_specified_partition( mock_gcs_conn, mock_list_keys, mock_emit, empty_data_context_stats_enabled): my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetGCSDataConnector datasource_name: test_environment bucket_or_name: my_bucket prefix: "" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) mock_list_keys.return_value = [ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ] my_data_connector: ConfiguredAssetGCSDataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_gcs_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, ) self_check_report = my_data_connector.self_check() assert self_check_report["class_name"] == "ConfiguredAssetGCSDataConnector" assert self_check_report["data_asset_count"] == 1 assert self_check_report["data_assets"]["TestFiles"][ "batch_definition_count"] == 10 assert self_check_report["unmatched_data_reference_count"] == 0 my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", data_connector_query=IDDict( **{ "batch_filter_parameters": { "name": "james", "timestamp": "20200713", "price": "1567", } }), ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request)) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] expected_batch_definition: BatchDefinition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict(**{ "name": "james", "timestamp": "20200713", "price": "1567", }), ) assert my_batch_definition == expected_batch_definition
def expected_batch_definitions_unsorted(): """ Used to validate `get_batch_definition_list_from_batch_request()` outputs. Input and output should maintain the same order (henced "unsorted") """ expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "alex", "timestamp": "20200809", "price": "1000" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200811", "price": "1009" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "abe", "timestamp": "20200809", "price": "1040" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200809", "price": "1002" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200713", "price": "1567" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20201129", "price": "1900" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200810", "price": "1001" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200810", "price": "1003" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "alex", "timestamp": "20200819", "price": "1300" }), ), ] return expected
def expected_batch_definitions_sorted(): """ Used to validate `get_batch_definition_list_from_batch_request()` outputs. Input should be sorted based on some criteria, resulting in some change between input and output. """ expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "abe", "timestamp": "20200809", "price": "1040" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "alex", "timestamp": "20200819", "price": "1300" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "alex", "timestamp": "20200809", "price": "1000" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20201129", "price": "1900" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200811", "price": "1009" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200810", "price": "1003" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200713", "price": "1567" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200810", "price": "1001" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_gcs_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200809", "price": "1002" }), ), ] return expected