def test_instantiation_with_test_yaml_config(mock_emit, caplog, empty_data_context_stats_enabled): empty_data_context_stats_enabled.test_yaml_config(yaml_config=""" module_name: great_expectations.data_context.store class_name: EvaluationParameterStore """) assert mock_emit.call_count == 1 # Substitute current anonymized name since it changes for each run anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][ "anonymized_name"] assert mock_emit.call_args_list == [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_name, "parent_class": "EvaluationParameterStore", "anonymized_store_backend": { "parent_class": "InMemoryStoreBackend" }, }, "success": True, }), ] # Confirm that logs do not contain any exceptions or invalid messages assert not usage_stats_exceptions_exist(messages=caplog.messages) assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_substitution_error( mock_emit, caplog, empty_data_context_stats_enabled): with pytest.raises(ge_exceptions.MissingConfigVariableError): _ = empty_data_context_stats_enabled.test_yaml_config(yaml_config=""" module_name: great_expectations.data_context.store.expectations_store class_name: ExpectationsStore store_backend: module_name: "great_expectations.data_context.store.store_backend" class_name: InMemoryStoreBackend error_on_substitution: $IDONTEXIST """) assert mock_emit.call_count == 1 assert mock_emit.call_args_list == [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "diagnostic_info": ["__substitution_error__"] }, "success": False, }), ] # Confirm that logs do not contain any exceptions or invalid messages assert not usage_stats_exceptions_exist(messages=caplog.messages) assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_custom_type_not_ge_subclass( mock_emit, caplog, empty_data_context_stats_enabled): """ What does this test and why? We should be able to discern the GE parent class for a custom type and construct a useful usage stats event message. """ data_context: DataContext = empty_data_context_stats_enabled _ = data_context.test_yaml_config(yaml_config=""" module_name: tests.data_context.fixtures.plugins class_name: MyCustomNonCoreGeClass """) assert mock_emit.call_count == 1 assert mock_emit.call_args_list == [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "diagnostic_info": ["__custom_subclass_not_core_ge__"] }, "success": True, }), ] # Confirm that logs do not contain any exceptions or invalid messages assert not usage_stats_exceptions_exist(messages=caplog.messages) assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_custom_config_class_name_not_provided( mock_emit, caplog, empty_data_context_stats_enabled): """ What does this test and why? If a class_name is not provided, and we have run into an error state in test_yaml_config() (likely because of the missing class_name) then we should report descriptive diagnostic info. This should be the case even if we are passing in a custom config. """ data_context: DataContext = empty_data_context_stats_enabled with pytest.raises(Exception): _ = data_context.test_yaml_config(yaml_config=""" module_name: tests.data_context.fixtures.plugins.my_custom_expectations_store store_backend: module_name: great_expectations.data_context.store.store_backend class_name: InMemoryStoreBackend """) assert mock_emit.call_count == 1 assert mock_emit.call_args_list == [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "diagnostic_info": ["__class_name_not_provided__"], }, "success": False, }), ] # Confirm that logs do not contain any exceptions or invalid messages assert not usage_stats_exceptions_exist(messages=caplog.messages) assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_class_name_not_provided( mock_emit, caplog, empty_data_context_stats_enabled): """ What does this test and why? If a class_name is not provided, and we have run into an error state in test_yaml_config() (likely because of the missing class_name) then we should report descriptive diagnostic info. """ with pytest.raises(Exception): # noinspection PyUnusedLocal my_expectation_store = empty_data_context_stats_enabled.test_yaml_config( yaml_config=""" module_name: great_expectations.data_context.store.expectations_store """) assert mock_emit.call_count == 1 assert mock_emit.call_args_list == [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "diagnostic_info": ["__class_name_not_provided__"] }, "success": False, }), ] # Confirm that logs do not contain any exceptions or invalid messages assert not usage_stats_exceptions_exist(messages=caplog.messages) assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_usage_statistics_handler_validate_message_success( caplog, in_memory_data_context_config_usage_stats_enabled, sample_partial_message): # caplog default is WARNING and above, we want to see DEBUG level messages for this test caplog.set_level( level=logging.DEBUG, logger="great_expectations.core.usage_statistics.usage_statistics", ) context: BaseDataContext = BaseDataContext( in_memory_data_context_config_usage_stats_enabled) usage_statistics_handler = UsageStatisticsHandler( data_context=context, data_context_id=in_memory_data_context_config_usage_stats_enabled. anonymous_usage_statistics.data_context_id, usage_statistics_url=in_memory_data_context_config_usage_stats_enabled. anonymous_usage_statistics.usage_statistics_url, ) assert (usage_statistics_handler._data_context_id == "00000000-0000-0000-0000-000000000001") envelope = usage_statistics_handler.build_envelope(sample_partial_message) validated_message = usage_statistics_handler.validate_message( envelope, anonymized_usage_statistics_record_schema) assert validated_message assert not usage_stats_invalid_messages_exist(caplog.messages)
def test_rule_based_profiler_emits_valid_usage_stats( mock_emit, caplog, empty_data_context_stats_enabled, test_df, tmp_path_factory): context = empty_data_context_stats_enabled yaml_config = """ name: my_profiler class_name: RuleBasedProfiler module_name: great_expectations.rule_based_profiler config_version: 1.0 variables: integer_type: INTEGER timestamp_type: TIMESTAMP max_user_id: 999999999999 min_timestamp: 2004-10-19 10:23:54 rules: my_rule_for_user_ids: domain_builder: class_name: TableDomainBuilder expectation_configuration_builders: - expectation_type: expect_column_values_to_be_of_type class_name: DefaultExpectationConfigurationBuilder """ context.test_yaml_config(yaml_config=yaml_config, name="my_profiler", class_name="Profiler") # Substitute anonymized name since it changes for each run anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][ "anonymized_name"] assert mock_emit.call_count == 1 assert mock_emit.call_args_list == [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_name, "parent_class": "RuleBasedProfiler", }, "success": True, }) ] # Confirm that logs do not contain any exceptions or invalid messages assert not usage_stats_exceptions_exist(messages=caplog.messages) assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_custom_type( mock_emit, caplog, empty_data_context_stats_enabled): """ What does this test and why? We should be able to discern the GE parent class for a custom type and construct a useful usage stats event message. """ data_context: DataContext = empty_data_context_stats_enabled _ = data_context.test_yaml_config(yaml_config=""" module_name: tests.data_context.fixtures.plugins class_name: MyCustomExpectationsStore store_backend: module_name: great_expectations.data_context.store.store_backend class_name: InMemoryStoreBackend """) assert mock_emit.call_count == 1 # Substitute anonymized name & class since it changes for each run anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][ "anonymized_name"] anonymized_class = mock_emit.call_args_list[0][0][0]["event_payload"][ "anonymized_class"] assert mock_emit.call_args_list == [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_name, "parent_class": "ExpectationsStore", "anonymized_class": anonymized_class, "anonymized_store_backend": { "parent_class": "InMemoryStoreBackend" }, }, "success": True, }), ] # Confirm that logs do not contain any exceptions or invalid messages assert not usage_stats_exceptions_exist(messages=caplog.messages) assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_usage_stats_invalid_messages_exist(test_input, test_output): assert usage_stats_invalid_messages_exist(messages=test_input) == test_output
def test_test_yaml_config_usage_stats_simple_sqlalchemy_datasource_subclass( mock_emit, caplog, sa, test_backends, empty_data_context_stats_enabled): """ What does this test and why? We should be able to discern the GE parent class for a custom type and construct a useful usage stats event message. This should be true for SimpleSqlalchemyDatasources. """ if "postgresql" not in test_backends: pytest.skip( "test_test_yaml_config_usage_stats_simple_sqlalchemy_datasource_subclass requires postgresql" ) data_context: DataContext = empty_data_context_stats_enabled _ = data_context.test_yaml_config(yaml_config=""" module_name: tests.data_context.fixtures.plugins.my_custom_simple_sqlalchemy_datasource_class class_name: MyCustomSimpleSqlalchemyDatasource name: some_name introspection: whole_table: data_asset_name_suffix: __whole_table credentials: drivername: postgresql host: localhost port: '5432' username: postgres password: '' database: postgres """) assert mock_emit.call_count == 1 # Substitute anonymized name & class since it changes for each run anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][ "anonymized_name"] anonymized_class = mock_emit.call_args_list[0][0][0]["event_payload"][ "anonymized_class"] anonymized_data_connector_name = mock_emit.call_args_list[0][0][0][ "event_payload"]["anonymized_data_connectors"][0]["anonymized_name"] assert mock_emit.call_args_list == [ mock.call({ "event": "data_context.test_yaml_config", "event_payload": { "anonymized_name": anonymized_name, "parent_class": "SimpleSqlalchemyDatasource", "anonymized_class": anonymized_class, "anonymized_execution_engine": { "parent_class": "SqlAlchemyExecutionEngine" }, "anonymized_data_connectors": [{ "anonymized_name": anonymized_data_connector_name, "parent_class": "InferredAssetSqlDataConnector", }], }, "success": True, }), ] # Confirm that logs do not contain any exceptions or invalid messages assert not usage_stats_exceptions_exist(messages=caplog.messages) assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_common_usage_stats_are_sent_no_mocking( caplog, in_memory_data_context_config_usage_stats_enabled, monkeypatch): """ What does this test and why? Our usage stats events are tested elsewhere in several ways (sending example events, validating sample events, throughout other tests ensuring the right events are sent, anonymization, opt-out, etc). This specific test is to ensure that there are no errors with the machinery to send the events in the UsageStatisticsHandler by running code that emits events and checking for errors in the log. This test purposely does not mock any part of the usage stats system to ensure the full code path is run, and sends events to the QA endpoint. This test uses both methods decorated with usage_statistics_enabled_method and those that send events directly. """ # caplog default is WARNING and above, we want to see DEBUG level messages for this test caplog.set_level( level=logging.DEBUG, logger="great_expectations.core.usage_statistics.usage_statistics", ) # Make sure usage stats are enabled monkeypatch.delenv("GE_USAGE_STATS", raising=False) # Undo the project-wide test default assert os.getenv("GE_USAGE_STATS") is None context: BaseDataContext = BaseDataContext( in_memory_data_context_config_usage_stats_enabled) # Note, we lose the `data_context.__init__` event because it was emitted before closing the worker context._usage_statistics_handler._close_worker() # Make sure usage stats are enabled assert not context._check_global_usage_statistics_opt_out() assert context.anonymous_usage_statistics.enabled assert context.anonymous_usage_statistics.data_context_id == DATA_CONTEXT_ID # Note module_name fields are omitted purposely to ensure we are still able to send events datasource_yaml = f""" name: example_datasource class_name: Datasource module_name: great_expectations.datasource execution_engine: # module_name: great_expectations.execution_engine class_name: PandasExecutionEngine data_connectors: default_runtime_data_connector_name: class_name: RuntimeDataConnector # module_name: great_expectations.datasource.data_connector batch_identifiers: - default_identifier_name """ # context.test_yaml_config() uses send_usage_message() context.test_yaml_config(yaml_config=datasource_yaml) expected_events: List[str] = ["data_context.test_yaml_config"] context.add_datasource(**yaml.load(datasource_yaml)) expected_events.append("data_context.add_datasource") df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) batch_request = RuntimeBatchRequest( datasource_name="example_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name= "my_data_asset", # This can be anything that identifies this data_asset for you runtime_parameters={"batch_data": df}, # df is your dataframe batch_identifiers={"default_identifier_name": "default_identifier"}, ) context.create_expectation_suite(expectation_suite_name="test_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite_name="test_suite") expected_events.append("data_context.get_batch_list") validator.expect_table_row_count_to_equal(value=2) validator.save_expectation_suite() expected_events.append("data_context.save_expectation_suite") checkpoint_yaml = """ name: my_checkpoint config_version: 1 class_name: SimpleCheckpoint validations: - batch_request: datasource_name: example_datasource data_connector_name: default_runtime_data_connector_name data_asset_name: my_data_asset expectation_suite_name: test_suite """ context.test_yaml_config(yaml_config=checkpoint_yaml) expected_events.append("data_context.test_yaml_config") # Note: add_checkpoint is not instrumented as of 20211215 context.add_checkpoint(**yaml.safe_load(checkpoint_yaml)) context.run_checkpoint( checkpoint_name="my_checkpoint", batch_request={ "runtime_parameters": { "batch_data": df }, "batch_identifiers": { "default_identifier_name": "my_simple_df" }, }, ) expected_events.append("data_context.get_batch_list") expected_events.append("data_asset.validate") expected_events.append("data_context.build_data_docs") expected_events.append("checkpoint.run") expected_events.append("data_context.run_checkpoint") assert not usage_stats_exceptions_exist(messages=caplog.messages) message_queue = context._usage_statistics_handler._message_queue.queue events = [event["event"] for event in message_queue] # Note: expected events does not contain the `data_context.__init__` event assert events == expected_events assert not usage_stats_invalid_messages_exist(caplog.messages)