Exemplo n.º 1
0
def test_instantiation_with_test_yaml_config(mock_emit, caplog,
                                             empty_data_context_stats_enabled):
    empty_data_context_stats_enabled.test_yaml_config(yaml_config="""
module_name: great_expectations.data_context.store
class_name: EvaluationParameterStore
""")
    assert mock_emit.call_count == 1
    # Substitute current anonymized name since it changes for each run
    anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][
        "anonymized_name"]
    assert mock_emit.call_args_list == [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name": anonymized_name,
                "parent_class": "EvaluationParameterStore",
                "anonymized_store_backend": {
                    "parent_class": "InMemoryStoreBackend"
                },
            },
            "success": True,
        }),
    ]

    # Confirm that logs do not contain any exceptions or invalid messages
    assert not usage_stats_exceptions_exist(messages=caplog.messages)
    assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_substitution_error(
        mock_emit, caplog, empty_data_context_stats_enabled):
    with pytest.raises(ge_exceptions.MissingConfigVariableError):
        _ = empty_data_context_stats_enabled.test_yaml_config(yaml_config="""
module_name: great_expectations.data_context.store.expectations_store
class_name: ExpectationsStore
store_backend:
    module_name: "great_expectations.data_context.store.store_backend"
    class_name: InMemoryStoreBackend
    error_on_substitution: $IDONTEXIST
    """)
    assert mock_emit.call_count == 1
    assert mock_emit.call_args_list == [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "diagnostic_info": ["__substitution_error__"]
            },
            "success": False,
        }),
    ]

    # Confirm that logs do not contain any exceptions or invalid messages
    assert not usage_stats_exceptions_exist(messages=caplog.messages)
    assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_custom_type_not_ge_subclass(
        mock_emit, caplog, empty_data_context_stats_enabled):
    """
    What does this test and why?
    We should be able to discern the GE parent class for a custom type and construct
    a useful usage stats event message.
    """
    data_context: DataContext = empty_data_context_stats_enabled
    _ = data_context.test_yaml_config(yaml_config="""
module_name: tests.data_context.fixtures.plugins
class_name: MyCustomNonCoreGeClass
""")
    assert mock_emit.call_count == 1
    assert mock_emit.call_args_list == [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "diagnostic_info": ["__custom_subclass_not_core_ge__"]
            },
            "success": True,
        }),
    ]

    # Confirm that logs do not contain any exceptions or invalid messages
    assert not usage_stats_exceptions_exist(messages=caplog.messages)
    assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_custom_config_class_name_not_provided(
        mock_emit, caplog, empty_data_context_stats_enabled):
    """
    What does this test and why?
    If a class_name is not provided, and we have run into an error state in test_yaml_config() (likely because of the missing class_name) then we should report descriptive diagnostic info.
    This should be the case even if we are passing in a custom config.
    """
    data_context: DataContext = empty_data_context_stats_enabled
    with pytest.raises(Exception):
        _ = data_context.test_yaml_config(yaml_config="""
module_name: tests.data_context.fixtures.plugins.my_custom_expectations_store
store_backend:
    module_name: great_expectations.data_context.store.store_backend
    class_name: InMemoryStoreBackend
""")
    assert mock_emit.call_count == 1
    assert mock_emit.call_args_list == [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "diagnostic_info": ["__class_name_not_provided__"],
            },
            "success": False,
        }),
    ]

    # Confirm that logs do not contain any exceptions or invalid messages
    assert not usage_stats_exceptions_exist(messages=caplog.messages)
    assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_class_name_not_provided(
        mock_emit, caplog, empty_data_context_stats_enabled):
    """
    What does this test and why?
    If a class_name is not provided, and we have run into an error state in test_yaml_config() (likely because of the missing class_name) then we should report descriptive diagnostic info.
    """
    with pytest.raises(Exception):
        # noinspection PyUnusedLocal
        my_expectation_store = empty_data_context_stats_enabled.test_yaml_config(
            yaml_config="""
module_name: great_expectations.data_context.store.expectations_store

    """)
    assert mock_emit.call_count == 1
    assert mock_emit.call_args_list == [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "diagnostic_info": ["__class_name_not_provided__"]
            },
            "success": False,
        }),
    ]

    # Confirm that logs do not contain any exceptions or invalid messages
    assert not usage_stats_exceptions_exist(messages=caplog.messages)
    assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_usage_statistics_handler_validate_message_success(
        caplog, in_memory_data_context_config_usage_stats_enabled,
        sample_partial_message):

    # caplog default is WARNING and above, we want to see DEBUG level messages for this test
    caplog.set_level(
        level=logging.DEBUG,
        logger="great_expectations.core.usage_statistics.usage_statistics",
    )

    context: BaseDataContext = BaseDataContext(
        in_memory_data_context_config_usage_stats_enabled)

    usage_statistics_handler = UsageStatisticsHandler(
        data_context=context,
        data_context_id=in_memory_data_context_config_usage_stats_enabled.
        anonymous_usage_statistics.data_context_id,
        usage_statistics_url=in_memory_data_context_config_usage_stats_enabled.
        anonymous_usage_statistics.usage_statistics_url,
    )

    assert (usage_statistics_handler._data_context_id ==
            "00000000-0000-0000-0000-000000000001")

    envelope = usage_statistics_handler.build_envelope(sample_partial_message)
    validated_message = usage_statistics_handler.validate_message(
        envelope, anonymized_usage_statistics_record_schema)

    assert validated_message
    assert not usage_stats_invalid_messages_exist(caplog.messages)
def test_rule_based_profiler_emits_valid_usage_stats(
        mock_emit, caplog, empty_data_context_stats_enabled, test_df,
        tmp_path_factory):
    context = empty_data_context_stats_enabled
    yaml_config = """
    name: my_profiler
    class_name: RuleBasedProfiler
    module_name: great_expectations.rule_based_profiler
    config_version: 1.0
    variables:
      integer_type: INTEGER
      timestamp_type: TIMESTAMP
      max_user_id: 999999999999
      min_timestamp: 2004-10-19 10:23:54
    rules:
      my_rule_for_user_ids:
        domain_builder:
          class_name: TableDomainBuilder
        expectation_configuration_builders:
          - expectation_type: expect_column_values_to_be_of_type
            class_name: DefaultExpectationConfigurationBuilder
    """
    context.test_yaml_config(yaml_config=yaml_config,
                             name="my_profiler",
                             class_name="Profiler")

    # Substitute anonymized name since it changes for each run
    anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][
        "anonymized_name"]
    assert mock_emit.call_count == 1
    assert mock_emit.call_args_list == [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name": anonymized_name,
                "parent_class": "RuleBasedProfiler",
            },
            "success": True,
        })
    ]

    # Confirm that logs do not contain any exceptions or invalid messages
    assert not usage_stats_exceptions_exist(messages=caplog.messages)
    assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_test_yaml_config_usage_stats_custom_type(
        mock_emit, caplog, empty_data_context_stats_enabled):
    """
    What does this test and why?
    We should be able to discern the GE parent class for a custom type and construct
    a useful usage stats event message.
    """
    data_context: DataContext = empty_data_context_stats_enabled
    _ = data_context.test_yaml_config(yaml_config="""
module_name: tests.data_context.fixtures.plugins
class_name: MyCustomExpectationsStore
store_backend:
    module_name: great_expectations.data_context.store.store_backend
    class_name: InMemoryStoreBackend
""")
    assert mock_emit.call_count == 1
    # Substitute anonymized name & class since it changes for each run
    anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][
        "anonymized_name"]
    anonymized_class = mock_emit.call_args_list[0][0][0]["event_payload"][
        "anonymized_class"]
    assert mock_emit.call_args_list == [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name": anonymized_name,
                "parent_class": "ExpectationsStore",
                "anonymized_class": anonymized_class,
                "anonymized_store_backend": {
                    "parent_class": "InMemoryStoreBackend"
                },
            },
            "success": True,
        }),
    ]

    # Confirm that logs do not contain any exceptions or invalid messages
    assert not usage_stats_exceptions_exist(messages=caplog.messages)
    assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
Exemplo n.º 9
0
def test_usage_stats_invalid_messages_exist(test_input, test_output):
    assert usage_stats_invalid_messages_exist(messages=test_input) == test_output
def test_test_yaml_config_usage_stats_simple_sqlalchemy_datasource_subclass(
        mock_emit, caplog, sa, test_backends,
        empty_data_context_stats_enabled):
    """
    What does this test and why?
    We should be able to discern the GE parent class for a custom type and construct
    a useful usage stats event message. This should be true for SimpleSqlalchemyDatasources.
    """

    if "postgresql" not in test_backends:
        pytest.skip(
            "test_test_yaml_config_usage_stats_simple_sqlalchemy_datasource_subclass requires postgresql"
        )

    data_context: DataContext = empty_data_context_stats_enabled
    _ = data_context.test_yaml_config(yaml_config="""
module_name: tests.data_context.fixtures.plugins.my_custom_simple_sqlalchemy_datasource_class
class_name: MyCustomSimpleSqlalchemyDatasource
name: some_name
introspection:
  whole_table:
    data_asset_name_suffix: __whole_table
credentials:
  drivername: postgresql
  host: localhost
  port: '5432'
  username: postgres
  password: ''
  database: postgres
""")
    assert mock_emit.call_count == 1
    # Substitute anonymized name & class since it changes for each run
    anonymized_name = mock_emit.call_args_list[0][0][0]["event_payload"][
        "anonymized_name"]
    anonymized_class = mock_emit.call_args_list[0][0][0]["event_payload"][
        "anonymized_class"]
    anonymized_data_connector_name = mock_emit.call_args_list[0][0][0][
        "event_payload"]["anonymized_data_connectors"][0]["anonymized_name"]
    assert mock_emit.call_args_list == [
        mock.call({
            "event": "data_context.test_yaml_config",
            "event_payload": {
                "anonymized_name":
                anonymized_name,
                "parent_class":
                "SimpleSqlalchemyDatasource",
                "anonymized_class":
                anonymized_class,
                "anonymized_execution_engine": {
                    "parent_class": "SqlAlchemyExecutionEngine"
                },
                "anonymized_data_connectors": [{
                    "anonymized_name":
                    anonymized_data_connector_name,
                    "parent_class":
                    "InferredAssetSqlDataConnector",
                }],
            },
            "success": True,
        }),
    ]

    # Confirm that logs do not contain any exceptions or invalid messages
    assert not usage_stats_exceptions_exist(messages=caplog.messages)
    assert not usage_stats_invalid_messages_exist(messages=caplog.messages)
def test_common_usage_stats_are_sent_no_mocking(
        caplog, in_memory_data_context_config_usage_stats_enabled,
        monkeypatch):
    """
    What does this test and why?
    Our usage stats events are tested elsewhere in several ways (sending example events, validating sample events, throughout other tests ensuring the right events are sent, anonymization, opt-out, etc). This specific test is to ensure that there are no errors with the machinery to send the events in the UsageStatisticsHandler by running code that emits events and checking for errors in the log. This test purposely does not mock any part of the usage stats system to ensure the full code path is run, and sends events to the QA endpoint. This test uses both methods decorated with usage_statistics_enabled_method and those that send events directly.
    """

    # caplog default is WARNING and above, we want to see DEBUG level messages for this test
    caplog.set_level(
        level=logging.DEBUG,
        logger="great_expectations.core.usage_statistics.usage_statistics",
    )

    # Make sure usage stats are enabled
    monkeypatch.delenv("GE_USAGE_STATS",
                       raising=False)  # Undo the project-wide test default
    assert os.getenv("GE_USAGE_STATS") is None

    context: BaseDataContext = BaseDataContext(
        in_memory_data_context_config_usage_stats_enabled)

    # Note, we lose the `data_context.__init__` event because it was emitted before closing the worker
    context._usage_statistics_handler._close_worker()

    # Make sure usage stats are enabled
    assert not context._check_global_usage_statistics_opt_out()
    assert context.anonymous_usage_statistics.enabled
    assert context.anonymous_usage_statistics.data_context_id == DATA_CONTEXT_ID

    # Note module_name fields are omitted purposely to ensure we are still able to send events
    datasource_yaml = f"""
    name: example_datasource
    class_name: Datasource
    module_name: great_expectations.datasource
    execution_engine:
      # module_name: great_expectations.execution_engine
      class_name: PandasExecutionEngine
    data_connectors:
        default_runtime_data_connector_name:
            class_name: RuntimeDataConnector
            # module_name: great_expectations.datasource.data_connector
            batch_identifiers:
                - default_identifier_name
    """

    # context.test_yaml_config() uses send_usage_message()
    context.test_yaml_config(yaml_config=datasource_yaml)
    expected_events: List[str] = ["data_context.test_yaml_config"]

    context.add_datasource(**yaml.load(datasource_yaml))
    expected_events.append("data_context.add_datasource")

    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})

    batch_request = RuntimeBatchRequest(
        datasource_name="example_datasource",
        data_connector_name="default_runtime_data_connector_name",
        data_asset_name=
        "my_data_asset",  # This can be anything that identifies this data_asset for you
        runtime_parameters={"batch_data": df},  # df is your dataframe
        batch_identifiers={"default_identifier_name": "default_identifier"},
    )

    context.create_expectation_suite(expectation_suite_name="test_suite",
                                     overwrite_existing=True)
    validator = context.get_validator(batch_request=batch_request,
                                      expectation_suite_name="test_suite")
    expected_events.append("data_context.get_batch_list")
    validator.expect_table_row_count_to_equal(value=2)
    validator.save_expectation_suite()
    expected_events.append("data_context.save_expectation_suite")

    checkpoint_yaml = """
    name: my_checkpoint
    config_version: 1
    class_name: SimpleCheckpoint
    validations:
      - batch_request:
            datasource_name: example_datasource
            data_connector_name: default_runtime_data_connector_name
            data_asset_name: my_data_asset
        expectation_suite_name: test_suite

    """
    context.test_yaml_config(yaml_config=checkpoint_yaml)
    expected_events.append("data_context.test_yaml_config")

    # Note: add_checkpoint is not instrumented as of 20211215
    context.add_checkpoint(**yaml.safe_load(checkpoint_yaml))

    context.run_checkpoint(
        checkpoint_name="my_checkpoint",
        batch_request={
            "runtime_parameters": {
                "batch_data": df
            },
            "batch_identifiers": {
                "default_identifier_name": "my_simple_df"
            },
        },
    )

    expected_events.append("data_context.get_batch_list")
    expected_events.append("data_asset.validate")
    expected_events.append("data_context.build_data_docs")
    expected_events.append("checkpoint.run")
    expected_events.append("data_context.run_checkpoint")

    assert not usage_stats_exceptions_exist(messages=caplog.messages)

    message_queue = context._usage_statistics_handler._message_queue.queue
    events = [event["event"] for event in message_queue]

    # Note: expected events does not contain the `data_context.__init__` event
    assert events == expected_events

    assert not usage_stats_invalid_messages_exist(caplog.messages)