def run_validation_operator_usage_statistics( data_context: "DataContext", # noqa: F821 validation_operator_name: str, assets_to_validate: list, **kwargs, ) -> dict: try: data_context_id = data_context.data_context_id except AttributeError: data_context_id = None anonymizer = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload = {} try: payload["anonymized_operator_name"] = anonymizer.anonymize( obj=validation_operator_name) except TypeError as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, run_validation_operator_usage_statistics: Unable to create validation_operator_name hash" ) if data_context._usage_statistics_handler: # noinspection PyBroadException try: anonymizer = data_context._usage_statistics_handler.anonymizer payload["anonymized_batches"] = [ anonymizer.anonymize(obj=batch) for batch in assets_to_validate ] except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, run_validation_operator_usage_statistics: Unable to create anonymized_batches payload field" ) return payload
def save_expectation_suite_usage_statistics( data_context, # self expectation_suite, expectation_suite_name=None): try: data_context_id = data_context.data_context_id except AttributeError: data_context_id = None anonymizer = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload = {} if expectation_suite_name is None: expectation_suite_name = expectation_suite.expectation_suite_name try: payload["anonymized_expectation_suite_name"] = anonymizer.anonymize( expectation_suite_name) except Exception: logger.debug( "save_expectation_suite_usage_statistics: Unable to create anonymized_expectation_suite_name payload field" ) return payload
def edit_expectation_suite_usage_statistics( data_context: "DataContext", # noqa: F821 expectation_suite_name: str, interactive_mode: Optional[CLISuiteInteractiveFlagCombinations] = None, ) -> dict: try: data_context_id = data_context.data_context_id except AttributeError: data_context_id = None anonymizer = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer if interactive_mode is None: payload = {} else: payload = copy.deepcopy(interactive_mode.value) # noinspection PyBroadException try: payload["anonymized_expectation_suite_name"] = anonymizer.anonymize( obj=expectation_suite_name ) except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, edit_expectation_suite_usage_statistics: Unable to create anonymized_expectation_suite_name payload field" ) return payload
def get_batch_list_usage_statistics( data_context: "DataContext", *args, **kwargs # noqa: F821 ) -> dict: try: data_context_id = data_context.data_context_id except AttributeError: data_context_id = None anonymizer = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload = {} if data_context._usage_statistics_handler: # noinspection PyBroadException try: anonymizer: Anonymizer = ( # noqa: F821 data_context._usage_statistics_handler.anonymizer) payload = anonymizer.anonymize(*args, **kwargs) except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, get_batch_list_usage_statistics: Unable to create anonymized_batch_request payload field" ) return payload
def run_validation_operator_usage_statistics( data_context, # self validation_operator_name, assets_to_validate, run_id=None, **kwargs): try: data_context_id = data_context.data_context_id except AttributeError: data_context_id = None anonymizer = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload = {} try: payload["anonymized_operator_name"] = anonymizer.anonymize( validation_operator_name) except TypeError: logger.debug( "run_validation_operator_usage_statistics: Unable to create validation_operator_name hash" ) try: batch_anonymizer = data_context._usage_statistics_handler._batch_anonymizer payload["anonymized_batches"] = [ batch_anonymizer.anonymize_batch_info(batch) for batch in assets_to_validate ] except Exception: logger.debug( "run_validation_operator_usage_statistics: Unable to create anonymized_batches payload field" ) return payload
def save_expectation_suite_usage_statistics( data_context: "DataContext", # noqa: F821 expectation_suite: ExpectationSuite, expectation_suite_name: Optional[str] = None, **kwargs, ) -> dict: try: data_context_id = data_context.data_context_id except AttributeError: data_context_id = None anonymizer = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload = {} if expectation_suite_name is None: if isinstance(expectation_suite, ExpectationSuite): expectation_suite_name = expectation_suite.expectation_suite_name elif isinstance(expectation_suite, dict): expectation_suite_name = expectation_suite.get("expectation_suite_name") # noinspection PyBroadException try: payload["anonymized_expectation_suite_name"] = anonymizer.anonymize( obj=expectation_suite_name ) except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, save_expectation_suite_usage_statistics: Unable to create anonymized_expectation_suite_name payload field" ) return payload
def __init__( self, data_context: "DataContext", # noqa: F821 data_context_id: str, usage_statistics_url: str, ) -> None: self._url = usage_statistics_url self._data_context_id = data_context_id self._data_context_instance_id = data_context.instance_id self._data_context = data_context self._ge_version = ge_version self._message_queue = Queue() self._worker = threading.Thread(target=self._requests_worker, daemon=True) self._worker.start() self._anonymizer = Anonymizer(data_context_id) try: self._sigterm_handler = signal.signal(signal.SIGTERM, self._teardown) except ValueError: # if we are not the main thread, we don't get to ask for signal handling. self._sigterm_handler = None try: self._sigint_handler = signal.signal(signal.SIGINT, self._teardown) except ValueError: # if we are not the main thread, we don't get to ask for signal handling. self._sigint_handler = None atexit.register(self._close_worker)
def test_anonymize_object_info_with_missing_args_raises_error( anonymizer_with_consistent_salt: Anonymizer, ): with pytest.raises(AssertionError) as e: anonymizer_with_consistent_salt._anonymize_object_info( anonymized_info_dict={}, object_=None, object_class=None, object_config=None, ) assert "Must pass either" in str(e.value)
def profiler_anonymizer() -> ProfilerAnonymizer: # Standardize the salt so our tests are deterimistic salt: str = "00000000-0000-0000-0000-00000000a004" aggregate_anonymizer: Anonymizer = Anonymizer(salt=salt) anonymizer: ProfilerAnonymizer = ProfilerAnonymizer( salt=salt, aggregate_anonymizer=aggregate_anonymizer) return anonymizer
def datasource_anonymizer() -> DatasourceAnonymizer: # Standardize the salt so our tests are deterimistic salt: str = "00000000-0000-0000-0000-00000000a004" aggregate_anonymizer: Anonymizer = Anonymizer(salt=salt) anonymizer: DatasourceAnonymizer = DatasourceAnonymizer( salt=salt, aggregate_anonymizer=aggregate_anonymizer) return anonymizer
def add_datasource_usage_statistics( data_context: "DataContext", name: str, **kwargs # noqa: F821 ) -> dict: if not data_context._usage_statistics_handler: return {} try: data_context_id = data_context.data_context_id except AttributeError: data_context_id = None from great_expectations.core.usage_statistics.anonymizers.datasource_anonymizer import ( DatasourceAnonymizer, ) aggregate_anonymizer = Anonymizer(salt=data_context_id) datasource_anonymizer = DatasourceAnonymizer( salt=data_context_id, aggregate_anonymizer=aggregate_anonymizer) payload = {} # noinspection PyBroadException try: payload = datasource_anonymizer._anonymize_datasource_info( name, kwargs) except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, add_datasource_usage_statistics: Unable to create add_datasource_usage_statistics payload field" ) return payload
def _handle_expectation_suite_usage_statistics( data_context: "DataContext", # noqa: F821 event_arguments_payload_handler_name: str, expectation_suite: Optional[ExpectationSuite] = None, expectation_suite_name: Optional[str] = None, interactive_mode: Optional[CLISuiteInteractiveFlagCombinations] = None, **kwargs, ) -> dict: """ This method anonymizes "expectation_suite_name" for events that utilize this property. """ data_context_id: Optional[str] try: data_context_id = data_context.data_context_id except AttributeError: data_context_id = None anonymizer: Anonymizer = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload: dict if interactive_mode is None: payload = {} else: payload = copy.deepcopy(interactive_mode.value) if expectation_suite_name is None: if isinstance(expectation_suite, ExpectationSuite): expectation_suite_name = expectation_suite.expectation_suite_name elif isinstance(expectation_suite, dict): expectation_suite_name = expectation_suite.get( "expectation_suite_name") # noinspection PyBroadException try: payload["anonymized_expectation_suite_name"] = anonymizer.anonymize( obj=expectation_suite_name) except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, {event_arguments_payload_handler_name}: Unable to create anonymized_expectation_suite_name payload field." ) return payload
def test_anonymize_object_info_with_core_ge_object( anonymizer_with_consistent_salt: Anonymizer, ): anonymized_result: dict = anonymizer_with_consistent_salt._anonymize_object_info( anonymized_info_dict={}, object_=ExpectationSuite(expectation_suite_name="my_suite"), ) assert anonymized_result == {"parent_class": "ExpectationSuite"}
def test_anonymize_object_info_with_custom_user_defined_object_with_no_parent( anonymizer_with_consistent_salt: Anonymizer, ): anonymized_result: dict = anonymizer_with_consistent_salt._anonymize_object_info( anonymized_info_dict={}, object_=BaseTestClass()) assert anonymized_result == { "anonymized_class": "760bfe8b56356bcd56012edfd512019b", "parent_class": "__not_recognized__", }
def get_profiler_run_usage_statistics( profiler: "RuleBasedProfiler", # noqa: F821 variables: Optional[dict] = None, rules: Optional[dict] = None, *args, **kwargs, ) -> dict: usage_statistics_handler: Optional[ UsageStatisticsHandler ] = profiler._usage_statistics_handler data_context_id: Optional[str] = None if usage_statistics_handler: data_context_id = usage_statistics_handler._data_context_id anonymizer: Optional[Anonymizer] = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload: dict = {} if usage_statistics_handler: # noinspection PyBroadException try: anonymizer = usage_statistics_handler.anonymizer resolved_runtime_config: "RuleBasedProfilerConfig" = ( # noqa: F821 RuleBasedProfilerConfig.resolve_config_using_acceptable_arguments( profiler=profiler, variables=variables, rules=rules, ) ) payload: dict = anonymizer.anonymize(obj=resolved_runtime_config) except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, get_profiler_run_usage_statistics: Unable to create anonymized_profiler_run payload field" ) return payload
def get_checkpoint_run_usage_statistics( checkpoint: "Checkpoint", # noqa: F821 *args, **kwargs, ) -> dict: usage_statistics_handler: Optional[ UsageStatisticsHandler ] = checkpoint._usage_statistics_handler data_context_id: Optional[str] = None try: data_context_id = checkpoint.data_context.data_context_id except AttributeError: data_context_id = None anonymizer: Optional[Anonymizer] = _anonymizers.get(data_context_id, None) if anonymizer is None: anonymizer = Anonymizer(data_context_id) _anonymizers[data_context_id] = anonymizer payload: dict = {} if usage_statistics_handler: # noinspection PyBroadException try: anonymizer = usage_statistics_handler.anonymizer # noqa: F821 resolved_runtime_kwargs: dict = ( CheckpointConfig.resolve_config_using_acceptable_arguments( *(checkpoint,), **kwargs ) ) payload: dict = anonymizer.anonymize( *(checkpoint,), **resolved_runtime_kwargs ) except Exception as e: logger.debug( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}, get_checkpoint_run_usage_statistics: Unable to create anonymized_checkpoint_run payload field" ) return payload
def test_anonymize_object_info_with_custom_user_defined_object_with_single_parent( anonymizer_with_consistent_salt: Anonymizer, ): anonymized_result: dict = anonymizer_with_consistent_salt._anonymize_object_info( anonymized_info_dict={}, object_=MyCustomExpectationSuite(expectation_suite_name="my_suite"), ) assert anonymized_result == { "anonymized_class": "54ab2657b855f8075e5d1f28e81ca7cd", "parent_class": "ExpectationSuite", }
def test_anonymize_object_info_with_custom_user_defined_object_with_multiple_parents( anonymizer_with_consistent_salt: Anonymizer, ): anonymized_result: dict = anonymizer_with_consistent_salt._anonymize_object_info( anonymized_info_dict={}, object_=MyCustomMultipleInheritanceClass( expectation_suite_name="my_name"), ) assert anonymized_result == { "anonymized_class": "1e1716661acfa73d538a191ed13efcfd", "parent_class": "ExpectationSuite,BatchRequest", }
def test_datasource_anonymizer(datasource_anonymizer: DatasourceAnonymizer): n1 = datasource_anonymizer._anonymize_datasource_info( name="test_datasource", config={ "name": "test_datasource", "class_name": "PandasDatasource", "module_name": "great_expectations.datasource", }, ) assert n1 == { "anonymized_name": "04bf89e1fb7495b0904bbd5ae478fbe0", "parent_class": "PandasDatasource", } n2 = datasource_anonymizer._anonymize_datasource_info( name="test_datasource", config={ "name": "test_datasource", "class_name": "CustomDatasource", "module_name": "tests.datasource.test_datasource_anonymizer", }, ) datasource_anonymizer_2 = DatasourceAnonymizer( aggregate_anonymizer=Anonymizer()) n3 = datasource_anonymizer_2._anonymize_datasource_info( name="test_datasource", config={ "name": "test_datasource", "class_name": "CustomDatasource", "module_name": "tests.datasource.test_datasource_anonymizer", }, ) assert n2["parent_class"] == "PandasDatasource" assert n3["parent_class"] == "PandasDatasource" print(n3) assert len(n3["anonymized_class"]) == 32 assert n2["anonymized_class"] != n3["anonymized_class"] # Same anonymizer *does* produce the same result n4 = datasource_anonymizer._anonymize_datasource_info( name="test_datasource", config={ "name": "test_datasource", "class_name": "CustomDatasource", "module_name": "tests.datasource.test_datasource_anonymizer", }, ) assert n4["anonymized_class"] == n2["anonymized_class"]
def test_anonymizer_consistent_salt(): # Provided same salt will produce same results data_context_id = str(uuid.uuid4()) anonymizer1 = Anonymizer(data_context_id) anonymizer2 = Anonymizer(data_context_id) test_name = "i_am_a_name" anon_name_1 = anonymizer1.anonymize(test_name) anon_name_2 = anonymizer2.anonymize(test_name) assert anon_name_1 == anon_name_2 assert len(anon_name_1) == 32 assert len(anon_name_2) == 32
def anonymizer_with_consistent_salt() -> Anonymizer: anonymizer: Anonymizer = Anonymizer( salt="00000000-0000-0000-0000-00000000a004") return anonymizer
def test_anonymizer_no_salt(): # No salt will generate a random one. anonymizer1 = Anonymizer() anonymizer2 = Anonymizer() test_name = "i_am_a_name" anon_name_1 = anonymizer1.anonymize(test_name) anon_name_2 = anonymizer2.anonymize(test_name) assert anon_name_1 != anon_name_2 assert len(anon_name_1) == 32 assert len(anon_name_2) == 32 # Provided different salt will produce different results anonymizer1 = Anonymizer("hello, friend") anonymizer2 = Anonymizer("hello, enemy") test_name = "i_am_a_name" anon_name_1 = anonymizer1.anonymize(test_name) anon_name_2 = anonymizer2.anonymize(test_name) assert anon_name_1 != anon_name_2 assert len(anon_name_1) == 32 assert len(anon_name_2) == 32
def test_anonymizer__is_parent_class_recognized(): """ What does this test and why? The method Anonymizer._is_parent_class_recognized() should return the name of the parent class if it is or is a subclass of one of the classes_to_check. If not, it should return None. It should do so regardless of the parameter used to pass in the object definition (object_, object_class, object_config). It should also return the first matching class in classes_to_check, even if a later class also matches. """ anonymizer = Anonymizer() # classes_to_check in order of inheritance hierarchy classes_to_check = [TestClass, BaseTestClass] assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_class=MyCustomTestClass) == "TestClass") assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_class=SomeOtherClass) is None) classes_to_check = [BaseTestClass] assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_class=TestClass) == "BaseTestClass") # classes_to_check in order of inheritance hierarchy my_custom_test_class = MyCustomTestClass() test_class = TestClass() some_other_class = SomeOtherClass() classes_to_check = [TestClass, BaseTestClass] assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_=my_custom_test_class) == "TestClass") assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_=some_other_class) is None) classes_to_check = [BaseTestClass] assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_=test_class) == "BaseTestClass") # classes_to_check in order of inheritance hierarchy my_custom_test_class_config = { "class_name": "MyCustomTestClass", "module_name": "tests.core.usage_statistics.test_anonymizer", } test_class_config = { "class_name": "TestClass", "module_name": "tests.core.usage_statistics.test_anonymizer", } some_other_class_config = { "class_name": "SomeOtherClass", "module_name": "tests.core.usage_statistics.test_anonymizer", } classes_to_check = [TestClass, BaseTestClass] assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_config=my_custom_test_class_config) == "TestClass") assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_config=some_other_class_config) is None) classes_to_check = [BaseTestClass] assert (anonymizer._is_parent_class_recognized( classes_to_check=classes_to_check, object_config=test_class_config) == "BaseTestClass")
class UsageStatisticsHandler: def __init__( self, data_context: "DataContext", # noqa: F821 data_context_id: str, usage_statistics_url: str, ) -> None: self._url = usage_statistics_url self._data_context_id = data_context_id self._data_context_instance_id = data_context.instance_id self._data_context = data_context self._ge_version = ge_version self._message_queue = Queue() self._worker = threading.Thread(target=self._requests_worker, daemon=True) self._worker.start() self._anonymizer = Anonymizer(data_context_id) try: self._sigterm_handler = signal.signal(signal.SIGTERM, self._teardown) except ValueError: # if we are not the main thread, we don't get to ask for signal handling. self._sigterm_handler = None try: self._sigint_handler = signal.signal(signal.SIGINT, self._teardown) except ValueError: # if we are not the main thread, we don't get to ask for signal handling. self._sigint_handler = None atexit.register(self._close_worker) @property def anonymizer(self) -> Anonymizer: return self._anonymizer def _teardown(self, signum: int, frame: Optional[FrameType]) -> None: self._close_worker() if signum == signal.SIGTERM and self._sigterm_handler: self._sigterm_handler(signum, frame) if signum == signal.SIGINT and self._sigint_handler: self._sigint_handler(signum, frame) def _close_worker(self) -> None: self._message_queue.put(STOP_SIGNAL) self._worker.join() def _requests_worker(self) -> None: session = requests.Session() while True: message = self._message_queue.get() if message == STOP_SIGNAL: self._message_queue.task_done() return try: res = session.post(self._url, json=message, timeout=2) logger.debug("Posted usage stats: message status " + str(res.status_code)) if res.status_code != 201: logger.debug("Server rejected message: ", json.dumps(message, indent=2)) except requests.exceptions.Timeout: logger.debug("Timeout while sending usage stats message.") except Exception as e: logger.debug("Unexpected error posting message: " + str(e)) finally: self._message_queue.task_done() def build_init_payload(self) -> dict: """Adds information that may be available only after full data context construction, but is useful to calculate only one time (for example, anonymization).""" expectation_suites: List[ExpectationSuite] = [ self._data_context.get_expectation_suite(expectation_suite_name) for expectation_suite_name in self._data_context.list_expectation_suite_names() ] init_payload = { "platform.system": platform.system(), "platform.release": platform.release(), "version_info": str(sys.version_info), "datasources": self._data_context.project_config_with_variables_substituted. datasources, "stores": self._data_context.stores, "validation_operators": self._data_context.validation_operators, "data_docs_sites": self._data_context.project_config_with_variables_substituted. data_docs_sites, "expectation_suites": expectation_suites, "dependencies": self._get_serialized_dependencies(), } anonymized_init_payload = self._anonymizer.anonymize_init_payload( init_payload=init_payload) return anonymized_init_payload @staticmethod def _get_serialized_dependencies() -> List[dict]: """Get the serialized dependencies from the GEExecutionEnvironment.""" ge_execution_environment: GEExecutionEnvironment = GEExecutionEnvironment( ) dependencies: List[PackageInfo] = ge_execution_environment.dependencies schema: PackageInfoSchema = PackageInfoSchema() serialized_dependencies: List[dict] = [ schema.dump(package_info) for package_info in dependencies ] return serialized_dependencies def build_envelope(self, message: dict) -> dict: message["version"] = "1.0.0" message["ge_version"] = self._ge_version message["data_context_id"] = self._data_context_id message["data_context_instance_id"] = self._data_context_instance_id message["event_time"] = (datetime.datetime.now( datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z") event_duration_property_name: str = f'{message["event"]}.duration'.replace( ".", "_") if hasattr(self, event_duration_property_name): delta_t: int = getattr(self, event_duration_property_name) message["event_duration"] = delta_t return message @staticmethod def validate_message(message: dict, schema: dict) -> bool: try: jsonschema.validate(message, schema=schema) return True except jsonschema.ValidationError as e: logger.debug( f"{UsageStatsExceptionPrefix.INVALID_MESSAGE.value} invalid message: " + str(e)) return False def send_usage_message( self, event: str, event_payload: Optional[dict] = None, success: Optional[bool] = None, ) -> None: """send a usage statistics message.""" # noinspection PyBroadException try: message: dict = { "event": event, "event_payload": event_payload or {}, "success": success, } self.emit(message) except Exception: pass def emit(self, message: dict) -> None: """ Emit a message. """ try: if message["event"] == "data_context.__init__": message["event_payload"] = self.build_init_payload() message = self.build_envelope(message=message) if not self.validate_message( message, schema=anonymized_usage_statistics_record_schema): return self._message_queue.put(message) # noinspection PyBroadException except Exception as e: # We *always* tolerate *any* error in usage statistics log_message: str = ( f"{UsageStatsExceptionPrefix.EMIT_EXCEPTION.value}: {e} type: {type(e)}" ) logger.debug(log_message)