def remove_all_whitelisted_outliers(self): from helpers.outlier import Outlier # import goes here to avoid issues with singletons & circular requirements ... //TODO: fix this must_clause = {"must": [{"match": {"tags": "outlier"}}]} total_docs_whitelisted = 0 for doc in self.scan(bool_clause=must_clause): total_outliers = int(doc["_source"]["outliers"]["total_outliers"]) # Generate all outlier objects for this document total_whitelisted = 0 for i in range(total_outliers): outlier_type = doc["_source"]["outliers"]["type"][i] outlier_reason = doc["_source"]["outliers"]["reason"][i] outlier_summary = doc["_source"]["outliers"]["summary"][i] outlier = Outlier(type=outlier_type, reason=outlier_reason, summary=outlier_summary) if outlier.is_whitelisted(additional_dict_values_to_check=doc): total_whitelisted += 1 # if all outliers for this document are whitelisted, removed them all. If not, don't touch the document. # this is a limitation in the way our outliers are stored: if not ALL of them are whitelisted, we can't remove just the whitelisted ones # from the Elasticsearch event, as they are stored as array elements and potentially contain observations that should be removed, too. # In this case, just don't touch the document. if total_whitelisted == total_outliers: total_docs_whitelisted += 1 doc = remove_outliers_from_document(doc) self.conn.delete(index=doc["_index"], doc_type=doc["_type"], id=doc["_id"], refresh=True) self.conn.create(index=doc["_index"], doc_type=doc["_type"], id=doc["_id"], body=doc["_source"], refresh=True) return total_docs_whitelisted
def test_test_osquery_ticket_1933_single_regexp_should_not_match(self): orig_doc = copy.deepcopy(doc_with_outlier_test_file) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary", doc=orig_doc) self.test_settings.change_configuration_path( "/app/tests/unit_tests/files/whitelist_tests_09_ticket_1933.conf") self.assertFalse(test_outlier.is_whitelisted())
def test_single_regex_not_to_match_in_doc_with_outlier(self): self.test_settings.change_configuration_path( "/app/tests/unit_tests/files/whitelist_tests_07_with_general.conf") orig_doc = copy.deepcopy(doc_with_outlier_test_file) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary", doc=orig_doc) result = test_outlier.is_whitelisted() self.assertFalse(result)
def test_whitelist_config_wipe_all_bug(self): orig_doc = copy.deepcopy(doc_with_outlier_test_file) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary", doc=orig_doc) self.test_settings.change_configuration_path( "/app/tests/unit_tests/files/whitelist_tests_10_issue_462.conf") self.assertFalse(test_outlier.is_whitelisted())
def test_whitelist_config_file_multi_item_match(self): orig_doc = copy.deepcopy(doc_with_outlier_test_file) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary", doc=orig_doc) self.test_settings.change_configuration_path( "/app/tests/unit_tests/files/whitelist_tests_01_with_general.conf") self.assertTrue(test_outlier.is_whitelisted())
def test_whitelist_config_file_multi_item_mismatch_with_three_fields_and_whitespace( self): orig_doc = copy.deepcopy(doc_with_outlier_test_file) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary", doc=orig_doc) self.test_settings.change_configuration_path( "/app/tests/unit_tests/files/whitelist_tests_05.conf") self.assertFalse(test_outlier.is_whitelisted())
def test_single_literal_not_to_match_in_doc_with_outlier(self): orig_doc = copy.deepcopy(doc_with_outlier_test_file) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary") settings.process_configuration_files( "/app/tests/unit_tests/files/whitelist_tests_03.conf") self.assertFalse( test_outlier.is_whitelisted( additional_dict_values_to_check=orig_doc))
def test_whitelist_config_file_multi_item_mismatch_with_three_fields_and_whitespace( self): orig_doc = copy.deepcopy(doc_with_outlier_test_file) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary") settings.process_configuration_files( "/app/tests/unit_tests/files/whitelist_tests_05.conf") self.assertFalse( test_outlier.is_whitelisted( additional_dict_values_to_check=orig_doc))
def remove_all_whitelisted_outliers(self): from helpers.outlier import Outlier # import goes here to avoid issues with singletons & circular requirements ... //TODO: fix this outliers_filter_query = {"filter": [{"term": {"tags": "outlier"}}]} total_docs_whitelisted = 0 idx = self.settings.config.get("general", "es_index_pattern") total_nr_outliers = self.count_documents( index=idx, bool_clause=outliers_filter_query) self.logging.logger.info( "going to analyze %s outliers and remove all whitelisted items", "{:,}".format(total_nr_outliers)) for doc in self.scan(index=idx, bool_clause=outliers_filter_query): total_outliers = int(doc["_source"]["outliers"]["total_outliers"]) # Generate all outlier objects for this document total_whitelisted = 0 for i in range(total_outliers): outlier_type = doc["_source"]["outliers"]["type"][i] outlier_reason = doc["_source"]["outliers"]["reason"][i] outlier_summary = doc["_source"]["outliers"]["summary"][i] outlier = Outlier(outlier_type=outlier_type, outlier_reason=outlier_reason, outlier_summary=outlier_summary) if outlier.is_whitelisted(additional_dict_values_to_check=doc): total_whitelisted += 1 # if all outliers for this document are whitelisted, removed them all. If not, don't touch the document. # this is a limitation in the way our outliers are stored: if not ALL of them are whitelisted, we can't remove just the whitelisted ones # from the Elasticsearch event, as they are stored as array elements and potentially contain observations that should be removed, too. # In this case, just don't touch the document. if total_whitelisted == total_outliers: total_docs_whitelisted += 1 doc = remove_outliers_from_document(doc) self.conn.delete(index=doc["_index"], doc_type=doc["_type"], id=doc["_id"], refresh=True) self.conn.create(index=doc["_index"], doc_type=doc["_type"], id=doc["_id"], body=doc["_source"], refresh=True) return total_docs_whitelisted
def remove_all_whitelisted_outliers(self, dict_with_analyzer): """ Remove all whitelisted outliers present in Elasticsearch. This method is normally only call by housekeeping :return: the number of outliers removed """ outliers_filter_query = {"filter": [{"term": {"tags": "outlier"}}]} total_outliers_whitelisted = 0 total_outliers_processed = 0 idx = self.settings.config.get("general", "es_index_pattern") total_nr_outliers, documents = self.count_and_scan_documents( index=idx, bool_clause=outliers_filter_query) if total_nr_outliers > 0: self.logging.logger.info( "going to analyze %s outliers and remove all whitelisted items", "{:,}".format(total_nr_outliers)) start_time = dt.datetime.today().timestamp() for doc in documents: total_outliers_processed = total_outliers_processed + 1 total_outliers_in_doc = int( doc["_source"]["outliers"]["total_outliers"]) # generate all outlier objects for this document total_whitelisted = 0 for i in range(total_outliers_in_doc): outlier_type = doc["_source"]["outliers"]["type"][i] outlier_reason = doc["_source"]["outliers"]["reason"][i] outlier_summary = doc["_source"]["outliers"]["summary"][i] # Extract information and get analyzer linked to this outlier model_name = doc["_source"]["outliers"]["model_name"][i] model_type = doc["_source"]["outliers"]["model_type"][i] config_section_name = model_type + "_" + model_name if config_section_name not in dict_with_analyzer: self.logging.logger.debug( "Outlier '" + config_section_name + "' " + " was not found in configuration, could not check whitelist" ) break # If one outlier is not whitelisted, we keep all other outliers analyzer = dict_with_analyzer[config_section_name] outlier = Outlier(outlier_type=outlier_type, outlier_reason=outlier_reason, outlier_summary=outlier_summary, doc=doc) if outlier.is_whitelisted( extra_literals_whitelist_value=analyzer. model_whitelist_literals, extra_regexps_whitelist_value=analyzer. model_whitelist_regexps): total_whitelisted += 1 # if all outliers for this document are whitelisted, removed them all. If not, don't touch the document. # this is a limitation in the way our outliers are stored: if not ALL of them are whitelisted, we # can't remove just the whitelisted ones # from the Elasticsearch event, as they are stored as array elements and potentially contain # observations that should be removed, too. # In this case, just don't touch the document. if total_whitelisted == total_outliers_in_doc: total_outliers_whitelisted += 1 doc = remove_outliers_from_document(doc) self.add_remove_outlier_bulk_action(doc) # we don't use the ticker from the logger singleton, as this will be called from the housekeeping thread # if we share a same ticker between multiple threads, strange results would start to appear in # progress logging # so, we duplicate part of the functionality from the logger singleton if self.logging.verbosity >= 5: should_log = True else: should_log = total_outliers_processed % max(1, int(math.pow(10, (6 - self.logging.verbosity)))) == 0 \ or total_outliers_processed == total_nr_outliers if should_log: # avoid a division by zero time_diff = max( float(1), float(dt.datetime.today().timestamp() - start_time)) ticks_per_second = "{:,}".format( round(float(total_outliers_processed) / time_diff)) self.logging.logger.info( "whitelisting historical outliers " + " [" + ticks_per_second + " eps." + " - " + '{:.2f}'.format( round( float(total_outliers_processed) / float(total_nr_outliers) * 100, 2)) + "% done" + " - " + "{:,}".format(total_outliers_whitelisted) + " outliers whitelisted]") self.flush_bulk_actions() return total_outliers_whitelisted
def remove_all_whitelisted_outliers(self): outliers_filter_query = {"filter": [{"term": {"tags": "outlier"}}]} total_outliers_whitelisted = 0 total_outliers_processed = 0 idx = self.settings.config.get("general", "es_index_pattern") total_nr_outliers = self.count_documents(index=idx, bool_clause=outliers_filter_query) self.logging.logger.info("going to analyze %s outliers and remove all whitelisted items", "{:,}" .format(total_nr_outliers)) if total_nr_outliers > 0: start_time = dt.datetime.today().timestamp() for doc in self.scan(index=idx, bool_clause=outliers_filter_query): total_outliers_processed = total_outliers_processed + 1 total_outliers_in_doc = int(doc["_source"]["outliers"]["total_outliers"]) # generate all outlier objects for this document total_whitelisted = 0 for i in range(total_outliers_in_doc): outlier_type = doc["_source"]["outliers"]["type"][i] outlier_reason = doc["_source"]["outliers"]["reason"][i] outlier_summary = doc["_source"]["outliers"]["summary"][i] outlier = Outlier(outlier_type=outlier_type, outlier_reason=outlier_reason, outlier_summary=outlier_summary, doc=doc) if outlier.is_whitelisted(): total_whitelisted += 1 # if all outliers for this document are whitelisted, removed them all. If not, don't touch the document. # this is a limitation in the way our outliers are stored: if not ALL of them are whitelisted, we # can't remove just the whitelisted ones # from the Elasticsearch event, as they are stored as array elements and potentially contain # observations that should be removed, too. # In this case, just don't touch the document. if total_whitelisted == total_outliers_in_doc: total_outliers_whitelisted += 1 doc = remove_outliers_from_document(doc) self._update_es(doc) # we don't use the ticker from the logger singleton, as this will be called from the housekeeping thread # if we share a same ticker between multiple threads, strange results would start to appear in # progress logging # so, we duplicate part of the functionality from the logger singleton if self.logging.verbosity >= 5: should_log = True else: should_log = total_outliers_processed % max(1, int(math.pow(10, (5 - self.logging.verbosity)))) == 0 \ or total_outliers_processed == total_nr_outliers if should_log: # avoid a division by zero time_diff = max(float(1), float(dt.datetime.today().timestamp() - start_time)) ticks_per_second = "{:,}".format(round(float(total_outliers_processed) / time_diff)) self.logging.logger.info("whitelisting historical outliers " + " [" + ticks_per_second + " eps." + " - " + '{:.2f}'.format(round(float(total_outliers_processed) / float(total_nr_outliers) * 100, 2)) + "% done" + " - " + str(total_outliers_whitelisted) + " outliers whitelisted]") return total_outliers_whitelisted