def test_update_es_correctly_work(self): dictionary_value = self._get_example_dictionary_key_value_and_expected( )[0] self.test_es.add_data(dictionary_value) result = [elem for elem in es.scan()][0] result["_source"]["key"]["test"] = "update_value" es._update_es(result) new_result = [elem for elem in es.scan()][0] self.assertEqual(new_result, result)
def train_model(self): search_query = es.filter_by_query_string( self.model_settings["es_query_filter"]) train_data = list() self.total_events = es.count_documents(search_query=search_query) training_data_size_pct = settings.config.getint( "machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing SVM training set") for doc in es.scan(search_query=search_query): if len(train_data) < total_training_events: logging.tick() fields = es.extract_fields_from_document(doc) train_data.append(fields) else: # We have collected sufficient training data break # Now, train the model if len(train_data) > 0: pass # Train!! else: logging.logger.warning( "no sentences to train model on. Are you sure the sentence configuration is correctly defined?" )
def train_model(self): w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) sentences = list() self.total_events = es.count_documents(search_query=search_query) training_data_size_pct = settings.config.getint("machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing word2vec training set") for doc in es.scan(search_query=search_query): if len(sentences) < total_training_events: logging.tick() fields = es.extract_fields_from_document(doc) if set(self.model_settings["sentence_format"]).issubset(fields.keys()): new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"]) for sentence in new_sentences: sentences.append(tuple(sentence)) # Remove all duplicates from sentences for training - REMOVED FOR TESTING # sentences = list(sentences) else: # We have collected sufficient training data break # Now, train the model if len(sentences) > 0: w2v_model.train_model(sentences) else: logging.logger.warning("no sentences to train model on. Are you sure the sentence configuration is correctly defined?")
def evaluate_model(self): self.extract_extra_model_settings() # Train the model if self.model_settings["train_model"]: self.train_model() return w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) if not w2v_model.is_trained(): logging.logger.warning("model was not trained! Skipping analysis.") else: # Check if we need to run the test data instead of real data if w2v_model.use_test_data: logging.print_generic_intro("using test data instead of live data to evaluate model " + self.model_name) self.evaluate_test_sentences(w2v_model=w2v_model) return self.total_events = es.count_documents(search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model") raw_docs = list() eval_sentences = list() for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) try: new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"]) eval_sentences.extend(new_sentences) except KeyError: logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") continue for _ in new_sentences: raw_docs.append(doc) # Evaluate batch of events against the model if logging.current_step == self.total_events or len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"): logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences") outliers = self.evaluate_batch_for_outliers(w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs) if len(outliers) > 0: unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") # Reset data structures for next batch raw_docs = list() eval_sentences = list()
def test_simple_process_outlier_save_es(self): self.test_settings.change_configuration_path("/app/tests/unit_tests/files/analyzer_test_01.conf") analyzer = TestStubAnalyzer("analyzer_dummy_test") doc_without_outlier = copy.deepcopy(doc_without_outlier_test_file) doc_with_outlier = copy.deepcopy(doc_with_outlier_test_file) doc_fields = doc_without_outlier["_source"] analyzer.process_outlier(doc_fields, doc_without_outlier) result = [elem for elem in es.scan()][0] self.assertEqual(result, doc_with_outlier)
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating beaconing model") eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) try: target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["target"]) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]") will_process_doc = False if will_process_doc: observations = dict() for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_terms_array = add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == total_events) if last_batch or total_terms_added >= settings.config.getint("beaconing", "beaconing_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = evaluate_batch_for_outliers(terms=eval_terms_array, model_settings=model_settings) if len(outliers) > 0: unique_summaries = len(set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0
def test_one_doc_outlier_correctly_add(self): doc_without_outlier = copy.deepcopy(doc_without_outlier_test_file) doc_with_outlier = copy.deepcopy(doc_with_outlier_test_file) # Insert value self.test_es.add_doc(doc_without_outlier) # Make test (supposed all doc work) self._get_simplequery_analyzer( "/app/tests/unit_tests/files/simplequery_test_01.conf", "simplequery_dummy_test").evaluate_model() # Fetch result to check if it is correct result = [elem for elem in es.scan()][0] self.assertEqual(result, doc_with_outlier)
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) # Add your model logic here logging.logger.info(json.dumps(fields, indent=4))
def perform_analysis(): for name in settings.config.sections(): if name.startswith("terms_"): param, model_name = name.split("terms_", 1) should_test_model = settings.config.getboolean("general", "run_models") and settings.config.getboolean(name, "run_model") should_run_model = settings.config.getboolean("general", "test_models") and settings.config.getboolean(name, "test_model") if should_test_model or should_run_model: model_settings = extract_model_settings(name) if "*" in model_settings["target"]: original_model_name = model_name logging.logger.warning("running terms model in brute force mode, could take a long time!") lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) batch_size = settings.config.getint("terms", "terms_batch_eval_size") total_events = es.count_documents(lucene_query=lucene_query) logging.init_ticker(total_steps=min(total_events, batch_size), desc=model_name + " - extracting brute force fields") field_names = set() num_docs_processed = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) fields = helpers.utils.flatten_dict(fields) # skip all fields that are related to outliers, we don't want to brute force them for field_name in list(fields.keys()): # create list instead of iterator so we can mutate the dictionary when iterating if field_name.startswith('outliers.'): logging.logger.debug("not brute forcing outliers field " + str(field_name)) fields.pop(field_name) field_names.update(fields.keys()) if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names)) + " fields") for field_name in field_names: model_name = original_model_name + " [" + field_name + "]" # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc. if "." in field_name: model_settings["target"] = list([field_name]) model_settings["brute_forced_field"] = field_name # so it can be added to the outlier events automatically evaluate_model(model_name=model_name, model_settings=model_settings, brute_force=True) else: evaluate_model(model_name=model_name, model_settings=model_settings)
def evaluate_model(self): self.total_events = es.count_documents(search_query=self.search_query) logging.print_analysis_intro(event_type="evaluating " + self.config_section_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") for doc in es.scan(search_query=self.search_query): logging.tick() fields = es.extract_fields_from_document(doc) self.process_outlier(fields, doc) self.print_analysis_summary()
def test_terms_evaluate_coeff_of_variation_like_expected_document(self): self.test_settings.change_configuration_path( "/app/tests/unit_tests/files/terms_test_01.conf") analyzer = TermsAnalyzer("terms_dummy_test_no_bucket") doc_without_outlier = copy.deepcopy(doc_without_outlier_test_file) expected_doc = copy.deepcopy( doc_with_terms_outlier_coeff_of_variation_no_score_sort) # Add doc to the database self.test_es.add_doc(doc_without_outlier) # Make test (suppose that all doc match with the query) analyzer.evaluate_model() result = [elem for elem in es.scan()][0] self.assertEqual(result, expected_doc)
def test_flush_bulk_actions_using_one_save_outlier(self): doc_with_outlier_with_derived_timestamp = copy.deepcopy( doc_with_outlier_with_derived_timestamp_test_file) doc_with_outlier_with_derived_timestamp.pop('sort') # field add by es doc_with_outlier_with_derived_timestamp.pop( '_score') # field add by es doc_without_outlier = copy.deepcopy(doc_without_outlier_test_file) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary", doc=doc_without_outlier) test_outlier.outlier_dict["observation"] = "dummy observation" es.save_outlier(doc_without_outlier, test_outlier) result = [elem for elem in es.scan()][0] self.assertEqual(result, doc_with_outlier_with_derived_timestamp)
def evaluate_model(self): model_filter = { "bool": { "filter": [{ "term": { "outliers.model_name.raw": { "value": self.model_name } } }, { "term": { "outliers.model_type.raw": { "value": "simplequery" } } }] } } exclude_hits_filter = {"bool": {"must_not": model_filter}} query = self.search_query if "filter" in query: query["filter"].append(exclude_hits_filter) else: query["filter"] = [exclude_hits_filter] self.total_events = es.count_documents(index=self.es_index, search_query=query) logging.print_analysis_intro(event_type="evaluating " + self.config_section_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") for doc in es.scan(index=self.es_index, search_query=query): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) self.process_outlier(fields, doc) self.print_analysis_summary()
def _calculate_target_fields_to_brute_force(self): batch_size = settings.config.getint("terms", "terms_batch_eval_size") self.total_events = es.count_documents( index=self.es_index, search_query=self.search_query, model_settings=self.model_settings) logging.init_ticker(total_steps=min(self.total_events, batch_size), desc=self.model_name + " - extracting brute force fields") field_names_to_brute_force = set() if self.total_events > 0: num_docs_processed = 0 for doc in es.scan(index=self.es_index, search_query=self.search_query, model_settings=self.model_settings): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) fields = helpers.utils.flatten_dict(fields) # create list instead of iterator so we can mutate the dictionary when iterating for field_name in list(fields.keys()): # skip all fields that are related to outliers, we don't want to brute force them if field_name.startswith('outliers.'): logging.logger.debug( "not brute forcing outliers field " + str(field_name)) continue # only brute force nested fields, so not the top level fields such as timestamp, # deployment name, etc. if "." in field_name: field_names_to_brute_force.add(field_name) # only process a single batch of events in order to decide which fields to brute force if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names_to_brute_force)) + " fields") return field_names_to_brute_force
def test_terms_generated_document_coeff_of_variation_respect_min(self): self.test_settings.change_configuration_path( "/app/tests/unit_tests/files/terms_test_01.conf") analyzer = TermsAnalyzer("terms_dummy_test_no_bucket") doc_generator = DummyDocumentsGenerate() nbr_val = 24 # Like 24 hours max_trigger_sensitivity = analyzer.model_settings[ "trigger_sensitivity"] default_value = 5 # Per default, 5 documents create per hour (arbitrarily) max_difference = 3 # Maximum difference between the number of document (so between 2 and 8 (included)) all_doc = doc_generator.create_doc_uniq_target_variable_at_most_specific_coef_variation( nbr_val, max_trigger_sensitivity, max_difference, default_value) self.test_es.add_multiple_docs(all_doc) analyzer.evaluate_model() nbr_outliers = 0 for doc in es.scan(): if "outliers" in doc['_source']: nbr_outliers += 1 self.assertEqual(nbr_outliers, len(all_doc))
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") outliers = list() for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) outlier_summary = replace_placeholder_string_with_fields( model_settings["outlier_summary"], fields) outlier_assets = helpers.utils.extract_outlier_asset_information( fields, settings) outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) if len(outlier_assets) > 0: outlier.add_observation("assets", outlier_assets) outliers.append(outlier) es.process_outliers(doc=doc, outliers=[outlier], should_notify=model_settings["should_notify"]) if len(outliers) > 0: unique_summaries = len( set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique]")
def evaluate_target(self, target, search_query, brute_force=False): self.total_events = es.count_documents(index=self.es_index, search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model") if brute_force: logging.logger.info("brute forcing field %s", str(target[0])) eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(index=self.es_index, search_query=search_query): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) try: target_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=target) aggregator_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self.model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug( "Skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") will_process_doc = False if will_process_doc: observations = dict() if brute_force: observations["brute_forced_field"] = self.model_settings[ "brute_forced_field"] for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence( target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence( aggregator_sentence) eval_terms_array = self.add_term_to_batch( eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == self.total_events) if last_batch or total_terms_added >= settings.config.getint( "terms", "terms_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = self.evaluate_batch_for_outliers( terms=eval_terms_array) if len(outliers) > 0: unique_summaries = len( set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 if outlier_batches_trend == -3 and brute_force: logging.logger.info( "too many batches without outliers, we are not going to continue brute forcing" ) break if outlier_batches_trend == 3 and brute_force: logging.logger.info( "too many batches with outliers, we are not going to continue brute forcing" ) break # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0 self.print_analysis_summary()
def test_remove_outliers_give_empty_list(self): nbr_generate = 5 self._generate_documents(nbr_generate) es.remove_all_outliers() result = [elem for elem in es.scan()] self.assertEqual(len(result), 0)
def test_generate_data_count_number_results_of_scan(self): nbr_generate = 5 self._generate_documents(nbr_generate) result = [elem for elem in es.scan()] self.assertEqual(len(result), nbr_generate)
def test_no_data_scan_return_empty_list(self): self.assertEqual([elem for elem in es.scan()], [])
def test_add_one_data_correctly_encode(self): dictionary_value, expected_result = self._get_example_dictionary_key_value_and_expected( ) self.test_es.add_data(dictionary_value) self.assertEqual([elem for elem in es.scan()], expected_result)
def evaluate_model(self): self.extract_additional_model_settings() eval_metrics = defaultdict() total_metrics_added = 0 self.total_events = es.count_documents(search_query=self.search_query) logging.print_analysis_intro(event_type="evaluating " + self.config_section_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") for doc in es.scan(search_query=self.search_query): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) try: target_value = helpers.utils.flatten_sentence( helpers.utils.get_dotkey_value( fields, self.model_settings["target"], case_sensitive=True)) aggregator_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self.model_settings["aggregator"]) except (KeyError, TypeError): logging.logger.debug( "skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") continue metric, observations = self.calculate_metric( self.model_settings["metric"], target_value) if metric is not None: # explicitly check for none, since "0" can be OK as a metric! total_metrics_added += 1 for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence( aggregator_sentence) eval_metrics = self.add_metric_to_batch( eval_metrics, flattened_aggregator_sentence, target_value, metric, observations, doc) # Evaluate batch of events against the model last_batch = (logging.current_step == self.total_events) if last_batch or total_metrics_added >= settings.config.getint( "metrics", "metrics_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_metrics_added) + " metrics [" + "{:,}".format(logging.current_step) + " events processed]") outliers, remaining_metrics = self.evaluate_batch_for_outliers( metrics=eval_metrics, model_settings=self.model_settings, last_batch=last_batch) if len(outliers) > 0: unique_summaries = len( set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") else: logging.logger.info("no outliers detected in batch") # Reset data structures for next batch eval_metrics = remaining_metrics.copy() total_metrics_added = 0 self.print_analysis_summary()
def evaluate_model(self): self.extract_additional_model_settings() if "*" in self.model_settings["target"]: brute_force = False logging.logger.warning("running terms model in brute force mode, could take a long time!") search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) batch_size = settings.config.getint("terms", "terms_batch_eval_size") self.total_events = es.count_documents(search_query=search_query) logging.init_ticker(total_steps=min(self.total_events, batch_size), desc=self.model_name + " - extracting brute force fields") field_names = set() num_docs_processed = 0 for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) fields = helpers.utils.flatten_dict(fields) # skip all fields that are related to outliers, we don't want to brute force them for field_name in list(fields.keys()): # create list instead of iterator so we can mutate the dictionary when iterating if field_name.startswith('outliers.'): logging.logger.debug("not brute forcing outliers field " + str(field_name)) fields.pop(field_name) field_names.update(fields.keys()) # only process a single batch of events in order to decide which fields to brute force if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names)) + " fields") for field_name in field_names: # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc. if "." in field_name: self.model_settings["target"] = list([field_name]) self.model_settings["brute_forced_field"] = field_name # so it can be added to the outlier events automatically brute_force = True else: brute_force = False if brute_force: search_query = es.filter_by_query_string(self.model_settings["es_query_filter"] + " AND _exists_:" + self.model_settings["brute_forced_field"]) else: search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) self.total_events = es.count_documents(search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model") eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) try: target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["target"]) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") will_process_doc = False if will_process_doc: observations = dict() if brute_force: observations["brute_forced_field"] = self.model_settings["brute_forced_field"] for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_terms_array = self.add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == self.total_events) if last_batch or total_terms_added >= settings.config.getint("terms", "terms_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = self.evaluate_batch_for_outliers(terms=eval_terms_array) if len(outliers) > 0: unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 if outlier_batches_trend == -3 and brute_force: logging.logger.info("too many batches without outliers, we are not going to continue brute forcing") break if outlier_batches_trend == 3 and brute_force: logging.logger.info("too many batches with outliers, we are not going to continue brute forcing") break # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0 self.print_analysis_summary()
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating metrics model") eval_metrics = defaultdict() total_metrics_added = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) will_process_doc = False try: target_value = helpers.utils.flatten_sentence(helpers.utils.get_dotkey_value(fields, model_settings["target"], case_sensitive=True)) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]") if will_process_doc: observations = dict() metric = None # ------------------------------------ # METRIC: Calculate numerical value # ------------------------------------ # Example: numerical_value("2") => 2 if model_settings["metric"] == "numerical_value": try: metric = float(target_value) total_metrics_added = total_metrics_added + 1 except ValueError: # number can not be casted to a Float, just continue pass # ------------------------------------ # METRIC: Calculate length of a string # ------------------------------------ # Example: length("outliers") => 8 if model_settings["metric"] == "length": metric = len(target_value) total_metrics_added = total_metrics_added + 1 # ------------------------------------- # METRIC: Calculate entropy of a string # ------------------------------------- # Example: entropy("houston") => 2.5216406363433186 if model_settings["metric"] == "entropy": metric = helpers.utils.shannon_entropy(target_value) total_metrics_added = total_metrics_added + 1 # ------------------------------------------------------------------------------------ # METRIC: Calculate total length of hexadecimal encoded substrings embedded in string # ------------------------------------------------------------------------------------ if model_settings["metric"] == "hex_encoded_length": hex_encoded_words = list() target_value_words = re.split("[^a-fA-F0-9+]", str(target_value)) # at least length 10 to have 5 encoded characters for word in target_value_words: if len(word) > 10 and helpers.utils.is_hex_encoded(word): # let's match at least 5 characters, meaning 10 hex digits hex_encoded_words.append(word) if len(hex_encoded_words) > 0: sorted_hex_encoded_words = sorted(hex_encoded_words, key=len) observations["max_hex_encoded_length"] = len(sorted_hex_encoded_words[-1]) observations["max_hex_encoded_word"] = sorted_hex_encoded_words[-1] metric = len(sorted_hex_encoded_words[-1]) else: metric = 0 total_metrics_added = total_metrics_added + 1 # ------------------------------------------------------------------------------------ # METRIC: Calculate total length of base64 encoded substrings embedded in string # ------------------------------------------------------------------------------------ # Example: base64_encoded_length("houston we have a cHJvYmxlbQ==") => base64_decoded_string: problem, base64_encoded_length: 7 if model_settings["metric"] == "base64_encoded_length": base64_decoded_words = list() # Split all non-Base64 characters, so we can try to convert them to Base64 decoded strings target_value_words = re.split("[^A-Za-z0-9+/=]", str(target_value)) for word in target_value_words: decoded_word = helpers.utils.is_base64_encoded(word) if decoded_word and len(decoded_word) >= 5: # let's match at least 5 characters, meaning 10 base64 digits base64_decoded_words.append(decoded_word) if len(base64_decoded_words) > 0: sorted_base64_decoded_words = sorted(base64_decoded_words, key=len) observations["max_base64_decoded_length"] = len(sorted_base64_decoded_words[-1]) observations["max_base64_decoded_word"] = sorted_base64_decoded_words[-1] metric = len(sorted_base64_decoded_words[-1]) else: metric = 0 total_metrics_added = total_metrics_added + 1 # --------------------------------------------------------- # METRIC: Calculate total length of URLs embedded in string # --------------------------------------------------------- # Example: url_length("why don't we go http://www.dance.com") => extracted_urls_length: 20, extracted_urls: http://www.dance.com if model_settings["metric"] == "url_length": extracted_urls_length = 0 extracted_urls = [] # if the target value is a list of strings, convert it into a single list of strings target_value_words = target_value.replace('"', ' ').split() # splits on whitespace by default, and on quotes, since we most likely will apply this to parameter arguments for word in target_value_words: is_url = helpers.utils.is_url(word) if is_url: extracted_urls_length += len(word) extracted_urls.append(word) if extracted_urls_length > 0: observations["extracted_urls_length"] = extracted_urls_length observations["extracted_urls"] = ','.join(extracted_urls) metric = extracted_urls_length total_metrics_added = total_metrics_added + 1 if metric is not None: # explicitly check for none, since "0" can be OK as a metric! for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_metrics = add_metric_to_batch(eval_metrics, flattened_aggregator_sentence, target_value, metric, observations, doc) # Evaluate batch of events against the model last_batch = (logging.current_step == total_events) if last_batch or total_metrics_added >= settings.config.getint("metrics", "metrics_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_metrics_added) + " metrics") outliers, remaining_metrics = evaluate_batch_for_outliers(metrics=eval_metrics, model_settings=model_settings, last_batch=last_batch) if len(outliers) > 0: unique_summaries = len(set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") else: logging.logger.info("no outliers detected in batch") # Reset data structures for next batch eval_metrics = remaining_metrics.copy() total_metrics_added = 0