def train_model(self): search_query = es.filter_by_query_string( self.model_settings["es_query_filter"]) train_data = list() self.total_events = es.count_documents(search_query=search_query) training_data_size_pct = settings.config.getint( "machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing SVM training set") for doc in es.scan(search_query=search_query): if len(train_data) < total_training_events: logging.tick() fields = es.extract_fields_from_document(doc) train_data.append(fields) else: # We have collected sufficient training data break # Now, train the model if len(train_data) > 0: pass # Train!! else: logging.logger.warning( "no sentences to train model on. Are you sure the sentence configuration is correctly defined?" )
def train_model(self): w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) sentences = list() self.total_events = es.count_documents(search_query=search_query) training_data_size_pct = settings.config.getint("machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing word2vec training set") for doc in es.scan(search_query=search_query): if len(sentences) < total_training_events: logging.tick() fields = es.extract_fields_from_document(doc) if set(self.model_settings["sentence_format"]).issubset(fields.keys()): new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"]) for sentence in new_sentences: sentences.append(tuple(sentence)) # Remove all duplicates from sentences for training - REMOVED FOR TESTING # sentences = list(sentences) else: # We have collected sufficient training data break # Now, train the model if len(sentences) > 0: w2v_model.train_model(sentences) else: logging.logger.warning("no sentences to train model on. Are you sure the sentence configuration is correctly defined?")
def evaluate_model(self): self.extract_extra_model_settings() # Train the model if self.model_settings["train_model"]: self.train_model() return w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) if not w2v_model.is_trained(): logging.logger.warning("model was not trained! Skipping analysis.") else: # Check if we need to run the test data instead of real data if w2v_model.use_test_data: logging.print_generic_intro("using test data instead of live data to evaluate model " + self.model_name) self.evaluate_test_sentences(w2v_model=w2v_model) return self.total_events = es.count_documents(search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model") raw_docs = list() eval_sentences = list() for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) try: new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"]) eval_sentences.extend(new_sentences) except KeyError: logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") continue for _ in new_sentences: raw_docs.append(doc) # Evaluate batch of events against the model if logging.current_step == self.total_events or len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"): logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences") outliers = self.evaluate_batch_for_outliers(w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs) if len(outliers) > 0: unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") # Reset data structures for next batch raw_docs = list() eval_sentences = list()
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating beaconing model") eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) try: target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["target"]) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]") will_process_doc = False if will_process_doc: observations = dict() for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_terms_array = add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == total_events) if last_batch or total_terms_added >= settings.config.getint("beaconing", "beaconing_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = evaluate_batch_for_outliers(terms=eval_terms_array, model_settings=model_settings) if len(outliers) > 0: unique_summaries = len(set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) # Add your model logic here logging.logger.info(json.dumps(fields, indent=4))
def perform_analysis(): for name in settings.config.sections(): if name.startswith("terms_"): param, model_name = name.split("terms_", 1) should_test_model = settings.config.getboolean("general", "run_models") and settings.config.getboolean(name, "run_model") should_run_model = settings.config.getboolean("general", "test_models") and settings.config.getboolean(name, "test_model") if should_test_model or should_run_model: model_settings = extract_model_settings(name) if "*" in model_settings["target"]: original_model_name = model_name logging.logger.warning("running terms model in brute force mode, could take a long time!") lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) batch_size = settings.config.getint("terms", "terms_batch_eval_size") total_events = es.count_documents(lucene_query=lucene_query) logging.init_ticker(total_steps=min(total_events, batch_size), desc=model_name + " - extracting brute force fields") field_names = set() num_docs_processed = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) fields = helpers.utils.flatten_dict(fields) # skip all fields that are related to outliers, we don't want to brute force them for field_name in list(fields.keys()): # create list instead of iterator so we can mutate the dictionary when iterating if field_name.startswith('outliers.'): logging.logger.debug("not brute forcing outliers field " + str(field_name)) fields.pop(field_name) field_names.update(fields.keys()) if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names)) + " fields") for field_name in field_names: model_name = original_model_name + " [" + field_name + "]" # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc. if "." in field_name: model_settings["target"] = list([field_name]) model_settings["brute_forced_field"] = field_name # so it can be added to the outlier events automatically evaluate_model(model_name=model_name, model_settings=model_settings, brute_force=True) else: evaluate_model(model_name=model_name, model_settings=model_settings)
def evaluate_model(self): self.extract_additional_model_settings() if self.model_settings["brute_force_target"]: logging.logger.warning( "running terms model in brute force mode, could take a long time!" ) target_fields_to_brute_force = self.calculate_target_fields_to_brute_force( ) for target_field in target_fields_to_brute_force: self.model_settings["brute_forced_field"] = target_field search_query = es.filter_by_query_string( self.model_settings["es_query_filter"] + " AND _exists_:" + self.model_settings["brute_forced_field"]) self.evaluate_target( target=[self.model_settings["brute_forced_field"]], search_query=search_query, brute_force=True) else: self.evaluate_target(target=self.model_settings["target"], search_query=es.filter_by_query_string( self.model_settings["es_query_filter"]), brute_force=False)
def calculate_target_fields_to_brute_force(self): search_query = es.filter_by_query_string( self.model_settings["es_query_filter"]) batch_size = settings.config.getint("terms", "terms_batch_eval_size") self.total_events = es.count_documents(index=self.es_index, search_query=search_query) logging.init_ticker(total_steps=min(self.total_events, batch_size), desc=self.model_name + " - extracting brute force fields") field_names_to_brute_force = set() num_docs_processed = 0 for doc in es.scan(index=self.es_index, search_query=search_query): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) fields = helpers.utils.flatten_dict(fields) for field_name in list( fields.keys() ): # create list instead of iterator so we can mutate the dictionary when iterating # skip all fields that are related to outliers, we don't want to brute force them if field_name.startswith('outliers.'): logging.logger.debug("not brute forcing outliers field " + str(field_name)) continue # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc. if "." in field_name: field_names_to_brute_force.add(field_name) # only process a single batch of events in order to decide which fields to brute force if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names_to_brute_force)) + " fields") return field_names_to_brute_force
def __init__(self, config_section_name): # the configuration file section for the use case, for example [simplequery_test_model] self.config_section_name = config_section_name # split the configuration section into the model type ("simplequery") and the model nalem ("test_model") self.model_type = self.config_section_name.split("_")[0] self.model_name = "_".join((self.config_section_name.split("_")[1:])) # extract all settings for this use case self.model_settings = self._extract_model_settings() if self.model_settings["es_query_filter"]: self.search_query = es.filter_by_query_string( self.model_settings["es_query_filter"]) if self.model_settings["es_dsl_filter"]: self.search_query = es.filter_by_dsl_query( self.model_settings["es_dsl_filter"]) self.total_events = 0 self.outliers = list()
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") outliers = list() for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) outlier_summary = replace_placeholder_string_with_fields( model_settings["outlier_summary"], fields) outlier_assets = helpers.utils.extract_outlier_asset_information( fields, settings) outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) if len(outlier_assets) > 0: outlier.add_observation("assets", outlier_assets) outliers.append(outlier) es.process_outliers(doc=doc, outliers=[outlier], should_notify=model_settings["should_notify"]) if len(outliers) > 0: unique_summaries = len( set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique]")
def _extract_model_settings(self): model_settings = dict() # by default, we don't process documents chronologically when analyzing the model, as it # has a high impact on performance when scanning in Elasticsearch model_settings["process_documents_chronologically"] = True try: model_settings["timestamp_field"] = settings.config.get( self.config_section_name, "timestamp_field") except NoOptionError: model_settings["timestamp_field"] = settings.config.get( "general", "timestamp_field", fallback="timestamp") try: model_settings["history_window_days"] = settings.config.getint( self.config_section_name, "history_window_days") except NoOptionError: model_settings["history_window_days"] = settings.config.getint( "general", "history_window_days") try: model_settings["history_window_hours"] = settings.config.getint( self.config_section_name, "history_window_hours") except NoOptionError: model_settings["history_window_hours"] = settings.config.getint( "general", "history_window_hours") try: model_settings["es_query_filter"] = settings.config.get( self.config_section_name, "es_query_filter") self.search_query = es.filter_by_query_string( model_settings["es_query_filter"]) except NoOptionError: model_settings["es_query_filter"] = None try: model_settings["es_dsl_filter"] = settings.config.get( self.config_section_name, "es_dsl_filter") self.search_query = es.filter_by_dsl_query( model_settings["es_dsl_filter"]) except NoOptionError: model_settings["es_dsl_filter"] = None try: model_settings["should_notify"] = settings.config.getboolean("notifier", "email_notifier") and \ settings.config.getboolean(self.config_section_name, "should_notify") except NoOptionError: model_settings["should_notify"] = False try: model_settings["use_derived_fields"] = settings.config.getboolean( self.config_section_name, "use_derived_fields") except NoOptionError: model_settings["use_derived_fields"] = False try: model_settings["should_notify"] = settings.config.getboolean("notifier", "email_notifier") and \ settings.config.getboolean(self.config_section_name, "should_notify") except NoOptionError: model_settings["should_notify"] = False try: self.es_index = settings.config.get(self.config_section_name, "es_index") except NoOptionError: self.es_index = settings.config.get("general", "es_index_pattern") model_settings["outlier_reason"] = settings.config.get( self.config_section_name, "outlier_reason") model_settings["outlier_type"] = settings.config.get( self.config_section_name, "outlier_type") model_settings["outlier_summary"] = settings.config.get( self.config_section_name, "outlier_summary") self.should_run_model = settings.config.getboolean( "general", "run_models") and settings.config.getboolean( self.config_section_name, "run_model") self.should_test_model = settings.config.getboolean( "general", "test_models") and settings.config.getboolean( self.config_section_name, "test_model") return model_settings
def evaluate_model(self): self.extract_additional_model_settings() if "*" in self.model_settings["target"]: brute_force = False logging.logger.warning("running terms model in brute force mode, could take a long time!") search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) batch_size = settings.config.getint("terms", "terms_batch_eval_size") self.total_events = es.count_documents(search_query=search_query) logging.init_ticker(total_steps=min(self.total_events, batch_size), desc=self.model_name + " - extracting brute force fields") field_names = set() num_docs_processed = 0 for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) fields = helpers.utils.flatten_dict(fields) # skip all fields that are related to outliers, we don't want to brute force them for field_name in list(fields.keys()): # create list instead of iterator so we can mutate the dictionary when iterating if field_name.startswith('outliers.'): logging.logger.debug("not brute forcing outliers field " + str(field_name)) fields.pop(field_name) field_names.update(fields.keys()) # only process a single batch of events in order to decide which fields to brute force if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names)) + " fields") for field_name in field_names: # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc. if "." in field_name: self.model_settings["target"] = list([field_name]) self.model_settings["brute_forced_field"] = field_name # so it can be added to the outlier events automatically brute_force = True else: brute_force = False if brute_force: search_query = es.filter_by_query_string(self.model_settings["es_query_filter"] + " AND _exists_:" + self.model_settings["brute_forced_field"]) else: search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) self.total_events = es.count_documents(search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model") eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) try: target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["target"]) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") will_process_doc = False if will_process_doc: observations = dict() if brute_force: observations["brute_forced_field"] = self.model_settings["brute_forced_field"] for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_terms_array = self.add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == self.total_events) if last_batch or total_terms_added >= settings.config.getint("terms", "terms_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = self.evaluate_batch_for_outliers(terms=eval_terms_array) if len(outliers) > 0: unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 if outlier_batches_trend == -3 and brute_force: logging.logger.info("too many batches without outliers, we are not going to continue brute forcing") break if outlier_batches_trend == 3 and brute_force: logging.logger.info("too many batches with outliers, we are not going to continue brute forcing") break # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0 self.print_analysis_summary()
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating metrics model") eval_metrics = defaultdict() total_metrics_added = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) will_process_doc = False try: target_value = helpers.utils.flatten_sentence(helpers.utils.get_dotkey_value(fields, model_settings["target"], case_sensitive=True)) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]") if will_process_doc: observations = dict() metric = None # ------------------------------------ # METRIC: Calculate numerical value # ------------------------------------ # Example: numerical_value("2") => 2 if model_settings["metric"] == "numerical_value": try: metric = float(target_value) total_metrics_added = total_metrics_added + 1 except ValueError: # number can not be casted to a Float, just continue pass # ------------------------------------ # METRIC: Calculate length of a string # ------------------------------------ # Example: length("outliers") => 8 if model_settings["metric"] == "length": metric = len(target_value) total_metrics_added = total_metrics_added + 1 # ------------------------------------- # METRIC: Calculate entropy of a string # ------------------------------------- # Example: entropy("houston") => 2.5216406363433186 if model_settings["metric"] == "entropy": metric = helpers.utils.shannon_entropy(target_value) total_metrics_added = total_metrics_added + 1 # ------------------------------------------------------------------------------------ # METRIC: Calculate total length of hexadecimal encoded substrings embedded in string # ------------------------------------------------------------------------------------ if model_settings["metric"] == "hex_encoded_length": hex_encoded_words = list() target_value_words = re.split("[^a-fA-F0-9+]", str(target_value)) # at least length 10 to have 5 encoded characters for word in target_value_words: if len(word) > 10 and helpers.utils.is_hex_encoded(word): # let's match at least 5 characters, meaning 10 hex digits hex_encoded_words.append(word) if len(hex_encoded_words) > 0: sorted_hex_encoded_words = sorted(hex_encoded_words, key=len) observations["max_hex_encoded_length"] = len(sorted_hex_encoded_words[-1]) observations["max_hex_encoded_word"] = sorted_hex_encoded_words[-1] metric = len(sorted_hex_encoded_words[-1]) else: metric = 0 total_metrics_added = total_metrics_added + 1 # ------------------------------------------------------------------------------------ # METRIC: Calculate total length of base64 encoded substrings embedded in string # ------------------------------------------------------------------------------------ # Example: base64_encoded_length("houston we have a cHJvYmxlbQ==") => base64_decoded_string: problem, base64_encoded_length: 7 if model_settings["metric"] == "base64_encoded_length": base64_decoded_words = list() # Split all non-Base64 characters, so we can try to convert them to Base64 decoded strings target_value_words = re.split("[^A-Za-z0-9+/=]", str(target_value)) for word in target_value_words: decoded_word = helpers.utils.is_base64_encoded(word) if decoded_word and len(decoded_word) >= 5: # let's match at least 5 characters, meaning 10 base64 digits base64_decoded_words.append(decoded_word) if len(base64_decoded_words) > 0: sorted_base64_decoded_words = sorted(base64_decoded_words, key=len) observations["max_base64_decoded_length"] = len(sorted_base64_decoded_words[-1]) observations["max_base64_decoded_word"] = sorted_base64_decoded_words[-1] metric = len(sorted_base64_decoded_words[-1]) else: metric = 0 total_metrics_added = total_metrics_added + 1 # --------------------------------------------------------- # METRIC: Calculate total length of URLs embedded in string # --------------------------------------------------------- # Example: url_length("why don't we go http://www.dance.com") => extracted_urls_length: 20, extracted_urls: http://www.dance.com if model_settings["metric"] == "url_length": extracted_urls_length = 0 extracted_urls = [] # if the target value is a list of strings, convert it into a single list of strings target_value_words = target_value.replace('"', ' ').split() # splits on whitespace by default, and on quotes, since we most likely will apply this to parameter arguments for word in target_value_words: is_url = helpers.utils.is_url(word) if is_url: extracted_urls_length += len(word) extracted_urls.append(word) if extracted_urls_length > 0: observations["extracted_urls_length"] = extracted_urls_length observations["extracted_urls"] = ','.join(extracted_urls) metric = extracted_urls_length total_metrics_added = total_metrics_added + 1 if metric is not None: # explicitly check for none, since "0" can be OK as a metric! for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_metrics = add_metric_to_batch(eval_metrics, flattened_aggregator_sentence, target_value, metric, observations, doc) # Evaluate batch of events against the model last_batch = (logging.current_step == total_events) if last_batch or total_metrics_added >= settings.config.getint("metrics", "metrics_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_metrics_added) + " metrics") outliers, remaining_metrics = evaluate_batch_for_outliers(metrics=eval_metrics, model_settings=model_settings, last_batch=last_batch) if len(outliers) > 0: unique_summaries = len(set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") else: logging.logger.info("no outliers detected in batch") # Reset data structures for next batch eval_metrics = remaining_metrics.copy() total_metrics_added = 0