def _compute_aggregator_and_target_value(self, doc): """ Compute the target value and the aggregator sentence. Return the two value or two None if one of the two could not be computed :param doc: the document for which the calculations must be made :return: target_value (could be None), aggregator_sentences (could be None) """ fields = es.extract_fields_from_document( doc, extract_derived_fields=self.model_settings["use_derived_fields"]) try: target_value = helpers.utils.flatten_sentence( helpers.utils.get_dotkey_value(fields, self.model_settings["target"], case_sensitive=True)) aggregator_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self.model_settings["aggregator"]) except (KeyError, TypeError): logging.logger.debug( "skipping event which does not contain the target and aggregator " + "fields we are processing. - [" + self.model_name + "]") return None, None return target_value, aggregator_sentences
def _create_outlier(self, non_outlier_values, term_value_count, aggregator_value, term_value, decision_frontier, terms, ii): non_outlier_values_sample = ",".join( random.sample(non_outlier_values, min(3, len(non_outlier_values)))) observations = dict() observations["non_outlier_values_sample"] = non_outlier_values_sample observations["term_count"] = term_value_count observations["aggregator"] = aggregator_value observations["term"] = term_value observations["decision_frontier"] = decision_frontier observations["trigger_method"] = str( self.model_settings["trigger_method"]) calculated_observations = terms[ observations["aggregator"]]["observations"][ii] calculated_observations.update(observations) raw_doc = terms[observations["aggregator"]]["raw_docs"][ii] fields = es.extract_fields_from_document( raw_doc, extract_derived_fields=self.model_settings["use_derived_fields"]) return self.process_outlier( fields, raw_doc, extra_outlier_information=calculated_observations)
def _create_outlier(self, raw_doc): """ Create outlier from raw_doc :param raw_doc: raw document representing one hit event from an Elasticsearch request :return: the created outlier """ extra_outlier_information = dict() if self.model_settings["highlight_match"]: extra_outlier_information["matched_fields"] = raw_doc["highlight"] matched_values = dict() for key, fields in raw_doc["highlight"].items(): matched_values[key] = list() for field in fields: # Find values between tags <value> and </value> values = re.findall("<value>((.|\n)*?)</value>", field) matched_values[key] = [value for value, _ in values] extra_outlier_information["matched_values"] = str(matched_values) fields = es.extract_fields_from_document( raw_doc, extract_derived_fields=self.model_settings["use_derived_fields"]) return self.create_outlier( fields, raw_doc, extra_outlier_information=extra_outlier_information)
def evaluate_batch_for_outliers(self, w2v_model=None, eval_sentences=None, raw_docs=None): # Initialize outliers = list() # all_words_probs: contains an array of arrays. the nested arrays contain the probabilities of a word on that index to have a certain probability, in the context of another word sentence_probs = w2v_model.evaluate_sentences(eval_sentences) for i, single_sentence_prob in enumerate(sentence_probs): # If the probability is nan, it means that the sentence could not be evaluated, and we can't reason about it. # This happens for example whenever the sentence is made up entirely of words that aren't known to the trained model. if single_sentence_prob is np.nan: continue unique_probs = list(set(sentence_probs)) decision_frontier = helpers.utils.get_decision_frontier(self.model_settings["trigger_method"], unique_probs, self.model_settings["trigger_sensitivity"], self.model_settings["trigger_on"]) is_outlier = helpers.utils.is_outlier(single_sentence_prob, decision_frontier, self.model_settings["trigger_on"]) if is_outlier: fields = es.extract_fields_from_document(raw_docs[i]) outliers.append(self.process_outlier(fields, raw_docs[i], extra_outlier_information=None)) else: if w2v_model.use_test_data: logging.logger.info("Not an outlier: " + str(eval_sentences[i]) + " - " + str(single_sentence_prob)) return outliers
def process_outlier(decision_frontier, non_outlier_values_sample, term_value_count, terms, aggregator_value, ii, term_value, model_settings): # Extract fields from raw document fields = es.extract_fields_from_document(terms[aggregator_value]["raw_docs"][ii]) observations = terms[aggregator_value]["observations"][ii] observations["non_outlier_values_sample"] = non_outlier_values_sample observations["aggregator"] = aggregator_value observations["term"] = term_value observations["term_count"] = term_value_count observations["decision_frontier"] = decision_frontier observations["trigger_method"] = str(model_settings["trigger_method"]) observations["confidence"] = np.abs(decision_frontier - term_value_count) merged_fields_and_observations = helpers.utils.merge_two_dicts(fields, observations) outlier_summary = helpers.utils.replace_placeholder_fields_with_values(model_settings["outlier_summary"], merged_fields_and_observations) outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings) if len(outlier_assets) > 0: observations["assets"] = outlier_assets outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) for k, v in observations.items(): outlier.add_observation(k, v) es.process_outliers(doc=terms[aggregator_value]["raw_docs"][ii], outliers=[outlier], should_notify=model_settings["should_notify"]) return outlier
def train_model(self): w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) sentences = list() self.total_events = es.count_documents(search_query=search_query) training_data_size_pct = settings.config.getint("machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing word2vec training set") for doc in es.scan(search_query=search_query): if len(sentences) < total_training_events: logging.tick() fields = es.extract_fields_from_document(doc) if set(self.model_settings["sentence_format"]).issubset(fields.keys()): new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"]) for sentence in new_sentences: sentences.append(tuple(sentence)) # Remove all duplicates from sentences for training - REMOVED FOR TESTING # sentences = list(sentences) else: # We have collected sufficient training data break # Now, train the model if len(sentences) > 0: w2v_model.train_model(sentences) else: logging.logger.warning("no sentences to train model on. Are you sure the sentence configuration is correctly defined?")
def train_model(self): search_query = es.filter_by_query_string( self.model_settings["es_query_filter"]) train_data = list() self.total_events = es.count_documents(search_query=search_query) training_data_size_pct = settings.config.getint( "machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing SVM training set") for doc in es.scan(search_query=search_query): if len(train_data) < total_training_events: logging.tick() fields = es.extract_fields_from_document(doc) train_data.append(fields) else: # We have collected sufficient training data break # Now, train the model if len(train_data) > 0: pass # Train!! else: logging.logger.warning( "no sentences to train model on. Are you sure the sentence configuration is correctly defined?" )
def test_extract_outlier_asset_information_case_insensitive_value(self): from helpers.singletons import settings, es # test case for case insensitive asset matching orig_doc = copy.deepcopy(doc_with_outlier_test_file) fields = es.extract_fields_from_document(orig_doc, extract_derived_fields=False) outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings) self.assertIn("ip: 192.168.67.175", outlier_assets)
def find_sudden_appearance(self, start_slide_win, end_slide_win): """ Find sudden apparition in aggregation defined by self.model_settings["aggregator"] of a term field defined by self.model_settings["target"] in events within the time window defined by start_slide_win and en_slide_win and create outliers. An event is considered as outlier when a term field appear for the first time after the (end_slide_win - self.jump_win) :param start_slide_win: start time of the time window :param end_slide_win: end time of the time window """ aggregator_buckets = es.scan_first_occur_documents(search_query=self.search_query, start_time=start_slide_win, end_time=end_slide_win, model_settings=self.model_settings) # Loop over the aggregations for aggregator_bucket in aggregator_buckets: target_buckets = aggregator_bucket["target"]["buckets"] # Loop over the documents in aggregation for doc in target_buckets: self.num_event_proc += doc["doc_count"] raw_doc = doc["top_doc"]["hits"]["hits"][0] fields = es.extract_fields_from_document(raw_doc, extract_derived_fields=self.model_settings[ "use_derived_fields"]) # convert the event timestamp in the right format event_timestamp = dateutil.parser.parse(fields[self.model_settings["timestamp_field"]], ignoretz=True) if event_timestamp > (end_slide_win - self.jump_win): # retrieve extra information extra_outlier_information = dict() extra_outlier_information["size_time_window"] = str(self.delta_slide_win) extra_outlier_information["start_time_window"] = str(start_slide_win) extra_outlier_information["end_time_window"] = str(end_slide_win) extra_outlier_information["aggregator"] = self.model_settings["aggregator"] extra_outlier_information["aggregator_value"] = aggregator_bucket["key"] extra_outlier_information["target"] = self.model_settings["target"] extra_outlier_information["target_value"] = doc["key"] extra_outlier_information["num_target_value_in_window"] = doc["doc_count"] outlier = self.create_outlier(fields, raw_doc, extra_outlier_information=extra_outlier_information) self.process_outlier(outlier) summary = "In aggregator '%s: %s', the field(s) '%s: %s' appear(s) " \ "suddenly at %s of the time window of size %s." % \ (", ".join(self.model_settings["aggregator"]), aggregator_bucket["key"], " ,".join(self.model_settings["target"]), doc["key"], str(event_timestamp), self.delta_slide_win) logging.logger.debug(summary) logging.tick(self.num_event_proc)
def test_extract_outlier_asset_information_simple_matching(self): from helpers.singletons import settings, es orig_doc = copy.deepcopy(doc_with_outlier_test_file) fields = es.extract_fields_from_document(orig_doc, extract_derived_fields=False) # test case for simple asset matching outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings) self.assertIn("user: dummyuser", outlier_assets) self.assertIn("host: DUMMY-PC", outlier_assets)
def test_extract_outlier_asset_information_list_values(self): from helpers.singletons import settings, es orig_doc = copy.deepcopy(doc_with_asset_edgecases) fields = es.extract_fields_from_document(orig_doc, extract_derived_fields=False) # test case for asset fields containing multiple values in an array outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings) self.assertIn("user: dummyuser1", outlier_assets) # test case for array assets self.assertIn("user: dummyuser2", outlier_assets) # test case for array assets self.assertEqual(len(outlier_assets), 3) # blank asset fields, such as the PC name in the JSON file, should
def evaluate_model(self): self.extract_extra_model_settings() # Train the model if self.model_settings["train_model"]: self.train_model() return w2v_model = word2vec.Word2Vec(name=self.model_name) search_query = es.filter_by_query_string(self.model_settings["es_query_filter"]) if not w2v_model.is_trained(): logging.logger.warning("model was not trained! Skipping analysis.") else: # Check if we need to run the test data instead of real data if w2v_model.use_test_data: logging.print_generic_intro("using test data instead of live data to evaluate model " + self.model_name) self.evaluate_test_sentences(w2v_model=w2v_model) return self.total_events = es.count_documents(search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model") raw_docs = list() eval_sentences = list() for doc in es.scan(search_query=search_query): logging.tick() fields = es.extract_fields_from_document(doc) try: new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"]) eval_sentences.extend(new_sentences) except KeyError: logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") continue for _ in new_sentences: raw_docs.append(doc) # Evaluate batch of events against the model if logging.current_step == self.total_events or len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"): logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences") outliers = self.evaluate_batch_for_outliers(w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs) if len(outliers) > 0: unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") # Reset data structures for next batch raw_docs = list() eval_sentences = list()
def is_document_whitelisted(self, document, extract_field=True): document_to_check = copy.deepcopy(document) if extract_field: fields = es.extract_fields_from_document( document_to_check, extract_derived_fields=self. model_settings["use_derived_fields"]) else: fields = document outlier_param = self._prepare_outlier_parameters(dict(), fields) document_to_check['__whitelist_extra'] = outlier_param return Outlier.is_whitelisted_doc(document_to_check)
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating beaconing model") eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) try: target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["target"]) aggregator_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug("Skipping event which does not contain the target and aggregator fields we are processing. - [" + model_name + "]") will_process_doc = False if will_process_doc: observations = dict() for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence(target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence(aggregator_sentence) eval_terms_array = add_term_to_batch(eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == total_events) if last_batch or total_terms_added >= settings.config.getint("beaconing", "beaconing_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = evaluate_batch_for_outliers(terms=eval_terms_array, model_settings=model_settings) if len(outliers) > 0: unique_summaries = len(set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) # Add your model logic here logging.logger.info(json.dumps(fields, indent=4))
def evaluate_model(self): model_filter = { "bool": { "filter": [{ "term": { "outliers.model_name.keyword": { "value": self.model_name } } }, { "term": { "outliers.model_type.keyword": { "value": "simplequery" } } }] } } exclude_hits_filter = {"bool": {"must_not": model_filter}} query = self.search_query if "filter" in query: query["filter"].append(exclude_hits_filter) else: query["filter"] = [exclude_hits_filter] self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=query, model_settings=self.model_settings) self.print_analysis_intro(event_type="evaluating " + self.model_type + "_" + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") if self.total_events > 0: for doc in documents: logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) outlier = self.create_outlier(fields, doc) self.process_outlier(outlier) self.print_analysis_summary()
def perform_analysis(): for name in settings.config.sections(): if name.startswith("terms_"): param, model_name = name.split("terms_", 1) should_test_model = settings.config.getboolean("general", "run_models") and settings.config.getboolean(name, "run_model") should_run_model = settings.config.getboolean("general", "test_models") and settings.config.getboolean(name, "test_model") if should_test_model or should_run_model: model_settings = extract_model_settings(name) if "*" in model_settings["target"]: original_model_name = model_name logging.logger.warning("running terms model in brute force mode, could take a long time!") lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) batch_size = settings.config.getint("terms", "terms_batch_eval_size") total_events = es.count_documents(lucene_query=lucene_query) logging.init_ticker(total_steps=min(total_events, batch_size), desc=model_name + " - extracting brute force fields") field_names = set() num_docs_processed = 0 for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) fields = helpers.utils.flatten_dict(fields) # skip all fields that are related to outliers, we don't want to brute force them for field_name in list(fields.keys()): # create list instead of iterator so we can mutate the dictionary when iterating if field_name.startswith('outliers.'): logging.logger.debug("not brute forcing outliers field " + str(field_name)) fields.pop(field_name) field_names.update(fields.keys()) if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names)) + " fields") for field_name in field_names: model_name = original_model_name + " [" + field_name + "]" # only brute force nested fields, so not the top level fields such as timestamp, deployment name, etc. if "." in field_name: model_settings["target"] = list([field_name]) model_settings["brute_forced_field"] = field_name # so it can be added to the outlier events automatically evaluate_model(model_name=model_name, model_settings=model_settings, brute_force=True) else: evaluate_model(model_name=model_name, model_settings=model_settings)
def evaluate_batch_for_outliers(metrics=None, model_settings=None, last_batch=False): # Initialize outliers = list() remaining_metrics = metrics.copy() for i, aggregator_value in enumerate(metrics): # Check if we have sufficient data. if not, continue. Else, evaluate for outliers. if len(metrics[aggregator_value]["metrics"]) < 100 and last_batch is False: continue else: # Remove from remaining metrics, as we will be handling it in a second del remaining_metrics[aggregator_value] # Calculate the decision frontier decision_frontier = helpers.utils.get_decision_frontier(model_settings["trigger_method"], metrics[aggregator_value]["metrics"], model_settings["trigger_sensitivity"], model_settings["trigger_on"]) logging.logger.debug("using decision frontier " + str(decision_frontier) + " for aggregator " + str(aggregator_value) + " - " + model_settings["metric"]) logging.logger.debug("example metric from batch for " + metrics[aggregator_value]["observations"][0]["target"] + ": " + str(metrics[aggregator_value]["metrics"][0])) # Calculate all outliers in array for ii, metric_value in enumerate(metrics[aggregator_value]["metrics"]): is_outlier = helpers.utils.is_outlier(metric_value, decision_frontier, model_settings["trigger_on"]) if is_outlier: confidence = np.abs(decision_frontier - metric_value) # Extract fields from raw document fields = es.extract_fields_from_document(metrics[aggregator_value]["raw_docs"][ii]) observations = metrics[aggregator_value]["observations"][ii] merged_fields_and_observations = helpers.utils.merge_two_dicts(fields, observations) outlier_summary = helpers.utils.replace_placeholder_fields_with_values(model_settings["outlier_summary"], merged_fields_and_observations) outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings) if len(outlier_assets) > 0: observations["assets"] = outlier_assets outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) outlier.add_observation("metric", metric_value) outlier.add_observation("decision_frontier", decision_frontier) outlier.add_observation("confidence", confidence) for k, v in observations.items(): outlier.add_observation(k, v) outliers.append(outlier) es.process_outliers(doc=metrics[aggregator_value]["raw_docs"][ii], outliers=[outlier], should_notify=model_settings["should_notify"]) return outliers, remaining_metrics
def evaluate_model(self): self.total_events = es.count_documents(search_query=self.search_query) logging.print_analysis_intro(event_type="evaluating " + self.config_section_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating " + self.model_type + " model") for doc in es.scan(search_query=self.search_query): logging.tick() fields = es.extract_fields_from_document(doc) self.process_outlier(fields, doc) self.print_analysis_summary()
def _calculate_target_fields_to_brute_force(self): batch_size = settings.config.getint("terms", "terms_batch_eval_size") self.total_events = es.count_documents( index=self.es_index, search_query=self.search_query, model_settings=self.model_settings) logging.init_ticker(total_steps=min(self.total_events, batch_size), desc=self.model_name + " - extracting brute force fields") field_names_to_brute_force = set() if self.total_events > 0: num_docs_processed = 0 for doc in es.scan(index=self.es_index, search_query=self.search_query, model_settings=self.model_settings): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) fields = helpers.utils.flatten_dict(fields) # create list instead of iterator so we can mutate the dictionary when iterating for field_name in list(fields.keys()): # skip all fields that are related to outliers, we don't want to brute force them if field_name.startswith('outliers.'): logging.logger.debug( "not brute forcing outliers field " + str(field_name)) continue # only brute force nested fields, so not the top level fields such as timestamp, # deployment name, etc. if "." in field_name: field_names_to_brute_force.add(field_name) # only process a single batch of events in order to decide which fields to brute force if num_docs_processed == batch_size: break else: num_docs_processed += 1 logging.logger.info("going to brute force " + str(len(field_names_to_brute_force)) + " fields") return field_names_to_brute_force
def test_flush_bulk_actions_using_one_save_outlier(self): doc_with_outlier_with_derived_timestamp = copy.deepcopy( doc_with_outlier_with_derived_timestamp_test_file) doc_without_outlier = copy.deepcopy(doc_without_outlier_test_file) doc_without_outlier["_source"] = es.extract_fields_from_document( doc_without_outlier, extract_derived_fields=True) self.test_es.add_doc(doc_without_outlier) test_outlier = Outlier(outlier_type="dummy type", outlier_reason="dummy reason", outlier_summary="dummy summary", doc=doc_without_outlier) test_outlier.outlier_dict["observation"] = "dummy observation" es.save_outlier(test_outlier, extract_derived_fields=True) result = [elem for elem in es._scan()][0] self.assertEqual(result, doc_with_outlier_with_derived_timestamp)
def _compute_aggregator_and_target_value(self, doc, target): """ Extract target and aggregator sentence from a document :param doc: document where data need to be extract :param target: target key name :return: list of target and list of aggregator """ fields = es.extract_fields_from_document(doc, extract_derived_fields=self.model_settings["use_derived_fields"]) try: target_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=target) aggregator_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self.model_settings["aggregator"]) except (KeyError, TypeError): logging.logger.debug("Skipping event which does not contain the target and aggregator " + "fields we are processing. - [" + self.model_name + "]") return None, None return target_sentences, aggregator_sentences
def _create_outlier(self, non_outlier_values, term_value_count, aggregator_value, term_value, decision_frontier, batch, ii): """ Create outlier with given parameter :param non_outlier_values: list of document that aren't outliers :param term_value_count: number of term :param aggregator_value: aggregator value :param term_value: term value :param decision_frontier: value of the decision frontier :param batch: batch :param ii: index of the document linked to this outlier :return: the created outlier """ observations = dict() if non_outlier_values: non_outlier_values_sample = ",".join( random.sample( non_outlier_values, 3 if len(non_outlier_values) > 3 else len(non_outlier_values))) observations[ "non_outlier_values_sample"] = non_outlier_values_sample else: observations["non_outlier_values_sample"] = [] observations["term_count"] = term_value_count observations["aggregator"] = aggregator_value observations["term"] = term_value observations["decision_frontier"] = decision_frontier observations["trigger_method"] = str( self.model_settings["trigger_method"]) calculated_observations = batch[ observations["aggregator"]]["observations"][ii] calculated_observations.update(observations) raw_doc = batch[observations["aggregator"]]["raw_docs"][ii] fields = es.extract_fields_from_document( raw_doc, extract_derived_fields=self.model_settings["use_derived_fields"]) return self.create_outlier( fields, raw_doc, extra_outlier_information=calculated_observations)
def prepare_and_process_outlier(self, decision_frontier, term_value_count, terms, aggregator_value, term_counter): # Extract fields from raw document fields = es.extract_fields_from_document( terms[aggregator_value]["raw_docs"][term_counter]) observations = terms[aggregator_value]["observations"][term_counter] observations["aggregator"] = aggregator_value observations["term"] = terms[aggregator_value]["targets"][term_counter] observations["term_count"] = term_value_count observations["decision_frontier"] = decision_frontier observations["confidence"] = np.abs(decision_frontier - term_value_count) return self.process_outlier( fields, terms[aggregator_value]["raw_docs"][term_counter], extra_outlier_information=observations)
def evaluate_batch_for_outliers(w2v_model=None, eval_sentences=None, raw_docs=None, model_settings=None): # Initialize outliers = list() # all_words_probs: contains an array of arrays. the nested arrays contain the probabilities of a word on that index to have a certain probability, in the context of another word sentence_probs = w2v_model.evaluate_sentences(eval_sentences) for i, single_sentence_prob in enumerate(sentence_probs): # If the probability is nan, it means that the sentence could not be evaluated, and we can't reason about it. # This happens for example whenever the sentence is made up entirely of words that aren't known to the trained model. if single_sentence_prob is np.nan: continue unique_probs = list(set(sentence_probs)) # if is_outlier_cutoff_percentage(single_sentence_prob, cutoff=0.005): # if is_outlier_std(single_sentence_prob, unique_probs, model_settings): if is_outlier_mad(single_sentence_prob, unique_probs, model_settings): outlier_summary = model_settings["outlier_summary"] # Extract fields from raw document fields = es.extract_fields_from_document(raw_docs[i]) outlier_summary = replace_placeholder_string_with_fields( outlier_summary, fields) outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) outlier.add_observation("probability", str(single_sentence_prob)) outliers.append(outlier) es.process_outliers(doc=raw_docs[i], outliers=[outlier], should_notify=model_settings["should_notify"]) else: if w2v_model.use_test_data: logging.logger.info("Not an outlier: " + str(eval_sentences[i]) + " - " + str(single_sentence_prob)) return outliers
def _compute_fields_observation_and_create_outlier( self, non_outlier_values, metrics_aggregator_value, ii, decision_frontier, metric_value): """ Extract field from document and compute different element that will be placed in the observation :param metrics_aggregator_value: value of the metrics aggregator :param ii: index of the document that have been detected like outlier :param decision_frontier: the value of the decision frontier :param metric_value: the metric value :return: the created outlier """ observations = metrics_aggregator_value["observations"][ii] if non_outlier_values: non_outlier_values_sample = ",".join( random.sample( non_outlier_values, 3 if len(non_outlier_values) > 3 else len(non_outlier_values))) observations[ "non_outlier_values_sample"] = non_outlier_values_sample else: observations["non_outlier_values_sample"] = [] observations["metric"] = metric_value observations["decision_frontier"] = decision_frontier confidence = np.abs(decision_frontier - metric_value) observations["confidence"] = confidence # Extract fields from raw document fields = es.extract_fields_from_document( metrics_aggregator_value["raw_docs"][ii], extract_derived_fields=self.model_settings["use_derived_fields"]) outlier = self.create_outlier(fields, metrics_aggregator_value["raw_docs"][ii], extra_outlier_information=observations) return outlier
def evaluate_model(model_name=None, model_settings=None): lucene_query = es.filter_by_query_string(model_settings["es_query_filter"]) total_events = es.count_documents(lucene_query=lucene_query) logging.print_analysis_intro(event_type="evaluating " + model_name, total_events=total_events) logging.init_ticker(total_steps=total_events, desc=model_name + " - evaluating simplequery model") outliers = list() for doc in es.scan(lucene_query=lucene_query): logging.tick() fields = es.extract_fields_from_document(doc) outlier_summary = replace_placeholder_string_with_fields( model_settings["outlier_summary"], fields) outlier_assets = helpers.utils.extract_outlier_asset_information( fields, settings) outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary) if len(outlier_assets) > 0: outlier.add_observation("assets", outlier_assets) outliers.append(outlier) es.process_outliers(doc=doc, outliers=[outlier], should_notify=model_settings["should_notify"]) if len(outliers) > 0: unique_summaries = len( set(o.get_observation("summary") for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique]")
def train_model(self): train_data = list() self.total_events, documents = es.count_and_scan_documents( index=self.model_settings["es_index"], search_query=self.search_query, model_settings=self.model_settings) training_data_size_pct = settings.config.getint( "machine_learning", "training_data_size_pct") training_data_size = self.total_events / 100 * training_data_size_pct self.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events) total_training_events = int(min(training_data_size, self.total_events)) logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing training set") if self.total_events > 0: for doc in documents: if len(train_data) < total_training_events: logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) train_data.append(fields) else: # We have collected sufficient training data break # Now, train the model if train_data: pass # Train!! else: logging.logger.warning( "no sentences to train model on. Are you sure the sentence configuration is " + "correctly defined?")
def evaluate_target(self, target, search_query, brute_force=False): self.total_events = es.count_documents(index=self.es_index, search_query=search_query) logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events) logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model") if brute_force: logging.logger.info("brute forcing field %s", str(target[0])) eval_terms_array = defaultdict() total_terms_added = 0 outlier_batches_trend = 0 for doc in es.scan(index=self.es_index, search_query=search_query): logging.tick() fields = es.extract_fields_from_document( doc, extract_derived_fields=self. model_settings["use_derived_fields"]) try: target_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=target) aggregator_sentences = helpers.utils.flatten_fields_into_sentences( fields=fields, sentence_format=self.model_settings["aggregator"]) will_process_doc = True except (KeyError, TypeError): logging.logger.debug( "Skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]") will_process_doc = False if will_process_doc: observations = dict() if brute_force: observations["brute_forced_field"] = self.model_settings[ "brute_forced_field"] for target_sentence in target_sentences: flattened_target_sentence = helpers.utils.flatten_sentence( target_sentence) for aggregator_sentence in aggregator_sentences: flattened_aggregator_sentence = helpers.utils.flatten_sentence( aggregator_sentence) eval_terms_array = self.add_term_to_batch( eval_terms_array, flattened_aggregator_sentence, flattened_target_sentence, observations, doc) total_terms_added += len(target_sentences) # Evaluate batch of events against the model last_batch = (logging.current_step == self.total_events) if last_batch or total_terms_added >= settings.config.getint( "terms", "terms_batch_eval_size"): logging.logger.info("evaluating batch of " + "{:,}".format(total_terms_added) + " terms") outliers = self.evaluate_batch_for_outliers( terms=eval_terms_array) if len(outliers) > 0: unique_summaries = len( set(o.outlier_dict["summary"] for o in outliers)) logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]") outlier_batches_trend += 1 else: logging.logger.info("no outliers detected in batch") outlier_batches_trend -= 1 if outlier_batches_trend == -3 and brute_force: logging.logger.info( "too many batches without outliers, we are not going to continue brute forcing" ) break if outlier_batches_trend == 3 and brute_force: logging.logger.info( "too many batches with outliers, we are not going to continue brute forcing" ) break # Reset data structures for next batch eval_terms_array = defaultdict() total_terms_added = 0 self.print_analysis_summary()
def evaluate_batch_for_outliers(self, terms=None): # Initialize outliers = list() # In case we want to count terms across different aggregators, we need to first iterate over all aggregators # and calculate the total number of unique terms for each aggregated value. # For example: # terms["smsc.exe"][A, B, C, D, D, E] # terms["abc.exe"][A, A, B] # is converted into: # unique_target_counts_across_aggregators: [5, 2] (the first term contains 5 unique values, the second one contains 2) if self.model_settings["target_count_method"] == "across_aggregators": unique_target_counts_across_aggregators = list() # loop 0: {i=0, aggregator_value = "smsc.exe"}, loop 1: {i=1, aggregator_value = "abc.exe"}, for i, aggregator_value in enumerate(terms): # unique_targets_in_aggregated_value = loop 0: [A, B, C, D, E], loop 1: [A, A, B] # unique_target_counts_across_aggregators = loop 0: [5], loop 1: [5, 2] unique_targets_in_aggregated_value = set( terms[aggregator_value]["targets"]) unique_target_counts_across_aggregators.append( len(unique_targets_in_aggregated_value)) # Calculate the decision frontier # unique_target_counts_across_aggregators = [5, 2] decision_frontier = helpers.utils.get_decision_frontier( self.model_settings["trigger_method"], unique_target_counts_across_aggregators, self.model_settings["trigger_sensitivity"], self.model_settings["trigger_on"]) logging.logger.debug("using " + self.model_settings["trigger_method"] + " decision frontier " + str(decision_frontier) + " across all aggregators") non_outlier_values = set() # loop 0: {i=0, aggregator_value = "smsc.exe"}, loop 1: {i=1, aggregator_value = "abc.exe"}, for i, aggregator_value in enumerate(terms): unique_target_count_across_aggregators = unique_target_counts_across_aggregators[ i] logging.logger.debug( "unique target count for aggregator " + str(aggregator_value) + ": " + str(unique_target_count_across_aggregators) + " - decision frontier " + str(decision_frontier)) is_outlier = helpers.utils.is_outlier( unique_target_count_across_aggregators, decision_frontier, self.model_settings["trigger_on"]) if is_outlier: for ii, term_value in enumerate( terms[aggregator_value]["targets"]): non_outlier_values_sample = ",".join( random.sample(non_outlier_values, min(3, len(non_outlier_values)))) observations = dict() observations[ "non_outlier_values_sample"] = non_outlier_values_sample observations[ "term_count"] = unique_target_count_across_aggregators observations["aggregator"] = aggregator_value observations["term"] = term_value observations["decision_frontier"] = decision_frontier observations["trigger_method"] = str( self.model_settings["trigger_method"]) calculated_observations = terms[ observations["aggregator"]]["observations"][ii] calculated_observations.update(observations) raw_doc = terms[ observations["aggregator"]]["raw_docs"][ii] fields = es.extract_fields_from_document( raw_doc, extract_derived_fields=self. model_settings["use_derived_fields"]) outliers.append( self.process_outlier(fields, raw_doc, extra_outlier_information= calculated_observations)) else: for ii, term_value in enumerate( terms[aggregator_value]["targets"]): non_outlier_values.add(term_value) # In case we want to count terms within an aggregator, it's a bit easier. # For example: # terms["smsc.exe"][A, B, C, D, D, E] # terms["abc.exe"][A, A, B] # is converted into: # First iteration: "smsc.exe" -> counted_target_values: {A: 1, B: 1, C: 1, D: 2, E: 1} # For each aggregator, we iterate over all terms within it: # term_value_count for a document with term "A" then becomes "1" in the example above. # we then flag an outlier if that "1" is an outlier in the array ["1 1 1 2 1"] if self.model_settings["target_count_method"] == "within_aggregator": for i, aggregator_value in enumerate(terms): # Count percentage of each target value occuring counted_targets = Counter(terms[aggregator_value]["targets"]) counted_target_values = list(counted_targets.values()) logging.logger.debug("terms count for aggregator value " + aggregator_value + " -> " + str(counted_targets)) decision_frontier = helpers.utils.get_decision_frontier( self.model_settings["trigger_method"], counted_target_values, self.model_settings["trigger_sensitivity"], self.model_settings["trigger_on"]) logging.logger.debug("using " + self.model_settings["trigger_method"] + " decision frontier " + str(decision_frontier) + " for aggregator " + str(aggregator_value)) non_outlier_values = set() for ii, term_value in enumerate( terms[aggregator_value]["targets"]): term_value_count = counted_targets[term_value] is_outlier = helpers.utils.is_outlier( term_value_count, decision_frontier, self.model_settings["trigger_on"]) if is_outlier: non_outlier_values_sample = ",".join( random.sample(non_outlier_values, min(3, len(non_outlier_values)))) observations = dict() observations[ "non_outlier_values_sample"] = non_outlier_values_sample observations["term_count"] = term_value_count observations["aggregator"] = aggregator_value observations["term"] = term_value observations["decision_frontier"] = decision_frontier observations["trigger_method"] = str( self.model_settings["trigger_method"]) calculated_observations = terms[ observations["aggregator"]]["observations"][ii] calculated_observations.update(observations) raw_doc = terms[ observations["aggregator"]]["raw_docs"][ii] fields = es.extract_fields_from_document( raw_doc, extract_derived_fields=self. model_settings["use_derived_fields"]) outliers.append( self.process_outlier(fields, raw_doc, extra_outlier_information= calculated_observations)) else: non_outlier_values.add(term_value) return outliers