예제 #1
0
def process_outlier(decision_frontier, non_outlier_values_sample, term_value_count, terms, aggregator_value, ii, term_value, model_settings):
    # Extract fields from raw document
    fields = es.extract_fields_from_document(terms[aggregator_value]["raw_docs"][ii])

    observations = terms[aggregator_value]["observations"][ii]

    observations["non_outlier_values_sample"] = non_outlier_values_sample
    observations["aggregator"] = aggregator_value
    observations["term"] = term_value
    observations["term_count"] = term_value_count
    observations["decision_frontier"] = decision_frontier
    observations["trigger_method"] = str(model_settings["trigger_method"])
    observations["confidence"] = np.abs(decision_frontier - term_value_count)

    merged_fields_and_observations = helpers.utils.merge_two_dicts(fields, observations)

    outlier_summary = helpers.utils.replace_placeholder_fields_with_values(model_settings["outlier_summary"], merged_fields_and_observations)

    outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings)

    if len(outlier_assets) > 0:
        observations["assets"] = outlier_assets

    outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary)

    for k, v in observations.items():
        outlier.add_observation(k, v)

    es.process_outliers(doc=terms[aggregator_value]["raw_docs"][ii], outliers=[outlier], should_notify=model_settings["should_notify"])
    return outlier
예제 #2
0
    def test_add_outlier_to_doc(self):
        test_outlier = Outlier(type="dummy type",
                               reason="dummy reason",
                               summary="dummy summary")
        test_outlier.add_observation(field_name="observation",
                                     field_value="dummy observation")

        doc_with_outlier = helpers.es.add_outlier_to_document(
            doc_without_outlier_test_file, test_outlier)
        self.assertDictEqual(doc_with_outlier_test_file, doc_with_outlier)
예제 #3
0
def evaluate_batch_for_outliers(w2v_model=None,
                                eval_sentences=None,
                                raw_docs=None,
                                model_settings=None):
    # Initialize
    outliers = list()

    # all_words_probs: contains an array of arrays. the nested arrays contain the probabilities of a word on that index to have a certain probability, in the context of another word
    sentence_probs = w2v_model.evaluate_sentences(eval_sentences)

    for i, single_sentence_prob in enumerate(sentence_probs):
        # If the probability is nan, it means that the sentence could not be evaluated, and we can't reason about it.
        # This happens for example whenever the sentence is made up entirely of words that aren't known to the trained model.
        if single_sentence_prob is np.nan:
            continue

        unique_probs = list(set(sentence_probs))

        # if is_outlier_cutoff_percentage(single_sentence_prob, cutoff=0.005):
        # if is_outlier_std(single_sentence_prob, unique_probs, model_settings):
        if is_outlier_mad(single_sentence_prob, unique_probs, model_settings):
            outlier_summary = model_settings["outlier_summary"]

            # Extract fields from raw document
            fields = es.extract_fields_from_document(raw_docs[i])
            outlier_summary = replace_placeholder_string_with_fields(
                outlier_summary, fields)

            outlier = Outlier(type=model_settings["outlier_type"],
                              reason=model_settings["outlier_reason"],
                              summary=outlier_summary)
            outlier.add_observation("probability", str(single_sentence_prob))

            outliers.append(outlier)
            es.process_outliers(doc=raw_docs[i],
                                outliers=[outlier],
                                should_notify=model_settings["should_notify"])
        else:
            if w2v_model.use_test_data:
                logging.logger.info("Not an outlier: " +
                                    str(eval_sentences[i]) + " - " +
                                    str(single_sentence_prob))
    return outliers
예제 #4
0
    def test_add_two_outliers_to_doc(self):
        test_outlier = Outlier(type="dummy type",
                               reason="dummy reason",
                               summary="dummy summary")
        test_outlier.add_observation(field_name="observation",
                                     field_value="dummy observation")

        test_outlier_2 = Outlier(type="dummy type 2",
                                 reason="dummy reason 2",
                                 summary="dummy summary 2")
        test_outlier_2.add_observation(field_name="observation_2",
                                       field_value="dummy observation 2")

        doc = copy.deepcopy(doc_without_outlier_test_file)
        doc_with_outlier = helpers.es.add_outlier_to_document(
            doc, test_outlier)
        doc_with_two_outliers = helpers.es.add_outlier_to_document(
            doc_with_outlier, test_outlier_2)

        self.assertDictEqual(doc_with_two_outliers,
                             doc_with_two_outliers_test_file)
예제 #5
0
def evaluate_batch_for_outliers(metrics=None, model_settings=None, last_batch=False):
    # Initialize
    outliers = list()
    remaining_metrics = metrics.copy()

    for i, aggregator_value in enumerate(metrics):

        # Check if we have sufficient data. if not, continue. Else, evaluate for outliers.
        if len(metrics[aggregator_value]["metrics"]) < 100 and last_batch is False:
            continue
        else:
            # Remove from remaining metrics, as we will be handling it in a second
            del remaining_metrics[aggregator_value]

        # Calculate the decision frontier
        decision_frontier = helpers.utils.get_decision_frontier(model_settings["trigger_method"], metrics[aggregator_value]["metrics"], model_settings["trigger_sensitivity"], model_settings["trigger_on"])
        logging.logger.debug("using decision frontier " + str(decision_frontier) + " for aggregator " + str(aggregator_value) + " - " + model_settings["metric"])
        logging.logger.debug("example metric from batch for " + metrics[aggregator_value]["observations"][0]["target"] + ": " + str(metrics[aggregator_value]["metrics"][0]))

        # Calculate all outliers in array
        for ii, metric_value in enumerate(metrics[aggregator_value]["metrics"]):
            is_outlier = helpers.utils.is_outlier(metric_value, decision_frontier, model_settings["trigger_on"])

            if is_outlier:
                confidence = np.abs(decision_frontier - metric_value)

                # Extract fields from raw document
                fields = es.extract_fields_from_document(metrics[aggregator_value]["raw_docs"][ii])

                observations = metrics[aggregator_value]["observations"][ii]
                merged_fields_and_observations = helpers.utils.merge_two_dicts(fields, observations)
                outlier_summary = helpers.utils.replace_placeholder_fields_with_values(model_settings["outlier_summary"], merged_fields_and_observations)

                outlier_assets = helpers.utils.extract_outlier_asset_information(fields, settings)
                if len(outlier_assets) > 0:
                    observations["assets"] = outlier_assets

                outlier = Outlier(type=model_settings["outlier_type"], reason=model_settings["outlier_reason"], summary=outlier_summary)

                outlier.add_observation("metric", metric_value)
                outlier.add_observation("decision_frontier", decision_frontier)
                outlier.add_observation("confidence", confidence)

                for k, v in observations.items():
                    outlier.add_observation(k, v)

                outliers.append(outlier)
                es.process_outliers(doc=metrics[aggregator_value]["raw_docs"][ii], outliers=[outlier], should_notify=model_settings["should_notify"])

    return outliers, remaining_metrics
예제 #6
0
def evaluate_model(model_name=None, model_settings=None):
    lucene_query = es.filter_by_query_string(model_settings["es_query_filter"])
    total_events = es.count_documents(lucene_query=lucene_query)

    logging.print_analysis_intro(event_type="evaluating " + model_name,
                                 total_events=total_events)
    logging.init_ticker(total_steps=total_events,
                        desc=model_name + " - evaluating simplequery model")

    outliers = list()
    for doc in es.scan(lucene_query=lucene_query):
        logging.tick()
        fields = es.extract_fields_from_document(doc)

        outlier_summary = replace_placeholder_string_with_fields(
            model_settings["outlier_summary"], fields)
        outlier_assets = helpers.utils.extract_outlier_asset_information(
            fields, settings)
        outlier = Outlier(type=model_settings["outlier_type"],
                          reason=model_settings["outlier_reason"],
                          summary=outlier_summary)

        if len(outlier_assets) > 0:
            outlier.add_observation("assets", outlier_assets)

        outliers.append(outlier)

        es.process_outliers(doc=doc,
                            outliers=[outlier],
                            should_notify=model_settings["should_notify"])

    if len(outliers) > 0:
        unique_summaries = len(
            set(o.get_observation("summary") for o in outliers))
        logging.logger.info("total outliers in batch processed: " +
                            str(len(outliers)) + " [" + str(unique_summaries) +
                            " unique]")