예제 #1
0
    def evaluate_model(self):
        self.total_events, documents = es.count_and_scan_documents(index=self.model_settings["es_index"],
                                                                   search_query=self.search_query,
                                                                   model_settings=self.model_settings)

        self.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events)
        logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating terms model")

        if self.total_events > 0:
            current_batch = defaultdict()
            targets_for_next_batch = defaultdict()
            total_targets_in_batch = 0

            for doc in documents:
                logging.tick()
                target_sentences, aggregator_sentences = self._compute_aggregator_and_target_value(
                    doc, self.model_settings["target"])

                if target_sentences is not None and aggregator_sentences is not None:
                    # Add current document to current_batch
                    current_batch = self._add_document_to_batch(current_batch, target_sentences,
                                                                aggregator_sentences, doc)

                    total_targets_in_batch += len(target_sentences) * len(aggregator_sentences)

                # Evaluate batch of events against the model
                is_last_batch = (logging.current_step == self.total_events)  # Check if it is the last batch
                # Run if it is the last batch OR if the batch size is large enough

                if is_last_batch or total_targets_in_batch >= self.terms_batch_eval_size:

                    # Display log message
                    log_message = "evaluating batch of " + "{:,}".format(total_targets_in_batch) + " terms "
                    if len(targets_for_next_batch) > 0:
                        log_message += "(+ " + "{:,}".format(len(targets_for_next_batch)) + " terms from last batch) "
                    log_message += "[" + "{:,}".format(logging.current_step) + " events processed]"
                    logging.logger.info(log_message)

                    # evaluate the current batch
                    outliers_in_batch, targets_for_next_batch = self._evaluate_batch_for_outliers(batch=current_batch)

                    if outliers_in_batch:
                        unique_summaries_in_batch = len(set(o.outlier_dict["summary"] for o in outliers_in_batch))
                        logging.logger.info("processing " + "{:,}".format(len(outliers_in_batch)) +
                                            " outliers in batch [" + "{:,}".format(unique_summaries_in_batch) +
                                            " unique summaries]")

                        for outlier in outliers_in_batch:
                            self.process_outlier(outlier)

                    else:
                        logging.logger.info("no outliers processed in batch")

                    # Reset data structures for next batch
                    current_batch = targets_for_next_batch
                    total_targets_in_batch = 0

        self.print_analysis_summary()
예제 #2
0
    def evaluate_model(self):

        model_filter = {
            "bool": {
                "filter": [{
                    "term": {
                        "outliers.model_name.keyword": {
                            "value": self.model_name
                        }
                    }
                }, {
                    "term": {
                        "outliers.model_type.keyword": {
                            "value": "simplequery"
                        }
                    }
                }]
            }
        }

        exclude_hits_filter = {"bool": {"must_not": model_filter}}

        query = self.search_query

        if "filter" in query:
            query["filter"].append(exclude_hits_filter)
        else:
            query["filter"] = [exclude_hits_filter]

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=query,
            model_settings=self.model_settings)
        self.print_analysis_intro(event_type="evaluating " + self.model_type +
                                  "_" + self.model_name,
                                  total_events=self.total_events)

        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating " +
                            self.model_type + " model")
        if self.total_events > 0:
            for doc in documents:
                logging.tick()
                fields = es.extract_fields_from_document(
                    doc,
                    extract_derived_fields=self.
                    model_settings["use_derived_fields"])
                outlier = self.create_outlier(fields, doc)
                self.process_outlier(outlier)

        self.print_analysis_summary()
예제 #3
0
    def train_model(self):
        w2v_model = word2vec.Word2Vec(name=self.model_name)

        sentences = list()

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=self.search_query,
            model_settings=self.model_settings)
        training_data_size_pct = settings.config.getint(
            "machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        self.print_analysis_intro(event_type="training " + self.model_name,
                                  total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events,
                            desc=self.model_name +
                            " - preparing word2vec training set")
        if self.total_events > 0:
            for doc in documents:
                if len(sentences) < total_training_events:
                    logging.tick()
                    fields = es.extract_fields_from_document(
                        doc,
                        extract_derived_fields=self.
                        model_settings["use_derived_fields"])
                    if set(self.model_settings["sentence_format"]).issubset(
                            fields.keys()):
                        new_sentences = helpers.utils.flatten_fields_into_sentences(
                            fields=fields,
                            sentence_format=self.
                            model_settings["sentence_format"])
                        for sentence in new_sentences:
                            sentences.append(tuple(sentence))

                        # Remove all duplicates from sentences for training - REMOVED FOR TESTING
                        # sentences = list(sentences)
                else:
                    # We have collected sufficient training data
                    break

        # Now, train the model
        if len(sentences) > 0:
            w2v_model.train_model(sentences)
        else:
            logging.logger.warning(
                "no sentences to train model on. Are you sure the sentence configuration is "
                + "correctly defined?")
예제 #4
0
    def train_model(self):
        train_data = list()

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=self.search_query,
            model_settings=self.model_settings)
        training_data_size_pct = settings.config.getint(
            "machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        self.print_analysis_intro(event_type="training " + self.model_name,
                                  total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events,
                            desc=self.model_name + " - preparing training set")
        if self.total_events > 0:
            for doc in documents:
                if len(train_data) < total_training_events:
                    logging.tick()
                    fields = es.extract_fields_from_document(
                        doc,
                        extract_derived_fields=self.
                        model_settings["use_derived_fields"])
                    train_data.append(fields)
                else:
                    # We have collected sufficient training data
                    break

        # Now, train the model
        if train_data:
            pass  # Train!!
        else:
            logging.logger.warning(
                "no sentences to train model on. Are you sure the sentence configuration is "
                + "correctly defined?")
예제 #5
0
    def evaluate_model(self):
        # Train the model
        if self.model_settings["train_model"]:
            self.train_model()
            return

        w2v_model = word2vec.Word2Vec(name=self.model_name)
        search_query = self.search_query

        if not w2v_model.is_trained():
            logging.logger.warning("model was not trained! Skipping analysis.")
        else:
            # Check if we need to run the test data instead of real data
            if w2v_model.use_test_data:
                logging.print_generic_intro(
                    "using test data instead of live data to evaluate model " +
                    self.model_name)
                self.evaluate_test_sentences(w2v_model=w2v_model)
                return

            self.total_events, documents = es.count_and_scan_documents(
                index=self.model_settings["es_index"],
                search_query=search_query,
                model_settings=self.model_settings)
            self.print_analysis_intro(event_type="evaluating " +
                                      self.model_name,
                                      total_events=self.total_events)

            logging.init_ticker(total_steps=self.total_events,
                                desc=self.model_name +
                                " - evaluating word2vec model")

            if self.total_events > 0:
                raw_docs = list()
                eval_sentences = list()

                for doc in documents:
                    logging.tick()
                    fields = es.extract_fields_from_document(
                        doc,
                        extract_derived_fields=self.
                        model_settings["use_derived_fields"])

                    try:
                        new_sentences = helpers.utils.flatten_fields_into_sentences(
                            fields=fields,
                            sentence_format=self.
                            model_settings["sentence_format"])
                        eval_sentences.extend(new_sentences)
                    except KeyError:
                        logging.logger.debug(
                            "skipping event which does not contain the target and aggregator fields "
                            + "we are processing. - [" + self.model_name + "]")
                        continue

                    for _ in new_sentences:
                        raw_docs.append(doc)

                    # Evaluate batch of events against the model
                    if logging.current_step == self.total_events or \
                            len(eval_sentences) >= settings.config.getint("machine_learning",
                                                                          "word2vec_batch_eval_size"):
                        logging.logger.info("evaluating batch of " +
                                            str(len(eval_sentences)) +
                                            " sentences")
                        outliers = self.evaluate_batch_for_outliers(
                            w2v_model=w2v_model,
                            eval_sentences=eval_sentences,
                            raw_docs=raw_docs)

                        if len(outliers) > 0:
                            unique_summaries = len(
                                set(o.outlier_dict["summary"]
                                    for o in outliers))
                            logging.logger.info(
                                "total outliers in batch processed: " +
                                "{:,}".format(len(outliers)) + " [" +
                                "{:,}".format(unique_summaries) +
                                " unique summaries]")

                        # Reset data structures for next batch
                        raw_docs = list()
                        eval_sentences = list()
예제 #6
0
    def evaluate_model(self):
        batch = defaultdict()  # Contain the current batch information
        remaining_metrics = defaultdict()
        total_metrics_in_batch = 0

        self.total_events, documents = es.count_and_scan_documents(
            index=self.model_settings["es_index"],
            search_query=self.search_query,
            model_settings=self.model_settings)

        self.print_analysis_intro(event_type="evaluating " + self.model_name,
                                  total_events=self.total_events)

        logging.init_ticker(total_steps=self.total_events,
                            desc=self.model_name + " - evaluating " +
                            self.model_type + " model")
        if self.total_events > 0:
            for doc in documents:
                logging.tick()

                # Extract target and aggregator values
                target_value, aggregator_sentences = self._compute_aggregator_and_target_value(
                    doc)

                # If target and aggregator values exist
                if target_value is not None and aggregator_sentences is not None:
                    # Add current document to eval_metrics
                    batch, metric_added = self._add_document_to_batch(
                        doc, batch, target_value, aggregator_sentences)

                    # We can only have 1 target field for metrics (as opposed to terms), so the total number of targets
                    # added is the same as the total number of aggregator sentences that were processed for this
                    # document
                    if metric_added:
                        total_metrics_in_batch += len(aggregator_sentences)

                is_last_batch = (logging.current_step == self.total_events
                                 )  # Check if it is the last batch
                # Run if it is the last batch OR if the batch size is large enough
                if is_last_batch or total_metrics_in_batch >= self.metrics_batch_eval_size:

                    # Display log message
                    log_message = "evaluating batch of " + "{:,}".format(
                        total_metrics_in_batch) + " metrics "
                    if remaining_metrics:
                        log_message += "(+ " + "{:,}".format(
                            len(remaining_metrics)
                        ) + " metrics from last batch) "
                    log_message += "[" + "{:,}".format(
                        logging.current_step) + " events processed]"
                    logging.logger.info(log_message)

                    outliers_in_batch, remaining_metrics = self._evaluate_batch_for_outliers(
                        batch=batch, is_last_batch=is_last_batch)

                    # For each result, save it in batch and in ES
                    if outliers_in_batch:
                        unique_summaries_in_batch = len(
                            set(o.outlier_dict["summary"]
                                for o in outliers_in_batch))
                        logging.logger.info(
                            "processing " +
                            "{:,}".format(len(outliers_in_batch)) +
                            " outliers in batch [" +
                            "{:,}".format(unique_summaries_in_batch) +
                            " unique summaries]")

                        for outlier in outliers_in_batch:
                            self.process_outlier(outlier)
                    else:
                        logging.logger.info("no outliers detected in batch")

                    # Reset data structures for next batch
                    batch = remaining_metrics
                    total_metrics_in_batch = 0

        self.print_analysis_summary()