Exemplo n.º 1
0
    def train_model(self):
        w2v_model = word2vec.Word2Vec(name=self.model_name)
        search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])

        sentences = list()

        self.total_events = es.count_documents(search_query=search_query)
        training_data_size_pct = settings.config.getint("machine_learning", "training_data_size_pct")
        training_data_size = self.total_events / 100 * training_data_size_pct

        logging.print_analysis_intro(event_type="training " + self.model_name, total_events=self.total_events)
        total_training_events = int(min(training_data_size, self.total_events))

        logging.init_ticker(total_steps=total_training_events, desc=self.model_name + " - preparing word2vec training set")
        for doc in es.scan(search_query=search_query):
            if len(sentences) < total_training_events:
                logging.tick()
                fields = es.extract_fields_from_document(doc)
                if set(self.model_settings["sentence_format"]).issubset(fields.keys()):
                    new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"])
                    for sentence in new_sentences:
                        sentences.append(tuple(sentence))

                    # Remove all duplicates from sentences for training - REMOVED FOR TESTING
                    # sentences = list(sentences)
            else:
                # We have collected sufficient training data
                break

        # Now, train the model
        if len(sentences) > 0:
            w2v_model.train_model(sentences)
        else:
            logging.logger.warning("no sentences to train model on. Are you sure the sentence configuration is correctly defined?")
Exemplo n.º 2
0
    def evaluate_model(self):
        self.extract_extra_model_settings()

        # Train the model
        if self.model_settings["train_model"]:
            self.train_model()
            return

        w2v_model = word2vec.Word2Vec(name=self.model_name)
        search_query = es.filter_by_query_string(self.model_settings["es_query_filter"])

        if not w2v_model.is_trained():
            logging.logger.warning("model was not trained! Skipping analysis.")
        else:
            # Check if we need to run the test data instead of real data
            if w2v_model.use_test_data:
                logging.print_generic_intro("using test data instead of live data to evaluate model " + self.model_name)
                self.evaluate_test_sentences(w2v_model=w2v_model)
                return

            self.total_events = es.count_documents(search_query=search_query)
            logging.print_analysis_intro(event_type="evaluating " + self.model_name, total_events=self.total_events)

            logging.init_ticker(total_steps=self.total_events, desc=self.model_name + " - evaluating word2vec model")

            raw_docs = list()
            eval_sentences = list()

            for doc in es.scan(search_query=search_query):
                logging.tick()
                fields = es.extract_fields_from_document(doc)

                try:
                    new_sentences = helpers.utils.flatten_fields_into_sentences(fields=fields, sentence_format=self.model_settings["sentence_format"])
                    eval_sentences.extend(new_sentences)
                except KeyError:
                    logging.logger.debug("skipping event which does not contain the target and aggregator fields we are processing. - [" + self.model_name + "]")
                    continue

                for _ in new_sentences:
                    raw_docs.append(doc)

                # Evaluate batch of events against the model
                if logging.current_step == self.total_events or len(eval_sentences) >= settings.config.getint("machine_learning", "word2vec_batch_eval_size"):
                    logging.logger.info("evaluating batch of " + str(len(eval_sentences)) + " sentences")
                    outliers = self.evaluate_batch_for_outliers(w2v_model=w2v_model, eval_sentences=eval_sentences, raw_docs=raw_docs)

                    if len(outliers) > 0:
                        unique_summaries = len(set(o.outlier_dict["summary"] for o in outliers))
                        logging.logger.info("total outliers in batch processed: " + str(len(outliers)) + " [" + str(unique_summaries) + " unique summaries]")

                    # Reset data structures for next batch
                    raw_docs = list()
                    eval_sentences = list()