示例#1
0
    def for_topics(cls, topics_as_topn_terms, **kwargs):
        """Initialize a CoherenceModel with estimated probabilities for all of the given topics.

        Args:
            topics_as_topn_terms (list of lists): Each element in the top-level list should be
                the list of topics for a model. The topics for the model should be a list of
                top-N words, one per topic.
        """
        if not topics_as_topn_terms:
            raise ValueError("len(topics) must be > 0.")
        if any(len(topic_lists) == 0 for topic_lists in topics_as_topn_terms):
            raise ValueError("found empty topic listing in `topics`")

        topn = 0
        for topic_list in topics_as_topn_terms:
            for topic in topic_list:
                topn = max(topn, len(topic))

        topn = min(kwargs.pop('topn', topn), topn)
        super_topic = utils.flatten(topics_as_topn_terms)

        logging.info(
            "Number of relevant terms for all %d models: %d",
            len(topics_as_topn_terms), len(super_topic))
        cm = CoherenceModel(topics=[super_topic], topn=len(super_topic), **kwargs)
        cm.estimate_probabilities()
        cm.topn = topn
        return cm
示例#2
0
    def for_topics(cls, topics_as_topn_terms, **kwargs):
        """Initialize a CoherenceModel with estimated probabilities for all of the given topics.

        Args:
            topics_as_topn_terms (list of lists): Each element in the top-level list should be
                the list of topics for a model. The topics for the model should be a list of
                top-N words, one per topic.
        """
        if not topics_as_topn_terms:
            raise ValueError("len(topics) must be > 0.")
        if any(len(topic_lists) == 0 for topic_lists in topics_as_topn_terms):
            raise ValueError("found empty topic listing in `topics`")

        topn = 0
        for topic_list in topics_as_topn_terms:
            for topic in topic_list:
                topn = max(topn, len(topic))

        topn = min(kwargs.pop('topn', topn), topn)
        super_topic = utils.flatten(topics_as_topn_terms)

        logging.info(
            "Number of relevant terms for all %d models: %d",
            len(topics_as_topn_terms), len(super_topic))
        cm = CoherenceModel(topics=[super_topic], topn=len(super_topic), **kwargs)
        cm.estimate_probabilities()
        cm.topn = topn
        return cm
 def not_in_vocab(self, words):
     uniq_words = set(utils.flatten(words))
     return set(word for word in uniq_words if word not in self.model.vocab)
示例#4
0
 def test_flatten_not_nested(self):
     not_nested = [1, 2, 3, 4, 5, 6]
     expected = [1, 2, 3, 4, 5, 6]
     self.assertEqual(utils.flatten(not_nested), expected)
示例#5
0
 def test_flatten_nested(self):
     nested_list = [[[1, 2, 3], [4, 5]], 6]
     expected = [1, 2, 3, 4, 5, 6]
     self.assertEqual(utils.flatten(nested_list), expected)
示例#6
0
 def test_flatten_not_nested(self):
     not_nested = [1, 2, 3, 4, 5, 6]
     expected = [1, 2, 3, 4, 5, 6]
     self.assertEqual(utils.flatten(not_nested), expected)
示例#7
0
 def test_flatten_nested(self):
     nested_list = [[[1, 2, 3], [4, 5]], 6]
     expected = [1, 2, 3, 4, 5, 6]
     self.assertEqual(utils.flatten(nested_list), expected)
示例#8
0
 def not_in_vocab(self, words):
     uniq_words = set(utils.flatten(words))
     return set(word for word in uniq_words if word not in self.model.vocab)