예제 #1
0
class Document(object):

    def __init__(self, num_topics):
        self.num_topics = num_topics
        self.words = None  # word occurances of the document,
                           # item fmt: Word<id, topic>
        self.doc_topic_hist = None  # N(z|d)

    def parse_from_tokens(self, doc_tokens, rand, vocabulary, model = None):
        """Parse the text document from tokens. Only tokens in vocabulary
        and model will be considered.
        """
        self.words = []
        self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics)

        for token in doc_tokens:
            word_index = vocabulary.word_index(token)
            if (word_index != -1 and
                    (model == None or model.has_word(word_index))):
                # initialize a random topic for current word
                topic = rand.randint(0, self.num_topics - 1)
                self.words.append(Word(word_index, topic))
                self.doc_topic_hist.increase_topic(topic, 1)

    def serialize_to_string(self):
        """Serialize document to DocumentPB string.
        """
        document_pb = DocumentPB()
        for word in self.words:
            word_pb = document_pb.words.add()
            word_pb.id = word.id
            word_pb.topic = word.topic
        return document_pb.SerializeToString()

    def parse_from_string(self, document_str):
        """Parse document from DocumentPB serialized string.
        """
        self.words = []
        self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics)

        self.document_pb = DocumentPB()
        self.document_pb.ParseFromString(document_str)
        for word_pb in self.document_pb.words:
            self.words.append(Word(word_pb.id, word_pb.topic))
            self.increase_topic(word_pb.topic, 1)

    def num_words(self):
        return len(self.words)

    def get_words(self):
        for word in self.words:
            yield word

    def get_topic_count(self, topic):
        """Returns N(z|d).
        """
        return self.doc_topic_hist.count(topic)

    def increase_topic(self, topic, count = 1):
        """Adds count to current topic, and returns the updated count.
        """
        return self.doc_topic_hist.increase_topic(topic, count)

    def decrease_topic(self, topic, count = 1):
        """Subtracts count from current topic, and returns the updated count.
        """
        return self.doc_topic_hist.decrease_topic(topic, count)

    def __str__(self):
        """Outputs a human-readable representation of the model.
        """
        document_str = []
        for word in self.words:
            document_str.append(str(word))
        document_str.append(str(self.doc_topic_hist))
        return '\n'.join(document_str)
class OrderedSparseTopicHistogramTest(unittest.TestCase):

    def setUp(self):
        self.num_topics = 20
        self.ordered_sparse_topic_hist = \
                OrderedSparseTopicHistogram(self.num_topics)
        for i in xrange(10):
            self.ordered_sparse_topic_hist.increase_topic(i, i + 1)

    def test_ordered_sparse_topic_hist(self):
        self.assertEqual(10, len(self.ordered_sparse_topic_hist.non_zeros))
        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros)):
            self.assertEqual(10 - i - 1,
                    self.ordered_sparse_topic_hist.non_zeros[i].topic)
            self.assertEqual(10 - i,
                    self.ordered_sparse_topic_hist.non_zeros[i].count)

    def test_num_topics(self):
        self.assertEqual(self.num_topics,
                self.ordered_sparse_topic_hist.num_topics)

    def test_size(self):
        self.assertEqual(10, self.ordered_sparse_topic_hist.size())

    def test_serialize_and_parse(self):
        blob = self.ordered_sparse_topic_hist.serialize_to_string()

        sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics)
        sparse_topic_hist.parse_from_string(blob)

        self.assertEqual(sparse_topic_hist.size(),
                self.ordered_sparse_topic_hist.size())
        self.assertEqual(str(sparse_topic_hist),
                str(self.ordered_sparse_topic_hist))

    def test_count(self):
        for i in xrange(10):
            self.assertEqual(i + 1, self.ordered_sparse_topic_hist.count(i))
        for i in xrange(10, 20):
            self.assertEqual(0, self.ordered_sparse_topic_hist.count(i))

    def test_increase_topic(self):
        for i in xrange(20):
            if i < 10:
                self.assertEqual(2 * (i + 1),
                        self.ordered_sparse_topic_hist.increase_topic(i, i + 1))
            else:
                self.assertEqual(i + 1,
                        self.ordered_sparse_topic_hist.increase_topic(i, i + 1))

            for j in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
                self.assertGreaterEqual(
                        self.ordered_sparse_topic_hist.non_zeros[j].count,
                        self.ordered_sparse_topic_hist.non_zeros[j + 1].count)

        self.assertEqual(2, self.ordered_sparse_topic_hist.count(0))
        self.assertEqual(12, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(11, self.ordered_sparse_topic_hist.count(10))
        self.assertEqual(16, self.ordered_sparse_topic_hist.count(15))
        self.assertEqual(20, self.ordered_sparse_topic_hist.increase_topic(15, 4))

    def test_decrease_topic(self):
        self.assertEqual(6, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(7, self.ordered_sparse_topic_hist.count(6))
        self.assertEqual(5, self.ordered_sparse_topic_hist.decrease_topic(5, 1))
        self.assertEqual(3, self.ordered_sparse_topic_hist.decrease_topic(6, 4))
        self.assertEqual(10, self.ordered_sparse_topic_hist.size())
        self.assertEqual(5, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(3, self.ordered_sparse_topic_hist.count(6))

        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(
                    self.ordered_sparse_topic_hist.non_zeros[i].count,
                    self.ordered_sparse_topic_hist.non_zeros[i + 1].count)

        self.assertEqual(0, self.ordered_sparse_topic_hist.decrease_topic(6, 3))
        self.assertEqual(9, self.ordered_sparse_topic_hist.size())
        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(
                    self.ordered_sparse_topic_hist.non_zeros[i].count,
                    self.ordered_sparse_topic_hist.non_zeros[i + 1].count)
예제 #3
0
class OrderedSparseTopicHistogramTest(unittest.TestCase):
    def setUp(self):
        self.num_topics = 20
        self.ordered_sparse_topic_hist = \
                OrderedSparseTopicHistogram(self.num_topics)
        for i in xrange(10):
            self.ordered_sparse_topic_hist.increase_topic(i, i + 1)

    def test_ordered_sparse_topic_hist(self):
        self.assertEqual(10, len(self.ordered_sparse_topic_hist.non_zeros))
        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros)):
            self.assertEqual(10 - i - 1,
                             self.ordered_sparse_topic_hist.non_zeros[i].topic)
            self.assertEqual(10 - i,
                             self.ordered_sparse_topic_hist.non_zeros[i].count)

    def test_num_topics(self):
        self.assertEqual(self.num_topics,
                         self.ordered_sparse_topic_hist.num_topics)

    def test_size(self):
        self.assertEqual(10, self.ordered_sparse_topic_hist.size())

    def test_serialize_and_parse(self):
        blob = self.ordered_sparse_topic_hist.serialize_to_string()

        sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics)
        sparse_topic_hist.parse_from_string(blob)

        self.assertEqual(sparse_topic_hist.size(),
                         self.ordered_sparse_topic_hist.size())
        self.assertEqual(str(sparse_topic_hist),
                         str(self.ordered_sparse_topic_hist))

    def test_count(self):
        for i in xrange(10):
            self.assertEqual(i + 1, self.ordered_sparse_topic_hist.count(i))
        for i in xrange(10, 20):
            self.assertEqual(0, self.ordered_sparse_topic_hist.count(i))

    def test_increase_topic(self):
        for i in xrange(20):
            if i < 10:
                self.assertEqual(
                    2 * (i + 1),
                    self.ordered_sparse_topic_hist.increase_topic(i, i + 1))
            else:
                self.assertEqual(
                    i + 1,
                    self.ordered_sparse_topic_hist.increase_topic(i, i + 1))

            for j in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
                self.assertGreaterEqual(
                    self.ordered_sparse_topic_hist.non_zeros[j].count,
                    self.ordered_sparse_topic_hist.non_zeros[j + 1].count)

        self.assertEqual(2, self.ordered_sparse_topic_hist.count(0))
        self.assertEqual(12, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(11, self.ordered_sparse_topic_hist.count(10))
        self.assertEqual(16, self.ordered_sparse_topic_hist.count(15))
        self.assertEqual(20,
                         self.ordered_sparse_topic_hist.increase_topic(15, 4))

    def test_decrease_topic(self):
        self.assertEqual(6, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(7, self.ordered_sparse_topic_hist.count(6))
        self.assertEqual(5,
                         self.ordered_sparse_topic_hist.decrease_topic(5, 1))
        self.assertEqual(3,
                         self.ordered_sparse_topic_hist.decrease_topic(6, 4))
        self.assertEqual(10, self.ordered_sparse_topic_hist.size())
        self.assertEqual(5, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(3, self.ordered_sparse_topic_hist.count(6))

        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(
                self.ordered_sparse_topic_hist.non_zeros[i].count,
                self.ordered_sparse_topic_hist.non_zeros[i + 1].count)

        self.assertEqual(0,
                         self.ordered_sparse_topic_hist.decrease_topic(6, 3))
        self.assertEqual(9, self.ordered_sparse_topic_hist.size())
        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(
                self.ordered_sparse_topic_hist.non_zeros[i].count,
                self.ordered_sparse_topic_hist.non_zeros[i + 1].count)