class Document(object): def __init__(self, num_topics): self.num_topics = num_topics self.words = None # word occurances of the document, # item fmt: Word<id, topic> self.doc_topic_hist = None # N(z|d) def parse_from_tokens(self, doc_tokens, rand, vocabulary, model = None): """Parse the text document from tokens. Only tokens in vocabulary and model will be considered. """ self.words = [] self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics) for token in doc_tokens: word_index = vocabulary.word_index(token) if (word_index != -1 and (model == None or model.has_word(word_index))): # initialize a random topic for current word topic = rand.randint(0, self.num_topics - 1) self.words.append(Word(word_index, topic)) self.doc_topic_hist.increase_topic(topic, 1) def serialize_to_string(self): """Serialize document to DocumentPB string. """ document_pb = DocumentPB() for word in self.words: word_pb = document_pb.words.add() word_pb.id = word.id word_pb.topic = word.topic return document_pb.SerializeToString() def parse_from_string(self, document_str): """Parse document from DocumentPB serialized string. """ self.words = [] self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics) self.document_pb = DocumentPB() self.document_pb.ParseFromString(document_str) for word_pb in self.document_pb.words: self.words.append(Word(word_pb.id, word_pb.topic)) self.increase_topic(word_pb.topic, 1) def num_words(self): return len(self.words) def get_words(self): for word in self.words: yield word def get_topic_count(self, topic): """Returns N(z|d). """ return self.doc_topic_hist.count(topic) def increase_topic(self, topic, count = 1): """Adds count to current topic, and returns the updated count. """ return self.doc_topic_hist.increase_topic(topic, count) def decrease_topic(self, topic, count = 1): """Subtracts count from current topic, and returns the updated count. """ return self.doc_topic_hist.decrease_topic(topic, count) def __str__(self): """Outputs a human-readable representation of the model. """ document_str = [] for word in self.words: document_str.append(str(word)) document_str.append(str(self.doc_topic_hist)) return '\n'.join(document_str)
class OrderedSparseTopicHistogramTest(unittest.TestCase): def setUp(self): self.num_topics = 20 self.ordered_sparse_topic_hist = \ OrderedSparseTopicHistogram(self.num_topics) for i in xrange(10): self.ordered_sparse_topic_hist.increase_topic(i, i + 1) def test_ordered_sparse_topic_hist(self): self.assertEqual(10, len(self.ordered_sparse_topic_hist.non_zeros)) for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros)): self.assertEqual(10 - i - 1, self.ordered_sparse_topic_hist.non_zeros[i].topic) self.assertEqual(10 - i, self.ordered_sparse_topic_hist.non_zeros[i].count) def test_num_topics(self): self.assertEqual(self.num_topics, self.ordered_sparse_topic_hist.num_topics) def test_size(self): self.assertEqual(10, self.ordered_sparse_topic_hist.size()) def test_serialize_and_parse(self): blob = self.ordered_sparse_topic_hist.serialize_to_string() sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics) sparse_topic_hist.parse_from_string(blob) self.assertEqual(sparse_topic_hist.size(), self.ordered_sparse_topic_hist.size()) self.assertEqual(str(sparse_topic_hist), str(self.ordered_sparse_topic_hist)) def test_count(self): for i in xrange(10): self.assertEqual(i + 1, self.ordered_sparse_topic_hist.count(i)) for i in xrange(10, 20): self.assertEqual(0, self.ordered_sparse_topic_hist.count(i)) def test_increase_topic(self): for i in xrange(20): if i < 10: self.assertEqual(2 * (i + 1), self.ordered_sparse_topic_hist.increase_topic(i, i + 1)) else: self.assertEqual(i + 1, self.ordered_sparse_topic_hist.increase_topic(i, i + 1)) for j in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1): self.assertGreaterEqual( self.ordered_sparse_topic_hist.non_zeros[j].count, self.ordered_sparse_topic_hist.non_zeros[j + 1].count) self.assertEqual(2, self.ordered_sparse_topic_hist.count(0)) self.assertEqual(12, self.ordered_sparse_topic_hist.count(5)) self.assertEqual(11, self.ordered_sparse_topic_hist.count(10)) self.assertEqual(16, self.ordered_sparse_topic_hist.count(15)) self.assertEqual(20, self.ordered_sparse_topic_hist.increase_topic(15, 4)) def test_decrease_topic(self): self.assertEqual(6, self.ordered_sparse_topic_hist.count(5)) self.assertEqual(7, self.ordered_sparse_topic_hist.count(6)) self.assertEqual(5, self.ordered_sparse_topic_hist.decrease_topic(5, 1)) self.assertEqual(3, self.ordered_sparse_topic_hist.decrease_topic(6, 4)) self.assertEqual(10, self.ordered_sparse_topic_hist.size()) self.assertEqual(5, self.ordered_sparse_topic_hist.count(5)) self.assertEqual(3, self.ordered_sparse_topic_hist.count(6)) for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1): self.assertGreaterEqual( self.ordered_sparse_topic_hist.non_zeros[i].count, self.ordered_sparse_topic_hist.non_zeros[i + 1].count) self.assertEqual(0, self.ordered_sparse_topic_hist.decrease_topic(6, 3)) self.assertEqual(9, self.ordered_sparse_topic_hist.size()) for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1): self.assertGreaterEqual( self.ordered_sparse_topic_hist.non_zeros[i].count, self.ordered_sparse_topic_hist.non_zeros[i + 1].count)
class OrderedSparseTopicHistogramTest(unittest.TestCase): def setUp(self): self.num_topics = 20 self.ordered_sparse_topic_hist = \ OrderedSparseTopicHistogram(self.num_topics) for i in xrange(10): self.ordered_sparse_topic_hist.increase_topic(i, i + 1) def test_ordered_sparse_topic_hist(self): self.assertEqual(10, len(self.ordered_sparse_topic_hist.non_zeros)) for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros)): self.assertEqual(10 - i - 1, self.ordered_sparse_topic_hist.non_zeros[i].topic) self.assertEqual(10 - i, self.ordered_sparse_topic_hist.non_zeros[i].count) def test_num_topics(self): self.assertEqual(self.num_topics, self.ordered_sparse_topic_hist.num_topics) def test_size(self): self.assertEqual(10, self.ordered_sparse_topic_hist.size()) def test_serialize_and_parse(self): blob = self.ordered_sparse_topic_hist.serialize_to_string() sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics) sparse_topic_hist.parse_from_string(blob) self.assertEqual(sparse_topic_hist.size(), self.ordered_sparse_topic_hist.size()) self.assertEqual(str(sparse_topic_hist), str(self.ordered_sparse_topic_hist)) def test_count(self): for i in xrange(10): self.assertEqual(i + 1, self.ordered_sparse_topic_hist.count(i)) for i in xrange(10, 20): self.assertEqual(0, self.ordered_sparse_topic_hist.count(i)) def test_increase_topic(self): for i in xrange(20): if i < 10: self.assertEqual( 2 * (i + 1), self.ordered_sparse_topic_hist.increase_topic(i, i + 1)) else: self.assertEqual( i + 1, self.ordered_sparse_topic_hist.increase_topic(i, i + 1)) for j in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1): self.assertGreaterEqual( self.ordered_sparse_topic_hist.non_zeros[j].count, self.ordered_sparse_topic_hist.non_zeros[j + 1].count) self.assertEqual(2, self.ordered_sparse_topic_hist.count(0)) self.assertEqual(12, self.ordered_sparse_topic_hist.count(5)) self.assertEqual(11, self.ordered_sparse_topic_hist.count(10)) self.assertEqual(16, self.ordered_sparse_topic_hist.count(15)) self.assertEqual(20, self.ordered_sparse_topic_hist.increase_topic(15, 4)) def test_decrease_topic(self): self.assertEqual(6, self.ordered_sparse_topic_hist.count(5)) self.assertEqual(7, self.ordered_sparse_topic_hist.count(6)) self.assertEqual(5, self.ordered_sparse_topic_hist.decrease_topic(5, 1)) self.assertEqual(3, self.ordered_sparse_topic_hist.decrease_topic(6, 4)) self.assertEqual(10, self.ordered_sparse_topic_hist.size()) self.assertEqual(5, self.ordered_sparse_topic_hist.count(5)) self.assertEqual(3, self.ordered_sparse_topic_hist.count(6)) for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1): self.assertGreaterEqual( self.ordered_sparse_topic_hist.non_zeros[i].count, self.ordered_sparse_topic_hist.non_zeros[i + 1].count) self.assertEqual(0, self.ordered_sparse_topic_hist.decrease_topic(6, 3)) self.assertEqual(9, self.ordered_sparse_topic_hist.size()) for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1): self.assertGreaterEqual( self.ordered_sparse_topic_hist.non_zeros[i].count, self.ordered_sparse_topic_hist.non_zeros[i + 1].count)