示例#1
0
    def test_read_and_writer_pb(self):
        fp = open('../testdata/recordio.dat', 'wb')
        record_writer = RecordWriter(fp)
        for i in xrange(20):
            word_topic_hist = WordTopicHistogramPB()
            word_topic_hist.word = i
            for j in xrange(20):
                non_zero = word_topic_hist.sparse_topic_hist.non_zeros.add()
                non_zero.topic = j
                non_zero.count = j + 1
            self.assertTrue(
                    record_writer.write(word_topic_hist.SerializeToString()))
        fp.close()

        fp = open('../testdata/recordio.dat', 'rb')
        record_reader = RecordReader(fp)
        i = 0
        while True:
            blob = record_reader.read()
            if blob == None:
                break
            word_topic_hist = WordTopicHistogramPB()
            word_topic_hist.ParseFromString(blob)
            self.assertEqual(i, word_topic_hist.word)
            sparse_topic_hist = word_topic_hist.sparse_topic_hist
            self.assertEqual(20, len(sparse_topic_hist.non_zeros))
            for j in xrange(len(sparse_topic_hist.non_zeros)):
                self.assertEqual(j, sparse_topic_hist.non_zeros[j].topic)
                self.assertEqual(j + 1, sparse_topic_hist.non_zeros[j].count)
            i += 1
        self.assertEqual(20, i)
        fp.close()
示例#2
0
 def _save_word_topic_hist(self, filename):
     fp = open(filename, 'wb')
     record_writer = RecordWriter(fp)
     for word, ordered_sparse_topic_hist in self.word_topic_hist.iteritems():
         word_topic_hist_pb = WordTopicHistogramPB()
         word_topic_hist_pb.word = word
         word_topic_hist_pb.sparse_topic_hist.ParseFromString(
                 ordered_sparse_topic_hist.serialize_to_string())
         record_writer.write(word_topic_hist_pb.SerializeToString())
     fp.close()
示例#3
0
 def _save_word_topic_hist(self, filename):
     fp = open(filename, 'wb')
     record_writer = RecordWriter(fp)
     for word, ordered_sparse_topic_hist in self.word_topic_hist.iteritems(
     ):
         word_topic_hist_pb = WordTopicHistogramPB()
         word_topic_hist_pb.word = word
         word_topic_hist_pb.sparse_topic_hist.ParseFromString(
             ordered_sparse_topic_hist.serialize_to_string())
         record_writer.write(word_topic_hist_pb.SerializeToString())
     fp.close()
示例#4
0
    def test_read_and_writer_pb(self):
        fp = open('../testdata/recordio.dat', 'wb')
        record_writer = RecordWriter(fp)
        for i in xrange(20):
            word_topic_hist = WordTopicHistogramPB()
            word_topic_hist.word = i
            for j in xrange(20):
                non_zero = word_topic_hist.sparse_topic_hist.non_zeros.add()
                non_zero.topic = j
                non_zero.count = j + 1
            self.assertTrue(
                record_writer.write(word_topic_hist.SerializeToString()))
        fp.close()

        fp = open('../testdata/recordio.dat', 'rb')
        record_reader = RecordReader(fp)
        i = 0
        while True:
            blob = record_reader.read()
            if blob == None:
                break
            word_topic_hist = WordTopicHistogramPB()
            word_topic_hist.ParseFromString(blob)
            self.assertEqual(i, word_topic_hist.word)
            sparse_topic_hist = word_topic_hist.sparse_topic_hist
            self.assertEqual(20, len(sparse_topic_hist.non_zeros))
            for j in xrange(len(sparse_topic_hist.non_zeros)):
                self.assertEqual(j, sparse_topic_hist.non_zeros[j].topic)
                self.assertEqual(j + 1, sparse_topic_hist.non_zeros[j].count)
            i += 1
        self.assertEqual(20, i)
        fp.close()
示例#5
0
    def _load_word_topic_hist(self, filename):
        logging.info('Loading word_topic_hist matrix N(w|z).')
        self.word_topic_hist.clear()

        fp = open(filename, "rb")
        record_reader = RecordReader(fp)
        while True:
            blob = record_reader.read()
            if blob == None:
                break

            word_topic_hist_pb = WordTopicHistogramPB()
            word_topic_hist_pb.ParseFromString(blob)

            ordered_sparse_topic_hist = \
                    OrderedSparseTopicHistogram(self.num_topics)
            ordered_sparse_topic_hist.parse_from_string(
                word_topic_hist_pb.sparse_topic_hist.SerializeToString())
            self.word_topic_hist[word_topic_hist_pb.word] = \
                    ordered_sparse_topic_hist
        fp.close()
        return (len(self.word_topic_hist) > 0)