def setUp(self): model = Model(20) model.load('../testdata/lda_model') vocabulary = Vocabulary() vocabulary.load('../testdata/vocabulary.dat') self.multi_chain_gibbs_sampler = \ MultiChainGibbsSampler(model, vocabulary, 10, 10, 5)
def setUp(self): model = Model(20) model.load('../testdata/lda_model') vocabulary = Vocabulary() vocabulary.load('../testdata/vocabulary.dat') self.sparselda_gibbs_sampler = \ SparseLDAGibbsSampler(model, vocabulary, 10, 5)
class ModelEvaluatorTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') def test_compute_loglikelihood(self): doc_tokens = [ 'macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null' ] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens(doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual( -14.113955684239654, model_evaluator.compute_loglikelihood(self.model, self.vocabulary, documents))
class TopicWordsStatTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load("../testdata/lda_model") self.vocabulary = Vocabulary() self.vocabulary.load("../testdata/vocabulary.dat") self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary) def test_save(self): print self.topic_words_stat.save("../testdata/topic_top_words.dat", 0.8) def test_get_topic_top_words(self): print self.topic_words_stat.get_topic_top_words(0.8) def test_compute_topic_word_distribution(self): print self.topic_words_stat.compute_topic_word_distribution()
def main(args): model = Model(0) model.load(args.model_dir) vocabulary = Vocabulary() vocabulary.load(args.vocabulary) multi_chain_gibbs_sampler = MultiChainGibbsSampler(model, vocabulary, args.num_markov_chains, args.total_iterations, args.burn_in_iterations) fp = open(args.documents, 'r') for doc_str in fp.readlines(): doc_str = doc_str.decode('gbk') doc_tokens = doc_str.strip().split('\t') topic_dist = multi_chain_gibbs_sampler.infer_topics(doc_tokens) print doc_str print topic_dist fp.close()
class TopicWordsStatTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary) def test_save(self): print self.topic_words_stat.save('../testdata/topic_top_words.dat', 0.8) def test_get_topic_top_words(self): print self.topic_words_stat.get_topic_top_words(0.8) def test_compute_topic_word_distribution(self): print self.topic_words_stat.compute_topic_word_distribution()
class ModelEvaluatorTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') def test_compute_loglikelihood(self): doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null'] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens( doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual(-14.113955684239654, model_evaluator.compute_loglikelihood(self.model, self.vocabulary, documents))