def setUp(self): model = Model(20) model.load('../testdata/lda_model') vocabulary = Vocabulary() vocabulary.load('../testdata/vocabulary.dat') self.multi_chain_gibbs_sampler = \ MultiChainGibbsSampler(model, vocabulary, 10, 10, 5)
def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary)
class SparseLDATrainGibbsSamplerTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.sparselda_train_gibbs_sampler = \ SparseLDATrainGibbsSampler(self.model, self.vocabulary) def test_load_corpus(self): self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus') self.assertEqual(4, len(self.sparselda_train_gibbs_sampler.documents)) def test_gibbs_sampling(self): self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus') rand = random.Random() for i in xrange(100): self.sparselda_train_gibbs_sampler.gibbs_sampling(rand) if (i + 1) % 10 == 0: self.sparselda_train_gibbs_sampler.save_checkpoint( '../testdata/checkpoint', i + 1) self.sparselda_train_gibbs_sampler.save_model( '../testdata/train_model', 100) def test_load_checkpoint(self): cur_iteration = self.sparselda_train_gibbs_sampler.load_checkpoint( '../testdata/checkpoint') rand = random.Random() for i in xrange(cur_iteration, 200): self.sparselda_train_gibbs_sampler.gibbs_sampling(rand) if (i + 1) % 10 == 0: self.sparselda_train_gibbs_sampler.save_checkpoint( '../testdata/checkpoint', i + 1)
def setUp(self): model = Model(20) model.load('../testdata/lda_model') vocabulary = Vocabulary() vocabulary.load('../testdata/vocabulary.dat') self.sparselda_gibbs_sampler = \ SparseLDAGibbsSampler(model, vocabulary, 10, 5)
class ModelEvaluatorTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') def test_compute_loglikelihood(self): doc_tokens = [ 'macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null' ] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens(doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual( -14.113955684239654, model_evaluator.compute_loglikelihood(self.model, self.vocabulary, documents))
def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.model_evaluator = ModelEvaluator(self.model, self.vocabulary)
class TopicWordsStatTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load("../testdata/lda_model") self.vocabulary = Vocabulary() self.vocabulary.load("../testdata/vocabulary.dat") self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary) def test_save(self): print self.topic_words_stat.save("../testdata/topic_top_words.dat", 0.8) def test_get_topic_top_words(self): print self.topic_words_stat.get_topic_top_words(0.8) def test_compute_topic_word_distribution(self): print self.topic_words_stat.compute_topic_word_distribution()
def main(args): model = Model(0) model.load(args.model_dir) vocabulary = Vocabulary() vocabulary.load(args.vocabulary) multi_chain_gibbs_sampler = MultiChainGibbsSampler(model, vocabulary, args.num_markov_chains, args.total_iterations, args.burn_in_iterations) fp = open(args.documents, 'r') for doc_str in fp.readlines(): doc_str = doc_str.decode('gbk') doc_tokens = doc_str.strip().split('\t') topic_dist = multi_chain_gibbs_sampler.infer_topics(doc_tokens) print doc_str print topic_dist fp.close()
def __init__(self, model_name, embed_size=200, attention_dim=300, encoder_dim=2048, decoder_dim=300, batch_size=8): self.model_name = model_name self.embed_size = embed_size self.attention_dim = attention_dim self.encoder_dim = encoder_dim self.decoder_dim = decoder_dim self.batch_size = batch_size self.vocab = Vocabulary()
class TopicWordsStatTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary) def test_save(self): print self.topic_words_stat.save('../testdata/topic_top_words.dat', 0.8) def test_get_topic_top_words(self): print self.topic_words_stat.get_topic_top_words(0.8) def test_compute_topic_word_distribution(self): print self.topic_words_stat.compute_topic_word_distribution()
class ModelEvaluatorTest(unittest.TestCase): def setUp(self): self.model = Model(20) self.model.load('../testdata/lda_model') self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') def test_compute_loglikelihood(self): doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null'] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens( doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual(-14.113955684239654, model_evaluator.compute_loglikelihood(self.model, self.vocabulary, documents))
def main(args): model = Model(args.num_topics, args.topic_prior, args.word_prior) vocabulary = Vocabulary() vocabulary.load(args.vocabulary_file) sparselda_train_gibbs_sampler = SparseLDATrainGibbsSampler( model, vocabulary) sparselda_train_gibbs_sampler.load_corpus(args.corpus_dir) rand = random.Random() for i in xrange(args.total_iterations): logging.info('sparselda trainer, gibbs sampling iteration %d.' % (i + 1)) sparselda_train_gibbs_sampler.gibbs_sampling(rand) # dump lda model if i == 0 or (i + 1) % args.save_model_interval == 0: logging.info('iteration %d start saving lda model.' % (i + 1)) sparselda_train_gibbs_sampler.save_model(args.model_dir, i + 1) topic_words_stat = TopicWordsStat(model, vocabulary) topic_words_stat.save( '%s/topic_top_words.%d' % (args.model_dir, i + 1), args.topic_word_accumulated_prob_threshold) logging.info('iteration %d save lda model ok.' % (i + 1)) # dump checkpoint if i == 0 or (i + 1) % args.save_checkpoint_interval == 0: logging.info('iteration %d start saving checkpoint.' % (i + 1)) sparselda_train_gibbs_sampler.save_checkpoint( args.checkpoint_dir, i + 1) logging.info('iteration %d save checkpoint ok.' % (i + 1)) # compute the loglikelihood if i == 0 or (i + 1) % args.compute_loglikelihood_interval == 0: logging.info('iteration %d start computing loglikelihood.' % (i + 1)) model_evaluator = ModelEvaluator(model, vocabulary) ll = model_evaluator.compute_loglikelihood( sparselda_train_gibbs_sampler.documents) logging.info('iteration %d loglikelihood is %f.' % (i + 1, ll))
def train(self, data, labels, dev_data=None, dev_labels=None, symbol_vocabulary_file=None, tags_vocabulary_file=None, lm_file=None, model_file=None, save_file=None): """ Trains the tagger on data :data: with labels :labels: data: list of lists of sequences, a list of sentences labels: list of lists of strs, a list of sequences of tags, each tag is a feature-value structure :return: """ if symbol_vocabulary_file is None: self.symbols_ = Vocabulary( character=True, min_count=self.min_char_count).train(data) else: self.symbols_ = vocabulary_from_json(symbol_vocabulary_file, use_features=False) if tags_vocabulary_file is None: self.tags_ = FeatureVocabulary(character=False).train(labels) else: with open(tags_vocabulary_file, "r", encoding="utf8") as fin: tags_info = json.load(fin) self.tags_ = vocabulary_from_json(tags_info, use_features=True) if self.verbose > 0: print("{} characters, {} tags".format(self.symbols_number_, self.tags_number_)) X_train, indexes_by_buckets = self.transform(data, labels, buckets_number=10) if dev_data is not None: X_dev, dev_indexes_by_buckets =\ self.transform(dev_data, dev_labels, bucket_size=BUCKET_SIZE) else: X_dev, dev_indexes_by_buckets = [None] * 2 self.build() if save_file is not None and model_file is not None: self.to_json(save_file, model_file, lm_file) self._train_on_data(X_train, indexes_by_buckets, X_dev, dev_indexes_by_buckets, model_file=model_file) return self
def __init__(self, embed_size=200, attention_dim=300, encoder_dim=2048, decoder_dim=300, batch_size=4, sequence_length=405): self.embed_size = embed_size self.attention_dim = attention_dim self.encoder_dim = encoder_dim self.decoder_dim = decoder_dim self.batch_size = batch_size self.sequence_length = sequence_length self.vocab = Vocabulary()
def setUp(self): self.model = Model(20) self.vocabulary = Vocabulary() self.vocabulary.load('../testdata/vocabulary.dat') self.sparselda_train_gibbs_sampler = \ SparseLDATrainGibbsSampler(self.model, self.vocabulary)
def __init__(self, sequence_length=405, batch_size=4): self.batch_size = batch_size self.sequence_length = sequence_length self.vocab = Vocabulary() self.vocab_size = len(self.vocab)