def setUp(self):
     model = Model(20)
     model.load('../testdata/lda_model')
     vocabulary = Vocabulary()
     vocabulary.load('../testdata/vocabulary.dat')
     self.multi_chain_gibbs_sampler = \
             MultiChainGibbsSampler(model, vocabulary, 10, 10, 5)
示例#2
0
    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

        self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary)
class SparseLDATrainGibbsSamplerTest(unittest.TestCase):

    def setUp(self):
        self.model = Model(20)
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')
        self.sparselda_train_gibbs_sampler = \
                SparseLDATrainGibbsSampler(self.model, self.vocabulary)

    def test_load_corpus(self):
        self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus')
        self.assertEqual(4, len(self.sparselda_train_gibbs_sampler.documents))

    def test_gibbs_sampling(self):
        self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus')
        rand = random.Random()
        for i in xrange(100):
            self.sparselda_train_gibbs_sampler.gibbs_sampling(rand)
            if (i + 1) % 10 == 0:
                self.sparselda_train_gibbs_sampler.save_checkpoint(
                        '../testdata/checkpoint', i + 1)
        self.sparselda_train_gibbs_sampler.save_model(
                '../testdata/train_model', 100)

    def test_load_checkpoint(self):
        cur_iteration = self.sparselda_train_gibbs_sampler.load_checkpoint(
                '../testdata/checkpoint')
        rand = random.Random()
        for i in xrange(cur_iteration, 200):
            self.sparselda_train_gibbs_sampler.gibbs_sampling(rand)
            if (i + 1) % 10 == 0:
                self.sparselda_train_gibbs_sampler.save_checkpoint(
                        '../testdata/checkpoint', i + 1)
示例#4
0
 def setUp(self):
     model = Model(20)
     model.load('../testdata/lda_model')
     vocabulary = Vocabulary()
     vocabulary.load('../testdata/vocabulary.dat')
     self.sparselda_gibbs_sampler = \
             SparseLDAGibbsSampler(model, vocabulary, 10, 5)
示例#5
0
class SparseLDATrainGibbsSamplerTest(unittest.TestCase):
    def setUp(self):
        self.model = Model(20)
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')
        self.sparselda_train_gibbs_sampler = \
                SparseLDATrainGibbsSampler(self.model, self.vocabulary)

    def test_load_corpus(self):
        self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus')
        self.assertEqual(4, len(self.sparselda_train_gibbs_sampler.documents))

    def test_gibbs_sampling(self):
        self.sparselda_train_gibbs_sampler.load_corpus('../testdata/corpus')
        rand = random.Random()
        for i in xrange(100):
            self.sparselda_train_gibbs_sampler.gibbs_sampling(rand)
            if (i + 1) % 10 == 0:
                self.sparselda_train_gibbs_sampler.save_checkpoint(
                    '../testdata/checkpoint', i + 1)
        self.sparselda_train_gibbs_sampler.save_model(
            '../testdata/train_model', 100)

    def test_load_checkpoint(self):
        cur_iteration = self.sparselda_train_gibbs_sampler.load_checkpoint(
            '../testdata/checkpoint')
        rand = random.Random()
        for i in xrange(cur_iteration, 200):
            self.sparselda_train_gibbs_sampler.gibbs_sampling(rand)
            if (i + 1) % 10 == 0:
                self.sparselda_train_gibbs_sampler.save_checkpoint(
                    '../testdata/checkpoint', i + 1)
示例#6
0
class ModelEvaluatorTest(unittest.TestCase):
    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

    def test_compute_loglikelihood(self):
        doc_tokens = [
            'macbook',
            'ipad',  # exist in vocabulary and model
            'mac os x',
            'chrome',  # only exist in vocabulary
            'nokia',
            'null'
        ]  # inexistent
        document = Document(self.model.num_topics)
        rand = random.Random()
        rand.seed(0)
        document.parse_from_tokens(doc_tokens, rand, self.vocabulary,
                                   self.model)
        documents = [document, document]
        self.assertEqual(
            -14.113955684239654,
            model_evaluator.compute_loglikelihood(self.model, self.vocabulary,
                                                  documents))
示例#7
0
    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

        self.model_evaluator = ModelEvaluator(self.model, self.vocabulary)
 def setUp(self):
     model = Model(20)
     model.load('../testdata/lda_model')
     vocabulary = Vocabulary()
     vocabulary.load('../testdata/vocabulary.dat')
     self.multi_chain_gibbs_sampler = \
             MultiChainGibbsSampler(model, vocabulary, 10, 10, 5)
 def setUp(self):
     model = Model(20)
     model.load('../testdata/lda_model')
     vocabulary = Vocabulary()
     vocabulary.load('../testdata/vocabulary.dat')
     self.sparselda_gibbs_sampler = \
             SparseLDAGibbsSampler(model, vocabulary, 10, 5)
    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

        self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary)
示例#11
0
class TopicWordsStatTest(unittest.TestCase):
    def setUp(self):
        self.model = Model(20)
        self.model.load("../testdata/lda_model")
        self.vocabulary = Vocabulary()
        self.vocabulary.load("../testdata/vocabulary.dat")

        self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary)

    def test_save(self):
        print self.topic_words_stat.save("../testdata/topic_top_words.dat", 0.8)

    def test_get_topic_top_words(self):
        print self.topic_words_stat.get_topic_top_words(0.8)

    def test_compute_topic_word_distribution(self):
        print self.topic_words_stat.compute_topic_word_distribution()
示例#12
0
def main(args):
    model = Model(0)
    model.load(args.model_dir)
    vocabulary = Vocabulary()
    vocabulary.load(args.vocabulary)
    multi_chain_gibbs_sampler = MultiChainGibbsSampler(model, vocabulary,
            args.num_markov_chains, args.total_iterations,
            args.burn_in_iterations)

    fp = open(args.documents, 'r')
    for doc_str in fp.readlines():
        doc_str = doc_str.decode('gbk')
        doc_tokens = doc_str.strip().split('\t')
        topic_dist = multi_chain_gibbs_sampler.infer_topics(doc_tokens)
        print doc_str
        print topic_dist
    fp.close()
 def __init__(self, model_name, embed_size=200, attention_dim=300, encoder_dim=2048, decoder_dim=300, batch_size=8):
     self.model_name = model_name
     self.embed_size = embed_size
     self.attention_dim = attention_dim
     self.encoder_dim = encoder_dim
     self.decoder_dim = decoder_dim
     self.batch_size = batch_size
     self.vocab = Vocabulary()
def main(args):
    model = Model(0)
    model.load(args.model_dir)
    vocabulary = Vocabulary()
    vocabulary.load(args.vocabulary)
    multi_chain_gibbs_sampler = MultiChainGibbsSampler(model, vocabulary,
                                                       args.num_markov_chains,
                                                       args.total_iterations,
                                                       args.burn_in_iterations)

    fp = open(args.documents, 'r')
    for doc_str in fp.readlines():
        doc_str = doc_str.decode('gbk')
        doc_tokens = doc_str.strip().split('\t')
        topic_dist = multi_chain_gibbs_sampler.infer_topics(doc_tokens)
        print doc_str
        print topic_dist
    fp.close()
示例#15
0
class TopicWordsStatTest(unittest.TestCase):
    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

        self.topic_words_stat = TopicWordsStat(self.model, self.vocabulary)

    def test_save(self):
        print self.topic_words_stat.save('../testdata/topic_top_words.dat',
                                         0.8)

    def test_get_topic_top_words(self):
        print self.topic_words_stat.get_topic_top_words(0.8)

    def test_compute_topic_word_distribution(self):
        print self.topic_words_stat.compute_topic_word_distribution()
示例#16
0
class ModelEvaluatorTest(unittest.TestCase):

    def setUp(self):
        self.model = Model(20)
        self.model.load('../testdata/lda_model')
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../testdata/vocabulary.dat')

    def test_compute_loglikelihood(self):
        doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
                'mac os x', 'chrome',  # only exist in vocabulary
                'nokia', 'null']  # inexistent
        document = Document(self.model.num_topics)
        rand = random.Random()
        rand.seed(0)
        document.parse_from_tokens(
                doc_tokens, rand, self.vocabulary, self.model)
        documents = [document, document]
        self.assertEqual(-14.113955684239654,
                model_evaluator.compute_loglikelihood(self.model, self.vocabulary, documents))
示例#17
0
def main(args):
    model = Model(args.num_topics, args.topic_prior, args.word_prior)
    vocabulary = Vocabulary()
    vocabulary.load(args.vocabulary_file)
    sparselda_train_gibbs_sampler = SparseLDATrainGibbsSampler(
        model, vocabulary)
    sparselda_train_gibbs_sampler.load_corpus(args.corpus_dir)

    rand = random.Random()

    for i in xrange(args.total_iterations):
        logging.info('sparselda trainer, gibbs sampling iteration %d.' %
                     (i + 1))
        sparselda_train_gibbs_sampler.gibbs_sampling(rand)

        # dump lda model
        if i == 0 or (i + 1) % args.save_model_interval == 0:
            logging.info('iteration %d start saving lda model.' % (i + 1))
            sparselda_train_gibbs_sampler.save_model(args.model_dir, i + 1)
            topic_words_stat = TopicWordsStat(model, vocabulary)
            topic_words_stat.save(
                '%s/topic_top_words.%d' % (args.model_dir, i + 1),
                args.topic_word_accumulated_prob_threshold)
            logging.info('iteration %d save lda model ok.' % (i + 1))

        # dump checkpoint
        if i == 0 or (i + 1) % args.save_checkpoint_interval == 0:
            logging.info('iteration %d start saving checkpoint.' % (i + 1))
            sparselda_train_gibbs_sampler.save_checkpoint(
                args.checkpoint_dir, i + 1)
            logging.info('iteration %d save checkpoint ok.' % (i + 1))

        # compute the loglikelihood
        if i == 0 or (i + 1) % args.compute_loglikelihood_interval == 0:
            logging.info('iteration %d start computing loglikelihood.' %
                         (i + 1))
            model_evaluator = ModelEvaluator(model, vocabulary)
            ll = model_evaluator.compute_loglikelihood(
                sparselda_train_gibbs_sampler.documents)
            logging.info('iteration %d loglikelihood is %f.' % (i + 1, ll))
示例#18
0
def main(args):
    model = Model(args.num_topics, args.topic_prior, args.word_prior)
    vocabulary = Vocabulary()
    vocabulary.load(args.vocabulary_file)
    sparselda_train_gibbs_sampler = SparseLDATrainGibbsSampler(
            model, vocabulary)
    sparselda_train_gibbs_sampler.load_corpus(args.corpus_dir)

    rand = random.Random()

    for i in xrange(args.total_iterations):
        logging.info('sparselda trainer, gibbs sampling iteration %d.'
                % (i + 1))
        sparselda_train_gibbs_sampler.gibbs_sampling(rand)

        # dump lda model
        if i == 0 or (i + 1) % args.save_model_interval == 0:
            logging.info('iteration %d start saving lda model.' % (i + 1))
            sparselda_train_gibbs_sampler.save_model(args.model_dir, i + 1)
            topic_words_stat = TopicWordsStat(model, vocabulary)
            topic_words_stat.save(
                    '%s/topic_top_words.%d' % (args.model_dir, i + 1),
                    args.topic_word_accumulated_prob_threshold)
            logging.info('iteration %d save lda model ok.' % (i + 1))

        # dump checkpoint
        if i == 0 or (i + 1) % args.save_checkpoint_interval == 0:
            logging.info('iteration %d start saving checkpoint.' % (i + 1))
            sparselda_train_gibbs_sampler.save_checkpoint(
                    args.checkpoint_dir, i + 1)
            logging.info('iteration %d save checkpoint ok.' % (i + 1))

        # compute the loglikelihood
        if i == 0 or (i + 1) % args.compute_loglikelihood_interval == 0:
            logging.info('iteration %d start computing loglikelihood.' % (i + 1))
            model_evaluator = ModelEvaluator(model, vocabulary)
            ll = model_evaluator.compute_loglikelihood(
                    sparselda_train_gibbs_sampler.documents)
            logging.info('iteration %d loglikelihood is %f.' % (i + 1, ll))
    def train(self,
              data,
              labels,
              dev_data=None,
              dev_labels=None,
              symbol_vocabulary_file=None,
              tags_vocabulary_file=None,
              lm_file=None,
              model_file=None,
              save_file=None):
        """
        Trains the tagger on data :data: with labels :labels:

        data: list of lists of sequences, a list of sentences
        labels: list of lists of strs,
            a list of sequences of tags, each tag is a feature-value structure
        :return:
        """
        if symbol_vocabulary_file is None:
            self.symbols_ = Vocabulary(
                character=True, min_count=self.min_char_count).train(data)
        else:
            self.symbols_ = vocabulary_from_json(symbol_vocabulary_file,
                                                 use_features=False)
        if tags_vocabulary_file is None:
            self.tags_ = FeatureVocabulary(character=False).train(labels)
        else:
            with open(tags_vocabulary_file, "r", encoding="utf8") as fin:
                tags_info = json.load(fin)
            self.tags_ = vocabulary_from_json(tags_info, use_features=True)
        if self.verbose > 0:
            print("{} characters, {} tags".format(self.symbols_number_,
                                                  self.tags_number_))
        X_train, indexes_by_buckets = self.transform(data,
                                                     labels,
                                                     buckets_number=10)
        if dev_data is not None:
            X_dev, dev_indexes_by_buckets =\
                self.transform(dev_data, dev_labels, bucket_size=BUCKET_SIZE)
        else:
            X_dev, dev_indexes_by_buckets = [None] * 2
        self.build()
        if save_file is not None and model_file is not None:
            self.to_json(save_file, model_file, lm_file)
        self._train_on_data(X_train,
                            indexes_by_buckets,
                            X_dev,
                            dev_indexes_by_buckets,
                            model_file=model_file)
        return self
示例#20
0
 def __init__(self,
              embed_size=200,
              attention_dim=300,
              encoder_dim=2048,
              decoder_dim=300,
              batch_size=4,
              sequence_length=405):
     self.embed_size = embed_size
     self.attention_dim = attention_dim
     self.encoder_dim = encoder_dim
     self.decoder_dim = decoder_dim
     self.batch_size = batch_size
     self.sequence_length = sequence_length
     self.vocab = Vocabulary()
示例#21
0
 def setUp(self):
     self.model = Model(20)
     self.vocabulary = Vocabulary()
     self.vocabulary.load('../testdata/vocabulary.dat')
     self.sparselda_train_gibbs_sampler = \
             SparseLDATrainGibbsSampler(self.model, self.vocabulary)
 def __init__(self, sequence_length=405, batch_size=4):
     self.batch_size = batch_size
     self.sequence_length = sequence_length
     self.vocab = Vocabulary()
     self.vocab_size = len(self.vocab)
 def setUp(self):
     self.model = Model(20)
     self.vocabulary = Vocabulary()
     self.vocabulary.load('../testdata/vocabulary.dat')
     self.sparselda_train_gibbs_sampler = \
             SparseLDATrainGibbsSampler(self.model, self.vocabulary)