示例#1
0
    def init_model(self, n_samples=15):
        """
        initializes the LDA2EC model

        :return:
        """

        # number of documents
        n_docs = self.doc_ids.max() + 1

        # Number of unique words in the vocabulary
        self.n_vocab = self.flattened.max() + 1

        # Get the string representation for every compact key
        self.words = self.corpus.word_list(self.vocab)[:self.n_vocab]

        # How many tokens are in each document
        doc_idx, lengths = np.unique(self.doc_ids, return_counts=True)
        self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32')
        self.doc_lengths[doc_idx] = lengths

        # Count all token frequencies
        tok_idx, freq = np.unique(self.flattened, return_counts=True)
        self.term_frequency = np.zeros(self.n_vocab, dtype='int32')
        self.term_frequency[tok_idx] = freq

        self.model = LDA2Vec(n_documents=n_docs, n_document_topics=self.n_topics,
                        n_units=self.n_units, n_vocab=self.n_vocab, counts=self.term_frequency,
                        n_samples=n_samples, power=self.power, temperature=self.temperature)
示例#2
0
# How many tokens are in each document
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

for key in sorted(locals().keys()):
    val = locals()[key]
    if len(str(val)) < 100 and '<' not in str(val):
        six.print_(key, val)

model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
                n_samples=15, power=power, temperature=temperature)
if os.path.exists('lda2vec.hdf5'):
    six.print_("Reloading from saved")
    serializers.load_hdf5("lda2vec.hdf5", model)
if pretrained:
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
示例#3
0
# Number of topics to fit
n_topics = 20
batchsize = 4096
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]
# How many tokens are in each document
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
                n_samples=15)
if os.path.exists('lda2vec.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("lda2vec.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
for epoch in range(5000):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
示例#4
0
sty_len[sty_idx] = lengths

# How many tokens are in each author
aut_idx, lengths = np.unique(author_id, return_counts=True)
aut_len = np.zeros(aut_idx.max() + 1, dtype='int32')
aut_len[aut_idx] = lengths

# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

model = LDA2Vec(n_stories=n_stories,
                n_story_topics=n_story_topics,
                n_authors=n_authors,
                n_author_topics=n_author_topics,
                n_units=n_units,
                n_vocab=n_vocab,
                counts=term_frequency,
                n_samples=15)
if os.path.exists('lda2vec.hdf5'):
    six.print_("Reloading from saved")
    serializers.load_hdf5("lda2vec.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
示例#5
0
# Number of unique words in the vocabulary
n_vocab = flattened.max() + 1
# Number of dimensions in a single word vector
n_units = 256
# 'Strength' of the dircihlet prior; 200.0 seems to work well
clambda = 200.0
# Number of topics to fit
n_story_topics = 40
n_author_topics = 20
batchsize = 4096 * 2
counts = corpus.keys_counts[:n_vocab]
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]

model = LDA2Vec(n_stories, n_story_topics, n_authors, n_author_topics,
                n_units=n_units, n_vocab=n_vocab, counts=counts,
                n_samples=7)
if os.path.exists('lda2vec_hn.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("lda2vec_hn.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
for epoch in range(5000):
    print "Story topics"
    w = cuda.to_cpu(model.mixture_stories.weights.W.data).copy()
    f = cuda.to_cpu(model.mixture_stories.factors.W.data).copy()
示例#6
0
n_docs = doc_ids.max() + 1
# Number of unique words in the vocabulary
n_vocab = flattened.max() + 1
# Number of dimensions in a single word vector
n_units = 256
# 'Strength' of the dircihlet prior; 200.0 seems to work well
clambda = 200.0
# Number of topics to fit
n_topics = 20
batchsize = 4096 * 8
counts = corpus.keys_counts[:n_vocab]
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]

model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=counts,
                n_samples=15)
if os.path.exists('lda2vec.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("lda2vec.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
for epoch in range(5000):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.embed.W.data).copy(),
示例#7
0
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths

# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

for key in sorted(locals().keys()):
    val = locals()[key]
    if len(str(val)) < 100 and '<' not in str(val):
        print(key, val)

model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
                n_samples=15, power=power, temperature=temperature, vocab=words,
                docu_initialW=doc_weights_init)
if os.path.exists('lda2vec.hdf5'):
    print("Reloading from saved")
    serializers.load_hdf5("lda2vec.hdf5", model)

if pretrained:
    logger.info('Use pre-trained Google word2vec')
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]
    np.nan_to_num(model.sampler.W.data, copy=False)

if gpu_id >= 0:
    model.to_gpu()
else:
    model.to_cpu()