def sample(self, n_docs, doc_length, n_topics=None, n_words=None): (doc_topic_prior, topic_word_prior) = self._initialize(n_topics, n_words) n_topics = len(doc_topic_prior) n_words = len(topic_word_prior) r = np.random.RandomState(0) # sample topic-word distributions topic_word = r.dirichlet(topic_word_prior, n_topics) # sample doc-topic distributions doc_topic = r.dirichlet(doc_topic_prior, n_docs) # sample documents themselves all_topics = [] all_words = [] for d in range(n_docs): topics = flatten_counts(r.multinomial(doc_length, doc_topic[d])) words = [] for (i, topic) in enumerate(topics): # sample topic for this word, word itself words.append(categorical(topic_word[topic], r)) # save this document's topics, words all_topics.append(topics) all_words.append(words) return { 'topic_word': topic_word, 'doc_topic': doc_topic, 'word_topic': all_topics, 'documents': all_words }
def infer(self, documents, n_sweeps=1000, word_topic=None): r = np.random.RandomState(0) # initialize counts for each doc-topic and topic-word pair using the prior (doc_topic_counts, topic_word_counts, word_index) = (self._initialize(documents)) topic_counts = np.sum(topic_word_counts, axis=1) n_topics = topic_word_counts.shape[0] n_docs = doc_topic_counts.shape[0] n_words = len(word_index) # transform documents into lists of word indices (documents, word_index) = reindex(documents) # initialize topics for all words uniformly at random if word_topic is None: word_topic = [[ categorical(np.ones(n_topics) / n_topics, r) for word in doc ] for doc in documents] # initialize doc-topic and topic-word counts for (d, doc) in enumerate(documents): for (i, word) in enumerate(doc): # get topic for this word t = word_topic[d][i] # increment counts doc_topic_counts[d, t] += 1 topic_word_counts[t, word] += 1 topic_counts[t] += 1 # resample word topics for sweep in range(n_sweeps): self.logger.debug('starting sweep #%d' % (sweep, )) for (d, doc) in enumerate(documents): if d % 100 == 0: self.logger.debug('starting document #%d' % (d, )) for (i, word) in enumerate(doc): # get topic for this word in this document t = word_topic[d][i] # remove it from counts doc_topic_counts[d, t] -= 1 topic_word_counts[t, word] -= 1 topic_counts[t] -= 1 # calculate P(t | everything else) prob = [ doc_topic_counts[d, t] * topic_word_counts[t, word] / topic_counts[t] for t in range(n_topics) ] prob = np.array(prob) / np.sum(prob) # select topic t = categorical(prob, r) # increment counts doc_topic_counts[d, t] += 1 topic_word_counts[t, word] += 1 topic_counts[t] += 1 # set topic for word word_topic[d][i] = t # sum of counts along each row topic_word_sums = topic_counts[:, np.newaxis] doc_topic_sums = np.sum(doc_topic_counts, axis=1)[:, np.newaxis] yield { 'topic_word': np.copy(topic_word_counts) / topic_word_sums, 'doc_topic': np.copy(doc_topic_counts) / doc_topic_sums, 'word_topic': copy.deepcopy(word_topic), 'word_index': word_index }
def infer(self, documents, n_sweeps=1000, word_topic=None): r = np.random.RandomState(0) # initialize counts for each doc-topic and topic-word pair using the prior (doc_topic_counts, topic_word_counts, word_index) = ( self._initialize(documents) ) topic_counts = np.sum(topic_word_counts, axis=1) n_topics = topic_word_counts.shape[0] n_docs = doc_topic_counts.shape[0] n_words = len(word_index) # transform documents into lists of word indices (documents, word_index) = reindex(documents) # initialize topics for all words uniformly at random if word_topic is None: word_topic = [ [ categorical(np.ones(n_topics)/n_topics, r) for word in doc ] for doc in documents ] # initialize doc-topic and topic-word counts for (d, doc) in enumerate(documents): for (i, word) in enumerate(doc): # get topic for this word t = word_topic[d][i] # increment counts doc_topic_counts[d,t] += 1 topic_word_counts[t,word] += 1 topic_counts[t] += 1 # resample word topics for sweep in range(n_sweeps): self.logger.debug('starting sweep #%d' % (sweep,)) for (d, doc) in enumerate(documents): if d % 100 == 0: self.logger.debug('starting document #%d' % (d,)) for (i, word) in enumerate(doc): # get topic for this word in this document t = word_topic[d][i] # remove it from counts doc_topic_counts[d,t] -= 1 topic_word_counts[t,word] -= 1 topic_counts[t] -= 1 # calculate P(t | everything else) prob = [ doc_topic_counts[d,t] * topic_word_counts[t,word] / topic_counts[t] for t in range(n_topics) ] prob = np.array(prob) / np.sum(prob) # select topic t = categorical(prob, r) # increment counts doc_topic_counts[d,t] += 1 topic_word_counts[t,word] += 1 topic_counts[t] += 1 # set topic for word word_topic[d][i] = t # sum of counts along each row topic_word_sums = topic_counts[:, np.newaxis] doc_topic_sums = np.sum(doc_topic_counts, axis=1)[:, np.newaxis] yield { 'topic_word': np.copy(topic_word_counts) / topic_word_sums, 'doc_topic': np.copy(doc_topic_counts) / doc_topic_sums, 'word_topic': copy.deepcopy(word_topic), 'word_index': word_index }