def __word_proposal(self, doc, token, old_topic): if isinstance(doc, LDADoc) and isinstance(token, Token): new_topic = self.__propose(token.id) if new_topic != old_topic: proposal_old = self.__word_proposal_distribution(token.id, old_topic) proposal_new = self.__word_proposal_distribution(token.id, new_topic) proportion_old = self.__proportional_function(doc, token, old_topic) proportion_new = self.__proportional_function(doc, token, new_topic) transition_prob = float((proportion_new * proposal_old) / (proportion_old * proposal_new)) rejection = rand() mask = -(rejection < transition_prob) return (new_topic & mask) | (old_topic & ~mask) return new_topic elif isinstance(doc, SLDADoc) and isinstance(token, Sentence): sent = token new_topic = old_topic for word_id in sent.tokens: new_topic = self.__propose(word_id) if new_topic != old_topic: proportion_old = self.__proportional_function(doc, sent, old_topic) proportion_new = self.__proportional_function(doc, sent, new_topic) proposal_old = self.__word_proposal_distribution(word_id, old_topic) proposal_new = self.__word_proposal_distribution(word_id, new_topic) transition_prob = float((proportion_new * proposal_old) / (proportion_old * proposal_new)) rejection = rand() mask = -(rejection < transition_prob) new_topic = (new_topic & mask) | (old_topic & ~mask) return new_topic
def __doc_proposal(self, doc, token): if isinstance(doc, LDADoc) and isinstance(token, Token): old_topic = token.topic dart = rand() * (doc.size() + self.__model.alpha_sum()) if dart < doc.size(): token_index = int(dart) new_topic = doc.token(token_index).topic else: new_topic = rand_k(self.__model.num_topics()) if new_topic != old_topic: proposal_old = self.__doc_proposal_distribution(doc, old_topic) proposal_new = self.__doc_proposal_distribution(doc, new_topic) proportion_old = self.__proportional_function( doc, token, old_topic) proportion_new = self.__proportional_function( doc, token, new_topic) transition_prob = float((proportion_new * proposal_old) / (proportion_old * proposal_new)) rejection = rand() mask = -(rejection < transition_prob) return (new_topic & mask) | (old_topic & ~mask) return new_topic elif isinstance(doc, SLDADoc) and isinstance(token, Sentence): sent = token old_topic = sent.topic dart = rand() * (doc.size() + self.__model.alpha_sum()) if dart < doc.size(): token_index = int(dart) new_topic = doc.sent(token_index).topic else: new_topic = rand_k(self.__model.num_topics()) if new_topic != old_topic: proportion_old = self.__proportional_function( doc, sent, old_topic) proportion_new = self.__proportional_function( doc, sent, new_topic) proposal_old = self.__doc_proposal_distribution(doc, old_topic) proposal_new = self.__doc_proposal_distribution(doc, new_topic) transition_prob = float((proportion_new * proposal_old) / (proportion_old * proposal_new)) rejection = rand() mask = -(rejection < transition_prob) return (new_topic & mask) | (old_topic & ~mask) return new_topic
def __sample_sentence(self, doc, sent): old_topic = sent.topic num_topics = self.__model.num_topics() accum_prob = np.zeros(num_topics) prob = np.zeros(num_topics) sum_ = 0 for t in range(num_topics): dt_alpha = doc.topic_sum(t) + self.__model.alpha() t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum() if t == old_topic: if dt_alpha > 1: dt_alpha -= 1 if t_sum_beta_sum > 1: t_sum_beta_sum -= 1 prob[t] = dt_alpha for i in range(len(sent.tokens)): w = sent.tokens[i] wt_beta = self.__model.word_topic_value(w, t) + self.__model.beta() if t == old_topic and wt_beta > 1: wt_beta -= 1 # Note: if the length of the sentence is too long, the probability will be # too small and the accuracy will be lost if there are too many multiply items prob[t] *= wt_beta / t_sum_beta_sum sum_ += prob[t] accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t] dart = rand() * sum if dart <= accum_prob[0]: return 0 for t in range(1, num_topics): if accum_prob[t - 1] < dart <= accum_prob[t]: return t return num_topics - 1
def __sample_token(self, doc, token): old_topic = token.topic num_topics = self.__model.num_topics() accum_prob = np.zeros(num_topics) prob = np.zeros(num_topics) sum_ = 0 for i in range(num_topics): dt_alpha = doc.topic_sum(i) + self.__model.alpha() wt_beta = self.__model.word_topic_value(token.id, i) + self.__model.beta() t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum() if i == old_topic and wt_beta > 1: if dt_alpha > 1: dt_alpha -= 1 wt_beta -= 1 t_sum_beta_sum -= 1 prob[i] = dt_alpha * wt_beta / t_sum_beta_sum sum_ += prob[i] accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i] dart = rand() * sum_ if dart <= accum_prob[0]: return 0 for i in range(1, num_topics): if accum_prob[i - 1] < dart <= accum_prob[i]: return i return num_topics - 1
def __propose(self, word_id): dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum) if dart < self.__prob_sum[word_id]: idx = self.__alias_tables[word_id].generate() topic = self.__topic_indexes[word_id][idx] else: topic = self.__beta_alias.generate() return topic
def generate(self): """Generate samples from given distribution. """ dart1 = rand_k(self.size()) dart2 = int(rand()) return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]