Пример #1
0
    def __doc_proposal(self, doc, token):
        if isinstance(doc, LDADoc) and isinstance(token, Token):
            old_topic = token.topic
            dart = rand() * (doc.size() + self.__model.alpha_sum())
            if dart < doc.size():
                token_index = int(dart)
                new_topic = doc.token(token_index).topic
            else:
                new_topic = rand_k(self.__model.num_topics())

            if new_topic != old_topic:
                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
                proportion_old = self.__proportional_function(
                    doc, token, old_topic)
                proportion_new = self.__proportional_function(
                    doc, token, new_topic)
                transition_prob = float((proportion_new * proposal_old) /
                                        (proportion_old * proposal_new))
                rejection = rand()
                mask = -(rejection < transition_prob)
                return (new_topic & mask) | (old_topic & ~mask)

            return new_topic

        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
            sent = token
            old_topic = sent.topic
            dart = rand() * (doc.size() + self.__model.alpha_sum())
            if dart < doc.size():
                token_index = int(dart)
                new_topic = doc.sent(token_index).topic
            else:
                new_topic = rand_k(self.__model.num_topics())

            if new_topic != old_topic:
                proportion_old = self.__proportional_function(
                    doc, sent, old_topic)
                proportion_new = self.__proportional_function(
                    doc, sent, new_topic)
                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
                transition_prob = float((proportion_new * proposal_old) /
                                        (proportion_old * proposal_new))
                rejection = rand()
                mask = -(rejection < transition_prob)
                return (new_topic & mask) | (old_topic & ~mask)

            return new_topic
Пример #2
0
    def __sample_sentence(self, doc, sent):
        old_topic = sent.topic
        num_topics = self.__model.num_topics()
        accum_prob = np.zeros(num_topics)
        prob = np.zeros(num_topics)
        sum_ = 0
        for t in range(num_topics):
            dt_alpha = doc.topic_sum(t) + self.__model.alpha()
            t_sum_beta_sum = self.__model.topic_sum(
                t) + self.__model.beta_sum()
            if t == old_topic:
                if dt_alpha > 1:
                    dt_alpha -= 1
                if t_sum_beta_sum > 1:
                    t_sum_beta_sum -= 1
            prob[t] = dt_alpha
            for i in range(len(sent.tokens)):
                w = sent.tokens[i]
                wt_beta = self.__model.word_topic_value(
                    w, t) + self.__model.beta()
                if t == old_topic and wt_beta > 1:
                    wt_beta -= 1
                # Note: if the length of the sentence is too long, the probability will be
                # too small and the accuracy will be lost if there are too many multiply items
                prob[t] *= wt_beta / t_sum_beta_sum
            sum_ += prob[t]
            accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]

        dart = rand() * sum
        if dart <= accum_prob[0]:
            return 0
        for t in range(1, num_topics):
            if accum_prob[t - 1] < dart <= accum_prob[t]:
                return t
        return num_topics - 1
Пример #3
0
    def __sample_token(self, doc, token):
        old_topic = token.topic
        num_topics = self.__model.num_topics()
        accum_prob = np.zeros(num_topics)
        prob = np.zeros(num_topics)
        sum_ = 0
        for i in range(num_topics):
            dt_alpha = doc.topic_sum(i) + self.__model.alpha()
            wt_beta = self.__model.word_topic_value(token.id,
                                                    i) + self.__model.beta()
            t_sum_beta_sum = self.__model.topic_sum(
                i) + self.__model.beta_sum()
            if i == old_topic and wt_beta > 1:
                if dt_alpha > 1:
                    dt_alpha -= 1
                wt_beta -= 1
                t_sum_beta_sum -= 1
            prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
            sum_ += prob[i]
            accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]

        dart = rand() * sum_
        if dart <= accum_prob[0]:
            return 0
        for i in range(1, num_topics):
            if accum_prob[i - 1] < dart <= accum_prob[i]:
                return i
        return num_topics - 1
Пример #4
0
 def __propose(self, word_id):
     dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
     if dart < self.__prob_sum[word_id]:
         idx = self.__alias_tables[word_id].generate()
         topic = self.__topic_indexes[word_id][idx]
     else:
         topic = self.__beta_alias.generate()
     return topic
Пример #5
0
    def __word_proposal(self, doc, token, old_topic):
        if isinstance(doc, LDADoc) and isinstance(token, Token):
            new_topic = self.__propose(token.id)
            if new_topic != old_topic:
                proposal_old = self.__word_proposal_distribution(
                    token.id, old_topic)
                proposal_new = self.__word_proposal_distribution(
                    token.id, new_topic)
                proportion_old = self.__proportional_function(
                    doc, token, old_topic)
                proportion_new = self.__proportional_function(
                    doc, token, new_topic)
                transition_prob = float((proportion_new * proposal_old) /
                                        (proportion_old * proposal_new))
                rejection = rand()
                mask = -(rejection < transition_prob)
                return (new_topic & mask) | (old_topic & ~mask)
            return new_topic

        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
            sent = token
            new_topic = old_topic
            for word_id in sent.tokens:
                new_topic = self.__propose(word_id)
                if new_topic != old_topic:
                    proportion_old = self.__proportional_function(
                        doc, sent, old_topic)
                    proportion_new = self.__proportional_function(
                        doc, sent, new_topic)
                    proposal_old = self.__word_proposal_distribution(
                        word_id, old_topic)
                    proposal_new = self.__word_proposal_distribution(
                        word_id, new_topic)
                    transition_prob = float((proportion_new * proposal_old) /
                                            (proportion_old * proposal_new))
                    rejection = rand()
                    mask = -(rejection < transition_prob)
                    new_topic = (new_topic & mask) | (old_topic & ~mask)
            return new_topic
Пример #6
0
 def generate(self):
     """Generate samples from given distribution.
     """
     dart1 = rand_k(self.size())
     dart2 = int(rand())
     return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]