예제 #1
0
파일: lda.py 프로젝트: duckworthd/Topics
    def sample(self, n_docs, doc_length, n_topics=None, n_words=None):
        (doc_topic_prior,
         topic_word_prior) = self._initialize(n_topics, n_words)
        n_topics = len(doc_topic_prior)
        n_words = len(topic_word_prior)
        r = np.random.RandomState(0)

        # sample topic-word distributions
        topic_word = r.dirichlet(topic_word_prior, n_topics)

        # sample doc-topic distributions
        doc_topic = r.dirichlet(doc_topic_prior, n_docs)

        # sample documents themselves
        all_topics = []
        all_words = []
        for d in range(n_docs):
            topics = flatten_counts(r.multinomial(doc_length, doc_topic[d]))
            words = []
            for (i, topic) in enumerate(topics):
                # sample topic for this word, word itself
                words.append(categorical(topic_word[topic], r))

            # save this document's topics, words
            all_topics.append(topics)
            all_words.append(words)

        return {
            'topic_word': topic_word,
            'doc_topic': doc_topic,
            'word_topic': all_topics,
            'documents': all_words
        }
예제 #2
0
파일: lda.py 프로젝트: duckworthd/Topics
  def sample(self, n_docs, doc_length, n_topics=None, n_words=None):
    (doc_topic_prior, topic_word_prior) = self._initialize(n_topics, n_words)
    n_topics = len(doc_topic_prior)
    n_words = len(topic_word_prior)
    r = np.random.RandomState(0)

    # sample topic-word distributions
    topic_word = r.dirichlet(topic_word_prior, n_topics)

    # sample doc-topic distributions
    doc_topic = r.dirichlet(doc_topic_prior, n_docs)

    # sample documents themselves
    all_topics = []
    all_words = []
    for d in range(n_docs):
      topics = flatten_counts(r.multinomial(doc_length, doc_topic[d]))
      words = []
      for (i, topic) in enumerate(topics):
        # sample topic for this word, word itself
        words.append(categorical(topic_word[topic], r))

      # save this document's topics, words
      all_topics.append(topics)
      all_words.append(words)

    return {
        'topic_word': topic_word,
        'doc_topic': doc_topic,
        'word_topic': all_topics,
        'documents': all_words
    }
예제 #3
0
파일: lda.py 프로젝트: duckworthd/Topics
    def infer(self, documents, n_sweeps=1000, word_topic=None):
        r = np.random.RandomState(0)

        # initialize counts for each doc-topic and topic-word pair using the prior
        (doc_topic_counts, topic_word_counts,
         word_index) = (self._initialize(documents))
        topic_counts = np.sum(topic_word_counts, axis=1)
        n_topics = topic_word_counts.shape[0]
        n_docs = doc_topic_counts.shape[0]
        n_words = len(word_index)

        # transform documents into lists of word indices
        (documents, word_index) = reindex(documents)

        # initialize topics for all words uniformly at random
        if word_topic is None:
            word_topic = [[
                categorical(np.ones(n_topics) / n_topics, r) for word in doc
            ] for doc in documents]

        # initialize doc-topic and topic-word counts
        for (d, doc) in enumerate(documents):
            for (i, word) in enumerate(doc):
                # get topic for this word
                t = word_topic[d][i]

                # increment counts
                doc_topic_counts[d, t] += 1
                topic_word_counts[t, word] += 1
                topic_counts[t] += 1

        # resample word topics
        for sweep in range(n_sweeps):
            self.logger.debug('starting sweep #%d' % (sweep, ))
            for (d, doc) in enumerate(documents):

                if d % 100 == 0:
                    self.logger.debug('starting document #%d' % (d, ))

                for (i, word) in enumerate(doc):
                    # get topic for this word in this document
                    t = word_topic[d][i]

                    # remove it from counts
                    doc_topic_counts[d, t] -= 1
                    topic_word_counts[t, word] -= 1
                    topic_counts[t] -= 1

                    # calculate P(t | everything else)
                    prob = [
                        doc_topic_counts[d, t] * topic_word_counts[t, word] /
                        topic_counts[t] for t in range(n_topics)
                    ]
                    prob = np.array(prob) / np.sum(prob)

                    # select topic
                    t = categorical(prob, r)

                    # increment counts
                    doc_topic_counts[d, t] += 1
                    topic_word_counts[t, word] += 1
                    topic_counts[t] += 1

                    # set topic for word
                    word_topic[d][i] = t

            # sum of counts along each row
            topic_word_sums = topic_counts[:, np.newaxis]
            doc_topic_sums = np.sum(doc_topic_counts, axis=1)[:, np.newaxis]

            yield {
                'topic_word': np.copy(topic_word_counts) / topic_word_sums,
                'doc_topic': np.copy(doc_topic_counts) / doc_topic_sums,
                'word_topic': copy.deepcopy(word_topic),
                'word_index': word_index
            }
예제 #4
0
파일: lda.py 프로젝트: duckworthd/Topics
  def infer(self, documents, n_sweeps=1000, word_topic=None):
    r = np.random.RandomState(0)

    # initialize counts for each doc-topic and topic-word pair using the prior
    (doc_topic_counts, topic_word_counts, word_index) = (
      self._initialize(documents)
    )
    topic_counts = np.sum(topic_word_counts, axis=1)
    n_topics = topic_word_counts.shape[0]
    n_docs = doc_topic_counts.shape[0]
    n_words = len(word_index)

    # transform documents into lists of word indices
    (documents, word_index) = reindex(documents)

    # initialize topics for all words uniformly at random
    if word_topic is None:
      word_topic = [
        [
          categorical(np.ones(n_topics)/n_topics, r)
          for word in doc
        ]
        for doc in documents
      ]

    # initialize doc-topic and topic-word counts
    for (d, doc) in enumerate(documents):
      for (i, word) in enumerate(doc):
        # get topic for this word
        t = word_topic[d][i]

        # increment counts
        doc_topic_counts[d,t] += 1
        topic_word_counts[t,word] += 1
        topic_counts[t] += 1

    # resample word topics
    for sweep in range(n_sweeps):
      self.logger.debug('starting sweep #%d' % (sweep,))
      for (d, doc) in enumerate(documents):

        if d % 100 == 0:
          self.logger.debug('starting document #%d' % (d,))

        for (i, word) in enumerate(doc):
          # get topic for this word in this document
          t = word_topic[d][i]

          # remove it from counts
          doc_topic_counts[d,t] -= 1
          topic_word_counts[t,word] -= 1
          topic_counts[t] -= 1

          # calculate P(t | everything else)
          prob = [
              doc_topic_counts[d,t] * topic_word_counts[t,word] / topic_counts[t]
              for t in range(n_topics)
          ]
          prob = np.array(prob) / np.sum(prob)

          # select topic
          t = categorical(prob, r)

          # increment counts
          doc_topic_counts[d,t] += 1
          topic_word_counts[t,word] += 1
          topic_counts[t] += 1

          # set topic for word
          word_topic[d][i] = t

      # sum of counts along each row
      topic_word_sums = topic_counts[:, np.newaxis]
      doc_topic_sums = np.sum(doc_topic_counts, axis=1)[:, np.newaxis]

      yield {
        'topic_word': np.copy(topic_word_counts) / topic_word_sums,
        'doc_topic': np.copy(doc_topic_counts) / doc_topic_sums,
        'word_topic': copy.deepcopy(word_topic),
        'word_index': word_index
      }