示例#1
0
文件: lda.py 项目: jperla/happynews
    def initialize(self, K):
        """Accepts K number of topics in document.
            Initializes all of the hidden variable arrays now that it knows dimensions
            of topics, vocabulary, etc.
        """
        assert self.documents is not None

        # give at least more documents than topics
        # so that it's not singular
        assert self.D > K

        self.K = K

        D = self.D
        W = self.W

        # "it suffices to fix alpha to uniform 1/K"
        # initialize to ones so that the topics are more evenly distributed
        # good for small datasets
        self.alpha = np.ones((K,)) * (3.0 / K)

        # Initialize the variational distribution q(beta|lambda)
        self.beta = topiclib.initialize_beta(K, W)

        document_Nds = self.num_words_per(self.documents)
        self.phi = [(np.ones((document_Nds[d], K))*(1.0/K)) for d in xrange(D)]

        self.gamma = np.ones((D, K)) * (1.0 / K)
        graphlib.initialize_random(self.gamma)

        self.is_initialized = True
示例#2
0
def test_initialize_beta():
    out = lm.initialize_beta(3, 4)
    assert out.shape == (3,4)

    sumrows = np.sum(out, axis=1)
    assert same(sumrows, np.ones(out.shape[0]))

    # test the log version
    out = lm.initialize_log_beta(3, 4)
    assert out.shape == (3,4)

    sumrows = lm.logsumexp(out, axis=1)
    assert same(np.exp(sumrows), np.ones(out.shape[0]))
示例#3
0
        Normalizes them to be a probability of choosing each document.
        Just divide by sum.
       Returns that.
    """
    z = np.array(z)
    return 1.0 * z / np.sum(z)

# figure out the ratings of the documents
muU = [np.dot(eta, zbar(zC[d])) for d in xrange(D)]
muL = [np.dot(eta, zbar(zL[l])) for l in xrange(L)]

yU = [np.random.normal(muU[d], np.sqrt(sigma_squared)) for d in xrange(D)]
yL = [np.random.normal(muL[l], np.sqrt(sigma_squared)) for l in xrange(L)]

# and finally, generate the word distribution beta and actual words
beta = topiclib.initialize_beta(K, W)

def itertopics(zvals):
    """Accepts a bunch of z values (a list of ints, number of times topic t is repeated).
        Yields a generator which repeats those values the appropriate times.
        Will yield sum(zvals) times.
    """
    for t,n in enumerate(zvals):
        for topic in repeat(t, n):
            yield topic

def make_doc(topics, beta):
    return np.sum([np.random.multinomial(1, beta[k]) for k in itertopics(topics)], axis=0)

documents = [make_doc(zD[d], beta) for d in xrange(D)]
comments = [make_doc(zC[d], beta) for d in xrange(D)]
示例#4
0
文件: tlc.py 项目: jperla/happynews
    def initialize(self, Ku, Ks, Kb):
        """Accepts K number of topics in document.
            Initializes all of the hidden variable arrays now that it knows dimensions
            of topics, vocabulary, etc.
        """
        assert self.documents is not None
        assert Ku is not None
        assert Ks is not None
        assert Kb is not None

        K = Ku + Ks + Kb

        # give at least more documents than topics
        # so that it's not singular
        assert self.D > K

        self.K = K
        self.Ku = Ku
        self.Ks = Ks
        self.Kb = Kb

        self.Kc = self.Ku + self.Ks
        self.Kl = self.Ks + self.Kb

        W = self.W

        # Initialize the variational distribution q(beta|lambda)
        self.beta = topiclib.initialize_beta(K, W)

        # "it suffices to fix alpha to uniform 1/K"
        # initialize to ones so that the topics are more evenly distributed
        # good for small datasets
        self.alphaU = np.ones((Ku,)) * (1.0 / Ku)
        self.alphaS = np.ones((Ks,)) * (1.0 / Ks)
        self.alphaB = np.ones((Kb,)) * (1.0 / Kb)

        # todo: not using this yet
        #self.alphaD = ...
        
        def uniform_phi(Nds, size):
            D = len(Nds)
            return [(np.ones((Nds[d], size)) * (1.0 / size)) for d in xrange(D)]

        document_Nds = self.num_words_per(self.documents)
        self.phiD = uniform_phi(document_Nds, self.Ku)
        comment_Nds = self.num_words_per(self.comments)
        self.phiC = uniform_phi(comment_Nds, self.Kc)
        labeled_Nds = self.num_words_per(self.labeled)
        self.phiL = uniform_phi(labeled_Nds, self.Kl)
        background_Nds = self.num_words_per(self.background)
        self.phiB = uniform_phi(background_Nds, self.Kb)

        self.num_document_words = sum(document_Nds)
        self.num_comment_words = sum(comment_Nds)
        self.num_labeled_words = sum(labeled_Nds)
        self.num_background_words = sum(background_Nds)

        biggest = float(max(self.num_document_words, self.num_comment_words,
                      self.num_labeled_words, self.num_background_words))
        self.document_multiplier = biggest / self.num_document_words
        self.comment_multiplier = biggest / self.num_comment_words
        self.labeled_multiplier = biggest / self.num_labeled_words
        self.background_multiplier = biggest / self.num_background_words

        self.gammaD = np.ones((self.D, self.Ku)) * (1.0 / self.Ku)
        self.gammaC = np.ones((self.D, self.Kc)) * (1.0 / self.Kc)
        self.gammaL = np.ones((self.L, self.Kl)) * (1.0 / self.Kl)
        self.gammaB = np.ones((self.B, self.Kb)) * (1.0 / self.Kb)
        graphlib.initialize_random(self.gammaD)
        graphlib.initialize_random(self.gammaC)
        graphlib.initialize_random(self.gammaL)
        graphlib.initialize_random(self.gammaB)

        self.eta = graphlib.random_normal(0, 2.0, (Ks,))
        self.sigma_squared = 0.5

        print 'eta start: {0}'.format(self.eta)

        self.is_initialized = True