def initialize(self, K): """Accepts K number of topics in document. Initializes all of the hidden variable arrays now that it knows dimensions of topics, vocabulary, etc. """ assert self.documents is not None # give at least more documents than topics # so that it's not singular assert self.D > K self.K = K D = self.D W = self.W # "it suffices to fix alpha to uniform 1/K" # initialize to ones so that the topics are more evenly distributed # good for small datasets self.alpha = np.ones((K,)) * (3.0 / K) # Initialize the variational distribution q(beta|lambda) self.beta = topiclib.initialize_beta(K, W) document_Nds = self.num_words_per(self.documents) self.phi = [(np.ones((document_Nds[d], K))*(1.0/K)) for d in xrange(D)] self.gamma = np.ones((D, K)) * (1.0 / K) graphlib.initialize_random(self.gamma) self.is_initialized = True
def test_initialize_beta(): out = lm.initialize_beta(3, 4) assert out.shape == (3,4) sumrows = np.sum(out, axis=1) assert same(sumrows, np.ones(out.shape[0])) # test the log version out = lm.initialize_log_beta(3, 4) assert out.shape == (3,4) sumrows = lm.logsumexp(out, axis=1) assert same(np.exp(sumrows), np.ones(out.shape[0]))
Normalizes them to be a probability of choosing each document. Just divide by sum. Returns that. """ z = np.array(z) return 1.0 * z / np.sum(z) # figure out the ratings of the documents muU = [np.dot(eta, zbar(zC[d])) for d in xrange(D)] muL = [np.dot(eta, zbar(zL[l])) for l in xrange(L)] yU = [np.random.normal(muU[d], np.sqrt(sigma_squared)) for d in xrange(D)] yL = [np.random.normal(muL[l], np.sqrt(sigma_squared)) for l in xrange(L)] # and finally, generate the word distribution beta and actual words beta = topiclib.initialize_beta(K, W) def itertopics(zvals): """Accepts a bunch of z values (a list of ints, number of times topic t is repeated). Yields a generator which repeats those values the appropriate times. Will yield sum(zvals) times. """ for t,n in enumerate(zvals): for topic in repeat(t, n): yield topic def make_doc(topics, beta): return np.sum([np.random.multinomial(1, beta[k]) for k in itertopics(topics)], axis=0) documents = [make_doc(zD[d], beta) for d in xrange(D)] comments = [make_doc(zC[d], beta) for d in xrange(D)]
def initialize(self, Ku, Ks, Kb): """Accepts K number of topics in document. Initializes all of the hidden variable arrays now that it knows dimensions of topics, vocabulary, etc. """ assert self.documents is not None assert Ku is not None assert Ks is not None assert Kb is not None K = Ku + Ks + Kb # give at least more documents than topics # so that it's not singular assert self.D > K self.K = K self.Ku = Ku self.Ks = Ks self.Kb = Kb self.Kc = self.Ku + self.Ks self.Kl = self.Ks + self.Kb W = self.W # Initialize the variational distribution q(beta|lambda) self.beta = topiclib.initialize_beta(K, W) # "it suffices to fix alpha to uniform 1/K" # initialize to ones so that the topics are more evenly distributed # good for small datasets self.alphaU = np.ones((Ku,)) * (1.0 / Ku) self.alphaS = np.ones((Ks,)) * (1.0 / Ks) self.alphaB = np.ones((Kb,)) * (1.0 / Kb) # todo: not using this yet #self.alphaD = ... def uniform_phi(Nds, size): D = len(Nds) return [(np.ones((Nds[d], size)) * (1.0 / size)) for d in xrange(D)] document_Nds = self.num_words_per(self.documents) self.phiD = uniform_phi(document_Nds, self.Ku) comment_Nds = self.num_words_per(self.comments) self.phiC = uniform_phi(comment_Nds, self.Kc) labeled_Nds = self.num_words_per(self.labeled) self.phiL = uniform_phi(labeled_Nds, self.Kl) background_Nds = self.num_words_per(self.background) self.phiB = uniform_phi(background_Nds, self.Kb) self.num_document_words = sum(document_Nds) self.num_comment_words = sum(comment_Nds) self.num_labeled_words = sum(labeled_Nds) self.num_background_words = sum(background_Nds) biggest = float(max(self.num_document_words, self.num_comment_words, self.num_labeled_words, self.num_background_words)) self.document_multiplier = biggest / self.num_document_words self.comment_multiplier = biggest / self.num_comment_words self.labeled_multiplier = biggest / self.num_labeled_words self.background_multiplier = biggest / self.num_background_words self.gammaD = np.ones((self.D, self.Ku)) * (1.0 / self.Ku) self.gammaC = np.ones((self.D, self.Kc)) * (1.0 / self.Kc) self.gammaL = np.ones((self.L, self.Kl)) * (1.0 / self.Kl) self.gammaB = np.ones((self.B, self.Kb)) * (1.0 / self.Kb) graphlib.initialize_random(self.gammaD) graphlib.initialize_random(self.gammaC) graphlib.initialize_random(self.gammaL) graphlib.initialize_random(self.gammaB) self.eta = graphlib.random_normal(0, 2.0, (Ks,)) self.sigma_squared = 0.5 print 'eta start: {0}'.format(self.eta) self.is_initialized = True