def _compute_posteriors(self, fwdlattice, bwdlattice): # gamma is guaranteed to be correctly normalized by logprob at # all frames, unless we do approximate inference using pruning. # So, we will normalize each frame explicitly in case we # pruned too aggressively. log_gamma = fwdlattice + bwdlattice log_normalize(log_gamma, axis=1) return np.exp(log_gamma)
def compute_rho(self, x): gamma = self.prior['gamma'] log_like_x = self.caclulate_log_lik_x(x) # collapsed E_Nc_minus_n = np.sum(self.rho, 0, keepdims=True) - self.rho E_Nc_minus_n_cumsum_geq = np.fliplr( np.cumsum(np.fliplr(E_Nc_minus_n), axis=1)) E_Nc_minus_n_cumsum = E_Nc_minus_n_cumsum_geq - E_Nc_minus_n # var_not_i = np.sum(self.rho * (1 - self.rho), 0, keepdims=True) - self.rho * (1 - self.rho) # var_not_i_eq_k = np.zeros((self.N, self.T)) # for t in range(self.T): # if t != 0: # var_not_i_eq_k[:, t] = np.sum(E_Nc_minus_n[:, :t], 1) # var_not_i_eq_k = var_not_i_eq_k * E_greater_i # rho += (np.log(1 + E_Nc_minus_n) - var_not_i / (2 * ((1 + E_Nc_minus_n) ** 2))) + ( # np.log(gamma + E_greater_i) - var_not_i_eq_k / (2 * ((gamma + E_greater_i) ** 2))) + np.log( # 1 + gamma + E_Nc_minus_n + E_greater_i) first_tem = np.log(1 + E_Nc_minus_n) - np.log(1 + gamma + E_Nc_minus_n_cumsum_geq) first_tem[:, self.T - 1] = 0 dummy = np.log(gamma + E_Nc_minus_n_cumsum) - np.log(1 + gamma + E_Nc_minus_n_cumsum_geq) second_term = np.cumsum(dummy, axis=1) - dummy rho = log_like_x + (first_tem + second_term) log_rho, log_n = log_normalize(rho) rho = np.exp(log_rho) return rho
def __init__(self, T, K, D, size_vocab, eta, trsz, hdp_hyperparam): ''' this follows the convention of the HDP paper''' ''' gamma, first level concentration ''' ''' alpha, second level concentration ''' ''' eta, the topic Dirichlet ''' ''' T, top level truncation level ''' ''' K, second level truncation level ''' ''' size_vocab, size of vocab''' ''' hdp_hyperparam, the hyperparameter of hdp ''' self.m_hdp_hyperparam = hdp_hyperparam self.m_T = T # higher level truncation self.m_K = K # for now, we assume all the same for the second level truncation self.m_size_vocab = size_vocab # print "%d %d %d" %(T, size_vocab, D) self.m_beta = np.random.gamma(1.0, 1.0, (T, size_vocab)) * D*100/(T*size_vocab) (log_m_beta, log_norm) = utils.log_normalize(self.m_beta) self.m_beta = np.exp(log_m_beta) self.save_topics("lambda.txt"); self.m_eta = eta self.m_alpha = hdp_hyperparam.m_alpha_a/hdp_hyperparam.m_alpha_b self.m_gamma = hdp_hyperparam.m_gamma_a/hdp_hyperparam.m_gamma_b self.m_var_sticks = np.zeros((2, T-1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = self.m_gamma self.r = np.zeros((6, self.m_K)) self.dmu = np.zeros((trsz, 6)) # variational posterior parameters for hdp self.m_var_gamma_a = hdp_hyperparam.m_gamma_a self.m_var_gamma_b = hdp_hyperparam.m_gamma_b
def CStep(self, reg_irls): # cluster posterior probabilities p(c_i=g|X) self.h_ig = np.exp(utl.log_normalize(self.log_alphag_fg_xij)) self.MAP() # c_ig the hard partition of the curves #Compute the optimized criterion cig_log_alphag_fg_xij = self.c_ig * self.log_alphag_fg_xij self.comp_loglik = sum(cig_log_alphag_fg_xij.sum(axis=1)) + reg_irls
def var_inf_2d(self, x, Elogsticks_1nd, ite): Elog_phi = self.caclulate_log_lik_x(x) second_max_iter = 2000 if self.second_max_iter == -1 else self.second_max_iter lambdas = 1 self.init_second_params(x.shape[0], x) Elogsticks_2nd = self.expect_log_sticks(self.rho, self.T, self.prior['gamma'], self.prior['eta']) for i in range(second_max_iter): # compute var_theta if (i + 1) % (second_max_iter // 10) == 0: lambdas -= 0.1 temp_var_theta = self.var_theta self.var_theta = self.rho.T.dot(Elog_phi) + Elogsticks_1nd log_var_theta, log_n = log_normalize(self.var_theta) self.var_theta = ( 1 - lambdas) * temp_var_theta + lambdas * np.exp(log_var_theta) temp_rho = self.rho self.rho = self.var_theta.dot(Elog_phi.T).T + Elogsticks_2nd log_rho, log_n = log_normalize(self.rho) self.rho = (1 - lambdas) * temp_rho + lambdas * np.exp(log_rho) Elogsticks_2nd = self.expect_log_sticks(self.rho, self.T, self.prior['gamma'], self.prior['eta']) self.temp_top_stick += np.sum(self.var_theta, 0) self.temp_k_ss += np.sum(self.rho.dot(self.var_theta), 0) dot_x = np.einsum("ijk, ilk -> ijl", x[:, :, np.newaxis], x[:, :, np.newaxis]) t_r = np.sum( self.rho[:, :, np.newaxis, np.newaxis] * dot_x[:, np.newaxis], 0) self.temp_xi_ss += np.sum( self.var_theta[:, :, np.newaxis, np.newaxis] * t_r[:, np.newaxis], 0) return None
def gmm_sample(means, stds, log_pais, num_samples): samples = torch.cat([ gaussian_sample(mean, std, num_samples)[:, np.newaxis, :] for mean, std in zip(means, stds) ], axis=1) # ixs = np.random.choice(k, size=num_samples, p=np.exp(log_weights)) # weights = log_normalize(log_pais) # log_weights = log_normalize(log_pais) # print(torch.exp(log_weights)) # print(log_weights) weights = torch.exp(log_normalize(log_pais)) ixs = torch.multinomial(weights, num_samples, replacement=True) # ixs = np.random.choice(2, size=num_samples, p=weights.detach()) return torch.stack([samples[i, ix, :] for i, ix in enumerate(ixs)])
def var_inf(self, x): begin = time.time() for ite in range(self.args.max_iter): # compute rho E_log_1_pi = np.roll( np.cumsum(digamma(self.h) - digamma(self.g + self.h)), 1) E_log_1_pi[0] = 0 self.rho = self.caclulate_log_lik_x(x) + digamma( self.g) - digamma(self.g + self.h) + E_log_1_pi log_rho, log_n = log_normalize(self.rho) self.rho = np.exp(log_rho) # compute k self.k = self.u / self.v self.k[self.k > self.max_k] = self.max_k self.update_zeta_xi(x, self.rho) self.update_u_v(self.rho) self.update_g_h(self.rho) print(ite) if ite == self.args.max_iter - 1: times = time.time() - begin logger = open(os.path.join(LOG_DIR, "log_times_0.txt"), 'a') logger.write('nyu: times: {}\n'.format(times)) logger.close() self.k = self.u / self.v self.k[self.k > self.max_k] = self.max_k self.pi = calculate_mix(self.g, self.h, self.T) self.calculate_new_com() if self.args.verbose: print('mu: {}'.format(self.xi)) print('k: {}'.format(self.k)) print('pi: {}'.format(self.pi)) print('times: {}'.format(times))
def __init__(self, T, K, D, size_vocab, eta, trsz, hdp_hyperparam): ''' this follows the convention of the HDP paper''' ''' gamma, first level concentration ''' ''' alpha, second level concentration ''' ''' eta, the topic Dirichlet ''' ''' T, top level truncation level ''' ''' K, second level truncation level ''' ''' size_vocab, size of vocab''' ''' hdp_hyperparam, the hyperparameter of hdp ''' self.m_hdp_hyperparam = hdp_hyperparam self.m_T = T # higher level truncation self.m_K = K # for now, we assume all the same for the second level truncation self.m_size_vocab = size_vocab # print "%d %d %d" %(T, size_vocab, D) self.m_beta = np.random.gamma( 1.0, 1.0, (T, size_vocab)) * D * 100 / (T * size_vocab) (log_m_beta, log_norm) = utils.log_normalize(self.m_beta) self.m_beta = np.exp(log_m_beta) self.save_topics("lambda.txt") self.m_eta = eta self.m_alpha = hdp_hyperparam.m_alpha_a / hdp_hyperparam.m_alpha_b self.m_gamma = hdp_hyperparam.m_gamma_a / hdp_hyperparam.m_gamma_b self.m_var_sticks = np.zeros((2, T - 1)) self.m_var_sticks[0] = 1.0 self.m_var_sticks[1] = self.m_gamma self.r = np.zeros((6, self.m_K)) self.dmu = np.zeros((trsz, 6)) # variational posterior parameters for hdp self.m_var_gamma_a = hdp_hyperparam.m_gamma_a self.m_var_gamma_b = hdp_hyperparam.m_gamma_b
def count_ps_from_beta_ref(n, beta): log_ws = [ log_counts_to_cols(count) + (-beta * entropy_from_counts(count)) for count in enumerate_counts_iter(n) ] return map(exp, log_normalize(log_ws))
def var_inf_2d(self, x, Elogsticks_1nd, ite): D = self.D Elog_phi = ((x.dot((self.xi * (self.u / self.v)[:, np.newaxis]).T)) + (D / 2 - 1) * (digamma(self.u) - np.log(self.v)) - (D / 2 * np.log(2 * np.pi)) - (d_besseli(D / 2 - 1, self.k)) * (self.u / self.v - self.k) - np.log(iv((D / 2 - 1), self.k) + np.exp(-700))) second_max_iter = 5000 if self.second_max_iter == -1 else self.second_max_iter self.init_second_params(x.shape[0]) likelihood = 0.0 old_likelihood = 1 converge = 1 Elogsticks_2nd = self.expect_log_sticks(self.g, self.h, self.T) for i in range(second_max_iter): # compute var_theta self.var_theta = self.rho.T.dot(Elog_phi) + Elogsticks_1nd log_var_theta, log_n = log_normalize(self.var_theta) self.var_theta = np.exp(log_var_theta) self.rho = self.var_theta.dot(Elog_phi.T).T + Elogsticks_2nd log_rho, log_n = log_normalize(self.rho) self.rho = np.exp(log_rho) self.update_g_h(self.rho) Elogsticks_2nd = self.expect_log_sticks(self.g, self.h, self.T) likelihood = 0.0 # compute likelihood likelihood += np.sum( (Elogsticks_1nd - log_var_theta) * self.var_theta) v = np.vstack((self.g, self.h)) log_alpha = np.log(self.prior['gamma']) likelihood += (self.T - 1) * log_alpha dig_sum = digamma(np.sum(v, 0)) likelihood += np.sum( (np.array([1.0, self.prior['gamma']])[:, np.newaxis] - v) * (digamma(v) - dig_sum)) likelihood -= np.sum(gammaln(np.sum(v, 0))) - np.sum(gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_rho) * self.rho) # X part, the data part likelihood += np.sum(self.rho.T * np.dot(self.var_theta, Elog_phi.T)) if i > 0: converge = (likelihood - old_likelihood) / abs(old_likelihood) old_likelihood = likelihood if converge < self.args.threshold: break self.temp_top_stick += np.sum(self.var_theta, 0) self.temp_k_ss += np.sum(self.rho.dot(self.var_theta), 0) self.temp_xi_ss += self.var_theta.T.dot(self.rho.T.dot(x)) if ite == self.args.max_iter - 1: self.container['rho'].append(self.rho) self.container['var_theta'].append(self.var_theta) return likelihood
def doc_e_step(self, doc, ss, Elogbeta, Elogsticks_1st, var_converge, fresh=False): Elogbeta_doc = Elogbeta[:, doc.words] v = np.zeros((2, self.m_K - 1)) phi = np.ones((doc.length, self.m_K)) * 1.0 / self.m_K # the following line is of no use Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 old_likelihood = -1e1000 converge = 1.0 eps = 1e-100 iter = 0 max_iter = 100 #(TODO): support second level optimization in the future while iter < max_iter and (converge < 0.0 or converge > var_converge): ### update variational parameters # var_phi if iter < 3 and fresh: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) else: var_phi = np.dot( phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi if iter < 3: phi = np.dot(var_phi, Elogbeta_doc).T (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) else: phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) # v phi_all = phi * np.array(doc.counts)[:, np.newaxis] v[0] = 1.0 + np.sum(phi_all[:, :self.m_K - 1], 0) phi_cum = np.flipud(np.sum(phi_all[:, 1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K - 1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum( (np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum( sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood) / abs(old_likelihood) old_likelihood = likelihood if converge < 0: print "warning, likelihood is decreasing!" iter += 1 # update the suff_stat ss ss.m_var_sticks_ss += np.sum(var_phi, 0) ss.m_var_beta_ss[:, doc.words] += np.dot(var_phi.T, phi.T * doc.counts) return (likelihood)
def doc_e_step(self, doc, ss, Elogsticks_1st, word_list, unique_words, var_converge, max_iter=100): """ e step for a single doc, update local hidden variables """ batchids = [unique_words[id] for id in doc.words] Elogbeta_doc = self.m_Elogbeta[:, doc.words] ## very similar to the HDP equations v = np.zeros( (2, self.m_K - 1)) # pi[i] ~ Beta(1, alpha), i = 1, ..., T-1 v[0] = 1.0 v[1] = self.m_alpha # The following line is of no use. Elogsticks_2nd = expect_log_sticks( v) # Elogsticks_2nd represents document level # back to the uniform phi = np.ones((len( doc.words), self.m_K)) * 1.0 / self.m_K # this is phi[d, n, i] likelihood = 0.0 old_likelihood = -1e100 converge = 1.0 eps = 1e-100 iter = 0 # not yet support second level optimization yet, to be done in the future while iter < max_iter and (converge < 0.0 or converge > var_converge): ### update variational parameters # var_phi # var_phi seems to be zeta if iter < 3: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) else: var_phi = np.dot( phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi if iter < 3: phi = np.dot(var_phi, Elogbeta_doc).T (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) else: phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) # v # v seems to be gamma phi_all = phi * np.array(doc.counts)[:, np.newaxis] v[0] = 1.0 + np.sum(phi_all[:, :self.m_K - 1], 0) phi_cum = np.flipud(np.sum(phi_all[:, 1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K - 1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum( (np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum( sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood) / abs(old_likelihood) old_likelihood = likelihood #if converge < -0.000001: #print("warning, likelihood is decreasing!") iter += 1 # update the suff_stat ss # this time it only contains information from one doc ss.m_var_sticks_ss += np.sum(var_phi, 0) ss.m_var_beta_ss[:, batchids] += np.dot(var_phi.T, phi.T * doc.counts) return (likelihood)
def doc_e_step(self, batch_count, doc, ss, Elogsticks_1st, word_list, unique_words, var_converge, max_iter=500): """ e step for a single doc """ batchids = [unique_words[id] for id in doc.words] # 生成文档对应的这个batch的id,不是全局的id(doc.words保存的是全局id) Elogbeta_doc = self.m_Elogbeta[:, doc.words] ## very similar to the hdp equations v = np.zeros((2, self.m_K-1)) v[0] = 1.0 v[1] = self.m_alpha # The following line is of no use. Elogsticks_2nd = expect_log_sticks(v) # back to the uniform phi = np.ones((len(doc.words), self.m_K)) * 1.0/self.m_K var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) # K x T likelihood = 0.0 old_likelihood = -1e100 converge = 1.0 eps = 1e-100 iter = 0 # not yet support second level optimization yet, to be done in the future while iter < max_iter \ and (converge < 0.0 or converge > var_converge): # print "%s [batch_count = %d] converge = %f" % (getTime(), batch_count, converge) # last_var_phi = var_phi # last_phi = phi # last_v = v.copy() ### update variational parameters # var_phi 公式里面的phi 和batch版本hdp的更新相同 if iter < 3: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) else: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi 公式里面的zeta 和batch版本hdp的更新相同 if iter < 3: phi = np.dot(var_phi, Elogbeta_doc).T (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) else: phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) # v phi_all = phi * np.array(doc.counts)[:,np.newaxis] v[0] = 1.0 + np.sum(phi_all[:,:self.m_K-1], 0) phi_cum = np.flipud(np.sum(phi_all[:,1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) Elogsticks_2nd = expect_log_sticks(v) # meanchange_var_phi = np.mean(abs(var_phi - last_var_phi)) # meanchange_phi = np.mean(abs(phi - last_phi)) # meanchange_v = np.mean(abs(v - last_v)) # print "%s [batch_count = %d iter = %d] meanchange_var_phi = %f, meanchange_phi = %f, meanchange_v = %f" \ # % (getTime(), batch_count, iter, meanchange_var_phi, meanchange_phi, meanchange_v) # if (meanchange_v < 0.001 and meanchange_phi < 0.001 and meanchange_var_phi < 0.001): # break likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K-1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum((np.array([1.0, self.m_alpha])[:,np.newaxis]-v) * (sp.psi(v)-dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum(sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood)/abs(old_likelihood) if converge < -0.000001: print "%s [batch_count = %d] warning, likelihood is decreasing! old_likelihood = %f new_likelihood = %f" % (getTime(), batch_count, old_likelihood, likelihood) old_likelihood = likelihood iter += 1 # update the suff_stat ss # this time it only contains information from one doc ss.m_var_sticks_ss += np.sum(var_phi, 0) ss.m_var_beta_ss[:, batchids] += np.dot(var_phi.T, phi.T * doc.counts) # T x mini-batch的独立词的个数 return(likelihood)
def doc_e_step(self, doc, ss, trlabel, docnum, Elogbeta, Elogsticks_1st, Elogsticks_2nd, var_converge, fresh=False): Elogbeta_doc = Elogbeta[:, doc.words] v = np.zeros((2, self.m_K-1)) phi = np.ones((doc.length, self.m_K)) * 1.0/self.m_K # should be zeta likelihood = 0.0 old_likelihood = -1e1000 converge = 1.0 eps = 1e-100 iter = 0 max_iter = 10 #(TODO): support second level optimization in the future while iter < max_iter: #and (converge < 0.0 or converge > var_converge): ### update variational parameters # smallphi var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi #zeta sval = np.zeros((1, self.m_K)) nwords = np.sum(doc.counts) tmp = (self.r[trlabel,:] - self.r) sval = np.dot(self.dmu[docnum,:],tmp) sval = sval/nwords sval = 0; phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd + sval (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) phi_all = phi * np.array(doc.counts)[:,np.newaxis] # local sticks v[0] = 1.0 + np.sum(phi_all[:,:self.m_K-1], 0) #a_{nt} phi_cum = np.flipud(np.sum(phi_all[:,1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) #b_{nt} Elogsticks_2nd = expect_log_sticks(v) if iter==max_iter-1: self.write_local_sticks(v); likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K-1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum((np.array([1.0, self.m_alpha])[:,np.newaxis]-v) * (sp.psi(v)-dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum(sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood)/abs(old_likelihood) old_likelihood = likelihood if converge < 0: print "warning, likelihood is decreasing!" iter = iter + 1 # update the suff_stat ss ss.m_var_sticks_ss += np.sum(var_phi, 0) ss.m_var_beta_ss[:, doc.words] += np.dot(var_phi.T, phi.T * doc.counts) ss.m_var_zeta[docnum,:] = np.sum((phi.T * doc.counts).T,0) return(likelihood)
def doc_inference(self, doc, docnum, Elogbeta, Elogsticks_1st, var_converge, m_var_zeta): Elogbeta_doc = Elogbeta[:, doc.words] v = np.zeros((2, self.m_K - 1)) phi = np.ones( (doc.length, self.m_K)) * 1.0 / self.m_K # should be zeta # the following line is of no use Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 old_likelihood = -1e1000 converge = 1.0 eps = 1e-100 iter = 0 max_iter = 100 #(TODO): support second level optimization in the future while iter < 20: #and (converge < 0.0 or converge > var_converge): ### update variational parameters # var_phi var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi #zeta phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) phi_all = phi * np.array(doc.counts)[:, np.newaxis] # local sticks v[0] = 1.0 + np.sum(phi_all[:, :self.m_K - 1], 0) #a_{jt} phi_cum = np.flipud(np.sum(phi_all[:, 1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) #b_{jt} Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K - 1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum( (np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum( sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood) / abs(old_likelihood) old_likelihood = likelihood """if converge < 0: print "warning, likelihood is decreasing!" """ iter = iter + 1 m_var_zeta[docnum, :] = np.sum((phi.T * doc.counts).T, 0) return (likelihood, m_var_zeta)
def doc_e_step(self, batch_count, doc, ss, Elogsticks_1st, Elogsticks_2nd, word_list, unique_words, var_converge, max_iter=500): """ e step for a single doc """ batchids = [unique_words[id] for id in doc.words ] # 生成文档对应的这个batch的id,不是全局的id(doc.words保存的是全局id) Elogbeta_doc = self.m_Elogbeta[:, :, doc.words] Elogbeta_doc_noise = self.m_Elogbeta_noise[doc.words] # Elogtime_doc.shape = (T, K) Elogtime_doc = np.array([[scipy.stats.norm.logpdf(doc.time, self.m_mu_t[t][k], self.m_sigma_t[t][k])\ for k in range(self.m_K)] for t in range(self.m_T)]) # Eloglocation_doc.shape = (T, K) Eloglocation_doc = np.array([[scipy.stats.multivariate_normal.logpdf((doc.latitude, doc.longitude),\ self.m_mu_l[t][k], self.m_sigma_l[t][k])\ for k in range(self.m_K)] for t in range(self.m_T)]) # 初始化x_hat # np.shape(x_hat) = (doc.length) x_hat = np.ones(doc.length) / 2 # 每个词都初始化为0.5 x_hat_bar = x_hat # 表示微博属于不同节点的概率,初始化为均匀分布,微博属于不同节点的概率都相同 # np.shape(phi) = (T, K) phi = np.ones((self.m_T, self.m_K)) * 1.0 / self.m_K # 表示微博属于不同事件的概率 # np.shape(Elogbeta_doc * doc.counts) = (T, K, N), # np.shape(x_hat) = (N,) = (N x 1) # np.shape(var_phi) = (T,) = (T x 1) var_phi = np.sum(np.dot((Elogbeta_doc * doc.counts), x_hat) * phi, 1) likelihood = 0.0 old_likelihood = -1e100 converge = 1.0 eps = 1e-100 iter = 0 while iter < max_iter and (converge < 0.0 or converge > var_converge): if iter < 3: var_phi = np.sum( np.dot((Elogbeta_doc * doc.counts), x_hat) * phi, 1) (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) else: var_phi = np.sum(np.dot((Elogbeta_doc * doc.counts), x_hat) * phi, 1) + Elogsticks_1st \ + np.sum(phi * Elogtime_doc, 1) + np.sum(phi * Eloglocation_doc, 1) (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) if iter < 3: phi = np.dot((Elogbeta_doc * doc.counts), x_hat) (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) else: phi = np.dot( (Elogbeta_doc * doc.counts), x_hat ) + Elogsticks_2nd + Eloglocation_doc + Eloglocation_doc (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) # 更新x_hat 这里使用一个二维的多项分布来近似,和笔记不同 # 先转置三维数组,(T, K, N)转换为(N, T, K)然后和phi逐项相乘,然后K那一维相加,最后乘以var_phi x_hat = self.m_Elogx[0] + np.dot( np.sum( np.transpose(Elogbeta_doc * doc.counts, (2, 0, 1)) * phi, 2), var_phi) x_hat_bar = self.m_Elogx[1] + Elogbeta_doc_noise # 然后合并两个矩阵,用log_normalize归一化,得到 N x 2的矩阵 (log_x_hat, log_norm) = utils.log_normalize( np.column_stack((x_hat, x_hat_bar))) x_hat_final = np.exp(log_x_hat) # 再分开两列,后面要用 log_x_hat_bar = log_x_hat[:, 1] log_x_hat = log_x_hat[:, 0] x_hat = x_hat_final[:, 0] x_hat_bar = x_hat_final[:, 1] likelihood = 0.0 # compute likelihood # 取文档内的参数的似然相加 # 展开式的第五项和第七项相加, # np.shape(Elogsticks_1st) = (T,) np.shape(log_var_phi) = (T,) = np.shape(var_phi) likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # 展开式的第六项和第八项相加 # np.shape(Elogsticks_2nd) = (T, K) np.shape(log_phi) = (T, K) = np.shape(phi) likelihood += np.sum( np.sum((Elogsticks_2nd - log_phi) * phi, 1) * var_phi) # 展开式的第四项和第九项相加 # np.shape(self.m_Elogx) = (2,) np.shape(log_x_hat) = np.shape(x_hat) = (N,) likelihood += np.sum( np.sum(self.m_Elogx[0] - log_x_hat) * x_hat) + np.sum( np.sum(self.m_Elogx[1] - log_x_hat_bar) * x_hat_bar) # 展开式的第一项,分为两部分,一部分是噪声生成的词项,一部分是非噪声 # np.shape(Elogbeta_doc) = (T, K, N) np.shape(var_phi) = (T,) np.shape(Elogbeta_doc_noise) = (N,) likelihood += np.sum(np.sum(np.dot((Elogbeta_doc * doc.counts), x_hat) * phi, 1) * var_phi) + \ np.dot((Elogbeta_doc_noise * doc.counts), x_hat_bar) # 展开式的第二项 likelihood += np.sum(np.sum(Elogtime_doc * phi, 1) * var_phi) # 展开式的第三项 likelihood += np.sum(np.sum(Eloglocation_doc * phi, 1) * var_phi) converge = (likelihood - old_likelihood) / abs(old_likelihood) if converge < -0.000001: print "%s [batch_count = %d iter = %d] warning, likelihood is decreasing! old_likelihood = %f new_likelihood = %f" % ( getTime(), batch_count, iter, old_likelihood, likelihood) old_likelihood = likelihood iter += 1 # print "%s [batch_count = %d iter = %d] new_likelihood = %f" % (getTime(), batch_count, iter, likelihood) # update the suff_stat ss # this time it only contains information from one doc st = phi * var_phi[:, np.newaxis] ss.m_var_sticks_1st_ss += var_phi ss.m_var_sticks_2nd_ss += st # ss.m_var_beta_ss[:, :, batchids] += np.ones( [self.m_T, self.m_K, doc.length]) * x_hat * doc.counts * ( (phi * var_phi[:, np.newaxis])[:, :, np.newaxis]) ss.m_var_beta_noise_ss[batchids] += x_hat_bar * doc.counts # ss.m_var_mu_time_ss_numerator += st * doc.time ss.m_var_mu_time_ss_denominator += st ss.m_var_sigma_time_ss_numerator += st * pow(doc.time - self.m_mu_t, 2) ss.m_var_sigma_time_ss_denominator += st # 矩阵逐项相乘 是后对齐 ss.m_var_mu_location_ss_numerator += np.ones([ self.m_T, self.m_K, 2 ]) * np.array([doc.latitude, doc.longitude]) * ( (phi * var_phi[:, np.newaxis])[:, :, np.newaxis]) ss.m_var_mu_location_ss_denominator += st[:, :, np.newaxis] ss.m_var_sigma_location_ss_numerator += np.array([[ np.array([(pow(doc.latitude - self.m_mu_l[t][k][0], 2), (doc.latitude - self.m_mu_l[t][k][0]) * (doc.longitude - self.m_mu_l[t][k][1])),\ ((doc.latitude - self.m_mu_l[t][k][0]) * (doc.longitude - self.m_mu_l[t][k][1]), pow(doc.longitude - self.m_mu_l[t][k][1], 2))]) \ * phi[t][k] * var_phi[t] for k in range(self.m_K)] for t in range(self.m_T)]) ss.m_var_sigma_location_ss_denominator += np.array([[ np.ones((2, 2))* phi[t][k] * var_phi[t] \ for k in range(self.m_K)] for t in range(self.m_T)]) return (likelihood)
def doc_e_step(self, doc, ss, Elogsticks_1st, \ word_list, unique_words, var_converge, \ max_iter=100): """ e step for a single doc """ batchids = [unique_words[id] for id in doc.words] Elogbeta_doc = self.m_Elogbeta[:, doc.words] ## very similar to the hdp equations v = np.zeros((2, self.m_K-1)) v[0] = 1.0 v[1] = self.m_alpha # The following line is of no use. Elogsticks_2nd = expect_log_sticks(v) # back to the uniform phi = np.ones((len(doc.words), self.m_K)) * 1.0/self.m_K likelihood = 0.0 old_likelihood = -1e100 converge = 1.0 eps = 1e-100 iter = 0 # not yet support second level optimization yet, to be done in the future while iter < max_iter and (converge < 0.0 or converge > var_converge): ### update variational parameters # var_phi if iter < 3: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) else: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi if iter < 3: phi = np.dot(var_phi, Elogbeta_doc).T (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) else: phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) # v phi_all = phi * np.array(doc.counts)[:,np.newaxis] v[0] = 1.0 + np.sum(phi_all[:,:self.m_K-1], 0) phi_cum = np.flipud(np.sum(phi_all[:,1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K-1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum((np.array([1.0, self.m_alpha])[:,np.newaxis]-v) * (sp.psi(v)-dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum(sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood)/abs(old_likelihood) old_likelihood = likelihood if converge < -0.000001: print "warning, likelihood is decreasing!" iter += 1 # update the suff_stat ss # this time it only contains information from one doc ss.m_var_sticks_ss += np.sum(var_phi, 0) ss.m_var_beta_ss[:, batchids] += np.dot(var_phi.T, phi.T * doc.counts) return(likelihood)
def doc_e_step(self, count, doc, ss, Elogbeta, Elogsticks_1st, var_converge, fresh=False): Elogbeta_doc = Elogbeta[:, doc. words] # T x doc.length fancy索引,将这个文档里面的词的对应的参数取出 v = np.zeros((2, self.m_K - 1)) # 2 x K-1维 0 # 这里原先没有,由于Elogsticks_2nd在一开始没有用到,所以下面这句其实也可不加 v[0] = 1.0 v[1] = self.m_alpha phi = np.ones( (doc.length, self.m_K)) * 1.0 / self.m_K # doc.length x K 归一化 # the following line is of no use Elogsticks_2nd = expect_log_sticks(v) # 计算Eq[log_pi_jt], K维 likelihood = 0.0 # 这里原来是-1e1000 old_likelihood = -1e100 converge = 1.0 eps = 1e-100 iter = 0 max_iter = 100 # (TODO): support second level optimization in the future while iter < max_iter and (converge < 0.0 or converge > var_converge): ### update variational parameters # var_phi,其实是公式里面的phi if iter < 3 and fresh: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) # 乘完之后是K x T维矩阵 (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) else: var_phi = np.dot( phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi 公式里面的zeta if iter < 3: phi = np.dot(var_phi, Elogbeta_doc).T # 乘完之后是doc.length x K维矩阵 KT与论文相反 (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) else: phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) # v phi_all = phi * np.array(doc.counts)[:, np.newaxis] v[0] = 1.0 + np.sum(phi_all[:, :self.m_K - 1], 0) # 更新变分参数a_jt phi_cum = np.flipud(np.sum(phi_all[:, 1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) # 更新变分参数b_jt Elogsticks_2nd = expect_log_sticks(v) # K维 likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation # 似然的展开式的第二项和第五项相加 likelihood += np.sum( (Elogsticks_1st - log_var_phi) * var_phi) # 这里 1 x T维减去 K x T维,结果是把第一个补全为K x T维之后相减 # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) # 第四项中的B函数展开是(self.m_K - 1) * log_alpha likelihood += (self.m_K - 1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) # 第四项和第七项中的log_pi_jt和log_1-pi_jt合并,v.shape = (K-1,) likelihood += np.sum( (np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) # 第七项中的B函数展开 likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum( sp.gammaln(v)) # Z part # 似然的展开式的第三项和第六项相加 likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood) / abs(old_likelihood) old_likelihood = likelihood # if converge < 0: # print "warning, likelihood is decreasing!" if converge < -0.000001: print "%s [batch_count = %d] warning, likelihood is decreasing! old_likelihood = %f new_likelihood = %f" % ( getTime(), count, old_likelihood, likelihood) iter += 1 # update the suff_stat ss # 这里为m步的更新uk,vk和lambda_kw作准备 ss.m_var_sticks_ss += np.sum(var_phi, 0) # 竖着加,加完最后是sigma(j,sigma(t, var_phi)) ss.m_var_beta_ss[:, doc.words] += np.dot( var_phi.T, phi.T * doc.counts) # lambda更新公式的eta后面那一项 return (likelihood)
def doc_e_step(self, doc, ss, Elogbeta, Elogsticks_1st, var_converge, fresh=False): Elogbeta_doc = Elogbeta[:, doc.words] v = np.zeros((2, self.m_K-1)) phi = np.ones((doc.length, self.m_K)) * 1.0/self.m_K # the following line is of no use Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 old_likelihood = -1e1000 converge = 1.0 eps = 1e-100 iter = 0 max_iter = 100 #(TODO): support second level optimization in the future while iter < max_iter and (converge < 0.0 or converge > var_converge): ### update variational parameters # var_phi if iter < 3 and fresh: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) else: var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi if iter < 3: phi = np.dot(var_phi, Elogbeta_doc).T (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) else: phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) # v phi_all = phi * np.array(doc.counts)[:,np.newaxis] v[0] = 1.0 + np.sum(phi_all[:,:self.m_K-1], 0) phi_cum = np.flipud(np.sum(phi_all[:,1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K-1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum((np.array([1.0, self.m_alpha])[:,np.newaxis]-v) * (sp.psi(v)-dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum(sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood)/abs(old_likelihood) old_likelihood = likelihood if converge < 0: print "warning, likelihood is decreasing!" iter += 1 # update the suff_stat ss ss.m_var_sticks_ss += np.sum(var_phi, 0) ss.m_var_beta_ss[:, doc.words] += np.dot(var_phi.T, phi.T * doc.counts) return(likelihood)
def EStep(self, mixModel, mixParam, phi, variance_type): """ E-step """ for g in range(0, mixModel.G): alpha_g = mixParam.alpha_g[g] beta_g = mixParam.beta_g[g, :, :] #Wg = self.param.Wg[g,:,:] pi_jgk = mixParam.pi_jgk[g, :, :] log_pijgk_fgk_xij = np.zeros((mixModel.n * mixModel.m, mixModel.K)) for k in range(0, mixModel.K): beta_gk = beta_g[:, k] if variance_type == enums.variance_types.common: sgk = mixParam.sigma_g[g] else: #? sgk = mixParam.sigma_g[g, k] temp = phi.XBeta @ beta_gk temp = temp.reshape((len(temp), 1)) z = ((mixModel.XR - temp)**2) / sgk #print(sgk) temp = np.array([ np.log(pi_jgk[:, k]) - 0.5 * (np.log(2 * np.pi) + np.log(sgk)) ]).T - 0.5 * z log_pijgk_fgk_xij[:, k] = temp.T #pdf cond à c_i = g et z_i = k de xij log_pijgk_fgk_xij = np.minimum(log_pijgk_fgk_xij, np.log(sys.float_info.max)) log_pijgk_fgk_xij = np.maximum(log_pijgk_fgk_xij, np.log(sys.float_info.min)) pijgk_fgk_xij = np.exp(log_pijgk_fgk_xij) sumk_pijgk_fgk_xij = np.array([pijgk_fgk_xij.sum(axis=1) ]).T # sum over k log_sumk_pijgk_fgk_xij = np.log(sumk_pijgk_fgk_xij) #[nxm x 1] self.log_tau_ijgk[ g, :, :] = log_pijgk_fgk_xij - log_sumk_pijgk_fgk_xij @ np.ones( (1, mixModel.K)) self.tau_ijgk[g, :, :] = np.exp( utl.log_normalize(self.log_tau_ijgk[g, :, :])) temp = np.reshape(log_sumk_pijgk_fgk_xij.T, (mixModel.n, mixModel.m)) self.log_fg_xij[:, g] = temp.sum( axis=1 ) #[n x 1]: sum over j=1,...,m: fg_xij = prod_j sum_k pi_{jgk} N(x_{ij},mu_{gk},s_{gk)) self.log_alphag_fg_xij[:, g] = np.log( alpha_g) + self.log_fg_xij[:, g] # [nxg] self.log_alphag_fg_xij = np.minimum(self.log_alphag_fg_xij, np.log(sys.float_info.max)) self.log_alphag_fg_xij = np.maximum(self.log_alphag_fg_xij, np.log(sys.float_info.min)) # cluster posterior probabilities p(c_i=g|X) self.h_ig = np.exp(utl.log_normalize(self.log_alphag_fg_xij)) # log-likelihood temp = np.exp(self.log_alphag_fg_xij) self.loglik = sum(np.log(temp.sum(axis=1)))
def log_gmm(x, means, stds, log_pais): component_log_densities = torch.stack([log_gaussian(x, mu, std) for (mu, std) in zip(means, stds)]).T # log_weights = torch.log(pais) log_weights = log_normalize(log_pais) return torch.logsumexp(component_log_densities + log_weights, axis=-1, keepdims=False)
def doc_e_step(self, doc, ss, trlabel, docnum, Elogbeta, Elogsticks_1st, Elogsticks_2nd, var_converge, fresh=False): Elogbeta_doc = Elogbeta[:, doc.words] v = np.zeros((2, self.m_K - 1)) phi = np.ones( (doc.length, self.m_K)) * 1.0 / self.m_K # should be zeta likelihood = 0.0 old_likelihood = -1e1000 converge = 1.0 eps = 1e-100 iter = 0 max_iter = 10 #(TODO): support second level optimization in the future while iter < max_iter: #and (converge < 0.0 or converge > var_converge): ### update variational parameters # smallphi var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi #zeta sval = np.zeros((1, self.m_K)) nwords = np.sum(doc.counts) tmp = (self.r[trlabel, :] - self.r) sval = np.dot(self.dmu[docnum, :], tmp) sval = sval / nwords sval = 0 phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd + sval (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) phi_all = phi * np.array(doc.counts)[:, np.newaxis] # local sticks v[0] = 1.0 + np.sum(phi_all[:, :self.m_K - 1], 0) #a_{nt} phi_cum = np.flipud(np.sum(phi_all[:, 1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) #b_{nt} Elogsticks_2nd = expect_log_sticks(v) if iter == max_iter - 1: self.write_local_sticks(v) likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K - 1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum( (np.array([1.0, self.m_alpha])[:, np.newaxis] - v) * (sp.psi(v) - dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum( sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood) / abs(old_likelihood) old_likelihood = likelihood if converge < 0: print "warning, likelihood is decreasing!" iter = iter + 1 # update the suff_stat ss ss.m_var_sticks_ss += np.sum(var_phi, 0) ss.m_var_beta_ss[:, doc.words] += np.dot(var_phi.T, phi.T * doc.counts) ss.m_var_zeta[docnum, :] = np.sum((phi.T * doc.counts).T, 0) return (likelihood)
def doc_inference(self, doc, docnum, Elogbeta, Elogsticks_1st, var_converge, m_var_zeta): Elogbeta_doc = Elogbeta[:, doc.words] v = np.zeros((2, self.m_K-1)) phi = np.ones((doc.length, self.m_K)) * 1.0/self.m_K # should be zeta # the following line is of no use Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 old_likelihood = -1e1000 converge = 1.0 eps = 1e-100 iter = 0 max_iter = 100 #(TODO): support second level optimization in the future while iter < 20: #and (converge < 0.0 or converge > var_converge): ### update variational parameters # var_phi var_phi = np.dot(phi.T, (Elogbeta_doc * doc.counts).T) + Elogsticks_1st (log_var_phi, log_norm) = utils.log_normalize(var_phi) var_phi = np.exp(log_var_phi) # phi #zeta phi = np.dot(var_phi, Elogbeta_doc).T + Elogsticks_2nd (log_phi, log_norm) = utils.log_normalize(phi) phi = np.exp(log_phi) phi_all = phi * np.array(doc.counts)[:,np.newaxis] # local sticks v[0] = 1.0 + np.sum(phi_all[:,:self.m_K-1], 0) #a_{jt} phi_cum = np.flipud(np.sum(phi_all[:,1:], 0)) v[1] = self.m_alpha + np.flipud(np.cumsum(phi_cum)) #b_{jt} Elogsticks_2nd = expect_log_sticks(v) likelihood = 0.0 # compute likelihood # var_phi part/ C in john's notation likelihood += np.sum((Elogsticks_1st - log_var_phi) * var_phi) # v part/ v in john's notation, john's beta is alpha here log_alpha = np.log(self.m_alpha) likelihood += (self.m_K-1) * log_alpha dig_sum = sp.psi(np.sum(v, 0)) likelihood += np.sum((np.array([1.0, self.m_alpha])[:,np.newaxis]-v) * (sp.psi(v)-dig_sum)) likelihood -= np.sum(sp.gammaln(np.sum(v, 0))) - np.sum(sp.gammaln(v)) # Z part likelihood += np.sum((Elogsticks_2nd - log_phi) * phi) # X part, the data part likelihood += np.sum(phi.T * np.dot(var_phi, Elogbeta_doc * doc.counts)) converge = (likelihood - old_likelihood)/abs(old_likelihood) old_likelihood = likelihood """if converge < 0: print "warning, likelihood is decreasing!" """ iter = iter + 1 m_var_zeta[docnum,:] = np.sum((phi.T * doc.counts).T,0) return(likelihood, m_var_zeta)