def maximization(self): ''' M-step of EM algorithm, use scikit.learn's LedoitWolf method to perfom covariance matrix shrinkage. Arguments: sufficient statistics, i.e. model parameters Returns: the updated sufficient statistics which all in self definition, so no return values ''' logger.info("running maximization function") logger.info("mean maximization") mu = np.divide(self.mu, self.ndata) logger.info("covariance maximization") for i in range(self._K): for j in range(self._K): self.cov[i, j] = (1.0 / self.ndata) * self.cov[i, j] + self.ndata * mu[i] * mu[j] - self.mu[i] * mu[j] - self.mu[j] * mu[i] logger.info(" performing covariance shrinkage using sklearn module") lw = LedoitWolf() cov_result = lw.fit(self.cov, assume_centered=True).covariance_ self.inv_cov = np.linalg.inv(cov_result) self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov)) logger.info("topic maximization") for i in range(self._K): sum_m = 0 sum_m += np.sum(self.beta, axis=0)[i] if sum_m == 0: sum_m = -1000 * self._W else: sum_m = np.log(sum_m) for j in range(self._W): self.log_beta[i, j] = math_utli.safe_log(self.beta[i, j] - sum_m) logger.info("write model parameters to file") logger.info("write gaussian") with open('ctm_nu', 'w') as ctm_nu_dump: cPickle.dump(self.nu, ctm_nu_dump) with open('ctm_cov', 'w') as ctm_cov_dump: cPickle.dump(self.cov, ctm_cov_dump) with open('ctm_inv_cov', 'w') as ctm_inv_cov_dump: cPickle.dump(self.inv_cov, ctm_inv_cov_dump) with open('ctm_log_det_inv_cov', 'w') as ctm_log_det_inv_cov_dump: cPickle.dump(self.log_det_inv_cov, ctm_log_det_inv_cov_dump) logger.info("write topic matrix") with open('ctm_log_beta', 'w') as ctm_log_beta_dump: cPickle.dump(self.log_beta, ctm_log_beta_dump)
def sample_term(self, eta, lambda_v, nu_v, obs_wordidsd, obs_wordctsd): ''' Importance sampling the likelihood based on the variational posterior Arguments: eta : natural parameter of logistic normal distribution theta : mean parameter of logistic normal distribution The mapping between them is equation 3 in the paper: eta[i] = log theta[i] / theta[K] Returns: value of p(w | eta) - q(eta) ''' t1 = 0.5 * self.log_det_inv_cov t1 += -(0.5) * self._K * 1.837877 # 1.837877 is the natural logarithm of 2*pi for i in range(self._K): for j in range(self._K): t1 -= (0.5) * (eta[i] - self.mu[i]) * self.inv_cov[i, j] * (eta[j] - self.mu[j]) # compute theta theta = eta[:] sum_t = np.sum(np.exp(eta)) theta = np.divide(theta, sum_t) # compute word probabilities nterms = len(obs_wordidsd) for n in range(nterms): word_term = 0 for i in range(self._K): word_term += theta[i] * np.exp(self.log_beta[i, n]) ids = obs_wordidsd.index(i) t1 += obs_wordctsd[ids] * math_utli.safe_log(word_term) # log(q(\eta | lambda, nu)) t2 = 0 for i in range(self._K): t2 += stats.norm.pdf(eta[i] - lambda_v[i], np.sqrt(nu_v[i])) return(t1 - t2)
def lhood_bnd(self, d, phi_v, log_phi_v, lambda_v, nu_v, zeta_v): ''' compute the likelihood bound given the variational parameters Arguments: d : current working docutment index variational parameters Returns: likelihood bound ''' logger.info("calculating likelihood bound") # E[log p(\eta | \mu, \Sigma)] + H(q(\eta | \lambda, \nu) lhood = (0.5) * self.log_det_inv_cov + 0.5 * self._K for i in range(self._K): v = - (0.5) * nu_v[i] * self.inv_cov[i, i] for j in range(self._K): v -= (0.5) * (lambda_v[i] - self.mu[i]) * self.inv_cov[i, j] * (lambda_v[j] - self.mu[j]) v += (0.5) * math_utli.safe_log(nu_v[i]) lhood += v # E[log p(z_n | \eta)] + E[log p(w_n | \beta)] + H(q(z_n | \phi_n)) # Equation 7 in paper, calculate the upper bound sum_exp = np.sum(np.exp(lambda_v) + 0.5 * nu_v) bound = (1.0 / zeta_v) * sum_exp - 1.0 + math_utli.safe_log(zeta_v) lhood -= bound * self._D ntermd = len(self.wordcts[d]) for i in range(self._K): for j in range(ntermd): if phi_v[i,j] > 0: # ids = self.wordids[d].index(i) logger.info("iteration %i, %i", i, j) logger.info('lhood-track-1 : %f', lhood) # logger.info('self.wordcts %i', self.wordcts[d][i]) # logger.info('phi_v - %f', phi_v[i][j]) # logger.info('lambda_v - %f',lambda_v[i]) logger.info('self.log_beta - %f', self.log_beta[i][j]) # logger.info('log_phi_v - %f', log_phi_v[i][j]) lhood = lhood + self.wordcts[d][j] * phi_v[i][j] * (lambda_v[i] + self.log_beta[i][j] - log_phi_v[i][j]) logger.info('lhood-track-2 : %f', lhood) return lhood
def log_mult_prob(self, held_wordctsd, e_theta): ''' log probability of the document under proportions theta and topics beta used to calculate the held-out data's probability ''' val = 0 nterms = len(held_wordctsd) for i in range(nterms): # here the number W should be the number of held-out data # log_beta should be initialized, not the old self.log_beta term_prob = 0 for k in range(self._K): term_prob += e_theta[k] * np.exp(self.log_beta[k, i]) val += math_utli.safe_log(term_prob) * held_wordctsd[i] return val
def f_lambda(self, sum_phi, phi_v, lambda_v, nu_v, zeta_v): temp1 = np.zeros(self._K) term1 = term2 = term3 = 0 # compute lambda^T * \sum phi term1 = np.dot(lambda_v * sum_phi) # compute lambda - mu (= temp1) temp1 += np.subtract(lambda_v, self.mu) # compute (lambda - mu)^T Sigma^-1 (lambda - mu) term2 = (-0.5) * temp1 * self.inv_cov * temp1 # last term for i in range(self._K): term3 += np.exp(lambda_v[i] + 0.5 * nu_v[i]) # need to figure out how term3 is calculated term3 = - ((1.0 / zeta_v) * term3 - 1.0 + math_utli.safe_log(zeta_v)) * self._K return (-(term1 + term2 + term3))
def opt_nu(self, lambda_v, zeta_v): logger.info("calculating variational parameter NU") # optimize nu df = d2f = 0 nu_v = np.dot(10, np.ones(self._K)) log_nu_v = np.log(nu_v) for i in range(self._K): while np.fabs(df) > 1e-10: nu_v[i] = np.exp(log_nu_v[i]) if math.isnan(nu_v[i]): nu_v[i] = 20 log_nu_v[i] = math_utli.safe_log(nu_v[i]) df = - np.dot(0.5, self.inv_cov[i, i]) - np.dot((0.5 * self._W / zeta_v), np.exp(lambda_v[i] + nu_v[i] / 2)) + (0.5 * (1.0 / nu_v[i])) d2f = - np.dot((0.25 * (self._W / zeta_v)), np.exp(lambda_v[i] + nu_v[i] / 2)) - (0.5 * (1.0 / nu_v[i] * nu_v[i])) log_nu_v[i] = log_nu_v[i] - (df * nu_v[i]) / (d2f * nu_v[i] * nu_v[i] + df * nu_v[i]) nu_v = np.exp(log_nu_v) return nu_v
def expected_theta(self, obs_wordidsd, obs_wordctsd, lambda_v, nu_v): ''' Return expected theta under a variational distribution Args: self : use all the parameters initialized before lambda_v : variational parameter lambda nu_v : variational parameter nu Returns: val : the expected theta ''' nsamples = 100 eta = np.zeros(self._K) theta = eta[:] # initialize e_theta e_theta = -1.0 * np.ones(self._K) # for each sample nterms = len(obs_wordidsd) for n in range(nterms): # sample eta from q(\eta) for i in range(self._K): eta[i] = random.gauss(0, np.sqrt(nu_v[i])) + lambda_v[i] # compute p(w | \eta) - q(\eta) log_prob = self.sample_term(eta, lambda_v, nu_v, obs_wordidsd, obs_wordctsd) # compute theta theta = eta[:] sum_t = np.sum(np.exp(eta)) theta = np.divide(theta, sum_t) # update e_theta for i in range(self._K): e_theta[i] = math_utli.log_sum(e_theta[i], log_prob + math_utli.safe_log(theta[i])) # normalize e_theta and set return vector sum_et = -1.0 for i in range(self._K): e_theta[i] -= np.log(nsamples) sum_et = math_utli.log_sum(sum_et, e_theta[i]) e_theta = np.exp(np.subtract(e_theta, sum_et)) return e_theta
def __init__(self, K, mu=None, cov=None): ''' Arguments: K: Number of topics D: Total number of documents in the population. For a fixed corpus, this is the size of the corpus. mu and cov: the hyperparameters logistic normal distribution for prior on weight vectors theta ''' logger.info("CTM commence.") logger.info("Initializing...") if K is None is None: raise ValueError('number of topics have to be specified.') # get the folder name which containing all the training files # we will have to manually specific the observed and heldout folders obs_filenames = os.listdir('d:\\mycode\\ctm_python\\state_union\\observed\\') path = 'd:\\mycode\\ctm_python\\state_union\\observed\\' logger.info("initializing id mapping from corpus, assuming identity") #initial a string to save all the file contents txt_corpus = [] for thefile in obs_filenames: with open(path + thefile, "rb") as f: strings = f.read() txt_corpus.append(strings) (dictionary, corpus) = preprocess.get_dict_and_corp(txt_corpus) logger.info("dictionary and corpus are generated") self.dictionary = dictionary self.corpus = corpus self._K = K # number of topics logger.info("There are %i topics.", self._K) self._W = len(dictionary) # number of all the terms self._D = len(corpus) # number of documents # initialize wordid and wordcount list for the whole corpus self.wordids = list() self.wordcts = list() for d, doc in enumerate(self.corpus): wordidsd = [id for id, _ in doc] wordctsd = np.array([cnt for _, cnt in doc]) self.wordids.append(wordidsd) self.wordcts.append(wordctsd) # mu : K-size vector with 0 as initial value # cov : K*K matrix with 1 as initial value , together they make a Gaussian if mu is None: self.mu = np.zeros(self._K) else: self.mu = mu if cov is None: self.cov = np.ones((self._K, self._K)) else: self.cov = cov # if cov is initialized by the process, then there is no need to calculate # inverse of cov, since it is singular matrix try: self.inv_cov = np.linalg.inv(self.cov) except np.linalg.linalg.LinAlgError as err: if 'Singular matrix' in err.message: self.inv_cov = self.cov else: pass # self.inv_cov = np.linalg.inv(self.cov) self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov)) self.ndata = 0 # cumulate count of number of docs processed # initialize topic distribution, i.e. self.log_beta sum = 0 self.beta = np.zeros([self._K, self._W]) self.log_beta = np.zeros([self._K, self._W]) for i in range(self._K): # initialize beta with a randomly chosen doc # stuff K topics randomly doc_no = np.random.randint(self._D) nterms = len(self.wordcts[doc_no]) for j in range(nterms): word_index = self.wordids[doc_no][j] self.log_beta[i][word_index] = self.wordcts[doc_no][j] # logging.info('self.log_beta[i,j]-track %f', self.log_beta[i,j]) for m in range(self._K): for n in range(self._W): self.log_beta[m][n] = self.log_beta[m][n] + 1.0 + np.random.ranf() # logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] ) # to initialize and smooth sum = math_utli.safe_log(np.sum(self.log_beta)) logger.info("log_beta_sum : %f", sum) # little function to normalize self.log_beta def element_add(x): return x + math_utli.safe_log(x-sum) self.log_beta = map(element_add, self.log_beta) for m in range(self._K): for n in range(self._W): logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] ) logger.info("initialization finished.")
def element_add(x): return x + math_utli.safe_log(x-sum)