文件: ctm.py 项目: zhibo-work/code
	def maximization(self):
		M-step of EM algorithm, use scikit.learn's LedoitWolf method to perfom
		covariance matrix shrinkage.
			sufficient statistics, i.e. model parameters
			the updated sufficient statistics which all in self definition, so no return values
		logger.info("running maximization function")
		logger.info("mean maximization")
		mu = np.divide(self.mu, self.ndata)
		logger.info("covariance maximization")
		for i in range(self._K):
			for j in range(self._K):
				self.cov[i, j] = (1.0 / self.ndata) * self.cov[i, j] + self.ndata * mu[i] * mu[j] - self.mu[i] * mu[j] - self.mu[j] * mu[i]
		logger.info(" performing covariance shrinkage using sklearn module")
		lw = LedoitWolf()
		cov_result = lw.fit(self.cov, assume_centered=True).covariance_
		self.inv_cov = np.linalg.inv(cov_result)
		self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov))

		logger.info("topic maximization")
		for i in range(self._K):
			sum_m = 0
			sum_m += np.sum(self.beta, axis=0)[i]

			if sum_m == 0:
				sum_m = -1000 * self._W
				sum_m = np.log(sum_m)

			for j in range(self._W):
				self.log_beta[i, j] = math_utli.safe_log(self.beta[i, j] - sum_m)

		logger.info("write model parameters to file")
		logger.info("write gaussian")
		with open('ctm_nu', 'w') as ctm_nu_dump:
			cPickle.dump(self.nu, ctm_nu_dump)
		with open('ctm_cov', 'w') as ctm_cov_dump:
			cPickle.dump(self.cov, ctm_cov_dump)
		with open('ctm_inv_cov', 'w') as ctm_inv_cov_dump:
			cPickle.dump(self.inv_cov, ctm_inv_cov_dump)
		with open('ctm_log_det_inv_cov', 'w') as ctm_log_det_inv_cov_dump:
			cPickle.dump(self.log_det_inv_cov, ctm_log_det_inv_cov_dump)
		logger.info("write topic matrix")
		with open('ctm_log_beta', 'w') as ctm_log_beta_dump:
			cPickle.dump(self.log_beta, ctm_log_beta_dump)
文件: ctm.py 项目: zhibo-work/code
	def sample_term(self, eta, lambda_v, nu_v, obs_wordidsd, obs_wordctsd):
		Importance sampling the likelihood based on the variational posterior

			eta : natural parameter of logistic normal distribution
			theta : mean parameter of logistic normal distribution
			The mapping between them is equation 3 in the paper:
					eta[i] = log theta[i] / theta[K]
			value of p(w | eta) - q(eta)
		t1 = 0.5 * self.log_det_inv_cov
		t1 += -(0.5) * self._K * 1.837877  # 1.837877 is the natural logarithm of 2*pi
		for i in range(self._K):
			for j in range(self._K):
				t1 -= (0.5) * (eta[i] - self.mu[i]) * self.inv_cov[i, j] * (eta[j] - self.mu[j])
		# compute theta
		theta = eta[:]
		sum_t = np.sum(np.exp(eta))
		theta = np.divide(theta, sum_t)

		# compute word probabilities
		nterms = len(obs_wordidsd)
		for n in range(nterms):
			word_term = 0
			for i in range(self._K):
				word_term += theta[i] * np.exp(self.log_beta[i, n])
			ids = obs_wordidsd.index(i)
			t1 += obs_wordctsd[ids] * math_utli.safe_log(word_term)
		# log(q(\eta | lambda, nu))
		t2 = 0
		for i in range(self._K):
			t2 += stats.norm.pdf(eta[i] - lambda_v[i], np.sqrt(nu_v[i]))
		return(t1 - t2)
文件: ctm.py 项目: zhibo-work/code
	def lhood_bnd(self, d, phi_v, log_phi_v, lambda_v, nu_v, zeta_v):
		compute the likelihood bound given the variational parameters

			d : current working docutment index
			variational parameters

			likelihood bound
		logger.info("calculating likelihood bound")
		# E[log p(\eta | \mu, \Sigma)] + H(q(\eta | \lambda, \nu)
		lhood = (0.5) * self.log_det_inv_cov + 0.5 * self._K
		for i in range(self._K):
			v = - (0.5) * nu_v[i] * self.inv_cov[i, i]
			for j in range(self._K):
				v -= (0.5) * (lambda_v[i] - self.mu[i]) * self.inv_cov[i, j] * (lambda_v[j] - self.mu[j])
			v += (0.5) * math_utli.safe_log(nu_v[i])
			lhood += v

		# E[log p(z_n | \eta)] + E[log p(w_n | \beta)] + H(q(z_n | \phi_n))
		# Equation 7 in paper, calculate the upper bound
		sum_exp = np.sum(np.exp(lambda_v) + 0.5 * nu_v)
		bound = (1.0 / zeta_v) * sum_exp - 1.0 + math_utli.safe_log(zeta_v)
		lhood -= bound * self._D

		ntermd = len(self.wordcts[d])
		for i in range(self._K):
			for j in range(ntermd):
				if phi_v[i,j] > 0:
					# ids = self.wordids[d].index(i)
					logger.info("iteration %i, %i", i, j)
					logger.info('lhood-track-1 : %f', lhood)
					# logger.info('self.wordcts %i', self.wordcts[d][i])
					# logger.info('phi_v -  %f', phi_v[i][j])
					# logger.info('lambda_v - %f',lambda_v[i])
					logger.info('self.log_beta - %f',  self.log_beta[i][j])
					# logger.info('log_phi_v - %f', log_phi_v[i][j])
					lhood = lhood + self.wordcts[d][j] * phi_v[i][j] * (lambda_v[i] + self.log_beta[i][j] - log_phi_v[i][j])
					logger.info('lhood-track-2 : %f', lhood)
		return lhood
文件: ctm.py 项目: zhibo-work/code
	def log_mult_prob(self, held_wordctsd, e_theta):
		 log probability of the document under proportions theta and topics beta
		 used to calculate the held-out data's probability

		val = 0
		nterms = len(held_wordctsd)
		for i in range(nterms):
		# here the number W should be the number of held-out data
		# log_beta should be initialized, not the old self.log_beta
			term_prob = 0
			for k in range(self._K):
				term_prob += e_theta[k] * np.exp(self.log_beta[k, i])
			val += math_utli.safe_log(term_prob) * held_wordctsd[i]
		return val
文件: ctm.py 项目: zhibo-work/code
		def f_lambda(self, sum_phi, phi_v, lambda_v, nu_v, zeta_v):
			temp1 = np.zeros(self._K)

			term1 = term2 = term3 = 0
			# compute lambda^T * \sum phi
			term1 = np.dot(lambda_v * sum_phi)
			# compute lambda - mu (= temp1)
			temp1 += np.subtract(lambda_v, self.mu)
			# compute (lambda - mu)^T Sigma^-1 (lambda - mu)
			term2 = (-0.5) * temp1 * self.inv_cov * temp1
			# last term
			for i in range(self._K):
				term3 += np.exp(lambda_v[i] + 0.5 * nu_v[i])
			# need to figure out how term3 is calculated
			term3 = - ((1.0 / zeta_v) * term3 - 1.0 + math_utli.safe_log(zeta_v)) * self._K
			return (-(term1 + term2 + term3))
文件: ctm.py 项目: zhibo-work/code
	def opt_nu(self, lambda_v, zeta_v):
		logger.info("calculating variational parameter NU")
		# optimize nu
		df = d2f = 0
		nu_v = np.dot(10, np.ones(self._K))
		log_nu_v = np.log(nu_v)

		for i in range(self._K):
			while np.fabs(df) > 1e-10:
				nu_v[i] = np.exp(log_nu_v[i])
				if math.isnan(nu_v[i]):
					nu_v[i] = 20
					log_nu_v[i] = math_utli.safe_log(nu_v[i])
				df = - np.dot(0.5, self.inv_cov[i, i]) - np.dot((0.5 * self._W / zeta_v), np.exp(lambda_v[i] + nu_v[i] / 2)) + (0.5 * (1.0 / nu_v[i]))
				d2f = - np.dot((0.25 * (self._W / zeta_v)), np.exp(lambda_v[i] + nu_v[i] / 2)) - (0.5 * (1.0 / nu_v[i] * nu_v[i]))
				log_nu_v[i] = log_nu_v[i] - (df * nu_v[i]) / (d2f * nu_v[i] * nu_v[i] + df * nu_v[i])
		nu_v = np.exp(log_nu_v)

		return nu_v
文件: ctm.py 项目: zhibo-work/code
	def expected_theta(self, obs_wordidsd, obs_wordctsd, lambda_v, nu_v):
		''' Return expected theta under a variational distribution

			self : use all the parameters initialized before
			lambda_v : variational parameter lambda
			nu_v : variational parameter nu

			val : the expected theta
		nsamples = 100
		eta = np.zeros(self._K)
		theta = eta[:]
		# initialize e_theta
		e_theta = -1.0 * np.ones(self._K)
		# for each sample
		nterms = len(obs_wordidsd)
		for n in range(nterms):
			# sample eta from q(\eta)
			for i in range(self._K):
				eta[i] = random.gauss(0, np.sqrt(nu_v[i])) + lambda_v[i]
			# compute p(w | \eta) - q(\eta)
			log_prob = self.sample_term(eta, lambda_v, nu_v, obs_wordidsd, obs_wordctsd)
			# compute theta
			theta = eta[:]
			sum_t = np.sum(np.exp(eta))
			theta = np.divide(theta, sum_t)

			# update e_theta
			for i in range(self._K):
				e_theta[i] = math_utli.log_sum(e_theta[i], log_prob + math_utli.safe_log(theta[i]))
		# normalize e_theta and set return vector
		sum_et = -1.0
		for i in range(self._K):
			e_theta[i] -= np.log(nsamples)
			sum_et = math_utli.log_sum(sum_et, e_theta[i])
		e_theta = np.exp(np.subtract(e_theta, sum_et))
		return e_theta
文件: ctm.py 项目: zhibo-work/code
	def __init__(self, K, mu=None, cov=None):
			K: Number of topics
			D: Total number of documents in the population. For a fixed corpus,
			   this is the size of the corpus.
			mu and cov: the hyperparameters logistic normal distribution for prior on weight vectors theta

		logger.info("CTM commence.")
		if K is None is None:
			raise ValueError('number of topics have to be specified.')
		# get the folder name which containing all the training files
		# we will have to manually specific the observed and heldout folders
		obs_filenames = os.listdir('d:\\mycode\\ctm_python\\state_union\\observed\\')
		path = 'd:\\mycode\\ctm_python\\state_union\\observed\\'

		logger.info("initializing id mapping from corpus, assuming identity")
		#initial a string to save all the file contents
		txt_corpus = []
		for thefile in obs_filenames:
			with open(path + thefile, "rb") as f:
				strings = f.read()
		(dictionary, corpus) = preprocess.get_dict_and_corp(txt_corpus)
		logger.info("dictionary and corpus are generated")

		self.dictionary = dictionary
		self.corpus = corpus

		self._K = K                     # number of topics
         	logger.info("There are %i topics.", self._K)
		self._W = len(dictionary)   # number of all the terms
		self._D = len(corpus)       # number of documents

		# initialize wordid and wordcount list for the whole corpus
		self.wordids = list()
		self.wordcts = list()

		for d, doc in enumerate(self.corpus):
			wordidsd = [id for id, _ in doc]
			wordctsd = np.array([cnt for _, cnt in doc])

		# mu   : K-size vector with 0 as initial value
		# cov  : K*K matrix with 1 as initial value , together they make a Gaussian
		if mu is None:
			self.mu = np.zeros(self._K)
			self.mu = mu
		if cov is None:
			self.cov = np.ones((self._K, self._K))
			self.cov = cov

		# if cov is initialized by the process, then there is no need to calculate
		# inverse of cov, since it is singular matrix
			self.inv_cov = np.linalg.inv(self.cov)
		except np.linalg.linalg.LinAlgError as err:
			if 'Singular matrix' in err.message:
				self.inv_cov = self.cov

		# self.inv_cov = np.linalg.inv(self.cov)
		self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov))

		self.ndata = 0  # cumulate count of number of docs processed

		# initialize topic distribution, i.e. self.log_beta
		sum = 0
		self.beta = np.zeros([self._K, self._W])
		self.log_beta = np.zeros([self._K, self._W])

		for i in range(self._K):
			# initialize beta with a randomly chosen doc
			# stuff K topics randomly
			doc_no = np.random.randint(self._D)
			nterms = len(self.wordcts[doc_no])
			for j in range(nterms):
				word_index = self.wordids[doc_no][j]
				self.log_beta[i][word_index] = self.wordcts[doc_no][j] 
				# logging.info('self.log_beta[i,j]-track %f', self.log_beta[i,j]) 

		for m in range(self._K):
			for n in range(self._W):
				self.log_beta[m][n] = self.log_beta[m][n] + 1.0 + np.random.ranf()
				# logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] )

		# to initialize and smooth
		sum = math_utli.safe_log(np.sum(self.log_beta))
		logger.info("log_beta_sum : %f", sum)

		# little function to normalize self.log_beta
		def element_add(x):
			return x +  math_utli.safe_log(x-sum)
		self.log_beta = map(element_add, self.log_beta)
		for m in range(self._K):
			for n in range(self._W):
				logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] )

		logger.info("initialization finished.")
文件: ctm.py 项目: zhibo-work/code
		def element_add(x):
			return x +  math_utli.safe_log(x-sum)