Exemplo n.º 1
0
	def get_perplexity(self):
		'''
		Calculate perplexity value. Read in the model parameters got from above procedures

		Right now, held out documents have to be manually put in a seperate folder to read in.

		Returns:
			perplexity : currently, the only evaluation value, add others later

		'''

		log_lhood = np.zeros(len(self.corpus))
		e_theta = np.zeros((len(self.corpus),self._K))
		total_words = 0
		total_lhood = 0

		# create held_out corpus
		heldout_filenames = os.listdir('d:\\mycode\\ctm_python\\state_union\\heldout\\')
		path = 'd:\\mycode\\ctm_python\\state_union\\heldout\\'
		heldout_corpus = []
		for thefile in heldout_filenames:
			with open(path + thefile, "rb") as f:
				strings = f.read()
				heldout_corpus.append(strings)
		(held_dictionary, held_corpus) = preprocess.get_dict_and_corp(heldout_corpus)
		total_words = len(held_dictionary)

		# load model parameters for calculating e_theta
		with open('corpus_lambda_dump', 'rb') as ctm_lambda_dump:
				lambda_v_c = cPickle.load(ctm_lambda_dump)
		with open('corpus_nu_dump', 'rb') as ctm_nu_dump:
				nu_v_c = cPickle.load(ctm_nu_dump)

		# calculate e_theta using observed data
		for d, doc in enumerate(self.corpus):
			obs_wordidsd = [id for id, _ in doc]
			obs_wordctsd = np.array([cnt for _, cnt in doc])
			# obs_nterms = len(obs_wordidsd)

			lambda_v = lambda_v_c[d]
			nu_v = nu_v_c[d]

			# e_theta is calculated on each document
			# since obs_doc number is always no lesser than held_out doc number
			# so on the held out inference stage, e_theta won't indexed out
			e_theta[d] = self.expected_theta(obs_wordidsd, obs_wordctsd, lambda_v, nu_v)

		for d, doc in enumerate(held_corpus):
			# held_wordidsd = [id for id, _ in doc]
			held_wordctsd = np.array([cnt for _, cnt in doc])
			# held_nterms = len(held_wordctsd)

			# approximate inference of held out data
			# randomly choose a number to index e_theta file
			# MEMO: this is dirty work, but right now, I can't think of a better way
			rand_etheta_index = np.random.randint(len(held_corpus))
			etheta = e_theta[rand_etheta_index]
			log_lhood[d] = self.log_mult_prob(held_wordctsd, etheta)

		total_lhood = np.sum(log_lhood)
		perplexity = np.exp(- total_lhood / total_words)
		print 'the perplexity is:', perplexity
Exemplo n.º 2
0
	def __init__(self, K, mu=None, cov=None):
		'''
		Arguments:
			K: Number of topics
			D: Total number of documents in the population. For a fixed corpus,
			   this is the size of the corpus.
			mu and cov: the hyperparameters logistic normal distribution for prior on weight vectors theta
		'''

		logger.info("CTM commence.")
		logger.info("Initializing...")
		if K is None is None:
			raise ValueError('number of topics have to be specified.')
		# get the folder name which containing all the training files
		# we will have to manually specific the observed and heldout folders
		obs_filenames = os.listdir('d:\\mycode\\ctm_python\\state_union\\observed\\')
		path = 'd:\\mycode\\ctm_python\\state_union\\observed\\'

		logger.info("initializing id mapping from corpus, assuming identity")
		#initial a string to save all the file contents
		txt_corpus = []
		for thefile in obs_filenames:
			with open(path + thefile, "rb") as f:
				strings = f.read()
				txt_corpus.append(strings)
		(dictionary, corpus) = preprocess.get_dict_and_corp(txt_corpus)
		logger.info("dictionary and corpus are generated")

		self.dictionary = dictionary
		self.corpus = corpus

		self._K = K                     # number of topics
         	logger.info("There are %i topics.", self._K)
		self._W = len(dictionary)   # number of all the terms
		self._D = len(corpus)       # number of documents

		# initialize wordid and wordcount list for the whole corpus
		self.wordids = list()
		self.wordcts = list()

		for d, doc in enumerate(self.corpus):
			wordidsd = [id for id, _ in doc]
			wordctsd = np.array([cnt for _, cnt in doc])
			self.wordids.append(wordidsd)
			self.wordcts.append(wordctsd)

		# mu   : K-size vector with 0 as initial value
		# cov  : K*K matrix with 1 as initial value , together they make a Gaussian
		if mu is None:
			self.mu = np.zeros(self._K)
		else:
			self.mu = mu
		if cov is None:
			self.cov = np.ones((self._K, self._K))
		else:
			self.cov = cov

		# if cov is initialized by the process, then there is no need to calculate
		# inverse of cov, since it is singular matrix
		try:
			self.inv_cov = np.linalg.inv(self.cov)
		except np.linalg.linalg.LinAlgError as err:
			if 'Singular matrix' in err.message:
				self.inv_cov = self.cov
		  	else:
		  		pass

		# self.inv_cov = np.linalg.inv(self.cov)
		self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov))

		self.ndata = 0  # cumulate count of number of docs processed

		# initialize topic distribution, i.e. self.log_beta
		sum = 0
		self.beta = np.zeros([self._K, self._W])
		self.log_beta = np.zeros([self._K, self._W])

		for i in range(self._K):
			# initialize beta with a randomly chosen doc
			# stuff K topics randomly
			doc_no = np.random.randint(self._D)
			nterms = len(self.wordcts[doc_no])
			for j in range(nterms):
				word_index = self.wordids[doc_no][j]
				self.log_beta[i][word_index] = self.wordcts[doc_no][j] 
				# logging.info('self.log_beta[i,j]-track %f', self.log_beta[i,j]) 

		for m in range(self._K):
			for n in range(self._W):
				self.log_beta[m][n] = self.log_beta[m][n] + 1.0 + np.random.ranf()
				# logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] )

		# to initialize and smooth
		sum = math_utli.safe_log(np.sum(self.log_beta))
		logger.info("log_beta_sum : %f", sum)

		# little function to normalize self.log_beta
		def element_add(x):
			return x +  math_utli.safe_log(x-sum)
		self.log_beta = map(element_add, self.log_beta)
		for m in range(self._K):
			for n in range(self._W):
				logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] )

		logger.info("initialization finished.")