def predict_log_probs(self, test_data): num_points = test_data["coordinates"].shape[0] topic_log_prob_vector = np.zeros((self.num_topics, num_points)) # topic_prob_vector = np.ones((self.num_topics, num_points)) for z in range(self.num_topics): try: rv = stats.multivariate_normal(mean=self.topic_centers[z, :], cov=self.topic_covar[z, :, :], allow_singular=True) loc_log_prob = rv.logpdf(test_data["coordinates"]).reshape((num_points, 1)) # Nx1 # loc_prob = rv.pdf(test_data["coordinates"]).reshape((num_points, 1)) # Nx1 except: print("Error while computing geo log probabilities for test, dumping data.", "\n", self.topic_centers[z, :], "\n", self.topic_covar[z, :, :], file=sys.stderr) traceback.print_stack(file=sys.stderr) sys.exit(1) feature_log_prob = np.zeros((num_points, 1)) # feature_prob = np.ones((num_points, 1)) for feature in self.beta_arrays.keys(): beta = self.beta_arrays[feature][z] num_words = beta.shape[0] feature_log_prob += test_data[feature] * np.log(beta.reshape((num_words, 1))) # (NxV * Vx1) = Nx1 # feature_prob *= np.prod(np.power(beta.reshape((num_words, 1)).T, test_data[feature].todense()), axis=1) topic_log_prob_vector[z] = (loc_log_prob + feature_log_prob + np.tile(np.log(self.theta[0, z]), (num_points, 1))).flatten() # topic_prob_vector[z] = np.multiply(np.multiply(loc_prob, feature_prob), # np.tile(self.theta[0, z], (num_points, 1))).flatten() # print("direct") # print(np.sum(np.log(np.sum(topic_prob_vector, axis=0)))) return np.sum(utils.log_sum(topic_log_prob_vector, axis=0))
def compute_probabilities_from_mixture(model, data): """ Compute data probabilities, but by substituting Gaussian mixture of all topic distributions as geographical distribution, so all points are drawn from the same distribution. :param model: :param coordinates: :return: """ theta = model.theta beta_arrays = model.beta_arrays num_points = data["coordinates"].shape[0] geo_log_prob = np.zeros((model.num_topics, num_points)) # kxN feature_log_prob = np.zeros((model.num_topics, num_points)) # kxN for z in range(model.num_topics): # Compute feature probabilities for feature in beta_arrays.keys(): beta = beta_arrays[feature] # kxV feature_log_prob[z] += np.log( beta[z]) * data[feature].T # kxV * NxV' = kxN # Compute geographical probabilities rv = multivariate_normal(mean=model.topic_centers[z, :], cov=model.topic_covar[z, :, :], allow_singular=True) geo_log_prob[z] += rv.logpdf(data["coordinates"]) log_theta_for_z = np.log(theta[0, z]) geo_log_prob[z] += log_theta_for_z feature_log_prob[z] += log_theta_for_z # Log-sum-exp over topics, then sum over data points final_geo_log_prob = log_sum(geo_log_prob, axis=0) final_feature_log_prob = log_sum(feature_log_prob, axis=0) return np.sum(final_geo_log_prob + final_feature_log_prob)
def compute_probabilities_from_mixture(model, data): """ Compute data probabilities, but by substituting Gaussian mixture of all topic distributions as geographical distribution, so all points are drawn from the same distribution. :param model: :param coordinates: :return: """ theta = model.theta beta_arrays = model.beta_arrays num_points = data["coordinates"].shape[0] geo_log_prob = np.zeros((model.num_topics, num_points)) # kxN feature_log_prob = np.zeros((model.num_topics, num_points)) # kxN for z in range(model.num_topics): # Compute feature probabilities for feature in beta_arrays.keys(): beta = beta_arrays[feature] # kxV feature_log_prob[z] += np.log(beta[z]) * data[feature].T # kxV * NxV' = kxN # Compute geographical probabilities rv = multivariate_normal(mean=model.topic_centers[z, :], cov=model.topic_covar[z, :, :], allow_singular=True) geo_log_prob[z] += rv.logpdf(data["coordinates"]) log_theta_for_z = np.log(theta[0, z]) geo_log_prob[z] += log_theta_for_z feature_log_prob[z] += log_theta_for_z # Log-sum-exp over topics, then sum over data points final_geo_log_prob = log_sum(geo_log_prob, axis=0) final_feature_log_prob = log_sum(feature_log_prob, axis=0) return np.sum(final_geo_log_prob + final_feature_log_prob)
def get_topic_unigram(m_array, h_array): """ m_array: 1 x V h_array: k x V beta_array: k x V """ beta_array = m_array + h_array norm_sum = utils.log_sum(beta_array, axis=1) try: beta_array = np.exp(beta_array.T - norm_sum).T except: sys.stderr.write(str(beta_array)) sys.stderr.write("\n") sys.stderr.write(str(norm_sum)) sys.stderr.write("\n") return beta_array
def __update_phi(data, beta_arrays, theta, topic_centers, topic_covar): """ :param data: a dictionary containing coordinates and sparse N x V_F matrices for features :param beta_arrays: F x k x V :param theta: 1 x k :param topic_centers: k x 2 :param topic_covar: k x 2 x 2 """ k, _ = topic_centers.shape N, _ = data["coordinates"].shape G = np.zeros((k, N)) Z = np.zeros((k, N)) # Compute geographical probabilities for z in range(k): # setup distribution variable rv = stats.multivariate_normal(mean=topic_centers[z, :], cov=topic_covar[z, :, :], allow_singular=True) log_probabilities = rv.logpdf(data["coordinates"]) G[z, :] = log_probabilities # TODO: @MM, I think we can move this out of the loop, no? There can be a minor performance increase. - Emre Z[z, :] = np.log(theta[0, z]) # Compute new phi # TODO: @MM, please check this. - Emre F = np.zeros((k, N)) for feature in beta_arrays.keys(): # (k x V) x (N x V)' + k x N F += np.log(beta_arrays[feature]) * data[feature].transpose() F += G + Z S = utils.log_sum(F, axis=0) # 1 x N F = F - S phi = np.exp(F) # k x N return phi
def predict_log_probs(self, test_data): num_points = test_data["coordinates"].shape[0] topic_log_prob_vector = np.zeros((self.num_topics, num_points)) # topic_prob_vector = np.ones((self.num_topics, num_points)) for z in range(self.num_topics): try: rv = stats.multivariate_normal(mean=self.topic_centers[z, :], cov=self.topic_covar[z, :, :], allow_singular=True) loc_log_prob = rv.logpdf(test_data["coordinates"]).reshape( (num_points, 1)) # Nx1 # loc_prob = rv.pdf(test_data["coordinates"]).reshape((num_points, 1)) # Nx1 except: print( "Error while computing geo log probabilities for test, dumping data.", "\n", self.topic_centers[z, :], "\n", self.topic_covar[z, :, :], file=sys.stderr) traceback.print_stack(file=sys.stderr) sys.exit(1) feature_log_prob = np.zeros((num_points, 1)) # feature_prob = np.ones((num_points, 1)) for feature in self.beta_arrays.keys(): beta = self.beta_arrays[feature][z] num_words = beta.shape[0] feature_log_prob += test_data[feature] * np.log( beta.reshape((num_words, 1))) # (NxV * Vx1) = Nx1 # feature_prob *= np.prod(np.power(beta.reshape((num_words, 1)).T, test_data[feature].todense()), axis=1) topic_log_prob_vector[z] = (loc_log_prob + feature_log_prob + np.tile(np.log(self.theta[0, z]), (num_points, 1))).flatten() # topic_prob_vector[z] = np.multiply(np.multiply(loc_prob, feature_prob), # np.tile(self.theta[0, z], (num_points, 1))).flatten() # print("direct") # print(np.sum(np.log(np.sum(topic_prob_vector, axis=0)))) return np.sum(utils.log_sum(topic_log_prob_vector, axis=0))
def compute_log_beta(h_matrix): beta_array = m_array + h_matrix norm_sum = utils.log_sum(beta_array, axis=1) return beta_array - norm_sum[:, np.newaxis]