def log_prior_pdf(self): K = self.topic_dim new_s1 = np.floatX(self.prior_s1 + K / 2) z_mean = tf.reduce_mean(self.z_3d, axis=-1, keep_dims=True) new_s2 = self.prior_s2 + tf.reduce_sum(tf.square(self.z_3d - z_mean), axis=-1) / 2 \ + self.prior_lambda * K / (self.prior_lambda + K) / 2 * tf.square(z_mean - self.prior_mu) return tf.lgamma(new_s1) - tf.lgamma(self.prior_s1) + np.floatX(self.prior_s1 * np.log(self.prior_s2)) - new_s1 * tf.log(new_s2) \ - tf.cast(0.5 * tf.log(1 + K / self.prior_lambda), tf.floatX)
def build_kl_loss(self): # The prior. self.prior_a = np.floatX(self.cfg["prior_alpha"]) self.prior_b = np.floatX(self.cfg["prior_beta"]) if self.cfg["pitman_yor"]: # Pitman-Yor process for power-law cluster size distribution. # self.prior_a is `1 - a` in the standard parametrization ( a is the discount parameter ) ; 0 < prior_a <= 1 # self.prior_b is `a + b` in the standard parametrization; 0 < prior_b # As the cluster distribution goes with k^{-1/a} asymptotically; the larger the prior_a, the smaller the a, the less activated topics the prior encourage self.prior_b = np.floatX( np.arange(self.topic_dim - 1) * (1 - self.prior_a) + self.prior_b) self.KL_loss_3d = self.calc_kl_loss() self.KL_loss = tf.reduce_sum(self.KL_loss_3d, axis=-1) # diviserity loss dl_type = self.cfg.get("diversity_loss_type", None) print("Use diversity regularization: ", dl_type) if dl_type == "xie_2015": K = [] beta_norm = tf.sqrt(tf.reduce_sum(tf.square(self.beta), axis=-1)) for i in range(self.topic_dim): Ki = [] for j in range(i): Ki.append(K[j][i]) Ki.append(tf.constant(0, dtype=tf.floatX)) for j in range(i + 1, self.topic_dim): Ki.append( tf.acos( tf.reduce_sum(self.beta[i, :] * self.beta[j, :]) / (beta_norm[i] * beta_norm[j]))) K.append(tf.stack(Ki)) K_mat = tf.stack(K) self.angle_mean = tf.reduce_mean(K_mat) self.angle_v = tf.reduce_mean(tf.square(K_mat - self.angle_mean)) self.diversity_loss = -self.diversity_weight_placeholder * ( self.angle_mean - self.angle_v) elif dl_type == "dpp": K = [] #beta_norm = tf.sqrt(tf.reduce_sum(tf.square(self.beta), axis=-1)) for i in range(self.topic_dim): Ki = [] for j in range(i): Ki.append(K[j][i]) #Ki.append(tf.constant(, dtype=tf.floatX)) for j in range(i, self.topic_dim): # TODO: other kernels? Ki.append(tf.reduce_sum(self.beta[i, :] * self.beta[j, :])) K.append(tf.stack(Ki)) K_mat = tf.stack(K) self.diversity_loss = -2 * logdet(K_mat) else: self.diversity_loss = tf.constant(0., dtype=tf.floatX) self.batch_kl_loss = tf.reduce_mean(self.KL_loss)
def sample_from_prior(self, num): # lambda ~ Gamma(s_1, s_2) lambdas = np.random.gamma(scale=self.prior_s1, shape=1./self.prior_s2, size=(num,)) # mu ~ Normal(mu_p, 1 / (lambda_p * lambda)) mus = np.random.normal(loc=self.prior_mu, scale=np.sqrt(1./(self.prior_lambda * lambdas))) # x ~ Normal(mu, 1 / lambda) return np.floatX(np.random.normal(mus, np.sqrt(1. / lambdas), size=(self.topic_dim, num)).T)
def build_kl_loss(self): self.prior_mu = np.floatX(self.cfg["prior_mu"]) self.prior_lambda = np.floatX(self.cfg["prior_lambda"]) self.prior_s1 = np.floatX(self.cfg["prior_s1"]) self.prior_s2 = np.floatX(self.cfg["prior_s2"]) self.post_mu = tf.get_variable("sgd_post_mu", shape=[], dtype=tf.floatX, initializer=tf.constant_initializer(self.prior_mu), trainable=self.cfg["trainable_post_mu"]) self.log_post_lambda = tf.get_variable("sgd_log_post_lambda", shape=[], dtype=tf.floatX, initializer=tf.constant_initializer(np.log(self.prior_lambda)), trainable=self.cfg["trainable_post_lambda"]) self.invsp_post_gamma1 = tf.get_variable("sgd_invsp_post_gamma1", shape=[], dtype=tf.floatX, initializer=tf.constant_initializer(np.log(np.exp(self.prior_s1) - 1)), trainable=self.cfg["trainable_post_gamma1"]) self.invsp_post_gamma2 = tf.get_variable("sgd_invsp_post_gamma2", shape=[], dtype=tf.floatX, initializer=tf.constant_initializer(np.log(np.exp(self.prior_s2) - 1)), trainable=self.cfg["trainable_post_gamma2"]) self.post_lambda = tf.exp(self.log_post_lambda) self.post_gamma1 = tf.nn.softplus(self.invsp_post_gamma1) self.post_gamma2 = tf.nn.softplus(self.invsp_post_gamma2) gamma_ratio = self.post_gamma1 / self.post_gamma2 neg_log_entro = - self.topic_dim / 2 * (np.log(2 * np.pi) + 1) - tf.reduce_sum(self.z_logvar, axis=-1) / 2 neg_log_entro_3d = - 1 / 2 * (np.log(2 * np.pi) + 1) - self.z_logvar / 2 kl_normal_gamma = tf.lgamma(self.prior_s1) - tf.lgamma(self.post_gamma1) - self.prior_s1 * tf.log(self.prior_s2 / self.post_gamma2) \ - (np.log(self.prior_lambda) - tf.log(self.post_lambda)) / 2 \ - tf.digamma(self.post_gamma1) * (self.prior_s1 - self.post_gamma1) \ + gamma_ratio * self.prior_s2 - self.post_gamma1 - 0.5 \ + self.prior_lambda / self.post_lambda / 2 \ + self.prior_lambda * gamma_ratio / 2 * (self.prior_mu - self.post_mu) ** 2 neg_Epz_q_z_u_lambda = self.topic_dim / 2 * (tf.log(2 * np.pi * self.post_gamma2) + 1 / self.post_lambda \ + self.post_mu ** 2 * gamma_ratio - tf.digamma(self.post_gamma1)) \ + gamma_ratio / 2 * tf.reduce_sum(tf.square(self.z_mean) + self.z_var - 2 * self.z_mean * self.post_mu, axis=-1) neg_Epz_q_z_u_lambda_3d = 0.5 * (tf.log(2 * np.pi * self.post_gamma2) + 1 / self.post_lambda \ + self.post_mu ** 2 * gamma_ratio - tf.digamma(self.post_gamma1)) \ + gamma_ratio / 2 * (tf.square(self.z_mean) + self.z_var - 2 * self.z_mean * self.post_mu) self.KL_loss_3d = neg_log_entro_3d + kl_normal_gamma + neg_Epz_q_z_u_lambda_3d self.KL_loss = neg_log_entro + kl_normal_gamma + neg_Epz_q_z_u_lambda self.batch_kl_loss = tf.reduce_mean(self.KL_loss)
def build_kl_loss(self): # The prior. self.prior_a = np.floatX(self.cfg["prior_alpha"]) self.prior_b = np.floatX(self.cfg["prior_beta"]) self.prior_gamma_a = np.floatX(self.cfg["prior_gamma_a"]) self.prior_gamma = np.floatX(self.cfg["prior_gamma"]) if self.cfg.get("pitman_yor", False): self.prior_b = np.floatX(np.arange(self.cfg["L2_truncation_level"] - 1) * (1 - self.prior_a) + self.prior_b) self.prior_gamma = np.floatX(np.arange(self.topic_dim - 1) * (1 - self.prior_gamma_a) + self.prior_gamma) if self.cfg["closed_form_update_beta"]: self.post_u = tf.get_variable("post_u", shape=[self.topic_dim-1], dtype=tf.floatX, initializer=tf.constant_initializer(1.0), trainable=False) self.post_v = tf.get_variable("post_v", shape=[self.topic_dim-1], dtype=tf.floatX, initializer=tf.constant_initializer(self.prior_gamma), trainable=False) else: self.inv_post_u = tf.get_variable("sgd_log_post_u", shape=[self.topic_dim-1], dtype=tf.floatX, initializer=tf.constant_initializer(np.log(np.exp(1.0)-1))) self.inv_post_v = tf.get_variable("sgd_log_post_v", shape=[self.topic_dim-1], dtype=tf.floatX, initializer=tf.constant_initializer(np.log(np.exp(self.prior_gamma)-1))) self.post_u = tf.nn.softplus(self.inv_post_u) self.post_v = tf.nn.softplus(self.inv_post_v) self.post_e_beta = self.post_u / (self.post_u + self.post_v) self.KL_pi = self.calc_kl_loss() self.KL_beta = self.calc_kl_loss_beta() self.KL_c = self.calc_kl_loss_c() # self.KL_loss_3d = self.calc_kl_loss() # self.KL_loss = tf.reduce_sum(self.KL_loss_3d, axis=-1) self.batch_kl_c = tf.reduce_mean(self.KL_c) self.batch_kl_pi = tf.reduce_mean(tf.reduce_sum(self.KL_pi, axis=-1)) self.KL_loss = tf.reduce_sum(self.KL_pi, axis=-1) + self.KL_c + self.KL_beta / self.dataset_size_placeholder if self.cfg["KL_beta_ratio"] > 0: tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.cfg["KL_beta_ratio"] * self.KL_beta) # / tf.where(self.training_placeholder, # / tf.cast(tf.shape(self.x)[0] * self.cfg["MC_samples"], tf.float32) # self.KL_loss = tf.Print(self.KL_loss, [tf.reduce_mean(tf.reduce_sum(self.KL_pi, axis=-1)), tf.reduce_mean(self.KL_c), self.KL_beta], "kl losses") self.diversity_loss = tf.constant(0., dtype=tf.floatX) self.batch_kl_loss = tf.reduce_mean(self.KL_loss)
def build_stochastic_layer(self, layer): self.a = tf.layers.dense( layer, self.topic_dim - 1, activation=self.cfg["dirichlet_ab_fct"], use_bias=self.cfg["dirichlet_ab_use_bias"], kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=tf.zeros_initializer(), name="posterior_a_output") self.b = tf.layers.dense( layer, self.topic_dim - 1, activation=self.cfg["dirichlet_ab_fct"], use_bias=self.cfg["dirichlet_ab_use_bias"], kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=tf.constant_initializer(self.cfg["b_init"]), name="posterior_b_output") uniform_samples = tf.random_uniform( (self.cfg["MC_samples"], tf.shape(self.x)[0], self.topic_dim - 1), minval=0.01, maxval=0.99, dtype=tf.floatX) if self.cfg.get("bias_on_prior", False): self.prior_a = np.floatX(self.cfg["prior_alpha"]) self.prior_b = np.floatX(self.cfg["prior_beta"]) if self.cfg["pitman_yor"]: self.prior_b = np.floatX( np.arange(self.topic_dim - 1) * (1 - self.prior_a) + self.prior_b) self.b = self.b + self.prior_b self.a = self.a + self.prior_a else: self.a = self.a + 1e-5 self.b = self.b + 1e-5 self.vs = (1 - uniform_samples**(1 / self.b))**(1 / self.a) # self.vs = tf.Print(self.vs, [tf.reduce_mean(self.vs), tf.reduce_max(self.vs), self.vs[:, 37, :]], summarize=200, message="print_vs: ") # Construct topic vector by stick-breaking process # stick_segment = tf.zeros((self.cfg["MC_samples"], tf.shape(self.x)[0])) # remaining_stick = tf.ones((self.cfg["MC_samples"], tf.shape(self.x)[0])) # def stick_breaking(s, elem): # stick = s[1] * self.vs[:, :, elem] # remain = s[1] * (1 - self.vs[:, :, elem]) # return (stick, remain) # stick_segments, remaining_sticks = tf.scan(fn=stick_breaking, elems=tf.range(self.topic_dim - 1), # initializer=(stick_segment, remaining_stick)) # self.z = tf.transpose(tf.concat((stick_segments, tf.expand_dims(remaining_sticks[-1, :, :], axis=0)), axis=0), (1, 2, 0)) # # 0.01 -> 99% stick # self.average_used_dims = tf.reduce_mean(tf.reduce_sum(tf.cast(remaining_sticks > self.cfg["stick_epsilon"], tf.floatX), axis=0)) stick_segments_lst = [] remaining_sticks = tf.ones( (self.cfg["MC_samples"], tf.shape(self.x)[0]), dtype=tf.floatX) for i in range(self.topic_dim - 1): stick_segments_lst.append(remaining_sticks * self.vs[:, :, i]) remaining_sticks = remaining_sticks * (1 - self.vs[:, :, i]) stick_segments = tf.stack( stick_segments_lst ) # (topic_dim - 1) x (MC samples) x (batch size) self.z_3d = tf.transpose( tf.concat( (stick_segments, tf.expand_dims(remaining_sticks, axis=0)), axis=0), (1, 2, 0)) # Change if self.cfg["effective_indicator"] == "average": self.average_of_every_topic = tf.reduce_mean( self.z_3d, axis=(0, 1)) * tf.cast( tf.shape(self.x)[0], tf.floatX) effective_dims = self.average_of_every_topic > self.cfg[ "effective_threshold"] self.average_used_dims = tf.reduce_sum( tf.cast(effective_dims, tf.floatX)) self.effective_dims = tf.squeeze(tf.where(effective_dims)) elif self.cfg["effective_indicator"] == "assignment" or self.cfg[ "effective_indicator"] == "ratio": self.assignment_of_every_topic = tf.bincount( tf.cast(tf.argmax(self.z_3d, axis=-1), tf.int32), minlength=self.topic_dim) effective_dims_bool = tf.cast( self.assignment_of_every_topic, tf.floatX) > self.cfg["assignment_threshold"] * tf.cast( tf.shape(self.x)[0], tf.floatX) * self.cfg["MC_samples"] # FIXME: for now, if MC_sample is not 1. This is not correct. self.average_used_dims = tf.reduce_sum( tf.cast(effective_dims_bool, tf.floatX)) self.effective_dims = tf.squeeze(tf.where(effective_dims_bool)) # self.average_used_dims = tf.Print(self.average_used_dims, [tf.transpose(remaining_sticks, (1, 2, 0))], "print_remaining", summarize=100, first_n=3) # self.z = tf.Print(self.z, [self.z], "print_z", summarize=50) # self.z = tf.Print(self.z, [tf.reduce_sum(self.z, axis=-1)], "print_z_sum") z = tf.reshape(self.z_3d, [-1, self.topic_dim]) return z
def sample_from_prior(self, num): eps = np.floatX( np.random.beta(self.prior_a, self.prior_b, size=(num, self.topic_dim - 1))) return self.stick_breaking(eps)
def sample_from_prior(self, num): eps = np.floatX(np.random.normal(size=(num, self.topic_dim))) return eps * np.sqrt(self.prior_var) + self.prior_mu
def _gaussian_log_pdf(self, x, mu, sigma): return -0.5 * (self.topic_dim * tf.cast(tf.log(2 * np.floatX(np.pi)), tf.floatX) + tf.reduce_sum(tf.log(sigma), axis=-1) + tf.reduce_sum(tf.square(x - mu) / sigma, axis=-1))