def gauss_kl_diag(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a matrix, each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) KL += -0.5 * tf.reduce_sum(tf.log(tf.square(q_sqrt))) # Log-det of q-cov L_inv = tf.matrix_triangular_solve(L, eye(tf.shape(L)[0]), lower=True) K_inv = tf.matrix_triangular_solve(tf.transpose(L), L_inv, lower=False) KL += 0.5 * tf.reduce_sum(tf.expand_dims(tf.diag_part(K_inv), 1) * tf.square(q_sqrt)) # Trace term. return KL
def build_likelihood(self): """ Constuct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] num_data = tf.shape(self.Y)[0] output_dim = tf.shape(self.Y)[1] err = self.Y - self.mean_function(self.X) Kdiag = self.kern.Kdiag(self.X) Kuf = self.kern.K(self.Z, self.X) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 L = tf.cholesky(Kuu) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, Kuf, lower=True) /\ tf.sqrt(self.likelihood.variance) AAT = tf.matmul(A, tf.transpose(A)) B = AAT + eye(num_inducing) LB = tf.cholesky(B) c = tf.matrix_triangular_solve(LB, tf.matmul(A, err), lower=True) /\ tf.sqrt(self.likelihood.variance) # compute log marginal bound bound = -0.5 * tf.cast(num_data * output_dim, tf.float64)*np.log(2*np.pi) bound += -tf.cast(output_dim, tf.float64)*tf.reduce_sum(tf.log(tf.diag_part(LB))) bound += -0.5*tf.cast(num_data*output_dim, tf.float64)*tf.log(self.likelihood.variance) bound += -0.5*tf.reduce_sum(tf.square(err))/self.likelihood.variance bound += 0.5*tf.reduce_sum(tf.square(c)) bound += -0.5*(tf.reduce_sum(Kdiag)/self.likelihood.variance - tf.reduce_sum(tf.diag_part(AAT))) return bound
def gauss_kl(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) for d in range(num_latent): Lq = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) # Log determinant of q covariance: KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) LiLq = tf.matrix_triangular_solve(L, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def compute_upper_bound(self): num_data = tf.cast(tf.shape(self.Y)[0], settings.float_type) Kdiag = self.kern.Kdiag(self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kuf = self.feature.Kuf(self.kern, self.X) L = tf.cholesky(Kuu) LB = tf.cholesky(Kuu + self.likelihood.variance ** -1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) LinvKuf = tf.matrix_triangular_solve(L, Kuf, lower=True) # Using the Trace bound, from Titsias' presentation c = tf.reduce_sum(Kdiag) - tf.reduce_sum(LinvKuf ** 2.0) # Kff = self.kern.K(self.X) # Qff = tf.matmul(Kuf, LinvKuf, transpose_a=True) # Alternative bound on max eigenval: # c = tf.reduce_max(tf.reduce_sum(tf.abs(Kff - Qff), 0)) corrected_noise = self.likelihood.variance + c const = -0.5 * num_data * tf.log(2 * np.pi * self.likelihood.variance) logdet = tf.reduce_sum(tf.log(tf.diag_part(L))) - tf.reduce_sum(tf.log(tf.diag_part(LB))) LC = tf.cholesky(Kuu + corrected_noise ** -1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) v = tf.matrix_triangular_solve(LC, corrected_noise ** -1.0 * tf.matmul(Kuf, self.Y), lower=True) quad = -0.5 * corrected_noise ** -1.0 * tf.reduce_sum(self.Y ** 2.0) + 0.5 * tf.reduce_sum(v ** 2.0) return const + logdet + quad
def gauss_kl(min_q_mu, q_sq,K): q_mu=-1*min_q_mu #q_sqrt=tf.cholesky(tf.squeeze(q_sqrt)) # K is a variance...we sqrt later ''' N=1 Q=5 q_mu=tf.random_normal([Q,1],dtype=tf.float64) q_var=tf.random_normal([Q,Q],dtype=tf.float64) q_var=q_var+tf.transpose(q_var [1,0])+1e+1*np.eye(Q) K=q_var q_sqrt=tf.cholesky(q_var) q_sqrt=tf.expand_dims(q_sqrt,-1) num_latent=1 s=tf.Session() s.run(tf.initialize_all_variables()) ''' """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). q_sqrt=tf.cholesky(K) L = tf.cholesky(q_sq) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0], tf.float64) Lq = tf.batch_matrix_band_part(q_sqrt, -1, 0) # Log determinant of q covariance: KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) LiLq = tf.matrix_triangular_solve(L, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term """ V2=tf.cholesky(K) V1=tf.cholesky(q_sq) KL=h.Mul(tf.transpose(q_mu),tf.cholesky_solve(V2,q_mu)) KL+=tf.trace(tf.cholesky_solve(V2,q_sq)) KL-=h.get_dim(K,0) KL+=tf.reduce_sum(2*tf.log(tf.diag_part(V2))-2*tf.log(tf.diag_part(V1))) return KL/2
def log_det(Z): #conditioned=condition(Z) Z=(Z+tf.transpose(Z))/2 return 2*tf.reduce_sum(tf.log(tf.diag_part(tf.cholesky(Z)))) chol=tf.cholesky(Z) logdet=2*tf.reduce_sum(tf.log(tf.diag_part(chol))) return logdet
def multivariate_gaussian_log_density(x, mu, Sigma=None, L=None, prec=None, L_prec=None): """ Assume X is a single vector described by a multivariate Gaussian distribution with x ~ N(mu, Sigma). We accept parameterization in terms of the covariance matrix or its cholesky decomposition L (more efficient if available), or the precision matrix or its cholesky decomposition L_prec. The latter is useful when representing a Gaussian in its natural parameterization. Note that we still require the explicit mean mu (not the natural parameter prec*mu) since I'm too lazy to cover all the permutations of possible arguments (though this should be straightforward). """ s = extract_shape(x) try: n, = s except: n, m = s assert(m==1) if L is None and Sigma is not None: L = tf.cholesky(Sigma) if L_prec is None and prec is not None: L_prec = tf.cholesky(prec) if L is not None: neg_half_logdet = -tf.reduce_sum(tf.log(tf.diag_part(L))) else: assert(L_prec is not None) neg_half_logdet = tf.reduce_sum(tf.log(tf.diag_part(L_prec))) d = tf.reshape(x - mu, (n,1)) if L is not None: alpha = tf.matrix_triangular_solve(L, d, lower=True) exponential_part= tf.reduce_sum(tf.square(alpha)) elif prec is not None: d = tf.reshape(d, (n, 1)) exponential_part = tf.reduce_sum(d * tf.matmul(prec, d)) else: assert(L_prec is not None) d = tf.reshape(d, (1, n)) alpha = tf.matmul(d, L_prec) exponential_part= tf.reduce_sum(tf.square(alpha)) n_log2pi = n * 1.83787706641 logp = -0.5 * n_log2pi logp += neg_half_logdet logp += -0.5 * exponential_part return logp
def multivariate_gaussian_entropy(Sigma=None, L=None, L_prec=None): if L is None and Sigma is not None: L = tf.cholesky(Sigma) if L is not None: half_logdet = tf.reduce_sum(tf.log(tf.diag_part(L))) n, _ = extract_shape(L) else: half_logdet = -tf.reduce_sum(tf.log(tf.diag_part(L_prec))) n, _ = extract_shape(L_prec) log_2pi = 1.83787706641 entropy = .5*n*(1 + log_2pi) + half_logdet return entropy
def diagPartOp(self, tensor, dtype, expected_ans, use_gpu=False): with self.test_session(use_gpu=use_gpu): tensor = tf.convert_to_tensor(tensor.astype(dtype)) tf_ans_inv = tf.diag_part(tensor) inv_out = tf_ans_inv.eval() self.assertAllClose(inv_out, expected_ans) self.assertShapeEqual(expected_ans, tf_ans_inv)
def gauss_kl_white(q_mu, q_sqrt, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, I) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). """ KL = 0.5 * tf.reduce_sum(tf.square(q_mu)) # Mahalanobis term KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) for d in range(num_latent): Lq = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) # Log determinant of q covariance: KL -= 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) KL += 0.5 * tf.reduce_sum(tf.square(Lq)) # Trace term. return KL
def build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower lound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) f_mean = tf.matmul(K, self.q_alpha) + self.mean_function(self.X) #for each of the data-dimensions (columns of Y), find the diagonal of the #variance, and also relevant parts of the KL. f_var, A_logdet, trAi = [], tf.zeros((1,), tf.float64), tf.zeros((1,), tf.float64) for d in range(self.num_latent): b = self.q_lambda[:,d] B = tf.expand_dims(b, 1) A = eye(self.num_data) + K*B*tf.transpose(B) L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, eye(self.num_data), lower=True) LiBi = Li / b #full_sigma:return tf.diag(b**-2) - LiBi.T.dot(LiBi) f_var.append(1./tf.square(b) - tf.reduce_sum(tf.square(LiBi),0)) A_logdet += 2*tf.reduce_sum(tf.log(tf.diag_part(L))) trAi += tf.reduce_sum(tf.square(Li)) f_var = tf.transpose(tf.pack(f_var)) KL = 0.5*(A_logdet + trAi - self.num_data*self.num_latent + tf.reduce_sum(f_mean*self.q_alpha)) return tf.reduce_sum(self.likelihood.variational_expectations(f_mean, f_var, self.Y)) - KL
def initialize(self, *args, **kwargs): # Store latent variables in a temporary attribute; MAP will # optimize `PointMass` random variables, which subsequently # optimizes mean parameters of the normal approximations. latent_vars_normal = self.latent_vars.copy() self.latent_vars = {z: PointMass(params=qz.loc) for z, qz in six.iteritems(latent_vars_normal)} super(Laplace, self).initialize(*args, **kwargs) hessians = tf.hessians(self.loss, list(six.itervalues(self.latent_vars))) self.finalize_ops = [] for z, hessian in zip(six.iterkeys(self.latent_vars), hessians): qz = latent_vars_normal[z] if isinstance(qz, (MultivariateNormalDiag, Normal)): scale_var = get_variables(qz.variance())[0] scale = 1.0 / tf.diag_part(hessian) else: # qz is MultivariateNormalTriL scale_var = get_variables(qz.covariance())[0] scale = tf.matrix_inverse(tf.cholesky(hessian)) self.finalize_ops.append(scale_var.assign(scale)) self.latent_vars = latent_vars_normal.copy() del latent_vars_normal
def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[2], float_type) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type) # constant term Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(Lq)))) # logdet L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1])) LiLq = tf.matrix_triangular_solve(L_tiled, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def logpdf(self, x, mean=None, cov=1): """Log of the probability density function. Parameters ---------- x : tf.Tensor A 1-D or 2-D tensor. mean : tf.Tensor, optional A 1-D tensor. Defaults to zero mean. cov : tf.Tensor, optional A 1-D or 2-D tensor. Defaults to identity matrix. Returns ------- tf.Tensor A tensor of one dimension less than the input. """ x = tf.cast(x, dtype=tf.float32) x_shape = get_dims(x) if len(x_shape) == 1: d = x_shape[0] else: d = x_shape[1] if mean is None: r = x else: mean = tf.cast(mean, dtype=tf.float32) r = x - mean if cov is 1: L_inv = tf.diag(tf.ones([d])) det_cov = tf.constant(1.0) else: cov = tf.cast(cov, dtype=tf.float32) if len(cov.get_shape()) == 1: # vector L_inv = tf.diag(1.0 / tf.sqrt(cov)) det_cov = tf.reduce_prod(cov) else: # matrix L = tf.cholesky(cov) L_inv = tf.matrix_inverse(L) det_cov = tf.pow(tf.reduce_prod(tf.diag_part(L)), 2) lps = -0.5*d*tf.log(2*np.pi) - 0.5*tf.log(det_cov) if len(x_shape) == 1: # vector r = tf.reshape(r, shape=(d, 1)) inner = tf.matmul(L_inv, r) lps -= 0.5 * tf.matmul(inner, inner, transpose_a=True) return tf.squeeze(lps) else: # matrix # TODO vectorize further out = [] for r_vec in tf.unpack(r): r_vec = tf.reshape(r_vec, shape=(d, 1)) inner = tf.matmul(L_inv, r_vec) out += [tf.squeeze(lps - 0.5 * tf.matmul(inner, inner, transpose_a=True))] return tf.pack(out)
def test(self): for k in self.kernels: with k.tf_mode(): k1 = k.Kdiag(self.X) k2 = tf.diag_part(k.K(self.X)) k1, k2 = tf.Session().run([k1, k2], feed_dict={self.x_free: k.get_free_state(), self.X: self.X_data}) self.failUnless(np.allclose(k1, k2))
def pred(X,X_m_1,mu,len_sc_1,noise_1): Kmm=h.tf_SE_K(X_m_1,X_m_1,len_sc_1,noise_1) Knm=h.tf_SE_K(X,X_m_1,len_sc_1,noise_1) posterior_mean= h.Mul(Knm,tf.matrix_solve(Kmm,mu)) K_nn=h.tf_SE_K(X,X,len_sc_1,noise_1) full_cov=K_nn-h.Mul(Knm,tf.matrix_solve(Kmm,tf.transpose(Knm))) posterior_cov=tf.diag_part(full_cov) return posterior_mean,tf.reshape(posterior_cov,[N,1]),full_cov
def predict2(): # predicitions cov=h.Mul(K_mm_2,tf.matrix_inverse(K_mm_2+K_mnnm_2/tf.square(sigma_2)),K_mm_2) cov_chol=tf.cholesky(cov) mu=h.Mul(K_mm_2,tf.cholesky_solve(cov_chol,K_mn_2),Ytr)/tf.square(sigma_2) mean=h.Mul(K_nm_2,tf.matrix_solve(K_mm_1,mu)) variance=K_nn_2-h.Mul(K_nm_2,h.safe_chol(K_mm_2,tf.transpose(K_nm_2))) var_terms=2*tf.sqrt(tf.reshape(tf.diag_part(variance)+tf.square(sigma_2),[N,1])) return mean, var_terms
def _compute_predictions(self, init = None): """ Compute vanilla-RNN states and predictions. """ with tf.variable_scope('states'): with tf.variable_scope("HMM"): with tf.variable_scope("transition"): skip_prob = tf.get_variable("skip", shape=[1], initializer=tf.constant_initializer(1e-1)) #skip_prob = tf.Variable( np.array(1e-1, dtype=np.float32), name="skip") # .astype(np.float32) self.W_trans = (1-skip_prob) * get_transition_matrix().astype(np.float32) + skip_prob* np.eye(self.hidden_layer_size).astype(np.float32) #self.W_trans = tf.Variable( transition_with_skips, # name='W_trans', trainable=True) print("W_trans", self.W_trans.get_shape()) with tf.variable_scope("emission"): "W_emit: [self.input_size, self.hidden_layer_size]" if self.emission_init is None: self.W_emit = tf.get_variable("W_emit", shape = [self.hidden_layer_size, self.input_size], initializer = tf.random_normal_initializer(0.0, 1e-6)) else: if not (self.emission_init.shape == (self.hidden_layer_size, self.input_size)): print("self.emission_init.shape", self.emission_init.shape) print("(self.hidden_layer_size, self.input_size)", (self.hidden_layer_size, self.input_size)) raise ValueError("wrong dimensions of `self.emission_init`") self.W_emit = tf.Variable(self.emission_init.astype(np.float32), name = "W_emit", trainable = False) self.W_emit_summary = tf.image_summary("W_emit", tf.reshape(self.W_emit, [1,self.hidden_layer_size, self.input_size,1])) "idea: impose kernel similarity: maximize(W K W)" "[ self.hidden_layer_size, self.nt_in_pore ]" emission_in_pore_space = tf.matmul( self.map_hex_to_pore, self.W_emit) self.emission_similarity = tf.reduce_sum( tf.diag_part( tf.matmul( tf.transpose(emission_in_pore_space),(emission_in_pore_space)) ), name="emission_w_similarity") if init is None: initial_state = tf.ones([self.hidden_layer_size], name='initial_state') initial_state = initial_state/ self.hidden_layer_size else: initial_state = init #states = self._rnn_step_fw(initial_state[:,0], self.inputs[0,:]) states = functional_ops.scan(self._rnn_step_fw, tf.identity(self.inputs), initializer=initial_state, name='states') states_fw_summary = tf.histogram_summary("states_fw", states) #states = states_fw #print("states:", states.get_shape()) with tf.variable_scope('predictions'): # set some explicit initializer, orthogonal inialization "for now, keep identity mapping from hidden states to labels" "assume probability interpretation of values: should sum to one" W_pred = tf.Variable(np.eye(self.target_size, dtype = np.float32), name="W_pred", trainable=False) predictions = tf.matmul(states, W_pred, name='predictions') #predictions = states predictions_summary = tf.histogram_summary("predictions", predictions) #predictions = tf.nn.softmax(tf.matmul(states, W_pred), name='predictions')) # do predictions sum to one? return states, predictions
def diagOp(self, diag, dtype, expected_ans, use_gpu=False): with self.test_session(use_gpu=use_gpu): tf_ans = tf.diag(tf.convert_to_tensor(diag.astype(dtype))) out = tf_ans.eval() tf_ans_inv = tf.diag_part(expected_ans) inv_out = tf_ans_inv.eval() self.assertAllClose(out, expected_ans) self.assertAllClose(inv_out, diag) self.assertShapeEqual(expected_ans, tf_ans) self.assertShapeEqual(diag, tf_ans_inv)
def predict(K_mn,sigma,K_mm,K_nn): # predicitions K_nm=tf.transpose(K_mn) Sig_Inv=1e-1*np.eye(M)+K_mm+K_mnnm_2/tf.square(sigma) mu_post=h.Mul(tf.matrix_solve(Sig_Inv,K_mn),Ytr)/tf.square(sigma) mean=h.Mul(K_nm,mu_post) variance=K_nn-h.Mul(K_nm,h.safe_chol(K_mm,K_mn))+h.Mul(K_nm,tf.matrix_solve(Sig_Inv,K_mn)) var_terms=2*tf.sqrt(tf.reshape(tf.diag_part(variance)+tf.square(sigma),[N,1])) return mean, var_terms
def test(self): with self.test_context() as session: for k in self.kernels: k.initialize(session=session, force=True) X = tf.placeholder(tf.float64, [30, self.dim]) rng = np.random.RandomState(1) X_data = rng.randn(30, self.dim) k1 = k.Kdiag(X) k2 = tf.diag_part(k.K(X)) k1, k2 = session.run([k1, k2], feed_dict={X: X_data}) self.assertTrue(np.allclose(k1, k2))
def build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ num_inducing = tf.shape(self.Z)[0] psi0, psi1, psi2 = ke.build_psi_stats(self.Z, self.kern, self.X_mean, self.X_var) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + eye(num_inducing) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.diag_part(LB))) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma # KL[q(x) || p(x)] NQ = tf.cast(tf.size(self.X_mean), tf.float64) D = tf.cast(tf.shape(self.Y)[1], tf.float64) KL = -0.5*tf.reduce_sum(tf.log(self.X_var)) \ + 0.5*tf.reduce_sum(tf.log(self.X_prior_var))\ - 0.5 * NQ\ + 0.5 * tf.reduce_sum((tf.square(self.X_mean - self.X_prior_mean) + self.X_var) / self.X_prior_var) # compute log marginal bound ND = tf.cast(tf.size(self.Y), tf.float64) bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.diag_part(AAT))) bound -= KL return bound
def test_multivariate_normal_diag(self): with self.test_session() as sess: N, D, w_true, X_train, y_train, X, w, b, y = self._setup() # INFERENCE. Initialize scales at identity to verify if we # learned an approximately zero determinant. qw = MultivariateNormalDiag( loc=tf.Variable(tf.random_normal([D])), scale_diag=tf.Variable(tf.ones(D))) qb = MultivariateNormalDiag( loc=tf.Variable(tf.random_normal([1])), scale_diag=tf.Variable(tf.ones(1))) inference = ed.Laplace({w: qw, b: qb}, data={X: X_train, y: y_train}) inference.run(n_iter=100) self._test(sess, qw, qb, w_true) self.assertAllClose(qw.covariance().eval(), tf.diag(tf.diag_part(qw.covariance())).eval()) self.assertAllClose(qb.covariance().eval(), tf.diag(tf.diag_part(qb.covariance())).eval())
def decov_loss(xs): """Decov loss as described in https://arxiv.org/pdf/1511.06068.pdf 'Reducing Overfitting In Deep Networks by Decorrelating Representation' """ x = tf.reshape(xs, [int(xs.get_shape()[0]), -1]) m = tf.reduce_mean(x, 0, True) z = tf.expand_dims(x-m, 2) corr = tf.reduce_mean(tf.matmul(z, tf.transpose(z, perm=[0,2,1])), 0) corr_frob_sqr = tf.reduce_sum(tf.square(corr)) corr_diag_sqr = tf.reduce_sum(tf.square(tf.diag_part(corr))) loss = 0.5*(corr_frob_sqr - corr_diag_sqr) return loss
def testRankFourFloatTensorUnknownShape(self): x = np.random.rand(3, 3) i = np.arange(3) expected_ans = x[i, i] for shape in None, (None, 3), (3, None): with self.test_session(use_gpu=False): t = tf.convert_to_tensor(x.astype(np.float32)) t.set_shape(shape) tf_ans = tf.diag_part(t) out = tf_ans.eval() self.assertAllClose(out, expected_ans) self.assertShapeEqual(expected_ans, tf_ans)
def testDiagPartGrad(self): np.random.seed(0) shapes = ((3,3), (3,3,3,3)) dtypes = (tf.float32, tf.float64) with self.test_session(use_gpu=False): errors = [] for shape in shapes: for dtype in dtypes: x1 = tf.constant(np.random.rand(*shape), dtype=dtype) y = tf.diag_part(x1) error = tf.test.compute_gradient_error(x1, x1.get_shape().as_list(), y, y.get_shape().as_list()) tf.logging.info("error = %f", error) self.assertLess(error, 1e-4)
def build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] num_data = tf.cast(tf.shape(self.Y)[0], settings.dtypes.float_type) output_dim = tf.cast(tf.shape(self.Y)[1], settings.dtypes.float_type) err = self.Y - self.mean_function(self.X) Kdiag = self.kern.Kdiag(self.X) Kuf = self.kern.K(self.Z, self.X) Kuu = self.kern.K(self.Z) + eye(num_inducing) * settings.numerics.jitter_level L = tf.cholesky(Kuu) sigma = tf.sqrt(self.likelihood.variance) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma AAT = tf.matmul(A, tf.transpose(A)) B = AAT + eye(num_inducing) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma # compute log marginal bound bound = -0.5 * num_data * output_dim * np.log(2 * np.pi) bound += -output_dim * tf.reduce_sum(tf.log(tf.diag_part(LB))) bound -= 0.5 * num_data * output_dim * tf.log(self.likelihood.variance) bound += -0.5 * tf.reduce_sum(tf.square(err)) / self.likelihood.variance bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * tf.reduce_sum(Kdiag) / self.likelihood.variance bound += 0.5 * tf.reduce_sum(tf.diag_part(AAT)) return bound
def multivariate_normal(x, mu, L): """ L is the Cholesky decomposition of the covariance. x and mu are either vectors (ndim=1) or matrices. In the matrix case, we assume independence over the *columns*: the number of rows must match the size of L. """ d = x - mu alpha = tf.matrix_triangular_solve(L, d, lower=True) num_col = 1 if tf.rank(x) == 1 else tf.shape(x)[1] num_col = tf.cast(num_col, tf.float32) num_dims = tf.cast(tf.shape(x)[0], tf.float32) ret = - 0.5 * num_dims * num_col * np.log(2 * np.pi) ret += - num_col * tf.reduce_sum(tf.log(tf.diag_part(L))) ret += - 0.5 * tf.reduce_sum(tf.square(alpha)) return tf.reduce_sum(ret)
def multivariate_normal(x, mu, L): """ L is the Cholesky decomposition of the covaraince. x and mu are either vectors (ndim=1) or matrices. in the matrix case, we assume independence over the *columns*: the number of rows must match the size of L. """ d = x - mu alpha = tf.matrix_triangular_solve(L, d, lower=True) num_col = 1 if tf.rank(x)==1 else tf.shape(x)[1] #TODO: this call to get_diag relies on x being a numpy object (ie. having a shape) ret = - 0.5 * tf.cast(tf.size(x), tf.float64) * np.log(2 * np.pi) ret += - tf.cast(num_col, tf.float64) * tf.reduce_sum(tf.log(tf.diag_part(L))) ret += - 0.5 * tf.reduce_sum(tf.square(alpha)) return ret
def call(self, inputs): if self.coeffs_mean is None and self.coeffs_precision_tril_op is None: # p(mean(ynew) | xnew) = Normal(ynew | mean = 0, variance = xnew xnew^T) predictive_mean = 0. predictive_variance = tf.reduce_sum(tf.square(inputs), -1) else: # p(mean(ynew) | xnew, x, y) = Normal(ynew | # mean = xnew (1/noise_variance) (1/noise_variance x^T x + I)^{-1}x^T y, # variance = xnew (1/noise_variance x^T x + I)^{-1} xnew^T) predictive_mean = tf.einsum('nm,m->n', inputs, self.coeffs_mean) predictive_covariance = tf.matmul( inputs, self.coeffs_precision_tril_op.solve( self.coeffs_precision_tril_op.solve(inputs, adjoint_arg=True), adjoint=True)) predictive_variance = tf.diag_part(predictive_covariance) return ed.Normal(loc=predictive_mean, scale=tf.sqrt(predictive_variance))
def kl_term(m, S, K_zz, K_zz_inv, u_ovln, L, stabilizer_value): # mean_diff = (u_ovln * tf.ones([tf.shape(Z_ph)[0]]) - m) mean_diff = tf.expand_dims( u_ovln * tf.ones([tf.shape(m)[0]], dtype=DTYPE) - m, 1) first = tf.trace(tf.matmul(K_zz_inv, S), name='kl_first') # ######################################### # TODO: solve matrix determinant Problem # Approaches: # 1. naive impl of determinants # -> Problem: NaN as Determimants get very large for big matrices # Code: # kzz_det = tf.matrix_determinant(K_zz) # S_det = tf.matrix_determinant(S) # second = tf.log(kzz_det / S_det, name='kl_second') # 2. Logdet and Cholesky decomp # -> Problem: Cholesky decomp not always possible (only pos semidefinite by our constr?) # -> Adding Eye to S might be a possible solution with tf.name_scope('log_of_determinant_ratio'): # posdef_stabilizer = tf.diag(tf.random_normal([tf.shape(K_zz)[0]], stddev=stabilizer_value)) posdef_stabilizer = tf.eye(tf.shape(K_zz)[0], dtype=DTYPE) * stabilizer_value with tf.name_scope('K_zz_logdet'): K_zz_logdet = tf.linalg.logdet(K_zz + posdef_stabilizer) with tf.name_scope('S_logdet'): S_logdet = tf.linalg.logdet(S + posdef_stabilizer) alt_logdet_via_L = tf.diag_part( L) # 2 * tf.reduce_sum(tf.log(tf.diag_part(L))) # S_logdet = 2 * tf.reduce_sum(tf.log(tf.diag_part(L))) # posdef_stabilizer = tf.eye(L_shape[0]) * lambda second = tf.subtract(K_zz_logdet, S_logdet, name='kl_second') # 3. Using tf.slogdet # -> Problem: slogdet doesn't seem to have a gradient defined #kzz_lds, kzz_ldav = tf.linalg.slogdet(tf.expand_dims(K_zz, 0)) #K_zz_logdet = kzz_lds[0] * kzz_ldav[0] #S_lds, S_ldav = tf.linalg.slogdet(tf.expand_dims(S, 0)) #S_logdet = S_lds[0] * S_ldav[0] #second = tf.subtract(K_zz_logdet, S_logdet, name='kl_second') # ######################################### if DTYPE == tf.float32: third = tf.to_float(tf.shape(m)[0], name='kl_third') elif DTYPE == tf.float64: third = tf.to_double(tf.shape(m)[0], name='kl_third') else: print('ERROR: DTYPE must be set to either tf.float32 or tf.float64') # fourth = tf.reduce_sum(tf.multiply(tf.reduce_sum(tf.multiply(mean_diff, tf.transpose(K_zz_inv)), axis=1) , mean_diff)) fourth = tf.squeeze(tf.matmul(tf.matmul(tf.transpose(mean_diff), K_zz_inv), mean_diff), name='kl_fourth') return 0.5 * (first + second - third + fourth), [ S_logdet, alt_logdet_via_L ]
def _compute_prediction_and_loss(self, l, label_inputs, unit_idx): l_label, l_eval_mask, l_dyn_hw = label_inputs ## Ground truth # compute block idx layer_idx = unit_idx # first idx that is > layer_idx bi = bisect.bisect_right(self.cumsum_blocks, layer_idx) label_img_idx = self.bi_to_scale_idx( bi) if not self.do_scale_feat_to_label else 0 label = l_label[ label_img_idx] # note this is a probability of label distri eval_mask = l_eval_mask[label_img_idx] dyn_hw = l_dyn_hw[label_img_idx] n_non_void_samples = tf.reduce_sum(eval_mask) n_non_void_samples += tf.cast(tf.less_equal(n_non_void_samples, 1e-12), tf.float32) ## Compute flattened logits # Assume all previous layers have gone through BNReLU, so conv directly ch_in = l.get_shape().as_list()[self.ch_dim] l = Conv2D('linear', l, self.num_classes, 1, use_bias=True) logit_vars = l.variables if self.data_format == 'channels_first': l = tf.transpose(l, [0, 2, 3, 1]) if self.do_scale_feat_to_label: # at this stage, the logits are already channels_last l = ResizeImages('resize_logits', l, dyn_hw, data_format='channels_last') logits = tf.reshape(l, [-1, self.num_classes], name='logits') logits.variables = logit_vars ## Square error between distributions. # Implement our own here b/c class weighting. prob = tf.nn.softmax(logits, name='pred_prob') prob_img_shape = tf.stack([-1, dyn_hw[0], dyn_hw[1], self.num_classes]) prob_img = tf.reshape(prob, prob_img_shape, name='pred_prob_img') sqr_err = tf.reduce_sum(\ tf.multiply(tf.square(label - prob), self.class_weight), \ axis=1, name='pixel_prob_square_err') sqr_err = tf.divide(tf.reduce_sum(sqr_err * eval_mask), n_non_void_samples, name='prob_sqr_err') add_moving_summary(sqr_err) ## Weighted cross entropy # Have to implement our own weighted softmax cross entroy # because TF doesn't provide one # Because logits and cost are returned in the end of this func, # we use _logit to represent the shifted logits. max_logits = tf.reduce_max(logits, axis=1, keep_dims=True) _logits = logits - max_logits normalizers = tf.reduce_sum(tf.exp(_logits), axis=1, keep_dims=True) _logits = _logits - tf.log(normalizers) cross_entropy = -tf.reduce_sum(\ tf.multiply(label * _logits, self.class_weight), axis=1) cross_entropy = cross_entropy * eval_mask cross_entropy = tf.divide(tf.reduce_sum(cross_entropy), n_non_void_samples, name='cross_entropy_loss') add_moving_summary(cross_entropy) ## Unweighted total abs diff sum_abs_diff = sum_absolute_difference(prob, label) sum_abs_diff *= eval_mask sum_abs_diff = tf.divide(tf.reduce_sum(sum_abs_diff), n_non_void_samples, name='sum_abs_diff') add_moving_summary(sum_abs_diff) ## confusion matrix for iou and pixel level accuracy int_pred = tf.argmax(logits, 1, name='int_pred') int_label = tf.argmax(label, 1, name='int_label') cm = tf.confusion_matrix(labels=int_label, predictions=int_pred,\ num_classes=self.num_classes, name='confusion_matrix', weights=eval_mask) ## pixel level accuracy accu = tf.divide(tf.cast(tf.reduce_sum(tf.diag_part(cm)), dtype=tf.float32), \ n_non_void_samples, name='accuracy') add_moving_summary(accu) return logits, cross_entropy
def train(self): """ This methods builds and trains the current model. """ self.logger.info("train model") tf.reset_default_graph() # define placeholder x = tf.placeholder('float32', [None, 10]) y = tf.placeholder('float32', [None, 10]) lambda_val = tf.placeholder('float32', [1, 1]) # build encoder z_mu = self.__build_encoder(x, self.hidden_dim) # parametrize sparsity layer ada = tf.matmul(tf.transpose(z_mu), z_mu) * (1.0 / self.batch_size) a_dp_a = tf.diag_part(ada) z_ls2 = tf.log(a_dp_a + 1) # calc z eps = tf.random_normal((self.batch_size, self.hidden_dim), 0, 1, dtype=tf.float32) # Adding a random number z = tf.add(z_mu, eps) # build decoder y_hat, y_ls2 = self.__build_decoder(z, 10) # define loss reconstr_loss = lambda_val * tf.reduce_sum( 0.5 * y_ls2 + (tf.square(y - y_hat) / (2.0 * tf.exp(y_ls2))), 1) latent_loss = 0.5 * tf.reduce_sum(z_ls2) total_loss = tf.reduce_mean(reconstr_loss) + latent_loss # define optimizer optimizer = tf.train.AdamOptimizer(self.learning_rate, epsilon=1e-8).minimize(total_loss) # run training with tf.Session() as session: session.run(tf.global_variables_initializer()) # init data iterator number_of_iterations = 70000 itx = list() ity = list() h_y = list() nzn = list() lambda_list = list() latent_list = list() lambda_value = 0.4 # run training procedure for epoch in tqdm(range(number_of_iterations)): # sample new batch x_batch, y_batch, y_orig_batch = ArtificialDataIterator.next_batch( self.batch_size, self.doTransform) # run training _, loss, ll, rl, sparse_matrix, y_mu, zmu = session.run( (optimizer, total_loss, latent_loss, reconstr_loss, a_dp_a, y_hat, z_mu), feed_dict={ x: x_batch, y: y_batch, lambda_val: np.asarray([[lambda_value]]) }) if (epoch % 500 == 0 and epoch > 0): # if latent loss higher 0.1 if (np.mean(ll) > 1e-1): # save MI(x,z) itx.append(np.mean(ll)) lambda_list.append(lambda_value) latent_list.append(zmu) #calc empirical Y entropy entropy = np.mean(np.absolute(np.asarray(h_y))) print("Cost: %.2f, I(x,t): %.4f, I(t,y): %4f" % (loss, np.mean(ll), -(np.mean(rl) / lambda_value) - entropy)) # save MI (z,y) ity.append(-(np.mean(rl) / lambda_value) - entropy) # save size of used latent dimensions num_latent_dim = len([ i for i, v in enumerate(sparse_matrix) if v > 0.25 ]) nzn.append(num_latent_dim) mi_x_t = np.asarray(itx) mi_t_y = np.asarray(ity) nzn_array = np.asarray(nzn) nbins = int(min(12, max(1, np.floor(len(mi_x_t) / 3)))) breaks = np.linspace(0.99 * min(mi_x_t), max(mi_x_t), nbins + 1) xl = list() yl = list() yl_means = list() nzn_list = list() kc = 0 for k in range(nbins): matchings_indices = [ i for i, item in enumerate(mi_x_t) if item > breaks[k] and item < breaks[k + 1] ] # if more than 3 MI -> create new bin if len(matchings_indices) > 3: xl.append(np.mean(mi_x_t[matchings_indices])) yl.append(mi_t_y[matchings_indices]) yl_means.append( np.median(mi_t_y[matchings_indices])) nzn_list.append( np.min(nzn_array[matchings_indices])) kc += 1 else: # collect mutual information in order to calculate the empirical entropy of Y h_y.append(-(np.mean(rl) / lambda_value)) # increase compression parameter lambda lambda_value = lambda_value * 1.06 IOTools.save_to_file( (yl_means, yl, xl, sparse_matrix, nzn_list, nzn, ity, itx, y_orig_batch, zmu, Transformation.UniformToOrig(y_orig_batch, y_mu), lambda_list, latent_list), self.dump_path)
def __init__(self, n_input, kernel_size, n_hidden, reg_constant1 = 1.0, re_constant2 = 1.0, batch_size = 100, reg = None, \ denoise = False, model_path = None, restore_path = None, \ logs_path = './logs', num_modalities=2): self.n_input = n_input self.kernel_size = kernel_size self.n_hidden = n_hidden self.batch_size = batch_size self.reg = reg self.model_path = model_path self.restore_path = restore_path self.iter = 0 self.num_modalities =num_modalities weights = self._initialize_weights() self.x={} #input required to be fed for i in range(0, self.num_modalities): modality = str(i) self.x[modality] = tf.placeholder(tf.float32, [None, self.n_input[0], self.n_input[1], 1]) self.learning_rate = tf.placeholder(tf.float32, [], name='learningRate') if denoise == False: x_input = self.x latents, shape = self.encoder(x_input,weights,self.num_modalities) Coef = weights['Coef'] Coef = Coef - tf.diag(tf.diag_part(Coef)) self.Coef = Coef z={} z_c={} latent_c={} for i in range(0, self.num_modalities): modality = str(i) z[modality] = tf.reshape(latents[modality], [batch_size, -1]) z_c[modality] = tf.matmul(Coef,z[modality]) latent_c[modality] = tf.reshape(z_c[modality], tf.shape(latents[modality])) self.z = z self.z_c =z_c self.x_r = self.decoder(latent_c, weights, self.num_modalities, shape) # l_2 reconstruction loss self.reconst_cost_x = 0.6*tf.reduce_sum(tf.pow(tf.subtract(self.x['0'], self.x_r['0']), 2.0)) for i in range(1, self.num_modalities): modality = str(i) self.reconst_cost_x = self.reconst_cost_x + 0.1*tf.reduce_sum(tf.pow(tf.subtract(self.x[modality], self.x_r[modality]), 2.0)) tf.summary.scalar("recons_loss", self.reconst_cost_x) self.reg_losses = tf.reduce_sum(tf.pow(self.Coef,2.0)) tf.summary.scalar("reg_loss", reg_constant1 * self.reg_losses ) self.selfexpress_losses = 0.3*tf.reduce_sum(tf.pow(tf.subtract(self.z['0'], self.z_c['0']), 2.0)) for i in range(1, self.num_modalities): modality = str(i) self.selfexpress_losses = self.selfexpress_losses + 0.05*tf.reduce_sum(tf.pow(tf.subtract(self.z[modality], self.z_c[modality]), 2.0)) tf.summary.scalar("selfexpress_loss", re_constant2 * self.selfexpress_losses ) self.loss = self.reconst_cost_x + reg_constant1 * self.reg_losses + re_constant2 * self.selfexpress_losses self.merged_summary_op = tf.summary.merge_all() self.optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate).minimize(self.loss) #GradientDescentOptimizer #AdamOptimizer self.init = tf.global_variables_initializer() tfconfig = tf.ConfigProto(allow_soft_placement=True) tfconfig.gpu_options.allow_growth = True self.sess = tf.InteractiveSession(config=tfconfig) self.sess.run(self.init) self.saver = tf.train.Saver([v for v in tf.trainable_variables() if not (v.name.startswith("Coef"))]) self.summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
def __init__(self, is_training, word_embeddings, simple_position=False): NN.__init__(self, is_training, word_embeddings, simple_position) with tf.name_scope("conv-maxpool"): mask_embedding = tf.constant( [[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32) pcnn_mask = tf.nn.embedding_lookup(mask_embedding, self.mask) input_sentence = tf.expand_dims(self.input_embedding, axis=1) x = tf.layers.conv2d(inputs=input_sentence, filters=FLAGS.hidden_size, kernel_size=[1, 3], strides=[1, 1], padding='same', kernel_initializer=tf.contrib.layers. xavier_initializer_conv2d()) x = tf.reshape(x, [-1, self.max_length, FLAGS.hidden_size, 1]) x = tf.reduce_max( tf.reshape(pcnn_mask, [-1, 1, self.max_length, 3]) * tf.transpose(x, [0, 2, 1, 3]), axis=2) x = tf.nn.relu(tf.reshape(x, [-1, self.output_size])) if FLAGS.katt_flag != 0: stack_repre = self.katt(x, is_training) else: stack_repre = self.att(x, is_training) with tf.name_scope("loss"): logits = tf.matmul(stack_repre, tf.transpose( self.relation_matrix)) + self.bias self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.label, logits=logits)) self.loss = tf.losses.softmax_cross_entropy( onehot_labels=self.label, logits=logits, weights=self.weights) self.output = tf.nn.softmax(logits) tf.summary.scalar('loss', self.loss) self.predictions = tf.argmax(logits, 1, name="predictions") self.correct_predictions = tf.equal(self.predictions, tf.argmax(self.label, 1)) self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions, "float"), name="accuracy") if not is_training: with tf.name_scope("test"): if FLAGS.katt_flag != 0: test_attention_logit = self.katt_test(x) else: test_attention_logit = self.att_test(x) test_tower_output = [] for i in range(FLAGS.test_batch_size): test_attention_score = tf.nn.softmax( tf.transpose(test_attention_logit[ self.scope[i]:self.scope[i + 1], :])) final_repre = tf.matmul(test_attention_score, x[self.scope[i]:self.scope[i + 1]]) logits = tf.matmul(final_repre, tf.transpose(relation_matrix)) + bias output = tf.diag_part(tf.nn.softmax(logits)) test_tower_output.append(output) test_stack_output = tf.reshape( tf.stack(test_tower_output), [FLAGS.test_batch_size, self.num_classes]) self.test_output = test_stack_output
def train_main(hparams): """ Main training routine for the dot semantic network bot :return: """ # ----------------------- # INIT EXPERIMENT # ---------------------- exp = Experiment(name=hparams.exp_name, debug=hparams.debug, description=hparams.exp_desc, autosave=False, save_dir=hparams.test_tube_dir) exp.add_argparse_meta(hparams) exp.save() # ----------------------- # LOAD DATASET # ---------------------- udc_dataset = UDCDataset(vocab_path=hparams.vocab_path, train_path=hparams.dataset_train_path, test_path=hparams.dataset_test_path, val_path=hparams.dataset_val_path, max_seq_len=hparams.max_seq_len) # ----------------------- # INIT TF VARS # ---------------------- # input_x holds chat history # input_y holds our responses # labels holds the ground truth labels input_x = tf.placeholder( dtype=tf.int32, shape=[hparams.batch_size, None], name='input_x') input_y = tf.placeholder( dtype=tf.int32, shape=[hparams.batch_size, None], name='input_y') # ---------------------- # EMBEDDING LAYER # ---------------------- # you can preload your own or learn in the network # in this case we'll just learn it in the network embedding = tf.get_variable('embedding', [udc_dataset.vocab_size, hparams.embedding_dim]) # ---------------------- # RESOLVE EMBEDDINGS # ---------------------- # Lookup the embeddings. embedding_x = tf.nn.embedding_lookup(embedding, input_x) embedding_y = tf.nn.embedding_lookup(embedding, input_y) # Generates 1 vector per training example. x = tf.reduce_sum(embedding_x, axis=1) y = tf.reduce_sum(embedding_y, axis=1) # ---------------------- # OPTIMIZATION PROBLEM # ---------------------- S = dot_product_scoring(x, y, is_training=True) K = tf.reduce_logsumexp(S, axis=1) loss = -tf.reduce_mean(tf.diag_part(S) - K) # allow optimizer to be changed through hyper params optimizer = get_optimizer(hparams=hparams, minimize=loss) # ---------------------- # TF ADMIN (VAR INIT, SESS) # ---------------------- sess = tf.Session() init_vars = tf.global_variables_initializer() sess.run(init_vars) # Add ops to save and restore all the variables. saver = tf.train.Saver() # ---------------------- # TRAINING ROUTINE # ---------------------- # admin vars nb_batches_served = 0 eval_every_n_batches = hparams.eval_every_n_batches train_err = 1000 prec_at_1 = 0 prec_at_2 = 0 # iter for the needed epochs print('\n\n', '-'*100,'\n {} TRAINING\n'.format(hparams.exp_name.upper()), '-'*100, '\n\n') for epoch in range(hparams.nb_epochs): print('training epoch:', epoch + 1) progbar = Progbar(target=udc_dataset.nb_tng, width=50) train_gen = udc_dataset.train_generator(batch_size=hparams.batch_size, max_epochs=1) # mini batches for batch_context, batch_utterance in train_gen: feed_dict = { input_x: batch_context, input_y: batch_utterance } # OPT: run one step of optimization optimizer.run(session=sess, feed_dict=feed_dict) # update loss metrics if nb_batches_served % eval_every_n_batches == 0: # calculate test error train_err = loss.eval(session=sess, feed_dict=feed_dict) prec_at_1 = test_precision_at_k(S, feed_dict, k=1, sess=sess) prec_at_2 = test_precision_at_k(S, feed_dict, k=2, sess=sess) # update prog bar exp.add_metric_row({'tng loss': train_err, 'P@1': prec_at_1, 'P@2': prec_at_2}) nb_batches_served += 1 progbar.add(n=len(batch_context), values=[('train_err', train_err), ('P@1', prec_at_1), ('P@2', prec_at_2)]) # ---------------------- # END OF EPOCH PROCESSING # ---------------------- # calculate the val loss print('\nepoch complete...\n') check_val_stats(loss, S, udc_dataset, hparams, input_x, input_y, exp, sess, epoch) # save model save_model(saver=saver, hparams=hparams, sess=sess, epoch=epoch) # save exp data exp.save() tf.reset_default_graph()
def build_model(self): """Defines the GP model. The loss is computed for partial feedback settings (bandits), so only the observed outcome is backpropagated (see weighted loss). Selects the optimizer and, finally, it also initializes the graph. """ logging.info("Initializing model %s.", self.name) self.global_step = tf.train.get_or_create_global_step() # Define state for the model (inputs, etc.) self.x_train = tf.get_variable( "training_data", initializer=tf.ones([self.hparams.batch_size, self.n_in], dtype=tf.float64), validate_shape=False, trainable=False) self.y_train = tf.get_variable("training_labels", initializer=tf.zeros( [self.hparams.batch_size, 1], dtype=tf.float64), validate_shape=False, trainable=False) self.weights_train = tf.get_variable( "weights_train", initializer=tf.ones([self.hparams.batch_size, self.n_out], dtype=tf.float64), validate_shape=False, trainable=False) self.input_op = tf.assign(self.x_train, self.x_in, validate_shape=False) self.input_w_op = tf.assign(self.weights_train, self.weights, validate_shape=False) self.input_std = tf.get_variable("data_standard_deviation", initializer=tf.ones([1, self.n_out], dtype=tf.float64), dtype=tf.float64, trainable=False) self.input_mean = tf.get_variable("data_mean", initializer=tf.zeros( [1, self.n_out], dtype=tf.float64), dtype=tf.float64, trainable=True) # GP Hyperparameters self.noise = tf.get_variable("noise", initializer=tf.cast(0.0, dtype=tf.float64)) self.amplitude = tf.get_variable("amplitude", initializer=tf.cast(1.0, dtype=tf.float64)) self.amplitude_linear = tf.get_variable("linear_amplitude", initializer=tf.cast( 1.0, dtype=tf.float64)) self.length_scales = tf.get_variable("length_scales", initializer=tf.zeros( [1, self.n_in], dtype=tf.float64)) self.length_scales_lin = tf.get_variable("length_scales_linear", initializer=tf.zeros( [1, self.n_in], dtype=tf.float64)) # Latent embeddings of the different outputs for task covariance self.task_vectors = tf.get_variable( "latent_task_vectors", initializer=tf.random_normal([self.n_out, self.task_latent_dim], dtype=tf.float64)) # Normalize outputs across each dimension # Since we have different numbers of observations across each task, we # normalize by their respective counts. index_counts = self.atleast_2d(tf.reduce_sum(self.weights, axis=0), self.n_out) index_counts = tf.where( index_counts > 0, index_counts, tf.ones(tf.shape(index_counts), dtype=tf.float64)) self.var_op = tf.assign( self.input_std, tf.sqrt(1e-4 + tf.reduce_sum(tf.square( self.y - tf.reduce_sum(self.y, axis=0) / index_counts), axis=0) / index_counts)) with tf.control_dependencies([self.var_op]): y_normed = self.atleast_2d( (self.y - self.input_mean) / self.input_std, self.n_out) y_normed = self.atleast_2d( tf.boolean_mask(y_normed, self.weights > 0), 1) self.out_op = tf.assign(self.y_train, y_normed, validate_shape=False) # Observation noise self.alpha = tf.nn.softplus(self.noise) + 1e-6 # Covariance with tf.control_dependencies( [self.input_op, self.input_w_op, self.out_op]): self.self_cov = ( self.cov(self.x_in, self.x_in) * self.task_cov(self.weights, self.weights) + tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64) * self.alpha) self.chol = tf.cholesky(self.self_cov) self.kinv = tf.cholesky_solve( self.chol, tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64)) self.input_inv = tf.Variable(tf.eye(self.hparams.batch_size, dtype=tf.float64), validate_shape=False, trainable=False) self.input_cov_op = tf.assign(self.input_inv, self.kinv, validate_shape=False) # Log determinant by taking the singular values along the diagonal # of self.chol with tf.control_dependencies([self.input_cov_op]): logdet = 2.0 * tf.reduce_sum( tf.log(tf.diag_part(self.chol) + 1e-16)) # Log Marginal likelihood self.marginal_ll = -tf.reduce_sum( -0.5 * tf.matmul(tf.transpose(y_normed), tf.matmul(self.kinv, y_normed)) - 0.5 * logdet - 0.5 * self.n * np.log(2 * np.pi)) zero = tf.cast(0., dtype=tf.float64) one = tf.cast(1., dtype=tf.float64) standard_normal = tfd.Normal(loc=zero, scale=one) # Loss is marginal likelihood and priors self.loss = tf.reduce_sum(self.marginal_ll - ( standard_normal.log_prob(self.amplitude) + standard_normal.log_prob(tf.exp(self.noise)) + standard_normal.log_prob(self.amplitude_linear) + tfd.Normal(loc=zero, scale=one * 10.).log_prob(self.task_vectors))) # Optimizer for hyperparameters optimizer = tf.train.AdamOptimizer(learning_rate=self.hparams.lr) vars_to_optimize = [ self.amplitude, self.length_scales, self.length_scales_lin, self.amplitude_linear, self.noise, self.input_mean ] if self.learn_embeddings: vars_to_optimize.append(self.task_vectors) grads = optimizer.compute_gradients(self.loss, vars_to_optimize) self.train_op = optimizer.apply_gradients(grads, global_step=self.global_step) # Predictions for test data self.y_mean, self.y_pred = self.posterior_mean_and_sample(self.x) # create tensorboard metrics self.create_summaries() self.summary_writer = tf.summary.FileWriter( "{}/graph_{}".format(FLAGS.logdir, self.name), self.sess.graph) self.check = tf.add_check_numerics_ops()
def DoOneRun(self, run_id, rf_number, nn_replication, prefix='', seed=0, batch_count=1): batch_size = self.config.batch_size self.config.rf_number = rf_number self.config.rf_file_name = ('features_' + prefix + '_' + str(rf_number) + '_' + str(run_id) + '.pkl') srf = rf.GenerateOrLoadRF(self.config, seed=run_id + 2718281828 + seed) if isinstance(nn_replication, (list, tuple)): self.skeleton.SetReplication(nn_replication) else: self.skeleton.SetReplication( [int(x * nn_replication) for x in self.original_replication]) with tf.Graph().as_default(), tf.Session('') as sess: examples = self.get_inputs(batch_size) # Calculate the exact gram matrix for the batch gram = tf.reshape(kf.Kernel(self.skeleton, examples, examples), [batch_size, batch_size]) # Calculate the approximate gram matrix using a neural net rep, _ = NN.NeuralNet(self.skeleton, self.config, examples) srep = tf.squeeze(rep) approx_gram = tf.matmul(srep, tf.transpose(srep)) # Normalize the approximate gram matrix to so that the norm of # each element is 1. norms = tf.reshape(tf.sqrt(tf.diag_part(approx_gram)), [-1, 1]) nn_gram = tf.div(approx_gram, tf.matmul(norms, tf.transpose(norms))) # Compute the approximate gram matrix using random features parameters = tf.constant( np.zeros((rf_number, self.config.number_of_classes)).astype(np.float32)) rand_features = tf.SparseTensor(srf.features[0], srf.features[1], srf.features[2]) _, rf_vectors = rf.RandomFeaturesGraph( self.skeleton, self.config.number_of_classes, examples, rf_number, rand_features, parameters, srf.weights) rf_gram = tf.matmul(rf_vectors, rf_vectors, transpose_b=True) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess, coord) RF_K_stat = Stat() NN_K_stat = Stat() for i in xrange(batch_count): gram_np, nn_gram_np, rf_gram_np, approx_gram_np = sess.run( [gram, nn_gram, rf_gram, approx_gram]) RF_K_stat.AddToStat(gram_np, rf_gram_np) NN_K_stat.AddToStat(gram_np, nn_gram_np) coord.request_stop() coord.join(threads) return NN_K_stat, RF_K_stat
def init_issue(self, Xtrain, Ytrain, Xtest=None, Ytest=None): size_temp, dim = Xtrain.shape flag_test_exists = False size_test = None self.ph.real_size = size_temp if (Xtest is not None) and (Ytest is not None): flag_test_exists = True size_test = Xtest.shape[0] self.ph.use_test = True self.ph.X_test_comp = Xtest self.ph.Y_test_comp = Ytest self.ph.real_size = size_test self.ph_tf.SubXTest = tf.constant(Xtest, dtype=tf.float64) self.ph_tf.SubYTest = tf.constant(Ytest, dtype=tf.float64) if self.batch_size < size_temp: self.ph_tf.SubXTrain = tf.placeholder(dtype=tf.float64, shape=(self.batch_size, dim)) self.ph_tf.SubYTrain = tf.placeholder(dtype=tf.float64, shape=(self.batch_size, )) else: self.ph.full_batch = True self.batch_size = size_temp self.ph_tf.SubXTrain = tf.constant(Xtrain, dtype=tf.float64) self.ph_tf.SubYTrain = tf.constant(Ytrain, dtype=tf.float64) if self.ph.r_ww is None: ww_init = np.zeros(dim) else: ww_init = self.ph.r_ww self.ph_tf.ww_ = tf.Variable(ww_init, dtype=tf.float64) self.ph_tf.w_ = tf.nn.softmax(self.ph_tf.ww_) self.ph_tf.ws_ = tf.reshape(tf.sqrt(self.ph_tf.w_), (-1, 1)) self.ph_tf.WS_ = tf.matmul(self.ph_tf.ws_, self.ph_tf.ws_, transpose_b=True) if self.ph.r_WMMMM is None: WMMMM_init = np.zeros([dim, dim]) else: WMMMM_init = self.ph.r_WMMMM self.ph_tf.WMMMM_ = tf.Variable(WMMMM_init, dtype=tf.float64) self.ph_tf.WMMM_ = (tf.sigmoid(self.ph_tf.WMMMM_) - 0.5) * 2 self.ph_tf.WMM_ = (self.ph_tf.WMMM_ + tf.transpose(self.ph_tf.WMMM_)) / 2 self.ph_tf.WM_ = self.ph_tf.WMM_ - tf.diag( tf.diag_part(self.ph_tf.WMM_)) + tf.diag(np.ones(dim)) # self.pp_ = tf.Variable(1, dtype=tf.float64) # self.p_ = tf.sigmoid(self.pp_) + 1 # self.p_ = tf.pow(self.pp_, 2) + 0.01 self.ph_tf.m_ = self.ph_tf.WS_ * self.ph_tf.WM_ # self.m_ = tf.diag(tf.nn.softmax(tf.diag_part((self.mm_ + tf.transpose(self.mm_)) / 2))) if flag_test_exists: self.ph_tf.Ad = tf.reduce_sum( tf.matmul(self.ph_tf.SubXTrain, self.ph_tf.m_) * self.ph_tf.SubXTrain, axis=1) self.ph_tf.Bd = tf.reduce_sum( tf.matmul(self.ph_tf.SubXTest, self.ph_tf.m_) * self.ph_tf.SubXTest, axis=1) self.ph_tf.AD = tf.tile(tf.reshape(self.ph_tf.Ad, (-1, 1)), [1, size_test]) self.ph_tf.BD = tf.tile(tf.reshape(self.ph_tf.Bd, (1, -1)), [self.batch_size, 1]) self.ph_tf.AM = tf.matmul(tf.matmul(self.ph_tf.SubXTrain, self.ph_tf.m_), self.ph_tf.SubXTest, transpose_b=True) self.ph_tf.DistP = self.ph_tf.AD + self.ph_tf.BD - 2 * self.ph_tf.AM else: self.ph_tf.AM = tf.matmul(tf.matmul(self.ph_tf.SubXTrain, self.ph_tf.m_), self.ph_tf.SubXTrain, transpose_b=True) self.ph_tf.Ad = tf.diag_part(self.ph_tf.AM) self.ph_tf.AD = tf.tile(tf.reshape(self.ph_tf.Ad, (1, -1)), [self.batch_size, 1]) self.ph_tf.DistP = self.ph_tf.AD + tf.transpose( self.ph_tf.AD) - 2 * self.ph_tf.AM self.ph_tf.Dist = tf.cast(self.ph_tf.DistP, tf.float64) # self.Dist = tf.pow(self.DistP, self.p_) # self.Dist = self.AD + tf.transpose(self.AD) - 2 * self.AM if self.ph.r_KN is None: init_kn = 1 else: init_kn = self.ph.r_KN # self.KN_base = tf.Variable(initial_value=np.log(init_kn), dtype=tf.float64) # self.KN = tf.exp(self.KN_base) self.ph_tf.KN = tf.Variable(initial_value=init_kn, dtype=tf.float64) self.ph_tf.DistR = self.ph_tf.Dist * self.ph_tf.KN * self.ph.KN0 if not flag_test_exists: self.ph_tf.DistR += tf.cast(tf.diag([np.inf] * self.batch_size), dtype=tf.float64) if self.cal_dist_mode == 0: self.ph_tf.DistRR = -self.ph_tf.DistR elif self.cal_dist_mode == 1: self.ph_tf.DistRR = tf.reciprocal(self.ph_tf.DistR) if not flag_test_exists: self.ph_tf.DistR -= tf.cast(tf.diag([np.inf] * self.batch_size), dtype=tf.float64) elif self.cal_dist_mode == 2: self.ph_tf.DistRR = tf.sigmoid(-self.ph_tf.DistR) else: self.ph_tf.DistRR = tf.reciprocal( self.ph_tf.DistR) - self.ph_tf.DistR self.ph_tf.IMatch = tf.nn.softmax(self.ph_tf.DistRR, axis=0) # self.ph_tf.IMatch = tf.nn.softmax(self.ph_tf.DistRR) SubYTrain_vec = tf.reshape(self.ph_tf.SubYTrain, (1, -1)) self.ph_tf.Y_predict = tf.matmul(SubYTrain_vec, self.ph_tf.IMatch) if flag_test_exists: self.ph_tf.Y_compare = tf.reshape(self.ph_tf.SubYTest, (1, -1)) else: self.ph_tf.Y_compare = SubYTrain_vec self.ph_tf.loss = tf.nn.l2_loss( tf.subtract(self.ph_tf.Y_compare, self.ph_tf.Y_predict)) self.ph_tf.my_loss = tf.sqrt( tf.reduce_mean( tf.square(self.ph_tf.Y_compare - self.ph_tf.Y_predict))) self.ph_tf.reg_term = tf.reduce_sum(self.ph_tf.m_) * self.ph.reg_alpha self.ph_tf.op_tar = self.ph_tf.loss + self.ph_tf.reg_term self.ph_tf.optimizer = tf.train.AdamOptimizer( **self.ph.dict_para_optimizer) # self.optimizer = tf.train.AdamOptimizer(learning_rate=10, beta1=0.5, beta2=0.8, epsilon=1e-8) # self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=1,) self.ph_tf.train = self.ph_tf.optimizer.minimize(self.ph_tf.op_tar) self.ph_tf.init_op = tf.global_variables_initializer()
def test_trace_KiX_against_solve(self): B = np.random.randn(self.N, self.N) tr_AiB_tf = self.session.run(self.mat.trace_KiX(B), self.feed) tr_AiB_tf2 = self.session.run( tf.reduce_sum(tf.diag_part(self.mat.solve(B))), self.feed) self.assertTrue(np.allclose(tr_AiB_tf, tr_AiB_tf2))
def logpdf(self, x, mean=None, cov=1): """Log of the probability density function. Parameters ---------- x : tf.Tensor A 1-D or 2-D tensor. mean : tf.Tensor, optional A 1-D tensor. Defaults to zero mean. cov : tf.Tensor, optional A 1-D or 2-D tensor. Defaults to identity matrix. Returns ------- tf.Tensor A tensor of one dimension less than the input. """ x = tf.cast(x, dtype=tf.float32) x_shape = get_dims(x) if len(x_shape) == 1: d = x_shape[0] else: d = x_shape[1] if mean is None: r = x else: mean = tf.cast(mean, dtype=tf.float32) r = x - mean if cov is 1: L_inv = tf.diag(tf.ones([d])) det_cov = tf.constant(1.0) else: cov = tf.cast(cov, dtype=tf.float32) if len(cov.get_shape()) == 1: # vector L_inv = tf.diag(1.0 / tf.sqrt(cov)) det_cov = tf.reduce_prod(cov) else: # matrix L = tf.cholesky(cov) L_inv = tf.matrix_inverse(L) det_cov = tf.pow(tf.reduce_prod(tf.diag_part(L)), 2) lps = -0.5 * d * tf.log(2 * np.pi) - 0.5 * tf.log(det_cov) if len(x_shape) == 1: # vector r = tf.reshape(r, shape=(d, 1)) inner = tf.matmul(L_inv, r) lps -= 0.5 * tf.matmul(inner, inner, transpose_a=True) return tf.squeeze(lps) else: # matrix # TODO vectorize further out = [] for r_vec in tf.unpack(r): r_vec = tf.reshape(r_vec, shape=(d, 1)) inner = tf.matmul(L_inv, r_vec) out += [ tf.squeeze(lps - 0.5 * tf.matmul(inner, inner, transpose_a=True)) ] return tf.pack(out)
def bpr(self, yhat): yhatT = tf.transpose(yhat) return tf.reduce_mean( -tf.log(tf.nn.sigmoid(tf.diag_part(yhat) - yhatT)))
def cross_entropy(self, yhat): # tf.diag_part取出对角线的值 return tf.reduce_mean(-tf.log(tf.diag_part(yhat) + 1e-24))
def __init__(self, is_training, word_embeddings, cell_name, simple_position=False): NN.__init__(self, is_training, word_embeddings, simple_position) input_sentence = tf.layers.dropout(self.input_embedding, rate=self.keep_prob, training=is_training) with tf.name_scope('bi-rnn'): fw_cell = self.get_rnn_cell(FLAGS.hidden_size, cell_name) bw_cell = self.get_rnn_cell(FLAGS.hidden_size, cell_name) outputs, states = tf.nn.bidirectional_dynamic_rnn( fw_cell, bw_cell, input_sentence, sequence_length=self.len, dtype=tf.float32, scope='bi-dynamic-rnn') fw_states, bw_states = states if isinstance(fw_states, tuple): fw_states = fw_states[0] bw_states = bw_states[0] x = tf.concat(states, axis=1) if FLAGS.katt_flag != 0: stack_repre = self.katt(x, is_training, False) else: stack_repre = self.att(x, is_training, False) with tf.name_scope("loss"): logits = tf.matmul(stack_repre, tf.transpose( self.relation_matrix)) + self.bias self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.label, logits=logits)) self.loss = tf.losses.softmax_cross_entropy( onehot_labels=self.label, logits=logits, weights=self.weights) self.output = tf.nn.softmax(logits) tf.summary.scalar('loss', self.loss) self.predictions = tf.argmax(logits, 1, name="predictions") self.correct_predictions = tf.equal(self.predictions, tf.argmax(self.label, 1)) self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions, "float"), name="accuracy") if not is_training: with tf.name_scope("test"): if FLAGS.katt_flag != 0: test_attention_logit = self.katt_test(x) else: test_attention_logit = self.att_test(x) test_tower_output = [] for i in range(FLAGS.test_batch_size): test_attention_score = tf.nn.softmax( tf.transpose(test_attention_logit[ self.scope[i]:self.scope[i + 1], :])) final_repre = tf.matmul(test_attention_score, x[self.scope[i]:self.scope[i + 1]]) logits = tf.matmul(final_repre, tf.transpose(relation_matrix)) + bias output = tf.diag_part(tf.nn.softmax(logits)) test_tower_output.append(output) test_stack_output = tf.reshape( tf.stack(test_tower_output), [FLAGS.test_batch_size, self.num_classes]) self.test_output = test_stack_output
def mu_tilde_square(X_data, Z, S, m, Kzz_inv, a, g): # DEBUG: # Kzz_inv = tf.eye(tf.shape(Z)[0]) ''' N : num datapoints D : datapoint dimensionality M : number inducing points IN: --- X_data : (N, D) Z : (M, D) S : (M, M) m : (M) K_zz_inv : (M, M) a : (D) g : () OUT: ---- mu : (N) sig_sqr : (N) ''' with tf.name_scope('K_ZX'): # k_zx : (M, N) k_zx = ard_kernel(Z, X_data, gamma=g, alphas=a) with tf.name_scope('K_XZ'): # k_xz : (N, M) k_xz = tf.transpose(k_zx, name='K_XZ') with tf.name_scope('K_XX'): # k_xx : (N, N) K_xx = ard_kernel(X_data, X_data, gamma=g, alphas=a) with tf.name_scope('kernel_matrices_summaries'): tf.summary.histogram('KZZ_inv', Kzz_inv) tf.summary.histogram('KZX', k_zx) tf.summary.histogram('KXX', K_xx) # mu = tf.matmul(tf.matmul(tf.transpose(tf.expand_dims(m,1)),Kzz_inv),k_zx, name='mu') # mu : (N, M)dot(M, M)dot(M) = (N) mu = tf.squeeze( tf.matmul(tf.matmul(k_xz, Kzz_inv), tf.expand_dims(m, 1), name='mu')) # sig_sqr : (N, N) - (N, M)dot(M,M)dot(M,N) with tf.name_scope('XX_variance'): middle = tf.diag_part(tf.matmul(tf.matmul(k_xz, Kzz_inv), k_zx)) right = tf.diag_part( tf.matmul( tf.matmul(tf.matmul(tf.matmul(k_xz, Kzz_inv), S), Kzz_inv), k_zx)) XX_cov = tf.diag_part(K_xx) sig_sqr = XX_cov - middle + right tf.summary.histogram('mean_at_datapoints', mu) tf.summary.histogram('variance_at_datapoints', sig_sqr) return mu, sig_sqr, [XX_cov, middle, right, k_zx]
def trace_KiX(self, X): """ X is a square matrix of the same size as this one. if self is K, compute tr(K^{-1} X) """ return tf.reduce_sum(tf.diag_part(X) / self.d)
tf.reset_default_graph() x_place = tf.placeholder(tf.float32, shape=(None, x_array.shape[-1]), name='x_place') y_place = tf.placeholder(tf.float32, shape=(None, y_array.shape[-1]), name='y_place') x_proj = dense_layer(x_place, units=NVECS, smoothness=X_SMOOTH, name='x_proj') y_proj = dense_layer(y_place, units=NVECS, smoothness=Y_SMOOTH, name='y_proj') #covar_mat = tf.matmul(tf.transpose((x_proj - y_proj)), (x_proj - y_proj)) covar_mat = tf.matmul(tf.transpose(x_proj), y_proj) cca_loss = tf.reduce_sum(tf.diag_part(tf.abs(covar_mat))) upper_loss = tf.reduce_sum(tf.matrix_band_part(tf.abs(covar_mat), 0, -1)) lower_loss = tf.reduce_sum(tf.matrix_band_part(tf.abs(covar_mat), -1, 0)) total_loss = -3. * cca_loss + upper_loss + lower_loss ## create optimizer and train op #optimizer = tf.train.MomentumOptimizer(learning_rate=LEARN_RATE, momentum=0.9) optimizer = tf.train.AdamOptimizer(learning_rate=LEARN_RATE) train_op = optimizer.minimize(total_loss) ## create eval op eval_op = tf_pearson_correlation(x_proj, y_proj) ## get weight clipping ops #maxnorm_ops = tf.get_collection('maxnorm') ortho_ops = tf.get_collection('ortho')
def logdet(self): part1 = tf.reduce_sum(tf.log(self.d)) I = tf.eye(tf.shape(self.W)[1], float_type) M = I + tf.matmul(tf.transpose(self.W) / self.d, self.W) part2 = 2 * tf.reduce_sum(tf.log(tf.diag_part(tf.cholesky(M)))) return part1 + part2
def log_cholesky_det(chol): return 2 * tf.reduce_sum(tf.log(tf.diag_part(chol)))
def energy(x): """Unnormalized minus log density of 2d strongly correlated Gaussian.""" xmmu = x - mu return .5 * tf.diag_part( tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu)))
def triangular_inv(L): eye = tf.diag(tf.ones_like(tf.diag_part(L))) invL = tf.matrix_triangular_solve(L, eye) return invL
def build_variance_standard(self): print('build variance') num_test = self.x_test.get_shape().as_list()[0] total_sum = [0.0 for y in range(self.num_outputs)] r = self.r full_var_flag = False precomp_intermediate = [[] for x in range(self.num_components)] for l in range(self.num_components): x_test = tf.expand_dims(self.x_test, 1) #N* x 1 x D mu_f, sigma_f, _, _ = self.sparsity._build_intermediate_conditionals( l, self.a, x_test, predict=not full_var_flag) mu_f, sigma_f = self.get_expected_values(mu_f, sigma_f) pi_l = self.q_weights[l] precomp_intermediate[l].append(pi_l) precomp_intermediate[l].append(mu_f) precomp_intermediate[l].append(sigma_f) if self.context.plot_posterior: noise_sigma = 0.0 else: noise_sigma = tf.square(util.var_postive(self.sigma_y[0])) noise_sigma = tf.Print( noise_sigma, [noise_sigma, tf.square(util.var_postive(self.sigma_y[0]))], 'noise_sigma: ') noise_sigma = tf.Print(noise_sigma, [noise_sigma], 'noise_sigma: ') for k in range(self.num_components): #mu_f = [Q, N, 1] #mu_w = [Q, P, N, 1] x_test = tf.expand_dims(self.x_test, 1) #N* x 1 x D #mu_f = #Q * N* x 1 #sigma_f = #Q * N* x 1 x 1 mu_f, sigma_f, _, _ = self.sparsity._build_intermediate_conditionals( k, self.a, x_test, predict=not full_var_flag) mu_f, sigma_f = self.get_expected_values(mu_f, sigma_f) pi_k = self.q_weights[k] i = 0 j = 0 mu_f = mu_f[j, :, 0] # N x 1 #sigma_f = tf.matrix_diag_part(sigma_f[j, :]) sigma_f = sigma_f[j, :, :, 0] # N x 1 s = sigma_f[:, 0] s = tf.Print(s, [noise_sigma], 'noise_sigma: ') s = tf.Print(s, [tf.shape(sigma_f)], 'tf.shape(sigma_f): ') s += noise_sigma total_sum[i] += pi_k * s if full_var_flag: total_sum = total_sum[0] else: total_sum = tf.stack(total_sum, axis=1) total_sum = tf.Print(total_sum, [self.likelihood_weights[self.r]], 'self.likelihood_weights[self.r]: ') if full_var_flag: total_sum = tf.Print(total_sum, [tf.shape(total_sum)], 'total_sum: ') return tf.expand_dims(tf.diag_part(total_sum), -1) return total_sum
def invert(settings, epoch, samples, g_tolerance=None, e_tolerance=0.1, n_iter=None, max_iter=10000, heuristic_sigma=None, C_samples=None): """ Return the latent space points corresponding to a set of a samples ( from gradient descent ) """ # cast samples to float32 samples = np.float32(samples[:, :, :]) # get the model if type(settings) == str: settings = json.load( open('./experiments/settings/' + settings + '.txt', 'r')) num_samples = samples.shape[0] print( 'Inverting', num_samples, 'samples using model', settings['identifier'], 'at epoch', epoch, ) if not g_tolerance is None: print('until gradient norm is below', g_tolerance) else: print('until error is below', e_tolerance) # get parameters parameters = load_parameters(settings['identifier'] + '_' + str(epoch)) # assertions assert samples.shape[2] == settings['num_generated_features'] # create VARIABLE Z Z = tf.get_variable( name='Z', shape=[num_samples, settings['seq_length'], settings['latent_dim']], initializer=tf.random_normal_initializer()) if C_samples is None: # create outputs G_samples = generator(Z, settings['hidden_units_g'], settings['seq_length'], num_samples, settings['num_generated_features'], reuse=False, parameters=parameters) fd = None else: CG = tf.placeholder(tf.float32, [num_samples, settings['cond_dim']]) assert C_samples.shape[0] == samples.shape[0] # CGAN G_samples = generator(Z, settings['hidden_units_g'], settings['seq_length'], num_samples, settings['num_generated_features'], reuse=False, parameters=parameters, cond_dim=settings['cond_dim'], c=CG) fd = {CG: C_samples} # define loss if heuristic_sigma is None: heuristic_sigma = mmd.median_pairwise_distance( samples) # this is noisy print('heuristic_sigma:', heuristic_sigma) Kxx, Kxy, Kyy, wts = mmd._mix_rbf_kernel(G_samples, samples, sigmas=tf.constant( value=heuristic_sigma, shape=(1, 1))) similarity_per_sample = tf.diag_part(Kxy) reconstruction_error_per_sample = 1 - similarity_per_sample #reconstruction_error_per_sample = tf.reduce_sum((tf.nn.l2_normalize(G_samples, dim=1) - tf.nn.l2_normalize(samples, dim=1))**2, axis=[1,2]) similarity = tf.reduce_mean(similarity_per_sample) reconstruction_error = 1 - similarity # updater # solver = tf.train.AdamOptimizer().minimize(reconstruction_error_per_sample, var_list=[Z]) #solver = tf.train.RMSPropOptimizer(learning_rate=500).minimize(reconstruction_error, var_list=[Z]) solver = tf.train.RMSPropOptimizer(learning_rate=0.1).minimize( reconstruction_error_per_sample, var_list=[Z]) #solver = tf.train.MomentumOptimizer(learning_rate=0.1, momentum=0.9).minimize(reconstruction_error_per_sample, var_list=[Z]) grad_Z = tf.gradients(reconstruction_error_per_sample, Z)[0] grad_per_Z = tf.norm(grad_Z, axis=(1, 2)) grad_norm = tf.reduce_mean(grad_per_Z) #solver = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(reconstruction_error, var_list=[Z]) print('Finding latent state corresponding to samples...') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) error = sess.run(reconstruction_error, feed_dict=fd) g_n = sess.run(grad_norm, feed_dict=fd) print(g_n) i = 0 if not n_iter is None: while i < n_iter: _ = sess.run(solver, feed_dict=fd) error = sess.run(reconstruction_error, feed_dict=fd) i += 1 else: if not g_tolerance is None: while g_n > g_tolerance: _ = sess.run(solver, feed_dict=fd) error, g_n = sess.run([reconstruction_error, grad_norm], feed_dict=fd) i += 1 print(error, g_n) if i > max_iter: break else: while np.abs(error) > e_tolerance: _ = sess.run(solver, feed_dict=fd) error = sess.run(reconstruction_error, feed_dict=fd) i += 1 print(error) if i > max_iter: break Zs = sess.run(Z, feed_dict=fd) error_per_sample = sess.run(reconstruction_error_per_sample, feed_dict=fd) print('Z found in', i, 'iterations with final reconstruction error of', error) tf.reset_default_graph() return Zs, error_per_sample, heuristic_sigma
def define(self): self.abstract = tf.placeholder("float", [None, self.n_steps, self.n_input]) self.x1_label = tf.placeholder("float", [None, self.n_steps, self.n_input]) self.x2_label = tf.placeholder("float", [None, self.n_steps, self.n_input]) self.x1_defn = tf.placeholder("float", [None, self.n_steps, self.n_input]) self.x2_defn = tf.placeholder("float", [None, self.n_steps, self.n_input]) self.x1_unit = tf.placeholder("float", [None, self.n_steps, self.n_input]) self.x2_unit = tf.placeholder("float", [None, self.n_steps, self.n_input]) self.y_mt = tf.placeholder("float", [None, self.n_class]) self.y_ent = tf.placeholder("float", [None, self.n_class]) self.y_char = tf.placeholder("float", [None, self.n_class]) self.keep_prob = tf.placeholder(tf.float32) with tf.variable_scope("BiLSTM_Abstract"): self.rep_abstract, self.seq_len_abstract = self.bidirectional_rnn( self.abstract) with tf.variable_scope("BiLSTM_Label") as scope: self.rep_x1_label, self.seq_len_x1_label = self.bidirectional_rnn( self.x1_label) scope.reuse_variables() self.rep_x2_label, self.seq_len_x2_label = self.bidirectional_rnn( self.x2_label) with tf.variable_scope("BiLSTM_Defn") as scope: self.rep_x1_defn, self.seq_len_x1_defn = self.bidirectional_rnn( self.x1_defn) scope.reuse_variables() self.rep_x2_defn, self.seq_len_x2_defn = self.bidirectional_rnn( self.x2_defn) with tf.variable_scope("BiLSTM_Unit") as scope: self.rep_x1_unit, self.seq_len_x1_unit = self.bidirectional_rnn( self.x1_unit) scope.reuse_variables() self.rep_x2_unit, self.seq_len_x2_unit = self.bidirectional_rnn( self.x2_unit) self.w_sim = tf.get_variable( "w_sim", shape=[self.n_hidden, self.n_hidden], initializer=tf.contrib.layers.xavier_initializer()) self.sim_score_label = tf.diag_part( tf.matmul(tf.matmul(self.rep_x1_label, self.w_sim), tf.transpose(self.rep_x2_label))) self.sim_score_label = tf.expand_dims(self.sim_score_label, 1) self.sim_score_defn = tf.diag_part( tf.matmul(tf.matmul(self.rep_x1_defn, self.w_sim), tf.transpose(self.rep_x2_defn))) self.sim_score_defn = tf.expand_dims(self.sim_score_defn, 1) self.sim_score_unit = tf.diag_part( tf.matmul(tf.matmul(self.rep_x1_unit, self.w_sim), tf.transpose(self.rep_x2_unit))) self.sim_score_unit = tf.expand_dims(self.sim_score_unit, 1) self.joined_vec = tf.concat([ self.rep_abstract, self.rep_x1_label, self.rep_x1_defn, self.rep_x1_unit, self.sim_score_label, self.sim_score_defn, self.sim_score_unit, self.rep_x2_label, self.rep_x2_defn, self.rep_x2_unit ], 1) self.w_out_mt = tf.get_variable( "w_out_mt", shape=[7 * self.n_hidden + 3, self.n_class], initializer=tf.contrib.layers.xavier_initializer()) self.b_out_mt = tf.get_variable( "b_out_mt", [self.n_class], initializer=tf.constant_initializer(0.0)) self.w_out_ent = tf.get_variable( "w_out_ent", shape=[7 * self.n_hidden + 3, self.n_class], initializer=tf.contrib.layers.xavier_initializer()) self.b_out_ent = tf.get_variable( "b_out_ent", [self.n_class], initializer=tf.constant_initializer(0.0)) self.w_out_char = tf.get_variable( "w_out_char", shape=[7 * self.n_hidden + 3, self.n_class], initializer=tf.contrib.layers.xavier_initializer()) self.b_out_char = tf.get_variable( "b_out_char", [self.n_class], initializer=tf.constant_initializer(0.0)) self.pred_mt = tf.matmul( tf.nn.dropout(self.joined_vec, self.keep_prob), self.w_out_mt) + self.b_out_mt self.pred_softmax_mt = tf.nn.softmax(self.pred_mt) self.pred_ent = tf.matmul( tf.nn.dropout(self.joined_vec, self.keep_prob), self.w_out_ent) + self.b_out_ent self.pred_softmax_ent = tf.nn.softmax(self.pred_ent) self.pred_char = tf.matmul( tf.nn.dropout(self.joined_vec, self.keep_prob), self.w_out_char) + self.b_out_char self.pred_softmax_char = tf.nn.softmax(self.pred_char) self.loss_orig = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.pred_mt, labels=self.y_mt)) \ + tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.pred_ent, labels=self.y_ent)) \ + tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.pred_char, labels=self.y_char)) l2_loss = tf.nn.l2_loss(self.w_out_mt) + tf.nn.l2_loss(self.b_out_mt) \ + tf.nn.l2_loss(self.w_out_ent) + tf.nn.l2_loss(self.b_out_ent) \ + tf.nn.l2_loss(self.w_out_char) + tf.nn.l2_loss(self.b_out_char) l2_reg_lambda = 0.5 self.loss = self.loss_orig + l2_reg_lambda * l2_loss self.optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(self.loss) self.init = tf.global_variables_initializer() self.saver = tf.train.Saver()
def getLogDet(M): return 2.0 * tf.reduce_sum(tf.log(tf.diag_part(tf.cholesky(M))), 0)
def mmd(data, gen, sigma=1., is_tf=False, weights=None): """Computes MMD between NumPy arrays. The smaller the value, the closer the sets. Args: data: ND NumPy array of any length, e.g. (1000, 2). gen: ND NumPy array of any length, e.g. (10, 2). sigma: Float, kernel bandwidth. is_tf: Boolean. Selects for TensorFlow functions. weights: (M,1) NumPy array with random weight for each data point. Returns: mmd: Scalar, the MMD between the sets. gradients_mmd: NumPy array of MMD gradients for each generated point. """ #print(' [*] Analytical gradients not yet implemented for MMD.') x = data y = gen # ------------- TensorFlow VERSION ------------- if is_tf: dim = tf.shape(x)[1] data_num = tf.shape(x)[0] gen_num = tf.shape(y)[0] v = tf.concat([x, y], 0) VVT = tf.matmul(v, tf.transpose(v)) v_sq = tf.reshape(tf.diag_part(VVT), [-1, 1]) #v_sq_tiled = tf.tile(v_sq, [1, v_sq.get_shape().as_list()[0]]) #v_sq_tiled_T = tf.transpose(v_sq_tiled) v_sq_tiled = tf.tile(v_sq, [1, data_num + gen_num]) v_sq_tiled_T = tf.transpose(v_sq_tiled) #v_sq_tiled = tf.tile(tf.expand_dims(v_sq, 1), [1, tf.shape(v_sq)[0], 1]) #v_sq_tiled_T = tf.transpose(v_sq_tiled, [1, 0, 2]) # Build kernel matrix, and optionally multiple by data weights. exp_object = v_sq_tiled - 2 * VVT + v_sq_tiled_T gamma = 1.0 / (2.0 * sigma**2) K = tf.exp(-gamma * exp_object) if weights is not None: weights = tf.constant(weights) p1_gen_num_weights = tf.tile(weights, (1, gen_num)) K_xy = K[:data_num, data_num:] * p1_gen_num_weights else: K_xy = K[:data_num, data_num:] K_xx = K[:data_num, :data_num] K_yy = K[data_num:, data_num:] m = tf.cast(data_num, tf.float32) n = tf.cast(gen_num, tf.float32) mmd = (1. / m / m * tf.reduce_sum(K_xx) + 1. / n / n * tf.reduce_sum(K_yy) - 2. / m / n * tf.reduce_sum(K_xy)) # TODO: MMD gradients. gradients_mmd = None return mmd, gradients_mmd # ------------- NumPy VERSION ------------- elif not is_tf: data_num = len(x) gen_num = len(y) if len(x.shape) == 1: x = np.reshape(x, [-1, 1]) y = np.reshape(y, [-1, 1]) v = np.concatenate((x, y), 0) VVT = np.matmul(v, np.transpose(v)) sqs = np.reshape(np.diag(VVT), [-1, 1]) sqs_tiled_horiz = np.tile(sqs, np.transpose(sqs).shape) # Build kernel matrix, and optionally multiple by data weights. exp_object = sqs_tiled_horiz - 2 * VVT + np.transpose(sqs_tiled_horiz) gamma = 1.0 / (2.0 * sigma**2) K = np.exp(-gamma * exp_object) if weights is not None: p1_gen_num_weights = np.tile(weights, (1, gen_num)) K_xy = K[:data_num, data_num:] * p1_gen_num_weights else: K_xy = K[:data_num, data_num:] K_xx = K[:data_num, :data_num] K_yy = K[data_num:, data_num:] mmd = (1. / data_num / data_num * np.sum(K_xx) + 1. / gen_num / gen_num * np.sum(K_yy) - 2. / data_num / gen_num * np.sum(K_xy)) # TODO: MMD gradients. gradients_mmd = None return mmd, gradients_mmd
def nce_loss(inputs, weights, biases, labels, sample, unigram_prob): """ ========================================================================== inputs: Embeddings for context words. Dimension is [batch_size, embedding_size]. weigths: Weights for nce loss. Dimension is [Vocabulary, embeeding_size]. biases: Biases for nce loss. Dimension is [Vocabulary, 1]. labels: Word_ids for predicting words. Dimesion is [batch_size, 1]. samples: Word_ids for negative samples. Dimension is [num_sampled]. unigram_prob: Unigram probability. Dimesion is [Vocabulary]. Implement Noise Contrastive Estimation Loss Here ========================================================================== """ ###########################################################################33 K = len(sample) batch_size = inputs.get_shape().as_list()[0] embedding_size = inputs.get_shape().as_list()[1] sample_size = len(sample) delta = tf.exp(-10.0) # Lookup for fetching the embeddings for the labels label_embedding = tf.reshape( tf.nn.embedding_lookup(weights, labels, name="labels_embedding"), [batch_size, embedding_size]) # Lookup for fetching the embeddings for the samples sample_embedding = tf.reshape( tf.nn.embedding_lookup(weights, sample, name="sample_embedding"), [sample_size, embedding_size]) # Lookup for fetching the bias for the samples sample_bias = tf.reshape( tf.nn.embedding_lookup(biases, sample, name="sample_bias"), [sample_size, 1]) unigram_prob = tf.reshape(unigram_prob, [weights.get_shape().as_list()[0], 1]) # Lookup for fetching the unigram probabilities for the sample sample_prob = tf.reshape( tf.nn.embedding_lookup(unigram_prob, sample, name="unigram_sample"), [sample_size, 1]) # Matrix multiplication for samples and inputs {sample*batch_size} sample_matmul = tf.matmul(sample_embedding, inputs, transpose_b=True) # Replicating the sample bias for easy addition sample_bias_multiple = tf.tile(sample_bias, [1, batch_size]) s_wxwc = tf.add(sample_matmul, sample_bias_multiple) # Replicating the probabilities for samples for easy arithematic sample_prob_multiple = tf.tile(sample_prob, [1, batch_size]) k_sample_prob_multiple = tf.scalar_mul(K, sample_prob_multiple) log_k_sample = tf.log(k_sample_prob_multiple + delta) sub_swxwc_logk_sample = tf.subtract(s_wxwc, log_k_sample, name="Inner-sigmoid-B") sigmoid_wxwc = tf.sigmoid(sub_swxwc_logk_sample, name="sigmoid-B") log_red_sum_sample = tf.log(1 - sigmoid_wxwc + delta) red_sum_sample = tf.reduce_sum(log_red_sum_sample, [0]) ####################################################################################### # Lookup for fetching the biases for the labels label_bias = tf.reshape( tf.nn.embedding_lookup(biases, labels, name="label_bias"), [batch_size, 1]) # Lookup for fetching the unigram probabilities for the labels label_prob = tf.reshape( tf.nn.embedding_lookup(unigram_prob, labels, name="unigram_sample"), [batch_size, 1]) # Matrix multiplication and taking the diagonal elements label_matmul = tf.reshape( tf.diag_part(tf.matmul(label_embedding, inputs, transpose_b=True)), [batch_size, 1]) s_wowc = tf.add(label_matmul, label_bias) k_label_prob_multiple = tf.scalar_mul(K, label_prob) log_k_label = tf.log(k_label_prob_multiple) sub_swowc_logk_label = tf.subtract(s_wowc, log_k_label, name="Inner-sigmoid-B") sigmoid_wowc = tf.sigmoid(sub_swowc_logk_label, name="sigmoid-B") log_red_sum_label = tf.log(sigmoid_wowc + delta) final_sum = tf.add(red_sum_sample, log_red_sum_label) return tf.negative(final_sum)
with tf.variable_scope('D_loss'): label = tf.concat([y,tf.zeros([batch_size,1])],axis=1) d_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=D_logits,labels=label)) with tf.variable_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(D[:,:-1],1), tf.argmax(y,1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32)) with tf.name_scope('gradients'): grad_loss_over_X = tf.gradients(d_loss, X)[0] grad_features_over_X = tf.gradients( tf.reduce_mean(tf.diag_part(flat_features[0:64,0:64])),X)[0] grad_logit_over_X = tf.gradients( tf.reduce_mean(tf.diag_part(D_logits[0:10,0:10])),X)[0] dvar = tf.global_variables() saver = tf.train.Saver(dvar) sess = tf.InteractiveSession() init = tf.global_variables_initializer() sess.run(init) #saver.restore(sess,tf.train.latest_checkpoint('GAN/discriminator/')) saver.restore(sess,tf.train.latest_checkpoint('discriminator_no_GAN/')) coord = tf.train.Coordinator()
def build_model(self, video, video_mask, caption, caption_mask, train_flag, reuse_variable=False): self.video = video # [batch_size, length, kernel, kernel, channel] self.video_mask = video_mask # [batch_size, length] video_mask_leng = tf.cast(tf.reduce_sum(self.video_mask,1),tf.int32) self.caption = caption # [batch_size, length] self.caption_mask = caption_mask # [batch_size, length] caption_mask_leng = tf.cast(tf.reduce_sum(self.caption_mask,1),tf.int32) #Make Mask list self.video_mask_list = [] self.caption_mask_list = [] max_len = self.config.caption_length for mi in range(2): video_mask_leng = tf.maximum(1, video_mask_leng-2) caption_mask_leng = tf.maximum(1, caption_mask_leng-2) max_len -= 2 self.video_mask_list.append(tf.reverse(tf.sequence_mask(video_mask_leng,max_len,tf.float32),[-1])) self.caption_mask_list.append(tf.sequence_mask(caption_mask_leng,max_len,tf.float32)) max_len = int((max_len-1)/2) video_mask_leng = tf.cast((video_mask_leng-1)/2,tf.int32) video_mask_leng = tf.maximum(1, video_mask_leng) caption_mask_leng = tf.cast((caption_mask_leng-1)/2,tf.int32) caption_mask_leng = tf.maximum(1, caption_mask_leng) self.video_mask_list.append(tf.reverse(tf.sequence_mask(video_mask_leng,max_len,tf.float32),[-1])) self.caption_mask_list.append(tf.sequence_mask(caption_mask_leng,max_len,tf.float32)) self.train_flag = train_flag #Batch normalization self.bn_fn = slim.batch_norm self.bn_params = {'is_training':self.train_flag} self.word_embed_t = tf.Variable(self.word_embed, dtype=tf.float32, name="word_embed", trainable=True) #video drop self.squeezed_feat = tf.squeeze(self.video) self.embedded_feat = tf.reshape(self.squeezed_feat, [self.batch_size, self.video_steps, self.channel_size]) # [batch_size, length, channel_size] self.embedded_feat = self.embedded_feat * tf.expand_dims(video_mask, 2) self.video_cell_d = lambda: rnn_cell.DropoutWrapper( self.video_cell(), input_keep_prob = self.dropout_keep_prob, output_keep_prob = self.dropout_keep_prob) self.caption_cell_d = lambda: rnn_cell.DropoutWrapper( self.caption_cell(), input_keep_prob = self.dropout_keep_prob, output_keep_prob = self.dropout_keep_prob) video_cell1 = rnn_cell.MultiRNNCell([self.video_cell_d() for _ in range(self.config.num_layers)], state_is_tuple=True) video_cell2 = rnn_cell.MultiRNNCell([self.video_cell_d() for _ in range(self.config.num_layers)], state_is_tuple=True) video_cell = [video_cell1, video_cell2] caption_cell1 = rnn_cell.MultiRNNCell([self.caption_cell_d() for _ in range(self.config.num_layers)], state_is_tuple=True) caption_cell2 = rnn_cell.MultiRNNCell([self.caption_cell_d() for _ in range(self.config.num_layers)], state_is_tuple=True) caption_cell = [caption_cell1, caption_cell2] video_emb_state = self.build_video_embedding(video_cell, self.embedded_feat, self.video_mask, reuse_variable) rnn_emb_state = self.build_caption_encoder(caption_cell, reuse_variable) with tf.variable_scope("multimodal", initializer=self.initializer) as scope: margin_list = [] logit_list = [] for i in range(self.batch_size): if i > 0: scope.reuse_variables() fuse = self.fusion(tf.tile(tf.expand_dims(video_emb_state[i,:,:],0),[self.batch_size,1,1]) , rnn_emb_state, i, reuse=(i>0)) with slim.arg_scope([slim.fully_connected], weights_regularizer=slim.l2_regularizer(0.0005), normalizer_fn=self.bn_fn, normalizer_params=self.bn_params): logit = slim.fully_connected(fuse, 256, activation_fn=tf.nn.leaky_relu, scope='fc1',reuse=(i>0)) logit = slim.fully_connected(logit, 256, activation_fn=tf.nn.leaky_relu, scope='fc2',reuse=(i>0)) logit = slim.fully_connected(logit, 128, activation_fn=tf.nn.leaky_relu, scope='fc3',reuse=(i>0)) logit = slim.fully_connected(logit, 1, activation_fn=None, scope='scorefn', reuse=(i>0)) score = logit logit_list.append(score) margin_list.append(score) margin_mat = tf.squeeze(tf.stack(margin_list)) logit_mat = tf.squeeze(tf.stack(logit_list)) self.logit = logit_mat diag_elem = tf.diag_part(margin_mat) loss_mat = tf.maximum(0.0, 10. + margin_mat - tf.reshape(diag_elem, [-1,1])) margin_loss = tf.reduce_sum(loss_mat) / (self.batch_size*self.batch_size) self.scores = margin_mat self.mean_loss = margin_loss self.concept_loss = tf.constant(0)
def __init__(self, input_means, input_vars, n_points, n_inducing_points, set_for_training, initial = None): BaseNode.__init__(self, input_means, input_vars) self.input_means = input_means self.input_vars = input_vars self.n_inducing_points = n_inducing_points self.input_d = input_means.get_shape().as_list()[1] self.batch_size = tf.shape(input_means)[0] self.n_points = n_points self.set_for_training = set_for_training # Covariance parameters of the cavities self.LParamPost = tf.Variable( tf.random_normal(((self.n_inducing_points, self.n_inducing_points)))) # Mean parameters of the cavities self.mParamPost = tf.Variable( tf.random_normal((self.n_inducing_points, 1))) self.lls = tf.Variable(tf.zeros([1, self.input_d], dtype=tf.float32)) self.lsf = tf.Variable(0.0, dtype=tf.float32) if (initial is None): self.z = tf.Variable( tf.random_uniform([self.n_inducing_points, self.input_d], -1, 1)) else: self.z = tf.Variable(initial, dtype=tf.float32) jitter = tf.cast(1e-3, tf.float32) # Below is based on the equations from page 8 # Expectation of Kxz w.r.t the input EKxz = SE.get_psi1(self.lls, self.lsf, self.input_means, self.input_vars, self.z) Kzz = SE.get_kernel(self.lls, self.lsf, self.z, self.z) Kzz += tf.eye(self.n_inducing_points) * jitter * tf.exp(self.lsf) KzzInv = getInversePSD(Kzz) Lu = tf.matrix_band_part(self.LParamPost, 0, -1) LParamPost_tri = Lu + tf.diag(tf.exp(tf.diag_part(self.LParamPost)) \ - tf.diag_part(self.LParamPost)) LtL = tf.matmul(tf.transpose(LParamPost_tri), LParamPost_tri) scalar = (self.n_points - self.set_for_training) / self.n_points covCavityInv = KzzInv + LtL * scalar covCavity = getInversePSD(covCavityInv) meanCavity = tf.matmul(covCavity, scalar * self.mParamPost) KzzInvcovCavity = tf.matmul(KzzInv, covCavity) KzzInvmeanCavity = tf.matmul(KzzInv, meanCavity) self.output_means = tf.matmul(EKxz, KzzInvmeanCavity) Kxz = SE.get_kernel(self.lls, self.lsf, self.input_means, self.z) B1 = tf.matmul(KzzInvcovCavity, KzzInv) - KzzInv v_out = tf.exp(self.lsf) + tf.reduce_sum(Kxz * tf.matmul(Kxz, B1), 1, keep_dims = True) B2 = tf.matmul(KzzInvmeanCavity, tf.transpose(KzzInvmeanCavity)) # Below is based on the equation (35) # L is the expectation of Kzz # B1 is Kinv # B2 is betabetaT L = SE.get_L(self.lls, self.lsf, self.z, self.input_means, self.input_vars) k = tf.expand_dims(Kxz, 2) kT = tf.expand_dims(Kxz, 1) kkT = tf.matmul(k, kT) l = tf.expand_dims(EKxz, 2) lT = tf.expand_dims(EKxz, 1) llT = tf.matmul(l, lT) L_kk = L - kkT L_ll = L - llT # Calculating the traces for the two terms v1 = tf.reduce_sum(tf.expand_dims(B2, 0) \ * tf.transpose(L_ll, [0, 2, 1]), [1, 2]) v2 = tf.reduce_sum(tf.expand_dims(B1, 0) \ * tf.transpose(L_kk, [0, 2, 1]), [1, 2]) v1 = tf.abs(tf.expand_dims(v1, 1)) v2 = tf.abs(tf.expand_dims(v2, 1)) self.output_vars = v_out + v2 + v1 # Finally calculate the energy (page 9) logZpost = self.getLogNormalizerPosterior(KzzInv, LtL) logZprior = self.getLogNormalizerPrior(KzzInv) logZcav = self.getLogNormalizerCavity(meanCavity, covCavity, covCavityInv) # We multiply by the minibatch size and normalize terms # according to the total number of points (n_points) self.v1 = v1 self.v2 = v2 self.vout = v_out self.energy = (logZcav - logZpost) * self.n_points + logZpost \ - logZprior