def random_tril_matrix( shape, dtype, force_well_conditioned=False, remove_upper=True): """[batch] lower triangular matrix. Args: shape: `TensorShape` or Python `list`. Shape of the returned matrix. dtype: `TensorFlow` `dtype` or Python dtype force_well_conditioned: Python `bool`. If `True`, returned matrix will have eigenvalues with modulus in `(1, 2)`. Otherwise, eigenvalues are unit normal random variables. remove_upper: Python `bool`. If `True`, zero out the strictly upper triangle. If `False`, the lower triangle of returned matrix will have desired properties, but will not not have the strictly upper triangle zero'd out. Returns: `Tensor` with desired shape and dtype. """ with tf.name_scope("random_tril_matrix"): # Totally random matrix. Has no nice properties. tril = random_normal(shape, dtype=dtype) if remove_upper: tril = tf.matrix_band_part(tril, -1, 0) # Create a diagonal with entries having modulus in [1, 2]. if force_well_conditioned: maxval = tf.convert_to_tensor(np.sqrt(2.), dtype=dtype.real_dtype) diag = random_sign_uniform( shape[:-1], dtype=dtype, minval=1., maxval=maxval) tril = tf.matrix_set_diag(tril, diag) return tril
def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[2], float_type) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type) # constant term Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(Lq)))) # logdet L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1])) LiLq = tf.matrix_triangular_solve(L_tiled, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def _random_cholesky_array(self, shape): mat = self._rng.rand(*shape) chol = distribution_util.matrix_diag_transform( mat, transform=tf.nn.softplus) # Zero the upper triangle because we're using this as a true Cholesky factor # in our tests. return tf.matrix_band_part(chol, -1, 0).eval()
def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder): shape = list(shape) diag_shape = shape[:-1] # Upper triangle will be ignored. # Use a diagonal that ensures this matrix is well conditioned. tril = tf.random_normal(shape=shape, dtype=dtype.real_dtype) diag = tf.random_uniform( shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.) if dtype.is_complex: tril = tf.complex( tril, tf.random_normal(shape, dtype=dtype.real_dtype)) diag = tf.complex( diag, tf.random_uniform( shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.)) tril = tf.matrix_set_diag(tril, diag) tril_ph = tf.placeholder(dtype=dtype) if use_placeholder: # Evaluate the tril here because (i) you cannot feed a tensor, and (ii) # tril is random and we want the same value used for both mat and # feed_dict. tril = tril.eval() operator = linalg.LinearOperatorTriL(tril_ph) feed_dict = {tril_ph: tril} else: operator = linalg.LinearOperatorTriL(tril) feed_dict = None mat = tf.matrix_band_part(tril, -1, 0) return operator, mat, feed_dict
def call(self, x, mask=None): x1 ,x2 = x outer = tf.matmul(tf.expand_dims(x1, axis=2), tf.expand_dims(x2, axis=1)) outer = tf.matrix_band_part(outer, 0, self.ans_limit) output1 = tf.reshape(tf.cast(tf.argmax(tf.reduce_max(outer, axis=2), axis=1), tf.float32),(-1,1)) output2 = tf.reshape(tf.cast(tf.argmax(tf.reduce_max(outer, axis=1), axis=1), tf.float32),(-1,1)) return [output1, output2]
def _sample_n(self, n, seed): batch_shape = self.batch_shape_tensor() event_shape = self.event_shape_tensor() batch_ndims = tf.shape(batch_shape)[0] ndims = batch_ndims + 3 # sample_ndims=1, event_ndims=2 shape = tf.concat([[n], batch_shape, event_shape], 0) stream = seed_stream.SeedStream(seed, salt="Wishart") # Complexity: O(nbk**2) x = tf.random_normal( shape=shape, mean=0., stddev=1., dtype=self.dtype, seed=stream()) # Complexity: O(nbk) # This parametrization is equivalent to Chi2, i.e., # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2) expanded_df = self.df * tf.ones( self.scale_operator.batch_shape_tensor(), dtype=self.df.dtype.base_dtype) g = tf.random_gamma( shape=[n], alpha=self._multi_gamma_sequence(0.5 * expanded_df, self.dimension), beta=0.5, dtype=self.dtype, seed=stream()) # Complexity: O(nbk**2) x = tf.matrix_band_part(x, -1, 0) # Tri-lower. # Complexity: O(nbk) x = tf.matrix_set_diag(x, tf.sqrt(g)) # Make batch-op ready. # Complexity: O(nbk**2) perm = tf.concat([tf.range(1, ndims), [0]], 0) x = tf.transpose(x, perm) shape = tf.concat([batch_shape, [event_shape[0]], [-1]], 0) x = tf.reshape(x, shape) # Complexity: O(nbM) where M is the complexity of the operator solving a # vector system. For LinearOperatorLowerTriangular, each matmul is O(k^3) so # this step has complexity O(nbk^3). x = self.scale_operator.matmul(x) # Undo make batch-op ready. # Complexity: O(nbk**2) shape = tf.concat([batch_shape, event_shape, [n]], 0) x = tf.reshape(x, shape) perm = tf.concat([[ndims - 1], tf.range(0, ndims - 1)], 0) x = tf.transpose(x, perm) if not self.input_output_cholesky: # Complexity: O(nbk**3) x = tf.matmul(x, x, adjoint_b=True) return x
def _forward(self, x): if self.validate_args: is_matrix = tf.assert_rank_at_least(x, 2) shape = tf.shape(x) is_square = tf.assert_equal(shape[-2], shape[-1]) x = control_flow_ops.with_dependencies([is_matrix, is_square], x) # For safety, explicitly zero-out the upper triangular part. x = tf.matrix_band_part(x, -1, 0) return tf.matmul(x, x, adjoint_b=True)
def CheckUnitary(self, x): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = tf.matmul(x, x, adjoint_a=True) identity = tf.matrix_band_part(tf.ones_like(xx), 0, 0) if is_single: tol = 1e-5 else: tol = 1e-14 self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
def Test(self): shape = batch_shape_ + shape_ x = tf.constant(np.random.rand(*shape), dtype=dtype_) with self.test_session(use_gpu=True): for lower in -1, 0, 1, shape_[-2] - 1: for upper in -1, 0, 1, shape_[-1] - 1: y = tf.matrix_band_part(x, lower, upper) error = tf.test.compute_gradient_error(x, x.get_shape().as_list(), y, y.get_shape().as_list()) self.assertLess(error, 1e-4)
def get_right_context_mask(time_steps): """ Generates the mask preventing the decoder from attending to unseen positions. """ # Generate mask that limits decoder self-attention up to and including the current position attn_mask = tf.matrix_band_part(tf.ones([time_steps, time_steps]), -1, 0) # Expand mask to 4d. so as to be compatible with attention weights attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 0), 0) # Illegal connections will be set to -inf when fed into the softmax function # Padding for non-masked positions is applied to prevent NaNs attn_mask = -1e9 * (1.0 - attn_mask) return attn_mask
def mask_leq(target_length, source_length): """A mask with 1.0 wherever source_pos <= target_pos and 0.0 elsewhere. Args: target_length: an integer source_length: an integer Returns: a Tensor with shape [1, target_length, source_length] """ return tf.expand_dims( tf.matrix_band_part(tf.ones([target_length, source_length]), -1, 0), 0)
def attention_bias_lower_triangle(length): """ Create a bias tensor to be added to attention logits. Allows a query to attend to all positions up to and including its own. Args: length: A scalar. Returns: A float Tensor of shape [1, 1, length, length], with -1e9 in padding positions and 0 in non-padding positions. """ lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0) ret = FLOAT_MIN * (1. - lower_triangle) return tf.reshape(ret, [1, 1, length, length])
def Test(self): mat = np.ones(shape_).astype(dtype_) batch_mat = np.tile(mat, batch_shape + (1, 1)) with self.test_session(use_gpu=True): for lower in -1, 0, 1, shape_[-2] - 1: for upper in -1, 0, 1, shape_[-1] - 1: band_np = mat if lower >= 0: band_np = np.triu(band_np, -lower) if upper >= 0: band_np = np.tril(band_np, upper) if batch_shape is not (): band_np = np.tile(band_np, batch_shape + (1, 1)) band = tf.matrix_band_part(batch_mat, lower, upper) self.assertAllEqual(band_np, band.eval())
def get_decoder_self_attention_bias(length): """Calculate bias for decoder that maintains model's autoregressive property. Creates a tensor that masks out locations that correspond to illegal connections, so prediction at position i cannot draw information from future positions. Args: length: int length of sequences in batch. Returns: float tensor of shape [1, 1, length, length] """ with tf.name_scope("decoder_self_attention_bias"): valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0) valid_locs = tf.reshape(valid_locs, [1, 1, length, length]) decoder_bias = _NEG_INF * (1.0 - valid_locs) return decoder_bias
def _assertions(self, x): if not self.validate_args: return [] x_shape = tf.shape(x) is_matrix = tf.assert_rank_at_least( x, 2, message="Input must have rank at least 2.") is_square = tf.assert_equal( x_shape[-2], x_shape[-1], message="Input must be a square matrix.") diag_part_x = tf.matrix_diag_part(x) is_lower_triangular = tf.assert_equal( tf.matrix_band_part(x, 0, -1), # Preserves triu, zeros rest. tf.matrix_diag(diag_part_x), message="Input must be lower triangular.") is_positive_diag = tf.assert_positive( diag_part_x, message="Input must have all positive diagonal entries.") return [is_matrix, is_square, is_lower_triangular, is_positive_diag]
def testNonDefaultsYieldCorrectShapesAndValues(self): batch_shape = [4, 3] x_size = 3 mvn_size = 5 x_ = np.random.randn(*np.concatenate([batch_shape, [x_size]])) x = tf.constant(x_) mvn = tfp.trainable_distributions.multivariate_normal_tril( x, dims=mvn_size, loc_fn=tf.zeros_like, scale_fn=lambda x: tfd.fill_triangular(tf.ones_like(x))) scale = mvn.scale.to_dense() expected_scale = tf.matrix_band_part( tf.ones(np.concatenate([batch_shape, [mvn_size, mvn_size]]), scale.dtype), num_lower=-1, num_upper=0) self.evaluate(tf.global_variables_initializer()) [ batch_shape_, event_shape_, loc_, scale_, expected_scale_, ] = self.evaluate([ mvn.batch_shape_tensor(), mvn.event_shape_tensor(), mvn.loc, scale, expected_scale, ]) self.assertAllEqual(batch_shape, mvn.batch_shape) self.assertAllEqual(batch_shape, batch_shape_) self.assertAllEqual([mvn_size], mvn.event_shape) self.assertAllEqual([mvn_size], event_shape_) self.assertAllEqual(np.zeros_like(loc_), loc_) self.assertAllEqual(expected_scale_, scale_)
def get_multivariate_gaussian_energy_fn(x_dim=2): """Get energy function for 2d strongly correlated Gaussian.""" mu = tf.random_normal(shape=[x_dim]) # Lower triangularize and positive diagonal l = tf.sigmoid( tf.matrix_band_part(tf.random_normal(shape=[x_dim, x_dim]), -1, 0)) # Exploit Cholesky decomposition sigma = tf.matmul(l, tf.transpose(l)) sigma *= 100. # Small covariance causes extreme numerical instability sigma_inv = tf.matrix_inverse(sigma) def energy(x): """Unnormalized log density/energy of 2d strongly correlated Gaussian.""" xmmu = x - mu return .5 * tf.diag_part( tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu))) return energy
def base_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, q_sqrt=None, white=False): # compute kernel stuff num_func = tf.shape(f)[1] # K Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = Knn - tf.matmul(A, A, transpose_a=True) shape = tf.stack([num_func, 1, 1]) else: fvar = Knn - tf.reduce_sum(tf.square(A), 0) shape = tf.stack([num_func, 1]) fvar = tf.tile(tf.expand_dims(fvar, 0), shape) # K x N x N or K x N # another backsubstitution in the unwhitened case if not white: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(A, f, transpose_a=True) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # K x M x N elif q_sqrt.get_shape().ndims == 3: L = tf.matrix_band_part(q_sqrt, -1, 0) # K x M x M A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([num_func, 1, 1])) LTA = tf.matmul(L, A_tiled, transpose_a=True) # K x M x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # K x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # K x N fvar = tf.transpose(fvar) # N x K or N x N x K return fmean, fvar
def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder): # Upper triangle will be nonzero, but ignored. # Use a diagonal that ensures this matrix is well conditioned. tril = linear_operator_test_util.random_tril_matrix( shape, dtype=dtype, force_well_conditioned=True, remove_upper=False) if use_placeholder: tril_ph = tf.placeholder(dtype=dtype) # Evaluate the tril here because (i) you cannot feed a tensor, and (ii) # tril is random and we want the same value used for both mat and # feed_dict. tril = tril.eval() operator = linalg.LinearOperatorTriL(tril_ph) feed_dict = {tril_ph: tril} else: operator = linalg.LinearOperatorTriL(tril) feed_dict = None mat = tf.matrix_band_part(tril, -1, 0) return operator, mat, feed_dict
def gauss_kl_white(q_mu, q_sqrt): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, I) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance. """ KL = 0.5 * tf.reduce_sum(tf.square(q_mu)) # Mahalanobis term KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type) # constant term L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL -= 0.5 * tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(L)))) # logdet KL += 0.5 * tf.reduce_sum(tf.square(L)) # Trace term. return KL
def mask_future(energies: tf.Tensor, mask_value=-1e9) -> tf.Tensor: """Mask energies of keys using lower triangular matrix. Mask simulates autoregressive decoding, such that it prevents the attention to look at what has not yet been decoded. Mask is not necessary during training when true output values are used instead of the decoded ones. Arguments: energies: A tensor to mask. mask_value: Value used to mask energies. Returns: Masked energies tensor. """ triangular_mask = tf.matrix_band_part(tf.ones_like(energies), -1, 0) mask_area = tf.equal(triangular_mask, 1) # Note that for compatibility with tensor2tensor, we use -1e9 for negative # infinity. masked_value = tf.fill(tf.shape(energies), mask_value) return tf.where(mask_area, energies, masked_value)
def _assertions(self, x): if not self.validate_args: return [] shape = tf.shape(x) is_matrix = tf.assert_rank_at_least( x, 2, message="Input must have rank at least 2.") is_square = tf.assert_equal( shape[-2], shape[-1], message="Input must be a square matrix.") above_diagonal = tf.matrix_band_part( tf.matrix_set_diag(x, tf.zeros(shape[:-1], dtype=tf.float32)), 0, -1) is_lower_triangular = tf.assert_equal( above_diagonal, tf.zeros_like(above_diagonal), message="Input must be lower triangular.") # A lower triangular matrix is nonsingular iff all its diagonal entries are # nonzero. diag_part = tf.matrix_diag_part(x) is_nonsingular = tf.assert_none_equal( diag_part, tf.zeros_like(diag_part), message="Input must have all diagonal entries nonzero.") return [is_matrix, is_square, is_lower_triangular, is_nonsingular]
def _build_likelihood(self): """ This method computes the variational lower bound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(\\mathbf f) = N(\\mathbf f \\,|\\, \\boldsymbol \\mu, \\boldsymbol \\Sigma) """ # Get prior KL. KL = gauss_kl(self.q_mu, self.q_sqrt) # Get conditionals K = self.kern.K(self.X) + tf.eye(self.num_data, dtype=settings.float_type) * \ settings.numerics.jitter_level L = tf.cholesky(K) fmean = tf.matmul(L, self.q_mu) + self.mean_function(self.X) # NN,ND->ND q_sqrt_dnn = tf.matrix_band_part(self.q_sqrt, -1, 0) # D x N x N L_tiled = tf.tile(tf.expand_dims(L, 0), tf.stack([self.num_latent, 1, 1])) LTA = tf.matmul(L_tiled, q_sqrt_dnn) # D x N x N fvar = tf.reduce_sum(tf.square(LTA), 2) fvar = tf.transpose(fvar) # Get variational expectations. var_exp = self.likelihood.variational_expectations(fmean, fvar, self.Y) return tf.reduce_sum(var_exp) - KL
def testDefaultsYieldCorrectShapesAndValues(self): batch_shape = [4, 3] x_size = 3 mvn_size = 5 x_ = np.random.randn(*np.concatenate([batch_shape, [x_size]])) x = tf.constant(x_) mvn = tfp.trainable_distributions.multivariate_normal_tril(x, dims=mvn_size) scale = mvn.scale.to_dense() scale_upper = tf.matrix_set_diag( tf.matrix_band_part(scale, num_lower=0, num_upper=-1), tf.zeros(np.concatenate([batch_shape, [mvn_size]]), scale.dtype)) scale_diag = tf.matrix_diag_part(scale) self.evaluate(tf.global_variables_initializer()) [ batch_shape_, event_shape_, scale_diag_, scale_upper_, ] = self.evaluate([ mvn.batch_shape_tensor(), mvn.event_shape_tensor(), scale_diag, scale_upper, ]) self.assertAllEqual(batch_shape, mvn.batch_shape) self.assertAllEqual(batch_shape, batch_shape_) self.assertAllEqual([mvn_size], mvn.event_shape) self.assertAllEqual([mvn_size], event_shape_) self.assertAllEqual(np.ones_like(scale_diag_, dtype=np.bool), scale_diag_ > 0.) self.assertAllEqual(np.zeros_like(scale_upper_), scale_upper_)
def __init__(self, placeholders, input_dim, attack=None, **kwargs): super(GCN, self).__init__(**kwargs) print('attack method:',attack) # if attack is False, placeholders['support'] feeds in normalized pre-processed adjacent matrix, # if attack is True, placeholders['adj'] feeds in raw adjacent matrix and placeholdder['s'] feeds in attack placeholders self.inputs = placeholders['features'] self.input_dim = input_dim # self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions self.output_dim = placeholders['labels'].get_shape().as_list()[1] self.placeholders = placeholders lmd = placeholders['lmd'] self.attack = attack if self.attack: mu = placeholders['mu'] # the length of A list, in fact, self.num_support is always 1 self.num_supports = len(placeholders['adj']) # original adjacent matrix A self.A = placeholders['adj'] self.mask = [tf.constant(np.triu(np.ones([self.A[0].get_shape()[0].value]*2, dtype = np.float32),1))] self.C = [1 - 2 * self.A[i] - tf.eye(self.A[i].get_shape().as_list()[0], self.A[i].get_shape().as_list()[1]) for i in range(self.num_supports)] # placeholder for adding edges self.upper_S_0 = placeholders['s'] # a strict upper triangular matrix to ensure only N(N-1)/2 trainable variables # here use matrix_band_part to ensure a stricly upper triangular matrix self.upper_S_real = [tf.matrix_band_part(self.upper_S_0[i],0,-1)-tf.matrix_band_part(self.upper_S_0[i],0,0) for i in range(self.num_supports)] # modified_A is the new adjacent matrix self.upper_S_real2 = [self.upper_S_real[i] + tf.transpose(self.upper_S_real[i]) for i in range(self.num_supports)] self.modified_A = [self.A[i] + tf.multiply(self.upper_S_real2[i], self.C[i]) for i in range(self.num_supports)] """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation.""" self.hat_A = [tf.cast(self.modified_A[i] + tf.eye(self.modified_A[i].get_shape().as_list()[0], self.modified_A[i].get_shape().as_list()[1]),dtype='float32') for i in range(self.num_supports)] # get degree by row sum self.rowsum = tf.reduce_sum(self.hat_A[0],axis=1) self.d_sqrt = tf.sqrt(self.rowsum) # square root self.d_sqrt_inv = tf.math.reciprocal(self.d_sqrt) # reciprocal self.support_real = tf.multiply(tf.transpose(tf.multiply(self.hat_A[0],self.d_sqrt_inv)),self.d_sqrt_inv) # this self.support is a list of \tilde{A} in the paper # replace the 'support' in the placeholders dictionary self.placeholders['support'] = [self.support_real] self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) self.build() # proximal gradient algorithm if self.attack == 'PGD': self.Sgrad = tf.gradients(self.attack_loss, self.upper_S_real[0]) self.a = self.upper_S_real[0] + mu * self.Sgrad * lmd * self.mask elif self.attack == 'CW': label = placeholders['labels'] real = tf.reduce_sum(label * self.outputs,1) label_mask_expand = placeholders['label_mask_expand'] other = tf.reduce_max((1 - label) * label_mask_expand * self.outputs - label * 10000,1) self.loss1 = tf.maximum(0.0, (real-other+50)*label_mask_expand[:,0]) self.loss2 = tf.reduce_sum(self.loss1) self.Sgrad = tf.gradients(self.loss2, self.upper_S_real[0]) self.a = self.upper_S_real[0] - mu * self.Sgrad * lmd * self.mask elif self.attack == 'minmax': self.w = placeholders['w'] label = placeholders['labels'] self.real = tf.reduce_sum(label * self.outputs,1) label_mask_expand = placeholders['label_mask_expand'] self.other = tf.reduce_max((1 - label) * label_mask_expand * self.outputs - label * 10000,1) self.loss1 = self.w * tf.maximum(0.0, self.real-self.other+0.) self.loss2 = tf.reduce_sum(self.loss1) self.Sgrad = tf.gradients(self.loss2, self.upper_S_real[0]) self.a = self.upper_S_real[0] - mu * self.Sgrad * self.mask else: raise NotImplementedError else: self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) self.build()
def build_model(self): # add place holder self.contexts = tf.placeholder(shape=[None, None], dtype=tf.int32, name="context") self.context_legnths = tf.placeholder(shape=[None], dtype=tf.int32, name="c_length") self.questions = tf.placeholder(shape=[None, None], dtype=tf.int32, name="q") self.question_legnths = tf.placeholder(shape=[None], dtype=tf.int32, name="q_len") # [batch, num_sentences, num_words] self.sentences = tf.placeholder(shape=[None, None, None], dtype=tf.int32, name="sentences") # [num_sentences, num_words] self.sequence_lengths = tf.placeholder(shape=[None, None], dtype=tf.int32, name="seq_len") # [num_sentences] self.sentence_lengths = tf.placeholder(shape=[None], dtype=tf.int32, name="sent_len") self.sentence_idx = tf.placeholder(shape=[None], dtype=tf.int32, name="sent_idx") self.answerable = tf.placeholder(shape=[None], dtype=tf.int32, name="answ") self.answer_span = tf.placeholder(shape=[None, 2], dtype=tf.int32, name="answer_span") self.dropout = tf.placeholder(dtype=tf.float32, name="dropout") self.avg_loss = tf.placeholder(dtype=tf.float32, name="avg_loss") self.avg_em = tf.placeholder(dtype=tf.float32, name="avg_em") self.avg_acc = tf.placeholder(dtype=tf.float32, name="avg_acc") loss_summary = tf.summary.scalar("loss", self.avg_em) acc_summary = tf.summary.scalar("accuracy", self.avg_acc) em_summary = tf.summary.scalar("em", self.avg_em) self.merged = tf.summary.merge([loss_summary, acc_summary, em_summary]) self.document_size, self.sentence_size, self.word_size = tf.unstack( tf.shape(self.sentences)) # add embeddings zeros = tf.constant([[0.0] * self.config.embedding_size]) unk_dummy = tf.get_variable(shape=[2, self.config.embedding_size], initializer=layers.xavier_initializer(), name="special_token") # load pre-trained GloVe embedding_matrix = tf.Variable(initial_value=self.config.embeddings, trainable=False, dtype=tf.float32, name="embedding") self.embedding_matrix = tf.concat([zeros, unk_dummy, embedding_matrix], axis=0) self.embedded_sentences = tf.nn.embedding_lookup( self.embedding_matrix, self.sentences) self.embedded_sentences = tf.layers.dropout(self.embedded_sentences, self.dropout) self.embedded_context = tf.nn.embedding_lookup(self.embedding_matrix, self.contexts) self.embedded_context = tf.layers.dropout(self.embedded_context, self.dropout) self.embedded_questions = tf.nn.embedding_lookup( self.embedding_matrix, self.questions) self.embedded_questions = tf.layers.dropout(self.embedded_questions, self.dropout) # conv block and self attention block with tf.variable_scope("Embedding_Encoder_Layer"): contexts = self.residual_block(self.embedded_context, self.context_legnths, num_blocks=1, num_conv_blocks=4, kernel_size=7, num_filters=128, scope="Embedding_Encoder", reuse=False) questions = self.residual_block(self.embedded_questions, self.question_legnths, num_blocks=1, num_conv_blocks=4, kernel_size=7, num_filters=128, scope="Embedding_Encoder", reuse=True) reshaped_sentences = tf.reshape( self.embedded_sentences, [-1, self.word_size, self.config.embedding_size]) sentence_len = tf.reshape(self.sequence_lengths, [-1]) encoded_sentence = self.residual_block(reshaped_sentences, sentence_len, num_blocks=1, num_conv_blocks=1, kernel_size=7, num_filters=128, scope="Embedding_Encoder", reuse=True) with tf.variable_scope("hierarchical_attention") and tf.device( "/device:GPU:0"): # [b * s, w, d] cnn_inputs = tf.layers.dense( encoded_sentence, self.config.filter_size, kernel_regularizer=self.regularizer, kernel_initializer=layers.xavier_initializer(), activation=tf.nn.relu) sentence_cnn = self.conv_encoder(cnn_inputs, self.config.filter_size, scope="word_encoder", reuse=False) encoded_question = self.question_encoding(questions, self.question_legnths) # [b, s, d] sentence_vectors = self.word_level_attention( encoded_question, sentence_cnn, self.document_size, self.sentence_size, self.word_size, self.sequence_lengths) sentence_cnn = self.conv_encoder(sentence_vectors, self.config.filter_size, scope="sentence_encoder", reuse=False) document_vector, sentence_score = self.sentence_level_attention( encoded_question, sentence_cnn, self.sentence_size, self.sentence_lengths) self.attention_loss, self.binary_loss = self.auxiliary_loss( sentence_score, document_vector, encoded_question) with tf.variable_scope("Context_Query_Attention_Layer") and tf.device( "/device:GPU:0"): A, B = self.co_attention(questions, contexts, self.question_legnths, self.context_legnths) attention_outputs = [contexts, A, contexts * A, contexts * B] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=2) inputs = tf.layers.dense( inputs, self.config.attention_size, kernel_regularizer=self.regularizer, kernel_initializer=layers.variance_scaling_initializer(), activation=tf.nn.relu) memories = [] for i in range(3): outputs = self.residual_block(inputs, self.context_legnths, num_blocks=7, num_conv_blocks=2, num_filters=128, kernel_size=5, scope="Model_Encoder", reuse=True if i > 0 else False) if i == 2: outputs = tf.layers.dropout(outputs, self.dropout) memories.append(outputs) inputs = outputs with tf.variable_scope("Output_Layer") and tf.device("/device:GPU:0"): logits_inputs = tf.concat([memories[0], memories[1]], axis=2) start_logits = self.pointer_network(document_vector, logits_inputs, self.context_legnths, scope="start_logits") logits_inputs = tf.concat([memories[0], memories[2]], axis=2) end_logits = self.pointer_network(document_vector, logits_inputs, self.context_legnths, scope="end_logits") start_label, end_label = tf.split(self.answer_span, 2, axis=1) start_label = tf.squeeze(start_label, axis=-1) end_label = tf.squeeze(end_label, axis=-1) losses1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=start_logits, labels=start_label) losses2 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=end_logits, labels=end_label) cross_entropy_loss = tf.reduce_mean(losses1 + losses2) self.loss = cross_entropy_loss \ + self.config.alpha * self.attention_loss \ + self.config.beta * self.binary_loss # for inference logits1 = tf.nn.softmax(start_logits) logits2 = tf.nn.softmax(end_logits) outer_product = tf.matmul(tf.expand_dims(logits1, axis=2), tf.expand_dims(logits2, axis=1)) outer = tf.matrix_band_part(outer_product, 0, self.config.ans_limit) self.start = tf.argmax(tf.reduce_max(outer, axis=2), axis=1, output_type=tf.int32) self.end = tf.argmax(tf.reduce_max(outer, axis=1), axis=1, output_type=tf.int32) self.em = self.evaluate_em(self.start, self.end, self.answer_span, self.unans_prob) if self.config.l2_lambda > 0: vars = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = layers.apply_regularization(self.regularizer, vars) self.loss += l2_loss # Exponential moving average self.var_ema = tf.train.ExponentialMovingAverage(0.9999) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v)) self.add_train_op() self.init_session()
def call(self, inputs, mask=None, **kwargs): if isinstance(inputs, list): inputs, positions = inputs positions = K.cast(positions, 'int32') mask = mask[1] else: positions = None input_len = K.shape(inputs)[1] if self.attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD: e = self._call_additive_emission(inputs) elif self.attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL: e = self._call_multiplicative_emission(inputs) if self.attention_activation is not None: e = self.attention_activation(e) e = K.exp(e - K.max(e, axis=-1, keepdims=True)) if self.attention_width is not None: ones = tf.ones((input_len, input_len)) if self.history_only: local = tf.matrix_band_part( ones, K.minimum(input_len, self.attention_width - 1), 0, ) else: local = tf.matrix_band_part( ones, K.minimum(input_len, self.attention_width // 2), K.minimum(input_len, (self.attention_width - 1) // 2), ) e = e * K.expand_dims(local, 0) if mask is not None: mask = K.cast(mask, K.floatx()) mask = K.expand_dims(mask) e = K.permute_dimensions( K.permute_dimensions(e * mask, (0, 2, 1)) * mask, (0, 2, 1)) # a_{t} = \text{softmax}(e_t) s = K.sum(e, axis=-1) s = K.tile(K.expand_dims(s, axis=-1), K.stack([1, 1, input_len])) a = e / (s + K.epsilon()) # l_t = \sum_{t'} a_{t, t'} x_{t'} v = K.batch_dot(a, inputs) if self.attention_regularizer_weight > 0.0: self.add_loss(self._attention_regularizer(a)) if positions is not None: pos_num = K.shape(positions)[1] batch_indices = K.tile( K.expand_dims(K.arange(K.shape(inputs)[0]), axis=-1), K.stack([1, pos_num])) pos_indices = K.stack([batch_indices, positions], axis=-1) v = tf.gather_nd(v, pos_indices) a = tf.gather_nd(a, pos_indices) if self.return_attention: return [v, a] return v
def concordance_index4(y_true, y_pred): y_true_1 = tf.expand_dims(y_true, 0) y_true_2 = tf.expand_dims(y_true, 1) y_pred_1 = tf.expand_dims(y_pred, 0) y_pred_2 = tf.expand_dims(y_pred, 1) y_true_diff = tf.sign(tf.subtract(y_true_1, y_true_2)) y_true_diff = tf.matrix_band_part(y_true_diff, 0, -1) y_pred_diff = tf.sign(tf.subtract(y_pred_1, y_pred_2)) y_pred_diff = tf.matrix_band_part(y_pred_diff, 0, -1) ones = tf.ones_like(y_pred_diff) mask_a = tf.matrix_band_part(ones, 0, -1) mask_b = tf.matrix_band_part(ones, 0, 0) mask = tf.cast(mask_a - mask_b, dtype=tf.bool) sess = tf.Session() #sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) mask = sess.run(mask) y_pred_diff = sess.run(y_pred_diff) y_true_diff = sess.run(y_true_diff) CPU_COUNT = int(0.5*os.cpu_count()) with Pool(processes=CPU_COUNT) as pool: #pdb.set_trace() #procs_pred = [] #for i in len(mask): time_start = time.time() procs_pred = [pool.apply_async(taking, [maski, y_pred_diff_i]) for (maski, y_pred_diff_i) in zip(mask, y_pred_diff)] results_pred = [proc.get() for proc in procs_pred] time_start1 = time.time() y_pred_diff_flat = np.array(list(itertools.chain.from_iterable(results_pred))) time_end1 = time.time() #y_pred_diff_flat = np.array(results_pred) # for res in results_pred: # y_pred_diff_flat = np.append(y_pred_diff_flat, res) procs_true = [pool.apply_async(taking, [maski, y_true_diff_i]) for (maski, y_true_diff_i) in zip(mask, y_true_diff)] results_true = [proc.get() for proc in procs_true] time_start2 = time.time() y_true_diff_flat = np.array(list(itertools.chain.from_iterable(results_true))) time_end2 = time.time() print("time used in flatting arrays: ", time_end2+time_end1-time_start2-time_start1) print("time used in CPU: ", time_end2-time_start) #y_true_diff_flat = np.array(results_true) # y_pred_diff_flat = np.array([]) # for res in results_pred: # y_pred_diff_flat = np.append(y_pred_diff_flat, res) #pdb.set_trace() # y_pred_diff_flat = y_pred_diff[mask] # pdb.set_trace() # y_true_diff_flat = tf.boolean_mask(y_true_diff, mask) # y_pred_diff_flat = tf.concat(results_pred, 0) # result = sess.run(y_pred_diff_flat) valid_pairs = tf.not_equal(y_true_diff_flat, 0.0) valid_pairs = tf.cast(valid_pairs, dtype=tf.float64) raw_comparison = tf.divide(tf.add(tf.multiply(y_true_diff_flat, y_pred_diff_flat), 1), 2) scores = tf.multiply(raw_comparison, valid_pairs) quotient = tf.reduce_sum(scores)/tf.reduce_sum(valid_pairs) quotient = sess.run(quotient) return quotient
def model_fn(features, labels, mode, params, word_embeddings_np=None, char_embeddings_np=None): attention_fun = partial(BahdanauAttention, num_units=params.units) if params.attention == 'bahdanau' \ else partial(LuongAttention, num_units=2 * params.units) dropout = params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0 passage_count = params.passage_count if mode != tf.estimator.ModeKeys.TRAIN \ else params.train_passage_count question_words_length = features['question_length'] passage_words_length = features['passage_length'] devices = get_devices() with tf.device('/cpu:0'): word_embeddings_placeholder = tf.placeholder( shape=[params.vocab_size, params.emb_size], dtype=tf.float32) char_embeddings_placeholder = tf.placeholder( shape=[params.char_vocab_size, params.char_emb_size], dtype=tf.float32) # word_embeddings = tf.create_partitioned_variables(shape=[params.vocab_size, params.emb_size], # slicing=[10, 1], # initializer=word_embeddings_placeholder, # trainable=False, name="word_embeddings") word_embeddings = tf.Variable(word_embeddings_placeholder, trainable=False, name="word_embeddings") char_embeddings = tf.Variable(char_embeddings_placeholder, trainable=False, name="char_embeddings") word_embeddings = tf.nn.dropout(word_embeddings, 1.0 - dropout, noise_shape=[params.vocab_size, 1]) char_embeddings = tf.nn.dropout( char_embeddings, 1.0 - dropout, noise_shape=[params.char_vocab_size, 1]) question_words_emb = tf.nn.embedding_lookup(word_embeddings, features['question_words']) question_chars_emb = tf.nn.embedding_lookup(char_embeddings, features['question_chars']) passage_words_emb = tf.nn.embedding_lookup(word_embeddings, features['passage_words']) passage_chars_emb = tf.nn.embedding_lookup(char_embeddings, features['passage_chars']) with tf.device(next(devices)): with tf.variable_scope('question_encoding'): question_enc = encoder(question_words_emb, question_words_length, question_chars_emb, features['question_char_length'], params, dropout=dropout) with tf.device(next(devices)): with tf.variable_scope('passage_encoding'): passage_enc = encoder(passage_words_emb, passage_words_length, passage_chars_emb, features['passage_char_length'], params, dropout=dropout) # question_enc = tf.Print(question_enc, [question_enc], summarize=1000) with tf.variable_scope('attention'): attention = attention_fun( memory=question_enc, memory_sequence_length=question_words_length) cell_fw = GatedAttentionWrapper( attention, DropoutWrapper( GRUCell(params.units, name="attention_gru"), # output_keep_prob=1.0 - dropout, input_keep_prob=1.0 - dropout, # state_keep_prob=1.0 - dropout, variational_recurrent=True, input_size=4 * params.units, dtype=tf.float32), dropout=0) cell_bw = GatedAttentionWrapper( attention, DropoutWrapper( GRUCell(params.units, name="attention_gru"), # output_keep_prob=1.0 - dropout, input_keep_prob=1.0 - dropout, # state_keep_prob=1.0 - dropout variational_recurrent=True, input_size=4 * params.units, dtype=tf.float32), dropout=0) passage_repr, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, passage_enc, passage_words_length, dtype=tf.float32) passage_repr = tf.concat(passage_repr, -1) with tf.variable_scope('pointer'): question_att = attention_fun( memory=question_enc, memory_sequence_length=question_words_length, name="question_align") pool_param = tf.get_variable('pool_param', shape=(question_att._num_units, ), initializer=tf.initializers.ones) pool_param = tf.reshape( tf.tile(pool_param, [tf.shape(question_enc)[0]]), (-1, question_att._num_units)) question_alignments, _ = question_att(pool_param, None) question_pool = tf.reduce_sum( tf.expand_dims(question_alignments, -1) * question_enc, 1) logits1, logits2 = pointer_net(passage_repr, passage_words_length, question_pool, params, attention_fun=attention_fun, dropout=dropout) outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) p1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) p2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'start': p1, 'end': p2} export_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) with tf.variable_scope('passage_ranking'): W_g = Dense(params.units, activation=tf.tanh, use_bias=False) v_g = Dense(1, use_bias=False) memory_layer = Dense(params.units, name="memory_layer", use_bias=False, dtype=tf.float32) query_layer = Dense(params.units, name="query_layer", use_bias=False, dtype=tf.float32) g = [] for i in range(passage_count): passage_mask = tf.boolean_mask( passage_repr, tf.equal(features['partitions'], i)) passage_i = tf.split(passage_mask, features['partitions_len'][:, i]) passage_i = [ pad_to_shape_2d( p, (tf.Dimension(params.passage_max_len), p.shape[1])) for p in passage_i ] passage_i = tf.stack(passage_i) passage_alignment, _ = ReusableBahdanauAttention( params.units, passage_i, features['partitions_len'][:, i], memory_layer=memory_layer, query_layer=query_layer, name="passage_align")(question_pool, None) passage_pool = tf.reduce_sum( tf.expand_dims(passage_alignment, -1) * passage_i, 1) g_i = v_g(W_g(tf.concat([question_pool, passage_pool], -1))) # g_i = tf.Print(g_i, [passage_mask, passage_i], message='is_nan_{}'.format(i), summarize=1000) g.append(g_i) g = tf.concat(g, -1) answer_start, answer_end, passage_rank = labels loss1 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=tf.stop_gradient(answer_start)) loss2 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=tf.stop_gradient(answer_end)) loss3 = tf.nn.softmax_cross_entropy_with_logits_v2( logits=g, labels=tf.stop_gradient(passage_rank)) # loss1 = tf.Print(loss1, [tf.argmax(answer_start, -1), tf.argmax(answer_end, -1), # tf.reduce_mean(loss1), tf.reduce_mean(loss2), tf.reduce_mean(loss3)], message="loss") loss = (params.r * tf.reduce_mean(loss1 + loss2) + (1 - params.r) * tf.reduce_mean(loss3)) \ if params.r < 1 else tf.reduce_mean(loss1 + loss2) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdadeltaOptimizer( learning_rate=params.learning_rate, epsilon=1e-6) global_step = tf.train.get_or_create_global_step() grads = optimizer.compute_gradients(loss) gradients, variables = zip(*grads) capped_grads, _ = tf.clip_by_global_norm(gradients, params.grad_clip) train_op = optimizer.apply_gradients(zip(capped_grads, variables), global_step=global_step) return EstimatorSpec( mode, loss=loss, train_op=train_op, scaffold=tf.train.Scaffold( init_feed_dict={ word_embeddings_placeholder: word_embeddings_np, char_embeddings_placeholder: char_embeddings_np }), ) if mode == tf.estimator.ModeKeys.EVAL: table = lookup_ops.index_to_string_table_from_file( params.word_vocab_file, value_column_index=0, delimiter=" ") return EstimatorSpec(mode, loss=loss, eval_metric_ops={ 'rouge-l': extraction_metric(p1, p2, tf.argmax(answer_start, -1), tf.argmax(answer_end, -1), features['passage_words'], params, table), 'f1': extraction_metric(p1, p2, tf.argmax(answer_start, -1), tf.argmax(answer_end, -1), features['passage_words'], params, table, metric='f1') })
def getLowerDiag(inputs): inputs_matrix = tf.reshape(tf.tile(inputs, [tf.shape(inputs)[0]]), [-1, tf.shape(inputs)[0]]) result = tf.matrix_band_part(inputs_matrix, -1, 0) return result
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def _random_tril_matrix(self, shape): mat = self.rng.rand(*shape) chol = tfd.matrix_diag_transform(mat, transform=tf.nn.softplus) return tf.matrix_band_part(chol, -1, 0)
def uncertain_conditional(Xnew_mu, Xnew_var, feat, kern, q_mu, q_sqrt, *, mean_function=None, full_output_cov=False, full_cov=False, white=False): """ Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var). See ``conditional`` documentation for further reference. :param Xnew_mu: mean of the inputs, size N x Din :param Xnew_var: covariance matrix of the inputs, size N x Din x Din :param feat: gpflow.InducingFeature object, only InducingPoints is supported :param kern: gpflow kernel or ekernel object. :param q_mu: mean inducing points, size M x Dout :param q_sqrt: cholesky of the covariance matrix of the inducing points, size Dout x M x M :param full_output_cov: boolean wheter to compute covariance between output dimension. Influences the shape of return value ``fvar``. Default is False :param white: boolean whether to use whitened representation. Default is False. :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is N x Dout, size ``fvar`` depends on ``full_output_cov``: if True ``f_var`` is N x Dout x Dout, if False then ``f_var`` is N x Dout """ # TODO(VD): Tensorflow 1.7 doesn't support broadcasting in``tf.matmul`` and # ``tf.matrix_triangular_solve``. This is reported in issue 216. # As a temporary workaround, we are using ``tf.einsum`` for the matrix # multiplications and tiling in the triangular solves. # The code that should be used once the bug is resolved is added in comments. if not isinstance(feat, InducingPoints): raise NotImplementedError if full_cov: # TODO(VD): ``full_cov`` True would return a ``fvar`` of shape N x N x D x D, # encoding the covariance between input datapoints as well. # This is not implemented as this feature is only used for plotting purposes. raise NotImplementedError pXnew = Gaussian(Xnew_mu, Xnew_var) num_data = tf.shape(Xnew_mu)[0] # number of new inputs (N) num_ind = tf.shape(q_mu)[0] # number of inducing points (M) num_func = tf.shape(q_mu)[1] # output dimension (D) q_sqrt_r = tf.matrix_band_part(q_sqrt, -1, 0) # D x M x M eKuf = tf.transpose(expectation(pXnew, (kern, feat))) # M x N (psi1) Kuu = feat.Kuu(kern, jitter=settings.numerics.jitter_level) # M x M Luu = tf.cholesky(Kuu) # M x M if not white: q_mu = tf.matrix_triangular_solve(Luu, q_mu, lower=True) Luu_tiled = tf.tile(Luu[None, :, :], [num_func, 1, 1]) # remove line once issue 216 is fixed q_sqrt_r = tf.matrix_triangular_solve(Luu_tiled, q_sqrt_r, lower=True) Li_eKuf = tf.matrix_triangular_solve(Luu, eKuf, lower=True) # M x N fmean = tf.matmul(Li_eKuf, q_mu, transpose_a=True) eKff = expectation(pXnew, kern) # N (psi0) eKuffu = expectation(pXnew, (kern, feat), (kern, feat)) # N x M x M (psi2) Luu_tiled = tf.tile(Luu[None, :, :], [num_data, 1, 1]) # remove this line, once issue 216 is fixed Li_eKuffu = tf.matrix_triangular_solve(Luu_tiled, eKuffu, lower=True) Li_eKuffu_Lit = tf.matrix_triangular_solve(Luu_tiled, tf.matrix_transpose(Li_eKuffu), lower=True) # N x M x M cov = tf.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True) # D x M x M if mean_function is None or isinstance(mean_function, mean_functions.Zero): e_related_to_mean = tf.zeros((num_data, num_func, num_func), dtype=settings.float_type) else: # Update mean: \mu(x) + m(x) fmean = fmean + expectation(pXnew, mean_function) # Calculate: m(x) m(x)^T + m(x) \mu(x)^T + \mu(x) m(x)^T, # where m(x) is the mean_function and \mu(x) is fmean e_mean_mean = expectation(pXnew, mean_function, mean_function) # N x D x D Lit_q_mu = tf.matrix_triangular_solve(Luu, q_mu, adjoint=True) e_mean_Kuf = expectation(pXnew, mean_function, (kern, feat)) # N x D x M # einsum isn't able to infer the rank of e_mean_Kuf, hence we explicitly set the rank of the tensor: e_mean_Kuf = tf.reshape(e_mean_Kuf, [num_data, num_func, num_ind]) e_fmean_mean = tf.einsum("nqm,mz->nqz", e_mean_Kuf, Lit_q_mu) # N x D x D e_related_to_mean = e_fmean_mean + tf.matrix_transpose(e_fmean_mean) + e_mean_mean if full_output_cov: fvar = ( tf.matrix_diag(tf.tile((eKff - tf.trace(Li_eKuffu_Lit))[:, None], [1, num_func])) + tf.matrix_diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) + # tf.matrix_diag(tf.trace(tf.matmul(Li_eKuffu_Lit, cov))) + tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) - # tf.matmul(q_mu, tf.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) - fmean[:, :, None] * fmean[:, None, :] + e_related_to_mean ) else: fvar = ( (eKff - tf.trace(Li_eKuffu_Lit))[:, None] + tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov) + tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu) - fmean ** 2 + tf.matrix_diag_part(e_related_to_mean) ) return fmean, fvar
def call(self, inputs, q_mask=False, v_mask=False, a_mask=False): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ q, k, v = inputs[:3] # 处理mask idx = 3 if q_mask: q_mask = inputs[idx] idx += 1 else: q_mask = None if v_mask: v_mask = inputs[idx] idx += 1 else: v_mask = None if a_mask: if len(inputs) > idx: a_mask = inputs[idx] else: a_mask = 'history_only' else: a_mask = None # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # 维度置换 qw = K.permute_dimensions(qw, (0, 2, 1, 3)) kw = K.permute_dimensions(kw, (0, 2, 1, 3)) vw = K.permute_dimensions(vw, (0, 2, 1, 3)) # 转为三阶张量 qw = K.reshape(qw, (-1, K.shape(q)[1], self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.head_size)) # Attention a = K.batch_dot(qw, kw, [2, 2]) / np.sqrt(self.key_size) a = add_seq_mask(a, v_mask, 1, -1, self.heads) if a_mask is not None: if a_mask == 'history_only': ones = K.ones_like(a[:1]) a_mask = (ones - tf.matrix_band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = K.batch_dot(a, vw, [2, 1]) o = K.reshape(o, (-1, self.heads, K.shape(q)[1], self.head_size)) o = K.permute_dimensions(o, (0, 2, 1, 3)) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) o = add_seq_mask(o, q_mask, 0) return o
def __init__(self, lr, batch_size, dimension, util_train, util_test, campaign, reg_lambda, sigma): # hyperparameters self.lr = lr self.batch_size = batch_size self.util_train = util_train self.util_test = util_test self.reg_lambda = reg_lambda self.sigma = sigma self.emb_size = 20 self.train_data_amt = util_train.get_data_amt() self.test_data_amt = util_test.get_data_amt() # output dir model_name = "{}_{}_{}_{}".format(self.lr, self.reg_lambda, self.batch_size, self.sigma) self.output_dir = "output/deephit/{}/{}/".format(campaign, model_name) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # reset graph tf.reset_default_graph() # field params self.field_sizes = self.util_train.feat_sizes self.field_num = len(self.field_sizes) # placeholders self.X = [ tf.sparse_placeholder(tf.float64) for i in range(0, self.field_num) ] self.z = tf.placeholder(tf.float64) self.b = tf.placeholder(tf.float64) self.y = tf.placeholder(tf.float64) # embedding layer self.var_map = {} # for truncated self.var_map['embed_0'] = tf.Variable( tf.truncated_normal([self.field_sizes[0], 1], dtype=tf.float64)) for i in range(1, self.field_num): self.var_map['embed_%d' % i] = tf.Variable( tf.truncated_normal([self.field_sizes[i], self.emb_size], dtype=tf.float64)) # after embedding w0 = [self.var_map['embed_%d' % i] for i in range(self.field_num)] self.dense_input = tf.concat([ tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(self.field_num) ], 1) # shared network self.hidden1 = tf.Variable(initial_value=tf.truncated_normal( shape=[(self.field_num - 1) * self.emb_size + 1, HIDDEN_SIZE1], dtype=tf.float64), name='h1') self.out1 = tf.Variable(initial_value=tf.truncated_normal( shape=[HIDDEN_SIZE1, OUT_SIZE1], dtype=tf.float64), name='o1') self.hidden2 = tf.Variable(initial_value=tf.truncated_normal( shape=[OUT_SIZE1, HIDDEN_SIZE2], dtype=tf.float64), name='h2') self.out2 = tf.Variable(initial_value=tf.truncated_normal( shape=[HIDDEN_SIZE2, OUT_SIZE2], dtype=tf.float64), name='o2') # cause-specific network self.hidden1_val = tf.nn.relu(tf.matmul(self.dense_input, self.hidden1)) self.out1_val = tf.sigmoid(tf.matmul(self.hidden1_val, self.out1)) self.hidden2_val = tf.nn.relu(tf.matmul(self.out1_val, self.hidden2)) self.out2_val = tf.sigmoid(tf.matmul(self.hidden2_val, self.out2)) # p_z and w_b self.p = tf.nn.softmax(self.out2_val) self.w = tf.cumsum(self.p, exclusive=True, axis=1) idx_z = tf.stack([ tf.reshape(tf.range(tf.shape(self.z)[0]), (-1, 1)), tf.cast(self.z - 1, tf.int32) ], axis=-1) idx_b = tf.stack([ tf.reshape(tf.range(tf.shape(self.b)[0]), (-1, 1)), tf.cast(self.b - 1, tf.int32) ], axis=-1) self.pz = tf.gather_nd(self.p, idx_z) self.wb = tf.gather_nd(self.w, idx_b) self.wz = tf.gather_nd(self.w, idx_z) # loss and train step self.loss1 = -tf.reduce_sum( tf.log(tf.clip_by_value(self.pz, 1e-8, 1.0)) * self.y) self.loss2 = -tf.reduce_sum( tf.log(tf.clip_by_value(1 - self.wb, 1e-8, 1.0)) * (1 - self.y)) self.reg_loss = tf.nn.l2_loss(self.hidden1[1:,]) + tf.nn.l2_loss(self.hidden2[1:,]) + \ tf.nn.l2_loss(self.out1[1:,]) + tf.nn.l2_loss(self.out2[1:,]) # get ranking loss self.w_of_pair = tf.transpose( tf.nn.embedding_lookup(tf.transpose(self.w), tf.cast(self.z[:, 0] - 1, tf.int32))) self.w_of_self = tf.reshape( tf.tile(tf.reshape(self.wz, (self.batch_size, )), [self.batch_size]), (self.batch_size, self.batch_size)) self.win_label = tf.reshape( tf.tile(tf.reshape(self.y, (self.batch_size, )), [self.batch_size]), (self.batch_size, self.batch_size)) self.delta = self.w_of_self - self.w_of_pair self.candidate = tf.exp(-self.delta / self.sigma) self.rank_loss = tf.reduce_sum( tf.matrix_band_part(self.candidate, -1, 0) * self.win_label) self.loss = self.loss1 + self.loss2 + self.reg_lambda * self.reg_loss + self.rank_loss self.optimizer = tf.train.GradientDescentOptimizer(self.lr) self.train_step = self.optimizer.minimize(self.loss) # session initialization config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru gi = [] att_vP = [] for i in range(config.max_para): print(i) with tf.variable_scope("emb" + str(i)): with tf.variable_scope("char" + str(i)): #CL = tf.Print(CL,[CL],message="CL:") #PL = tf.Print(PL,[PL],message="PL:") #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr.get_shape()],message="ch_pr:") self.ch_pr_ = self.ch_pr[:, i * config.para_limit:(i + 1) * config.para_limit, :] print(self.ch_pr_.get_shape()) #self.c_pr = tf.reshape(self.c_pr, [N, 12, PL]) #print(self.ch.get_shape()) #print(self.ch_pr.get_shape()) #print(self.c.get_shape()) #print(self.c_pr.get_shape()) #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr[:,2:,:]],message="ch_pr") ch_emb = tf.reshape(tf.nn.embedding_lookup(\ self.char_mat, self.ch_pr_), [N * PL, CL, dc]) # self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape( tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") #qh_emb = tf.Print(qh_emb,[qh_emb],message="qh_emb") qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) #state_fw = tf.Print(state_fw,[state_fw],message="state_fw") #state_bw = tf.Print(state_bw,[state_bw],message="state_bw") qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb") with tf.name_scope("word" + str(i)): c_emb = tf.nn.embedding_lookup( self.word_mat, self.c_pr[:, i * config.para_limit:(i + 1) * config.para_limit]) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding" + str(i)): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention" + str(i)): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) # att is the v_P if i == 0: att_vP = tf.identity(att) else: att_vP = tf.concat([att_vP, att], axis=1) #att = tf.Print(att,[att],message="att:") print("att:", att.get_shape().as_list()) print("att_vP:", att_vP.get_shape().as_list()) #att_vP = tf.Print(att_vP,[tf.shape(att_vP)],message="att_vP:") """ with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) """ with tf.variable_scope("pointer"): # r_Q: init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) print("rQ:", init.get_shape().as_list()) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, att, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) #losses1_2 = tf.reduce_mean(losses1_2, axis=0) self.loss = tf.reduce_mean(losses + losses2) # print losses #condition = tf.greater(self.loss, 11) #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1) #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1) if config.with_passage_ranking: gi = None for i in range(config.max_para): # Passage ranking with tf.variable_scope("passage-ranking-attention" + str(i)): #att_vP = tf.Print(att_vP,[att_vP.get_shape()],message="att_vP:") vj_P = att_vP[:, i * config.para_limit:(i + 1) * config.para_limit, :] pr_att = pr_attention( batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) r_P = pr_att(init, vj_P, d, self.c_mask) #r_P = tf.Print(r_P,[r_P],message="r_p") # Wg concatenate = tf.concat([init, r_P], axis=1) g = tf.nn.tanh( dense(concatenate, hidden=d, use_bias=False, scope="g" + str(i))) g_ = dense(g, 1, use_bias=False, scope="g_" + str(i)) #g = tf.Print(g,[g],message="g") if i == 0: gi = tf.reshape(g_, [N, 1]) else: gi = tf.concat([gi, tf.reshape(g_, [N, 1])], axis=1) #gi_ = tf.convert_to_tensor(gi,dtype=tf.float32) #self.gi = tf.nn.softmax(gi_) #self.losses3 = tf.nn.softmax_cross_entropy_with_logits( # logits=gi_, labels=tf.reshape(self.pr,[-1,1])) self.losses3 = tf.nn.softmax_cross_entropy_with_logits( logits=gi, labels=self.pr) #self.losses3 = tf.Print(self.losses3,[self.losses3,tf.reduce_max(self.losses3), # tf.reduce_max(self.pr),tf.reduce_max(gi)],message="losses3:") self.pr_loss = tf.reduce_mean(self.losses3) #self.pr_loss = tf.Print(self.pr_loss,[self.pr_loss]) self.r = tf.constant(0.8) self.e_loss1 = tf.multiply(self.r, self.loss) self.e_loss2 = tf.multiply(tf.subtract(tf.constant(1.0), self.r), self.pr_loss) self.e_loss = tf.add(self.e_loss1, self.e_loss2)
def attention_bias_center(attn_bias, w_size, value=10.): bias_mask = tf.cast(tf.equal(attn_bias[0, 0], 0), attn_bias.dtype) centered_bias = tf.matrix_band_part(bias_mask, w_size - 1, w_size - 1) * value centered_bias = tf.expand_dims(tf.expand_dims(centered_bias, 0), 0) return centered_bias
def __init__(self, ind, y, U, m, B, lr, num_c): self.U = U self.m = m self.y = y.reshape([y.size, 1]) self.ind = ind self.B = B self.learning_rate = lr self.nmod = len(self.U) self.r = self.U[0].shape[1] self.tf_U = [ tf.Variable(self.U[k], dtype=tf.float32) for k in range(self.nmod) ] self.d = 0 self.num_channels = num_c for k in range(self.nmod): self.d = self.d + self.U[k].shape[1] #init mu, L, Z Zinit = self.init_pseudo_inputs() self.tf_W = tf.reshape(tf.Variable(Zinit, dtype=tf.float32), [self.m, self.r, self.nmod, 1]) self.N = y.size #covariance of the noise self.U_covar_diag_tf = [ tf.Variable(np.ones(self.U[k].shape[0] * self.U[k].shape[1]), dtype=tf.float32) for k in range(self.nmod) ] U_covar_sqrt_mat_tf = [ tf.linalg.diag(self.U_covar_diag_tf[k]) for k in range(self.nmod) ] #variational posterior self.tf_mu = tf.Variable(np.zeros([m, 1]), dtype=tf.float32) self.tf_L = tf.Variable(np.eye(m), dtype=tf.float32) #shallow kernel parameters self.tf_log_lengthscale = tf.Variable(0.0, dtype=tf.float32) self.tf_log_tau = tf.Variable(0.0, dtype=tf.float32) #Stochastic Variational ELBO #A mini-batch of observed entry indices self.tf_sub = tf.placeholder(tf.int32, shape=[None, self.nmod]) self.tf_y = tf.placeholder(tf.float32, shape=[None, 1]) #convolution variables and parameters self.conv0_f_shape = [2, 2, 1, self.num_channels] self.tf_conv0_w = tf.Variable( tf.truncated_normal(self.conv0_f_shape, stddev=0.03)) self.tf_bias0 = tf.Variable(tf.truncated_normal([self.num_channels])) self.conv1_f_shape = [self.r, 1, self.num_channels, self.num_channels] self.tf_conv1_w = tf.Variable( tf.truncated_normal(self.conv1_f_shape, stddev=0.03)) self.tf_bias1 = tf.Variable(tf.truncated_normal([self.num_channels])) self.conv2_f_shape = [ 1, self.nmod, self.num_channels, self.num_channels ] self.tf_conv2_w = tf.Variable( tf.truncated_normal(self.conv2_f_shape, stddev=0.03)) self.tf_bias2 = tf.Variable(tf.truncated_normal([self.num_channels])) #compute convolutions for pseudo inputs self.tf_Z = self.compute_convs(self.tf_W) #compute convolutions and generate noise for batch tf_noise = [ tf.matmul( 0.1 * U_covar_sqrt_mat_tf[k], tf.random_normal( shape=[self.U[k].shape[0] * self.U[k].shape[1], 1])) for k in range(self.nmod) ] tf_noise = [ tf.reshape(tf_noise[k], self.U[k].shape) for k in range(self.nmod) ] tf_inputs = tf.concat([ tf.gather((self.tf_U[k] + tf_noise[k]), self.tf_sub[:, k]) for k in range(self.nmod) ], 1) tf_inputs = tf.reshape(tf_inputs, [-1, self.r, self.nmod, 1]) tf_inputs = self.compute_convs(tf_inputs) Ltril = tf.matrix_band_part(self.tf_L, -1, 0) Kmm = self.kernel_matrix(self.tf_Z) Kmn = self.kernel_cross(self.tf_Z, tf_inputs) Knm = tf.transpose(Kmn) KnmKmmInv = tf.transpose(tf.matrix_solve(Kmm, Kmn)) KnmKmmInvL = tf.matmul(KnmKmmInv, Ltril) tau = tf.exp(self.tf_log_tau) lengthscale = tf.exp(self.tf_log_lengthscale) hh_expt = tf.matmul(Ltril, tf.transpose(Ltril)) + tf.matmul( self.tf_mu, tf.transpose(self.tf_mu)) ELBO = -0.5*tf.linalg.logdet(Kmm) - 0.5*tf.trace(tf.matrix_solve(Kmm, hh_expt)) + 0.5*tf.reduce_sum(tf.log(tf.pow(tf.diag_part(Ltril), 2))) \ + 0.5*self.N*self.tf_log_tau - 0.5*tau*self.N/self.B*tf.reduce_sum(tf.pow(self.tf_y - tf.matmul(KnmKmmInv,self.tf_mu), 2)) \ - 0.5*tau*( self.N*(1+jitter) - self.N/self.B*tf.reduce_sum(KnmKmmInv*Knm) + self.N/self.B*tf.reduce_sum(tf.pow(KnmKmmInvL,2)) ) \ + 0.5*self.m - 0.5*self.N*tf.log(2.0*tf.constant(np.pi, dtype=tf.float32)) #\ #- 0.5*tf.reduce_sum(tf.pow(self.tf_U[0],2)) - 0.5*tf.reduce_sum(tf.pow(self.tf_U[1],2)) - 0.5*tf.reduce_sum(tf.pow(self.tf_U[2],2)) #- 0.5*tf.pow(tau,2) - 0.5*tf.pow(lengthscale, 2) #Add entropy of variational posterior to ELBO # This uses the property that the log det(A) = 2*sum(log(real(diag(C)))) # where C is the cholesky decomposition of A. This allows us to avoid computing the cholesky decomposition for k in range(self.nmod): ELBO += 0.5*2.0 * math_ops.reduce_sum( math_ops.log(math_ops.real(array_ops.matrix_diag_part(0.1*U_covar_sqrt_mat_tf[k]))), axis=[-1])\ + (self.U[k].shape[0]*self.U[k].shape[1])/2*(1+tf.log(2.0*tf.constant(np.pi, dtype=tf.float32))) self.loss = -ELBO self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.minimizer = self.optimizer.minimize(self.loss) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.sess.run(tf.global_variables_initializer())
def causal_mask(length, neg_inf=-1e9, name=None): with tf.name_scope(name, default_name="causal_mask"): lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0) ret = neg_inf * (1.0 - lower_triangle) return tf.reshape(ret, [1, 1, length, length])
def uncertain_conditional(Xnew_mu, Xnew_var, feat, kern, q_mu, q_sqrt, full_cov_output=False, full_cov=False, whiten=False): """ Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var). See ``conditional`` documentation for further reference. :param Xnew_mu: mean of the inputs, size N x Din :param Xnew_var: covariance matrix of the inputs, size N x Din x Din :param feat: gpflow.InducingFeature object, only InducingPoints is supported :param kern: gpflow kernel or ekernel object. :param q_mu: mean inducing points, size M x Dout :param q_sqrt: cholesky of the covariance matrix of the inducing points, size M x M x Dout :param full_cov_output: boolean wheter to compute covariance between output dimension. Influences the shape of return value ``fvar``. Default is False :param whiten: boolean whether to whiten the representation. Default is False. :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is N x Dout, size ``fvar`` depends on ``full_cov_output``: if True ``f_var`` is N x Dout x Dout, if False then ``f_var`` is N x Dout """ # TODO: Tensorflow 1.3 doesn't support broadcasting in``tf.matmul`` and # ``tf.matrix_triangular_solve``. This is reported in issue 216. # As a temporary workaround, we are using ``tf.einsum`` for the matrix # multiplications and tiling in the triangular solves. # The code that should be used once the bug is resolved is added in comments. if not isinstance(feat, InducingPoints): raise NotImplementedError if full_cov: # TODO: ``full_cov`` True would return a ``fvar`` of shape N x N x D x D, # encoding the covariance between input datapoints as well. # This is not implemented as this feature is only used for plotting purposes. raise NotImplementedError num_data = tf.shape(Xnew_mu)[0] # number of new inputs (N) num_func = tf.shape(q_mu)[1] # output dimension (D) q_sqrt_r = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # D x M x M eKuf = tf.transpose(feat.eKfu(kern, Xnew_mu, Xnew_var)) # M x N Kuu = feat.Kuu(kern, jitter=settings.numerics.jitter_level) # M x M Luu = tf.cholesky(Kuu) # M x M if not whiten: q_mu = tf.matrix_triangular_solve(Luu, q_mu, lower=True) Luu_tiled = tf.tile( Luu[None, :, :], [num_func, 1, 1]) # remove line once issue 216 is fixed q_sqrt_r = tf.matrix_triangular_solve(Luu_tiled, q_sqrt_r, lower=True) Li_eKuf = tf.matrix_triangular_solve(Luu, eKuf, lower=True) # M x N fmean = tf.matmul(Li_eKuf, q_mu, transpose_a=True) eKff = kern.eKdiag(Xnew_mu, Xnew_var) # N eKuffu = feat.eKufKfu(kern, Xnew_mu, Xnew_var) # N x M x M Luu_tiled = tf.tile( Luu[None, :, :], [num_data, 1, 1]) # remove this line, once issue 216 is fixed Li_eKuffu_Lit = tf.matrix_triangular_solve(Luu_tiled, tf.matrix_transpose(eKuffu), lower=True) Li_eKuffu_Lit = tf.matrix_triangular_solve( Luu_tiled, tf.matrix_transpose(Li_eKuffu_Lit), lower=True) # N x M x M cov = tf.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True) # D x M x M if full_cov_output: fvar = ( tf.matrix_diag( tf.tile( (eKff - tf.trace(Li_eKuffu_Lit))[:, None], [1, num_func])) + tf.matrix_diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) + # tf.matrix_diag(tf.trace(tf.matmul(Li_eKuffu_Lit, cov))) + tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) - # tf.matmul(q_mu, tf.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) - tf.matmul(fmean[:, :, None], fmean[:, :, None], transpose_b=True)) else: fvar = ((eKff - tf.trace(Li_eKuffu_Lit))[:, None] + tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov) + tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu) - fmean**2) return fmean, fvar
def get_mask(batch_size,sequence_length): lower_triangle=tf.matrix_band_part(tf.ones([sequence_length,sequence_length]),-1,0) result=-1e9*(1.0-lower_triangle) print("get_mask==>result:",result) return result
def multihead_attention(self, query, key, value, h=4, mask=False): W_query = tf.Variable( initial_value=tf.random_normal((self.hidden, self.hidden), stddev=1e-2), trainable=True, dtype=tf.float32, ) W_key = tf.Variable( initial_value=tf.random_normal((self.hidden, self.hidden), stddev=1e-2), trainable=True, dtype=tf.float32, ) W_value = tf.Variable( initial_value=tf.random_normal((self.hidden, self.hidden), stddev=1e-2), trainable=True, dtype=tf.float32, ) W_output = tf.Variable( initial_value=tf.random_normal((self.hidden, self.hidden), stddev=1e-2), trainable=True, dtype=tf.float32, ) multi_query = tf.concat(tf.unstack(tf.reshape( tf.matmul(tf.reshape(query, [-1, self.hidden]), W_query), [-1, 1, tf.shape(query)[1], h, int(self.hidden / h)]), axis=3), axis=1) multi_key = tf.concat(tf.unstack(tf.reshape( tf.matmul(tf.reshape(key, [-1, self.hidden]), W_key), [-1, 1, tf.shape(key)[1], h, int(self.hidden / h)]), axis=3), axis=1) multi_value = tf.concat(tf.unstack(tf.reshape( tf.matmul(tf.reshape(value, [-1, self.hidden]), W_value), [-1, 1, tf.shape(value)[1], h, int(self.hidden / h)]), axis=3), axis=1) dotp = tf.matmul(multi_query, multi_key, transpose_b=True) / (tf.cast( tf.shape(multi_query)[-1], tf.float32)**0.5) attention_weights = tf.nn.softmax(dotp) if mask: attention_weights = tf.matrix_band_part(attention_weights, -1, 0) attention_weights /= tf.reduce_sum(attention_weights, axis=3, keep_dims=True) weighted_sum = tf.matmul(attention_weights, multi_value) weighted_sum = tf.concat(tf.unstack(weighted_sum, axis=1), axis=-1) multihead = tf.reshape( tf.matmul(tf.reshape(weighted_sum, [-1, self.hidden]), W_output), [-1, tf.shape(query)[1], self.hidden]) output = multihead + query output = tf.contrib.layers.layer_norm(output, begin_norm_axis=2) return output, attention_weights
lambda_phi = tf.Variable(lambda_phi_var, trainable=False, dtype=tf.float64) lambda_pi_var = tf.Variable(lambda_pi_var, dtype=tf.float64) lambda_beta_var = tf.Variable(lambda_beta_var, dtype=tf.float64) lambda_nu_var = tf.Variable(lambda_nu_var, dtype=tf.float64) lambda_m = tf.Variable(lambda_m_var, dtype=tf.float64) lambda_w_var = tf.Variable(lambda_w_var, dtype=tf.float64) # Maintain numerical stability lambda_pi = tf.nn.softplus(lambda_pi_var) lambda_beta = tf.nn.softplus(lambda_beta_var) lambda_nu = tf.add(tf.nn.softplus(lambda_nu_var), tf.cast(D, dtype=tf.float64)) # Semidefinite positive matrices definition with Cholesky descomposition mats = [] for k in range(K): aux1 = tf.matrix_set_diag(tf.matrix_band_part(lambda_w_var[k], -1, 0), tf.nn.softplus(tf.diag_part(lambda_w_var[k]))) mats.append(tf.matmul(aux1, aux1, transpose_b=True)) lambda_w = tf.convert_to_tensor(mats) idx_tensor = tf.placeholder(tf.int32, shape=(BATCH_SIZE)) alpha_o = tf.convert_to_tensor(alpha_o, dtype=tf.float64) nu_o = tf.convert_to_tensor(nu_o, dtype=tf.float64) w_o = tf.convert_to_tensor(w_o, dtype=tf.float64) m_o = tf.convert_to_tensor(m_o, dtype=tf.float64) beta_o = tf.convert_to_tensor(beta_o, dtype=tf.float64) # Evidence Lower Bound definition e3 = tf.convert_to_tensor(0., dtype=tf.float64) e2 = tf.convert_to_tensor(0., dtype=tf.float64)
def make_tril_scale( loc=None, scale_tril=None, scale_diag=None, scale_identity_multiplier=None, shape_hint=None, validate_args=False, assert_positive=False, name=None): """Creates a LinearOperator representing a lower triangular matrix. Args: loc: Floating-point `Tensor`. This is used for inferring shape in the case where only `scale_identity_multiplier` is set. scale_tril: Floating-point `Tensor` representing the diagonal matrix. `scale_diag` has shape [N1, N2, ... k, k], which represents a k x k lower triangular matrix. When `None` no `scale_tril` term is added to the LinearOperator. The upper triangular elements above the diagonal are ignored. scale_diag: Floating-point `Tensor` representing the diagonal matrix. `scale_diag` has shape [N1, N2, ... k], which represents a k x k diagonal matrix. When `None` no diagonal term is added to the LinearOperator. scale_identity_multiplier: floating point rank 0 `Tensor` representing a scaling done to the identity matrix. When `scale_identity_multiplier = scale_diag = scale_tril = None` then `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added to `scale`. shape_hint: scalar integer `Tensor` representing a hint at the dimension of the identity matrix when only `scale_identity_multiplier` is set. validate_args: Python `bool` indicating whether arguments should be checked for correctness. assert_positive: Python `bool` indicating whether LinearOperator should be checked for being positive definite. name: Python `str` name given to ops managed by this object. Returns: `LinearOperator` representing a lower triangular matrix. Raises: ValueError: If only `scale_identity_multiplier` is set and `loc` and `shape_hint` are both None. """ def _maybe_attach_assertion(x): if not validate_args: return x if assert_positive: return control_flow_ops.with_dependencies([ tf.assert_positive( tf.matrix_diag_part(x), message="diagonal part must be positive"), ], x) return control_flow_ops.with_dependencies([ tf.assert_none_equal( tf.matrix_diag_part(x), tf.zeros([], x.dtype), message="diagonal part must be non-zero"), ], x) with tf.name_scope( name, "make_tril_scale", values=[loc, scale_diag, scale_identity_multiplier]): loc = _convert_to_tensor(loc, name="loc") scale_tril = _convert_to_tensor(scale_tril, name="scale_tril") scale_diag = _convert_to_tensor(scale_diag, name="scale_diag") scale_identity_multiplier = _convert_to_tensor( scale_identity_multiplier, name="scale_identity_multiplier") if scale_tril is not None: scale_tril = tf.matrix_band_part(scale_tril, -1, 0) # Zero out TriU. tril_diag = tf.matrix_diag_part(scale_tril) if scale_diag is not None: tril_diag += scale_diag if scale_identity_multiplier is not None: tril_diag += scale_identity_multiplier[..., tf.newaxis] scale_tril = tf.matrix_set_diag(scale_tril, tril_diag) return tf.linalg.LinearOperatorLowerTriangular( tril=_maybe_attach_assertion(scale_tril), is_non_singular=True, is_self_adjoint=False, is_positive_definite=assert_positive) return make_diag_scale( loc=loc, scale_diag=scale_diag, scale_identity_multiplier=scale_identity_multiplier, shape_hint=shape_hint, validate_args=validate_args, assert_positive=assert_positive, name=name)
import tensorflow as tf """tf.matrix_band_part(input,num_lower,num_upper,name=None) 功能:复制一个矩阵,并将规定带之外的元素置为0。 假设元素坐标为(m,n),则in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper < 0 || (n-m) <= num_upper)。 band(m,n)=in_band(m,n)*input(m,n)。 特殊情况: tf.matrix_band_part(input, 0, -1) ==> 上三角阵. tf.matrix_band_part(input, -1, 0) ==> 下三角阵. tf.matrix_band_part(input, 0, 0) ==> 对角阵. 输入:num_lower:如果为负,则结果右上空三角阵; num_upper:如果为负,则结果左下为空三角阵。""" a = tf.constant([[0, 1, 2, 3], [-1, 0, 1, 2], [-2, -1, 0, 1], [-3, -2, -1, 0]]) z = tf.matrix_band_part(a, 1, -1) # 左下角空三角阵 # z==>[[0 1 2 3] # [-1 0 1 2] # [0 -1 0 1] # [0 0 -1 0]] z1 = tf.matrix_band_part(a, 1, -2) # 只要位置是负的话就行,与负数的数值无关 # [[ 0 1 2 3] # [-1 0 1 2] # [ 0 -1 0 1] # [ 0 0 -1 0]] z2 = tf.matrix_band_part(a, -1, 1) # 右上角空三角阵 # [[ 0 1 0 0] # [-1 0 1 0] # [-2 -1 0 1] # [-3 -2 -1 0]] z3 = tf.matrix_band_part(a, 0, -1) # [[0 1 2 3] # [0 0 1 2]
def fully_correlated_conditional_repeat(Kmn, Kmm, Knn, f, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False): """ This function handles conditioning of multi-output GPs in the case where the conditioning points are all fully correlated, in both the prior and posterior. Note: This conditional can handle 'repetitions' R, given in `f` and `q_sqrt`. :param Kmn: LM x N x P :param Kmm: LM x LM :param Knn: N x P or N x P x N x P :param f: data matrix, LM x R :param q_sqrt: R x LM x LM or R x ML :param full_cov: calculate covariance between inputs :param full_output_cov: calculate covariance between outputs :param white: use whitened representation :return: - mean: R x N x P - variance: R x N x P, R x N x P x P, R x P x N x N, R x N x P x N x P """ logger.debug("fully correlated conditional") R = tf.shape(f)[1] M, N, K = [tf.shape(Kmn)[i] for i in range(Kmn.shape.ndims)] Lm = tf.cholesky(Kmm) # Compute the projection matrix A # Lm: M x M Kmn: M x NK Kmn = tf.reshape(Kmn, (M, N * K)) # M x NK A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # M x NK Ar = tf.reshape(A, (M, N, K)) # compute the covariance due to the conditioning if full_cov and full_output_cov: # fvar = Knn - tf.matmul(Ar, Ar, transpose_a=True) # NK x NK, then reshape? fvar = Knn - tf.tensordot(Ar, Ar, [[0], [0]]) # N x K x N x K elif full_cov and not full_output_cov: At = tf.transpose(Ar) # K x N x M fvar = Knn - tf.matmul(At, At, transpose_b=True) # K x N x N elif not full_cov and full_output_cov: # This transpose is annoying At = tf.transpose(Ar, [1, 0, 2]) # N x M x K # fvar = Knn - tf.einsum('mnk,mnl->nkl', Ar, Ar) fvar = Knn - tf.matmul(At, At, transpose_a=True) # N x K x K elif not full_cov and not full_output_cov: # Knn: N x K fvar = Knn - tf.reshape(tf.reduce_sum(tf.square(A), [0, 1]), (N, K)) # Can also do this with a matmul # another backsubstitution in the unwhitened case if not white: # A = tf.matrix_triangular_solve(tf.matrix_transpose(Lm), A, lower=False) # M x NK raise NotImplementedError("Need to verify this.") # pragma: no cover # f: M x R fmean = tf.matmul(f, A, transpose_a=True) # R x M * M x NK -> R x NK fmean = tf.reshape(fmean, (R, N, K)) # R x N x K if q_sqrt is not None: Lf = tf.matrix_band_part(q_sqrt, -1, 0) # R x M x M if q_sqrt.get_shape().ndims == 3: A_tiled = tf.tile(A[None, :, :], tf.stack([R, 1, 1])) # R x M x NK LTA = tf.matmul(Lf, A_tiled, transpose_a=True) # R x M x NK elif q_sqrt.get_shape().ndims == 2: # pragma: no cover raise NotImplementedError("Does not support diagonal q_sqrt yet...") else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov and full_output_cov: addvar = tf.matmul(LTA, LTA, transpose_a=True) # R x NK x NK fvar = fvar[None, :, :, :, :] + tf.reshape(addvar, (R, N, K, N, K)) elif full_cov and not full_output_cov: LTAr = tf.transpose(tf.reshape(LTA, [R, M, N, K]), [0, 3, 1, 2]) # R x K x M x N addvar = tf.matmul(LTAr, LTAr, transpose_a=True) # R x K x N x N fvar = fvar[None, ...] + addvar # R x K x N x N elif not full_cov and full_output_cov: LTAr = tf.transpose(tf.reshape(LTA, (R, M, N, K)), [0, 2, 3, 1]) # R x N x K x M fvar = fvar[None, ...] + tf.matmul(LTAr, LTAr, transpose_b=True) # R x N x K x K elif not full_cov and not full_output_cov: addvar = tf.reshape(tf.reduce_sum(tf.square(LTA), axis=1), (R, N, K)) # R x N x K fvar = fvar[None, ...] + addvar # R x N x K return fmean, fvar
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None) qh_emb = conv(qh_emb, d, bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True) ch_emb = tf.reduce_max(ch_emb, axis = 1) qh_emb = tf.reduce_max(qh_emb, axis = 1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None) q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Encoder_Residual_Block", bias = False, dropout = self.dropout) q = residual_block(q_emb, num_blocks = 1, num_conv_layers = 4, kernel_size = 7, mask = self.q_mask, num_filters = d, num_heads = nh, seq_len = self.q_len, scope = "Encoder_Residual_Block", reuse = True, # Share the weights between passage and question bias = False, dropout = self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask = mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis = -1) self.enc = [conv(inputs, d, name = "input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks = 7, num_conv_layers = 2, kernel_size = 5, mask = self.c_mask, num_filters = d, num_heads = nh, seq_len = self.c_len, scope = "Model_Encoder", bias = False, reuse = True if i > 0 else None, dropout = self.dropout) ) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1) end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1) self.logits = [mask_logits(start_logits, mask = self.c_mask), mask_logits(end_logits, mask = self.c_mask)] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var,v))
def base_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, q_sqrt=None, white=False): """ Given a g1 and g2, and distribution p and q such that p(g2) = N(g2;0,Kmm) p(g1) = N(g1;0,Knn) p(g1|g2) = N(g1;0,Knm) And q(g2) = N(g2;f,q_sqrt*q_sqrt^T) This method computes the mean and (co)variance of q(g1) = \int q(g2) p(g1|g2) :param Kmn: M x N :param Kmm: M x M :param Knn: N x N or N :param f: M x R :param full_cov: bool :param q_sqrt: None or R x M x M (lower triangular) :param white: bool :return: N x R or R x N x N """ logger.debug("base conditional") # compute kernel stuff num_func = tf.shape(f)[1] # R Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = Knn - tf.matmul(A, A, transpose_a=True) fvar = tf.tile(fvar[None, :, :], [num_func, 1, 1]) # R x N x N else: fvar = Knn - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(fvar[None, :], [num_func, 1]) # R x N # another backsubstitution in the unwhitened case if not white: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(A, f, transpose_a=True) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # R x M x N elif q_sqrt.get_shape().ndims == 3: L = tf.matrix_band_part(q_sqrt, -1, 0) # R x M x M A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([num_func, 1, 1])) LTA = tf.matmul(L, A_tiled, transpose_a=True) # R x M x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # R x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # R x N if not full_cov: fvar = tf.transpose(fvar) # N x R return fmean, fvar # N x R, R x N x N or N x R
Kfu = kernelfx(x, xu) Kff = kernelfx(x, x) Kuuinv = tf.matrix_inverse(Kuu + offset * tf.eye(m, dtype=tf.float64)) KfuKuuinv = tf.matmul(Kfu, Kuuinv) KffKuuinvU = [ tf.reshape( tf.matmul(KfuKuuinv, tf.expand_dims(tf.cast(u[i], dtype=tf.float64), axis=1)), [-1]) for i in range(0, p) ] KffKuuKuf = tf.matmul(KfuKuuinv, Kfu, transpose_b=True) sigmaf_temp = Kff - KffKuuKuf sigmaf_diag = tf.matrix_band_part(sigmaf_temp, 0, 0) sigmaf_upperT = tf.matrix_band_part(sigmaf_temp, 0, -1) sigmaf = sigmaf_upperT + tf.transpose(sigmaf_upperT) - sigmaf_diag f_scale = tf.cholesky(sigmaf + offset * tf.eye(M, dtype=tf.float64), name='f_scale') # p(F|U,X,Xu) f = MultivariateNormalTriL(loc=tf.cast(KffKuuinvU, dtype=tf.float32), scale_tril=tf.cast(f_scale, dtype=tf.float32), name='pf') # p(Y|F) t_var_pre = tf.Variable(0.5 * np.ones((G, 1)), dtype=tf.float32) t_var_full = tf.nn.softplus(t_var_pre) idx_g = tf.placeholder(tf.int32, p) t_var = tf.gather(t_var_full, idx_g)
# Build model. x = tf.placeholder(tf.float64, [data_num, 1], name='x') z = tf.placeholder(tf.float64, [gen_num, z_dim], name='z') g = generator(z, width=width, depth=depth, activation=activation, out_dim=out_dim) v = tf.concat([x, g], 0) VVT = tf.matmul(v, tf.transpose(v)) sqs = tf.reshape(tf.diag_part(VVT), [-1, 1]) sqs_tiled_horiz = tf.tile(sqs, tf.transpose(sqs).get_shape()) exp_object = sqs_tiled_horiz - 2 * VVT + tf.transpose(sqs_tiled_horiz) K = tf.exp(-0.5 * (1 / sigma) * exp_object) K_xx = K[:data_num, :data_num] K_yy = K[data_num:, data_num:] K_xy = K[:data_num, data_num:] K_xx_upper = (tf.matrix_band_part(K_xx, 0, -1) - tf.matrix_band_part(K_xx, 0, 0)) K_yy_upper = (tf.matrix_band_part(K_yy, 0, -1) - tf.matrix_band_part(K_yy, 0, 0)) num_combos_xx = data_num * (data_num - 1) / 2 num_combos_yy = gen_num * (gen_num - 1) / 2 mmd = (tf.reduce_sum(K_xx_upper) / num_combos_xx + tf.reduce_sum(K_yy_upper) / num_combos_yy - 2 * tf.reduce_sum(K_xy) / (data_num * gen_num)) g_vars = [var for var in tf.global_variables() if 'generator' in var.name] if optimizer == 'adagrad': opt = tf.train.AdagradOptimizer(learning_rate) elif optimizer == 'adam': opt = tf.train.AdamOptimizer(learning_rate) elif optimizer == 'rmsprop': opt = tf.train.RMSPropOptimizer(learning_rate)
def _random_pd_matrix(self, *shape): mat = rng.rand(*shape) chol = tfd.matrix_diag_transform(mat, transform=tf.nn.softplus) chol = tf.matrix_band_part(chol, -1, 0) return self.evaluate(tf.matmul(chol, chol, adjoint_b=True))
def conditional(Kmn, Kmm, Knn, f, *, full_cov=False, q_sqrt=None, white=False): """ Given a g1 and g2, and distribution p and q such that p(g2) = N(g2;0,Kmm) p(g1) = N(g1;0,Knn) p(g1|g2) = N(g1;0,Knm) And q(g2) = N(g2;f,q_sqrt*q_sqrt^T) This method computes the mean and (co)variance of q(g1) = \int q(g2) p(g1|g2) :param Kmn: P x M x N :param Kmm: M x M :param Knn: P x N x N or P x N :param f: M x R :param full_cov: bool :param q_sqrt: R x M x M (lower triangular) :param white: bool :return: N x R or R x N x N """ logger.debug("base conditional") # compute kernel stuff num_func = tf.shape(f)[1] # R Lm = tf.cholesky(Kmm) def solve_A(MN_Kmn): return tf.matrix_triangular_solve(Lm, MN_Kmn, lower=True) # M x M @ M x N -> M x N A = tf.map_fn(solve_A, Kmn) # P x M x N # compute the covariance due to the conditioning if full_cov: fvar = Knn - tf.tensordot(A, A, [[1], [1]]) # P x N x N fvar = tf.tile(fvar[None, :, :, :], [num_func, 1, 1, 1]) # R x N x N else: fvar = Knn - tf.reduce_sum(tf.square(A), 1) # P x N fvar = tf.tile(fvar[None, :, :], [num_func, 1, 1]) # R x P x N # another backsubstitution in the unwhitened case if not white: def backsub(MN_A): return tf.matrix_triangular_solve(tf.transpose(Lm), MN_A, lower=False) A = tf.map_fn(backsub, A) # P x M x N # construct the conditional mean fmean = tf.tensordot(A, f, [[1], [0]]) # P x N x R fmean = tf.transpose(fmean, [1, 0, 2]) # N x P x R if q_sqrt is not None: if q_sqrt.get_shape().ndims == 3: L = tf.matrix_band_part(q_sqrt, -1, 0) # R x M x M # A: P x M x N LTA = tf.tensordot(L, A, [[1], [1]]) # R x M x P x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.tensordot(LTA, LTA, [[1], [1]]) # R x P x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # R x P x N return fmean, fvar # N x P x R, R x P x N or R x P x N x N
def ready(self): config = self.config N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden gru = cudnn_gru if config.use_cudnn else native_gru with tf.variable_scope("emb"): with tf.variable_scope("char"): ch_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup( self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = dropout( ch_emb, keep_prob=config.keep_prob, is_train=self.is_train) qh_emb = dropout( qh_emb, keep_prob=config.keep_prob, is_train=self.is_train) cell_fw = tf.contrib.rnn.GRUCell(dg) cell_bw = tf.contrib.rnn.GRUCell(dg) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32) ch_emb = tf.concat([state_fw, state_bw], axis=1) _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32) qh_emb = tf.concat([state_fw, state_bw], axis=1) qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg]) ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg]) with tf.name_scope("word"): c_emb = tf.nn.embedding_lookup(self.word_mat, self.c) q_emb = tf.nn.embedding_lookup(self.word_mat, self.q) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) with tf.variable_scope("encoding"): rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) c = rnn(c_emb, seq_len=self.c_len) q = rnn(q_emb, seq_len=self.q_len) with tf.variable_scope("attention"): qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) att = rnn(qc_att, seq_len=self.c_len) with tf.variable_scope("match"): self_att = dot_attention( att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train) rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape( ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train) match = rnn(self_att, seq_len=self.c_len) with tf.variable_scope("pointer"): init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train) pointer = ptr_net(batch=N, hidden=init.get_shape().as_list( )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train) logits1, logits2 = pointer(init, match, d, self.c_mask) with tf.variable_scope("predict"): outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits( logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits( logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2)
def mask_attn_weights(w): n = shape_list(w)[-1] b = tf.matrix_band_part(tf.ones([n, n]), -1, 0) b = tf.reshape(b, [1, 1, n, n]) w = w * b + -1e9 * (1 - b) return w
def independent_interdomain_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False): """ The inducing outputs live in the g-space (R^L). Interdomain conditional calculation. :param Kmn: M x L x N x P :param Kmm: L x M x M :param Knn: N x P or N x N or P x N x N or N x P x N x P :param f: data matrix, M x L :param q_sqrt: L x M x M or M x L :param full_cov: calculate covariance between inputs :param full_output_cov: calculate covariance between outputs :param white: use whitened representation :return: - mean: N x P - variance: N x P, N x P x P, P x N x N, N x P x N x P """ logger.debug("independent_interdomain_conditional") M, L, N, P = [tf.shape(Kmn)[i] for i in range(Kmn.shape.ndims)] Lm = tf.cholesky(Kmm) # L x M x M # Compute the projection matrix A Kmn = tf.reshape(tf.transpose(Kmn, (1, 0, 2, 3)), (L, M, N * P)) A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # L x M x M * L x M x NP -> L x M x NP Ar = tf.reshape(A, (L, M, N, P)) # compute the covariance due to the conditioning if full_cov and full_output_cov: fvar = Knn - tf.tensordot(Ar, Ar, [[0, 1], [0, 1]]) # N x P x N x P elif full_cov and not full_output_cov: At = tf.reshape(tf.transpose(Ar), (P, N, M * L)) # P x N x ML fvar = Knn - tf.matmul(At, At, transpose_b=True) # P x N x N elif not full_cov and full_output_cov: At = tf.reshape(tf.transpose(Ar, [2, 3, 1, 0]), (N, P, M * L)) # N x P x ML fvar = Knn - tf.matmul(At, At, transpose_b=True) # N x P x P elif not full_cov and not full_output_cov: fvar = Knn - tf.reshape(tf.reduce_sum(tf.square(A), [0, 1]), (N, P)) # Knn: N x P # another backsubstitution in the unwhitened case if not white: A = tf.matrix_triangular_solve(Lm, Ar) # L x M x M * L x M x NP -> L x M x NP Ar = tf.reshape(A, (L, M, N, P)) fmean = tf.tensordot(Ar, f, [[1, 0], [0, 1]]) # N x P if q_sqrt is not None: if q_sqrt.shape.ndims == 3: Lf = tf.matrix_band_part(q_sqrt, -1, 0) # L x M x M LTA = tf.matmul(Lf, A, transpose_a=True) # L x M x M * L x M x NP -> L x M x NP else: # q_sqrt M x L LTA = (A * tf.transpose(q_sqrt)[..., None]) # L x M x NP if full_cov and full_output_cov: LTAr = tf.reshape(LTA, (L * M, N * P)) fvar = fvar + tf.reshape(tf.matmul(LTAr, LTAr, transpose_a=True), (N, P, N, P)) elif full_cov and not full_output_cov: LTAr = tf.transpose(tf.reshape(LTA, (L * M, N, P)), [2, 0, 1]) # P x LM x N fvar = fvar + tf.matmul(LTAr, LTAr, transpose_a=True) # P x N x N elif not full_cov and full_output_cov: LTAr = tf.transpose(tf.reshape(LTA, (L * M, N, P)), [1, 0, 2]) # N x LM x P fvar = fvar + tf.matmul(LTAr, LTAr, transpose_a=True) # N x P x P elif not full_cov and not full_output_cov: fvar = fvar + tf.reshape(tf.reduce_sum(tf.square(LTA), (0, 1)), (N, P)) return fmean, fvar
def forward(self): PRTIN_ATT = 8 config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) #we utilize the maximum length to control the upper bound of dataset... #we assume all the head using the same bucketing methods, but just learn to update the attention parts... self.soft_t5_alpha, self.soft_t5_beta = get_clip(nh, A=1 / config.fixed_c_maxlen, config=config, name='layer_c') with tf.variable_scope("Embedding_Encoder_Layer"): self.c_t5_bias = compute_bias(nh, config.para_limit, config.para_limit, self.soft_t5_alpha, self.soft_t5_beta, l1_width=config.l1_width, l2_width=config.l2_width, stddev=config.stddev, dropout_prob=self.dropout, activation=config.soft_t5_activation, bidirectional=True, name='layer_c') print('[!!!-c_t5_bias:]', self.c_t5_bias) ''' @we add head mask for c_t5_bias ''' head_mask = np.zeros((nh, config.para_limit, config.para_limit)) #hidx=[7,2,3,0,6,1,4,5] low2high = [5, 4, 1, 6, 0, 3, 2, 7] for tt in range(PRTIN_ATT): head_mask[low2high[tt], :, :] = np.ones( (config.para_limit, config.para_limit)) self.c_t5_bias = self.c_t5_bias * head_mask self.c_layer_weights, c = residual_block( c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout, t5_bias=self.c_t5_bias[:, :, :self.c_maxlen, :self.c_maxlen]) self.q_t5_bias = compute_bias(nh, config.ques_limit, config.ques_limit, self.soft_t5_alpha, self.soft_t5_beta, l1_width=config.l1_width, l2_width=config.l2_width, stddev=config.stddev, dropout_prob=self.dropout, activation=config.soft_t5_activation, bidirectional=True, name='layer_q') print('[!!!-q_t5_bias:]', self.q_t5_bias) head_mask = np.zeros((nh, config.ques_limit, config.ques_limit)) #hidx=[7,0,6,2,4,1,3,5] low2high = [5, 3, 1, 4, 2, 6, 0, 7] for tt in range(PRTIN_ATT): head_mask[low2high[tt], :, :] = np.ones( (config.ques_limit, config.ques_limit)) self.q_t5_bias = self.q_t5_bias * head_mask #num_blocks = 1, self.q_layer_weights, q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout, t5_bias=self.q_t5_bias[:, :, :self.q_maxlen, :self.q_maxlen]) #we need to revise this into multiple head attention~~ with tf.variable_scope("Context_to_Query_Attention_Layer"): # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1]) # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1]) # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout) S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c] self.model_c_t5_bias_list = [] self.model_c_layer_weights = [] ''' hidx_list = [[2,7,4,3,1,0,6,5], [5,3,0,2,1,6,4,7], [5,0,1,6,3,2,7,4]]''' hidx_list = [[5, 6, 0, 1, 3, 4, 7, 2], [7, 4, 6, 1, 2, 0, 3, 5], [4, 7, 2, 3, 6, 1, 0, 5]] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) c_t5_bias = compute_bias(nh, config.para_limit, config.para_limit, self.soft_t5_alpha, self.soft_t5_beta, l1_width=config.l1_width, l2_width=config.l2_width, stddev=config.stddev, dropout_prob=self.dropout, activation=config.soft_t5_activation, bidirectional=True, name='model_layer_' + str(i)) head_mask = np.zeros( (nh, config.para_limit, config.para_limit)) for tt in range(PRTIN_ATT): head_mask[hidx_list[i][tt], :, :] = np.ones( (config.para_limit, config.para_limit)) c_t5_bias = c_t5_bias * head_mask self.model_c_t5_bias_list.append(c_t5_bias) print('[!!!-c_t5_bias:]', c_t5_bias) layer_weights, model_c = residual_block( self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout, t5_bias=self.model_c_t5_bias_list[i] [:, :, :self.c_maxlen, :self.c_maxlen]) self.model_c_layer_weights.append(layer_weights) self.enc.append(model_c) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, config.ans_limit) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) print('self.loss:', self.loss) print('self.yp1:', self.yp1) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.assign_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.assign_vars.append(tf.assign(var, v))
codes.append(list(pair[1])) print(codes) codes = tf.constant(codes) labels = np.array([[0., 8., 8., 5., 8.], [8., 0., 8., 8., 5.], [8., 8., 0., 8., 8.], [5., 8., 8., 0., 8.], [8., 5., 8., 8., 0.]]) #loss = MSE_Loss(codes, labels, None) sess = tf.Session() #a = sess.run(loss) bs = 5 k = 0 A1, A2 = tf.split(codes, [bs, bs*k]) # Handle first part of loss M1 = tf.matmul(A1, tf.transpose(A1)) diag = tf.squeeze(tf.matrix_diag_part(M1)) M2 = tf.stack([diag for i in range(bs)]) # l2_mat_{i,j} = ||d_i - d_j||^2 l2_mat_1 = (M2 + tf.transpose(M2) - 2*M1) loss_mat_1 = tf.matrix_band_part((l2_mat_1 - labels)**2, 0, -1) loss_1 = tf.reduce_sum(loss_mat_1) lis = sess.run([A1,A2,M1,diag,M2,l2_mat_1,loss_mat_1,loss_1]) print(lis) grad = tf.gradients(loss_1, codes) g = sess.run(grad) print(g)
def gauss_kl(q_mu, q_sqrt, K=None): """ Compute the KL divergence KL[q || p] between q(x) = N(q_mu, q_sqrt^2) and p(x) = N(0, K) We assume N multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. Returns the sum of the divergences. q_mu is a matrix (M x N), each column contains a mean. q_sqrt can be a 3D tensor (M x M x N), each matrix within is a lower triangular square-root matrix of the covariance of q. q_sqrt can be a matrix (M x N), each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix (M x M): the covariance of p. If K is None, compute the KL divergence to p(x) = N(0, I) instead. """ if K is None: white = True alpha = q_mu else: white = False Lp = tf.cholesky(K) alpha = tf.matrix_triangular_solve(Lp, q_mu, lower=True) if q_sqrt.get_shape().ndims == 2: diag = True num_latent = tf.shape(q_sqrt)[1] NM = tf.size(q_sqrt) Lq = Lq_diag = q_sqrt elif q_sqrt.get_shape().ndims == 3: diag = False num_latent = tf.shape(q_sqrt)[2] NM = tf.reduce_prod(tf.shape(q_sqrt)[1:]) Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle Lq_diag = tf.matrix_diag_part(Lq) else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: {}".format( q_sqrt.get_shape().ndims)) # Mahalanobis term: μqᵀ Σp⁻¹ μq mahalanobis = tf.reduce_sum(tf.square(alpha)) # Constant term: - N x M constant = -tf.cast(NM, settings.tf_float) # Log-determinant of the covariance of q(x): logdet_qcov = tf.reduce_sum(tf.log(tf.square(Lq_diag))) # Trace term: tr(Σp⁻¹ Σq) if white: trace = tf.reduce_sum(tf.square(Lq)) else: if diag: M = tf.shape(Lp)[0] Lp_inv = tf.matrix_triangular_solve( Lp, tf.eye(M, dtype=settings.tf_float), lower=True) K_inv = tf.matrix_triangular_solve(tf.transpose(Lp), Lp_inv, lower=False) trace = tf.reduce_sum( tf.expand_dims(tf.matrix_diag_part(K_inv), 1) * tf.square(q_sqrt)) else: Lp_tiled = tf.tile(tf.expand_dims(Lp, 0), [num_latent, 1, 1]) LpiLq = tf.matrix_triangular_solve(Lp_tiled, Lq, lower=True) trace = tf.reduce_sum(tf.square(LpiLq)) twoKL = mahalanobis + constant - logdet_qcov + trace # Log-determinant of the covariance of p(x): if not white: log_sqdiag_Lp = tf.log(tf.square(tf.matrix_diag_part(Lp))) sum_log_sqdiag_Lp = tf.reduce_sum(log_sqdiag_Lp) prior_logdet = tf.cast(num_latent, settings.tf_float) * sum_log_sqdiag_Lp twoKL += prior_logdet return 0.5 * twoKL
def conditional(Xnew, X, kern, f, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there may be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` represents the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). In this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last dimension of q_sqrt). - Xnew is a data matrix, size N x D - X are data points, size M x D - kern is a GPflow kernel - f is a data matrix, M x K, representing the function values at X, for K functions. - q_sqrt (optional) is a matrix of standard-deviations or Cholesky matrices, size M x K or M x M x K - whiten (optional) is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ # compute kernel stuff num_data = tf.shape(X)[0] # M num_func = tf.shape(f)[1] # K Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + tf.eye(num_data, dtype=_settings.tf_float) * _settings.jitter_level Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = kern.K(Xnew) - tf.matmul(A, A, transpose_a=True) shape = tf.stack([num_func, 1, 1]) else: fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) shape = tf.stack([num_func, 1]) fvar = tf.tile(tf.expand_dims(fvar, 0), shape) # K x N x N or K x N # another backsubstitution in the unwhitened case if not whiten: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(A, f, transpose_a=True) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # K x M x N elif q_sqrt.get_shape().ndims == 3: L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # K x M x M A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([num_func, 1, 1])) LTA = tf.matmul(L, A_tiled, transpose_a=True) # K x M x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # K x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # K x N fvar = tf.transpose(fvar) # N x K or N x N x K return fmean, fvar