def check(x, dtype=None): z = tf.convert_to_tensor(x) y = convert_to_tensor_and_cast(x, dtype) self.assertIsInstance(y, tf.Tensor) if dtype is not None: self.assertEqual(y.dtype, dtype) else: self.assertEqual(y.dtype, z.dtype)
def sample(self, n_samples=None, group_ndims=0, is_reparameterized=None, compute_density=None, name=None): self._validate_sample_is_reparameterized_arg(is_reparameterized) if is_reparameterized is None: is_reparameterized = self.is_reparameterized with tf.name_scope(name, default_name='DiscretizedLogistic.sample'): # sample from uniform distribution sample_shape = self.batch_shape static_sample_shape = self.get_batch_shape() if n_samples is not None: sample_shape = tf.concat([[n_samples], sample_shape], 0) static_sample_shape = tf.TensorShape( [None if is_tensor_object(n_samples) else n_samples]). \ concatenate(static_sample_shape) u = tf.random_uniform(shape=sample_shape, minval=self._epsilon, maxval=1. - self._epsilon, dtype=self._param_dtype) u.set_shape(static_sample_shape) # inverse CDF of the logistic inverse_logistic_cdf = maybe_check_numerics( tf.log(u) - tf.log(1. - u), 'inverse_logistic_cdf') # obtain the actual sample scale = maybe_check_numerics(tf.exp(self.log_scale, name='scale'), 'scale') sample = self.mean + scale * inverse_logistic_cdf if self.discretize_sample: sample = self._discretize(sample) sample = maybe_check_numerics(sample, 'sample') sample = convert_to_tensor_and_cast(sample, self.dtype) if not is_reparameterized: sample = tf.stop_gradient(sample) t = StochasticTensor(distribution=self, tensor=sample, n_samples=n_samples, group_ndims=group_ndims, is_reparameterized=is_reparameterized) # compute the density if compute_density: compute_density_immediately(t) return t
def dropout(input, rate=.5, noise_shape=None, training=False, name=None): """ Apply dropout on `input`. Args: input (Tensor): The input tensor. rate (float or tf.Tensor): The rate of dropout. noise_shape (tuple[int] or tf.Tensor): Shape of the noise. If not specified, use the shape of `input`. training (bool or tf.Tensor): Whether or not the model is under training stage? Returns: tf.Tensor: The dropout transformed tensor. """ input = tf.convert_to_tensor(input) with tf.name_scope(name, default_name='dropout', values=[input]): dtype = input.dtype.base_dtype retain_prob = convert_to_tensor_and_cast(1. - rate, dtype=dtype) inv_retain_prob = 1. / retain_prob if noise_shape is None: noise_shape = get_shape(input) def training_branch(): noise = tf.random_uniform(shape=noise_shape, minval=0., maxval=1., dtype=dtype) mask = tf.cast(noise < retain_prob, dtype=dtype) return input * mask * inv_retain_prob def testing_branch(): return input return smart_cond( training, training_branch, testing_branch, )
def __init__(self, mean, log_scale, bin_size, min_val=None, max_val=None, dtype=tf.float32, biased_edges=True, discretize_given=True, discretize_sample=True, epsilon=1e-7): """ Construct a new :class:`DiscretizedLogistic`. Args: mean: A Tensor, the `mean`. log_scale: A Tensor, the `log(scale)`. bin_size: A scalar, the `bin_size`. min_val: A scalar, the minimum possible value of `x`. max_val: A scalar, the maximum possible value of `x`. dtype: The data type of `x`. biased_edges: Whether or not to use bias density for edge values? See above. discretize_given (bool): Whether or not to discretize `given` in :meth:`log_prob` and :meth:`prob`? discretize_sample (bool): Whether or not to discretize the generated samples in :meth:`sample`? epsilon: Small float to avoid dividing by zero or taking logarithm of zero. """ # check the arguments mean = tf.convert_to_tensor(mean) param_dtype = mean.dtype log_scale = tf.convert_to_tensor(log_scale) dtype = tf.as_dtype(dtype) if not is_integer_number(bin_size) and not dtype.is_floating: raise ValueError( '`bin_size` is a float number, but `dtype` is not a float ' 'number type: {}'.format(dtype)) if (min_val is None and max_val is not None) or \ (min_val is not None and max_val is None): raise ValueError('`min_val` and `max_val` must be both None or ' 'neither None.') if max_val is not None and min_val is not None and \ not is_integer_number((max_val - min_val) / bin_size): raise ValueError( '`min_val - max_val` must be multiples of `bin_size`: ' 'max_val - min_val = {} vs bin_size = {}'.format( max_val - min_val, bin_size)) # infer the batch shape try: batch_static_shape = tf.broadcast_static_shape( mean.get_shape(), log_scale.get_shape()) except ValueError: raise ValueError('The shape of `mean` and `log_scale` cannot ' 'be broadcasted: mean {} vs log_scale {}'.format( mean, log_scale)) with tf.name_scope('DiscretizedLogistic.init'): batch_shape = tf.broadcast_dynamic_shape(tf.shape(mean), tf.shape(log_scale)) # memorize the arguments and call parent constructor bin_size = convert_to_tensor_and_cast(bin_size, param_dtype) if min_val is not None: min_val = convert_to_tensor_and_cast(min_val, param_dtype) if max_val is not None: max_val = convert_to_tensor_and_cast(max_val, param_dtype) self._mean = mean self._log_scale = log_scale self._param_dtype = param_dtype self._bin_size = bin_size self._min_val = min_val self._max_val = max_val self._biased_edges = bool(biased_edges) self._discretize_given = bool(discretize_given) self._discretize_sample = bool(discretize_sample) self._epsilon = epsilon super(DiscretizedLogistic, self).__init__(dtype=dtype, is_continuous=not self._discretize_sample, is_reparameterized=not self._discretize_sample, batch_shape=batch_shape, batch_static_shape=batch_static_shape, value_ndims=0)
def nvil_estimator(values, latent_log_joint, baseline=None, center_by_moving_average=True, decay=0.8, axis=None, keepdims=False, batch_axis=None, name=None): """ Derive the gradient estimator for :math:`\\mathbb{E}_{q(\\mathbf{z}|\\mathbf{x})}\\big[f(\\mathbf{x},\\mathbf{z})\\big]`, by NVIL (Mnih and Gregor, 2014) algorithm. .. math:: \\begin{aligned} \\nabla \\, \\mathbb{E}_{q(\\mathbf{z}|\\mathbf{x})} \\big[f(\\mathbf{x},\\mathbf{z})\\big] &= \\mathbb{E}_{q(\\mathbf{z}|\\mathbf{x})}\\Big[ \\nabla f(\\mathbf{x},\\mathbf{z}) + f(\\mathbf{x},\\mathbf{z})\\,\\nabla\\log q(\\mathbf{z}|\\mathbf{x})\\Big] \\\\ &= \\mathbb{E}_{q(\\mathbf{z}|\\mathbf{x})}\\Big[ \\nabla f(\\mathbf{x},\\mathbf{z}) + \\big(f(\\mathbf{x},\\mathbf{z}) - C_{\\psi}(\\mathbf{x})-c\\big)\\,\\nabla\\log q(\\mathbf{z}|\\mathbf{x})\\Big] \\end{aligned} where :math:`C_{\\psi}(\\mathbf{x})` is a learnable network with parameter :math:`\\psi`, and `c` is a learnable constant. They would be learnt by minimizing :math:`\\mathbb{E}_{ q(\\mathbf{z}|\\mathbf{x}) }\\Big[\\big(f(\\mathbf{x},\\mathbf{z}) - C_{\\psi}(\\mathbf{x})-c\\big)^2 \\Big]`. Args: values: Values of the target function given `z` and `x`, i.e., :math:`f(\\mathbf{z},\\mathbf{x})`. latent_log_joint: Values of :math:`\\log q(\\mathbf{z}|\\mathbf{x})`. baseline: Values of the baseline function :math:`C_{\\psi}(\\mathbf{x})` given input `x`. If this is not specified, then this method will degenerate to the REINFORCE algorithm, with only a moving average estimated constant baseline `c`. center_by_moving_average (bool): Whether or not to use the moving average to maintain an estimation of `c` in above equations? decay: The decaying factor for moving average. axis: The sampling axes to be reduced in outputs. If not specified, no axis will be reduced. keepdims (bool): When `axis` is specified, whether or not to keep the reduced axes? (default :obj:`False`) batch_axis: The batch axes to be reduced when computing expectation over `x`. If not specified, all axes will be treated as batch axes, except the sampling axes. Returns: (tf.Tensor, tf.Tensor): The `(surrogate, baseline cost)`. `surrogate` is the surrogate for optimizing the original target. Maximizing/minimizing this surrogate via gradient descent will effectively maximize/minimize the original target. `baseline cost` is the cost to be minimized for training baseline. It will be :obj:`None` if `baseline` is :obj:`None`. """ if baseline is None and not center_by_moving_average: raise ValueError('`baseline` is not specified, thus ' '`center_by_moving_average` must be False.') values = tf.convert_to_tensor(values) # f(x,z) latent_log_joint = tf.convert_to_tensor(latent_log_joint) # log q(z|x) if baseline is not None: baseline = tf.convert_to_tensor(baseline) dtype = values.dtype @contextmanager def mk_scope(): if center_by_moving_average: with tf.variable_scope(None, default_name=name or 'nvil_estimator'): yield else: ns_values = [values, latent_log_joint] if baseline is not None: ns_values += [baseline] with tf.name_scope(name or 'nvil_estimator', values=ns_values): yield with mk_scope(): l_signal = values baseline_cost = None # compute the baseline cost if baseline is not None: # baseline_cost = E[(f(x,z)-C(x)-c)^2] with tf.name_scope('baseline_cost'): baseline_cost = tf.square( tf.stop_gradient(l_signal) - baseline) if axis is not None: baseline_cost = tf.reduce_mean(baseline_cost, axis, keepdims=keepdims) l_signal = l_signal - baseline # estimate `c` by moving average if center_by_moving_average: with tf.name_scope('center_by_moving_average'): batch_center = tf.reduce_mean(l_signal, axis=batch_axis, keepdims=True) moving_mean_shape = get_static_shape(batch_center) if None in moving_mean_shape: raise ValueError( 'The shape of `values` after `batch_axis` having been ' 'reduced must be static: values {}, batch_axis {}'. format(values, batch_axis)) moving_mean = tf.get_variable( 'moving_mean', shape=moving_mean_shape, initializer=tf.constant_initializer(0.), trainable=False, dtype=dtype) decay = convert_to_tensor_and_cast(1. - decay, dtype) moving_mean = moving_mean.assign(moving_mean - (moving_mean - batch_center) * decay) l_signal = l_signal - moving_mean # compute the nvil cost with tf.name_scope('cost'): cost = tf.stop_gradient(l_signal) * latent_log_joint + values if axis is not None: cost = tf.reduce_mean(cost, axis, keepdims=keepdims) return cost, baseline_cost
def __init__(self, size, strict=False, dtype=tf.float32, epsilon=1e-6, trainable=True, random_state=None, name=None, scope=None): """ Construct a new :class:`InvertibleMatrix`. Args: size (int or (int, int)): Size of the matrix. strict (bool): If :obj:`True`, will derive the matrix using a variant of PLU decomposition, to enforce invertibility (see above). If :obj:`False`, the matrix will only be initialized to be an orthogonal invertible matrix, without further constraint. (default :obj:`False`) dtype (tf.DType): The data type of the variables. epsilon: Small float to avoid dividing by zero or taking logarithm of zero. trainable (bool): Whether or not the parameters are trainable? random_state (np.random.RandomState): Use this random state, instead of constructing a :class:`VarScopeRandomState`. """ from tfsnippet.ops import convert_to_tensor_and_cast # validate the arguments def validate_shape(): if is_integer(size): shape = (int(size),) * 2 else: h, w = size shape = (int(h), int(w)) if shape[0] != shape[1] or shape[0] < 1: raise ValueError() return shape try: shape = validate_shape() except Exception: raise ValueError('`size` is not valid for a square matrix: {!r}.'. format(size)) strict = bool(strict) dtype = tf.as_dtype(dtype) self._shape = shape self._strict = strict self._dtype = dtype self._epsilon = epsilon # initialize the variable scope and the random state super(InvertibleMatrix, self).__init__(name=name, scope=scope) if random_state is None: random_state = VarScopeRandomState(self.variable_scope) self._random_state = random_state # generate the initial orthogonal matrix initial_matrix = la.qr(random_state.normal(size=shape))[0] # helper for creating the variable and add to histogram def check_tensor(tensor, name=None): if name is None: name = tensor.name.rsplit('/')[-1] if name.endswith(':0'): name = name[:-2] maybe_add_histogram(tensor, name, strip_scope=True) return maybe_check_numerics(tensor, name) # create the variables with reopen_variable_scope(self.variable_scope): if not strict: # the matrix self._matrix = check_tensor( model_variable( 'matrix', initializer=tf.constant(initial_matrix, dtype=dtype), dtype=dtype, trainable=trainable ) ) self._inv_matrix = check_tensor( tf.matrix_inverse(self._matrix, name='inv_matrix')) # log_det if is_tensorflow_version_higher_or_equal('1.10.0'): self._log_det = tf.linalg.slogdet( self._matrix, name='log_det')[1] else: # low versions of TensorFlow does not have a gradient op # for `slogdet`, thus we have to derive it as follows: with tf.name_scope('log_det', values=[self._matrix]): m = convert_to_tensor_and_cast(self._matrix, tf.float64) self._log_det = tf.log( tf.maximum(tf.abs(tf.matrix_determinant(m)), epsilon) ) self._log_det = \ convert_to_tensor_and_cast(self._log_det, dtype) self._log_det = check_tensor(self._log_det, 'log_det') else: initial_P, initial_L, initial_U = la.lu(initial_matrix) initial_s = np.diag(initial_U) initial_sign = np.sign(initial_s) initial_log_s = np.log( np.maximum(np.abs(initial_s), self._epsilon)) initial_U = np.triu(initial_U, k=1) # TODO: use PermutationMatrix to derive P once we can export it # # PermutationMatrix is faster, however, it cannot be exported # by just saving the TensorFlow variables. Thus for the time # being, we have to use a true TensorFlow variable to derive P. # # P = self._P = PermutationMatrix(initial_P) P = self._P = model_variable( 'P', initializer=tf.constant(initial_P, dtype=dtype), dtype=dtype, trainable=False ) pre_L = self._pre_L = check_tensor( model_variable( 'pre_L', initializer=tf.constant(initial_L, dtype=dtype), dtype=dtype, trainable=trainable ) ) pre_U = self._pre_U = check_tensor( model_variable( 'pre_U', initializer=tf.constant(initial_U, dtype=dtype), dtype=dtype, trainable=trainable ) ) sign = self._sign = model_variable( 'sign', initializer=tf.constant(initial_sign, dtype=dtype), dtype=dtype, trainable=False ) log_s = self._log_s = check_tensor( model_variable( 'log_s', initializer=tf.constant(initial_log_s, dtype=dtype), dtype=dtype, trainable=trainable ) ) with tf.name_scope('L', values=[pre_L]): L_mask = tf.constant(np.tril(np.ones(shape), k=-1), dtype=dtype) L = self._L = check_tensor( L_mask * pre_L + tf.eye(*shape, dtype=dtype), 'L') with tf.name_scope('U', values=[pre_U, sign, log_s]): U_mask = tf.constant(np.triu(np.ones(shape), k=1), dtype=dtype) U = self._U = check_tensor( U_mask * pre_U + tf.diag(sign * tf.exp(log_s)), 'U') with tf.name_scope('matrix', values=[P, L, U]): self._matrix = check_tensor( tf.matmul(P, tf.matmul(L, U), name='matrix')) with tf.name_scope('inv_matrix', values=[P, L, U]): self._inv_matrix = check_tensor( tf.matmul( check_tensor( tf.matrix_inverse(U, name='inv_U')), tf.matmul( check_tensor( tf.matrix_inverse(L, name='inv_L')), check_tensor( tf.matrix_inverse(P, name='inv_P')), ), name='inv_matrix' ) ) with tf.name_scope('log_det', values=[log_s]): self._log_det = check_tensor( tf.reduce_sum(log_s, name='log_det'))