示例#1
0
 def check(x, dtype=None):
     z = tf.convert_to_tensor(x)
     y = convert_to_tensor_and_cast(x, dtype)
     self.assertIsInstance(y, tf.Tensor)
     if dtype is not None:
         self.assertEqual(y.dtype, dtype)
     else:
         self.assertEqual(y.dtype, z.dtype)
示例#2
0
    def sample(self,
               n_samples=None,
               group_ndims=0,
               is_reparameterized=None,
               compute_density=None,
               name=None):
        self._validate_sample_is_reparameterized_arg(is_reparameterized)
        if is_reparameterized is None:
            is_reparameterized = self.is_reparameterized

        with tf.name_scope(name, default_name='DiscretizedLogistic.sample'):
            # sample from uniform distribution
            sample_shape = self.batch_shape
            static_sample_shape = self.get_batch_shape()
            if n_samples is not None:
                sample_shape = tf.concat([[n_samples], sample_shape], 0)
                static_sample_shape = tf.TensorShape(
                    [None if is_tensor_object(n_samples) else n_samples]). \
                    concatenate(static_sample_shape)

            u = tf.random_uniform(shape=sample_shape,
                                  minval=self._epsilon,
                                  maxval=1. - self._epsilon,
                                  dtype=self._param_dtype)
            u.set_shape(static_sample_shape)

            # inverse CDF of the logistic
            inverse_logistic_cdf = maybe_check_numerics(
                tf.log(u) - tf.log(1. - u), 'inverse_logistic_cdf')

            # obtain the actual sample
            scale = maybe_check_numerics(tf.exp(self.log_scale, name='scale'),
                                         'scale')
            sample = self.mean + scale * inverse_logistic_cdf
            if self.discretize_sample:
                sample = self._discretize(sample)
            sample = maybe_check_numerics(sample, 'sample')
            sample = convert_to_tensor_and_cast(sample, self.dtype)

            if not is_reparameterized:
                sample = tf.stop_gradient(sample)

            t = StochasticTensor(distribution=self,
                                 tensor=sample,
                                 n_samples=n_samples,
                                 group_ndims=group_ndims,
                                 is_reparameterized=is_reparameterized)

            # compute the density
            if compute_density:
                compute_density_immediately(t)

            return t
示例#3
0
def dropout(input, rate=.5, noise_shape=None, training=False, name=None):
    """
    Apply dropout on `input`.

    Args:
        input (Tensor): The input tensor.
        rate (float or tf.Tensor): The rate of dropout.
        noise_shape (tuple[int] or tf.Tensor): Shape of the noise.
            If not specified, use the shape of `input`.
        training (bool or tf.Tensor): Whether or not the model is under
            training stage?

    Returns:
        tf.Tensor: The dropout transformed tensor.
    """
    input = tf.convert_to_tensor(input)

    with tf.name_scope(name, default_name='dropout', values=[input]):
        dtype = input.dtype.base_dtype
        retain_prob = convert_to_tensor_and_cast(1. - rate, dtype=dtype)
        inv_retain_prob = 1. / retain_prob
        if noise_shape is None:
            noise_shape = get_shape(input)

        def training_branch():
            noise = tf.random_uniform(shape=noise_shape,
                                      minval=0.,
                                      maxval=1.,
                                      dtype=dtype)
            mask = tf.cast(noise < retain_prob, dtype=dtype)
            return input * mask * inv_retain_prob

        def testing_branch():
            return input

        return smart_cond(
            training,
            training_branch,
            testing_branch,
        )
示例#4
0
    def __init__(self,
                 mean,
                 log_scale,
                 bin_size,
                 min_val=None,
                 max_val=None,
                 dtype=tf.float32,
                 biased_edges=True,
                 discretize_given=True,
                 discretize_sample=True,
                 epsilon=1e-7):
        """
        Construct a new :class:`DiscretizedLogistic`.

        Args:
            mean: A Tensor, the `mean`.
            log_scale: A Tensor, the `log(scale)`.
            bin_size: A scalar, the `bin_size`.
            min_val: A scalar, the minimum possible value of `x`.
            max_val: A scalar, the maximum possible value of `x`.
            dtype: The data type of `x`.
            biased_edges: Whether or not to use bias density for edge values?
                See above.
            discretize_given (bool): Whether or not to discretize `given`
                in :meth:`log_prob` and :meth:`prob`?
            discretize_sample (bool): Whether or not to discretize the
                generated samples in :meth:`sample`?
            epsilon: Small float to avoid dividing by zero or taking
                logarithm of zero.
        """
        # check the arguments
        mean = tf.convert_to_tensor(mean)
        param_dtype = mean.dtype
        log_scale = tf.convert_to_tensor(log_scale)
        dtype = tf.as_dtype(dtype)

        if not is_integer_number(bin_size) and not dtype.is_floating:
            raise ValueError(
                '`bin_size` is a float number, but `dtype` is not a float '
                'number type: {}'.format(dtype))

        if (min_val is None and max_val is not None) or \
                (min_val is not None and max_val is None):
            raise ValueError('`min_val` and `max_val` must be both None or '
                             'neither None.')

        if max_val is not None and min_val is not None and \
                not is_integer_number((max_val - min_val) / bin_size):
            raise ValueError(
                '`min_val - max_val` must be multiples of `bin_size`: '
                'max_val - min_val = {} vs bin_size = {}'.format(
                    max_val - min_val, bin_size))

        # infer the batch shape
        try:
            batch_static_shape = tf.broadcast_static_shape(
                mean.get_shape(), log_scale.get_shape())
        except ValueError:
            raise ValueError('The shape of `mean` and `log_scale` cannot '
                             'be broadcasted: mean {} vs log_scale {}'.format(
                                 mean, log_scale))

        with tf.name_scope('DiscretizedLogistic.init'):
            batch_shape = tf.broadcast_dynamic_shape(tf.shape(mean),
                                                     tf.shape(log_scale))

        # memorize the arguments and call parent constructor
        bin_size = convert_to_tensor_and_cast(bin_size, param_dtype)
        if min_val is not None:
            min_val = convert_to_tensor_and_cast(min_val, param_dtype)
        if max_val is not None:
            max_val = convert_to_tensor_and_cast(max_val, param_dtype)

        self._mean = mean
        self._log_scale = log_scale
        self._param_dtype = param_dtype
        self._bin_size = bin_size
        self._min_val = min_val
        self._max_val = max_val
        self._biased_edges = bool(biased_edges)
        self._discretize_given = bool(discretize_given)
        self._discretize_sample = bool(discretize_sample)
        self._epsilon = epsilon

        super(DiscretizedLogistic,
              self).__init__(dtype=dtype,
                             is_continuous=not self._discretize_sample,
                             is_reparameterized=not self._discretize_sample,
                             batch_shape=batch_shape,
                             batch_static_shape=batch_static_shape,
                             value_ndims=0)
示例#5
0
def nvil_estimator(values,
                   latent_log_joint,
                   baseline=None,
                   center_by_moving_average=True,
                   decay=0.8,
                   axis=None,
                   keepdims=False,
                   batch_axis=None,
                   name=None):
    """
    Derive the gradient estimator for
    :math:`\\mathbb{E}_{q(\\mathbf{z}|\\mathbf{x})}\\big[f(\\mathbf{x},\\mathbf{z})\\big]`,
    by NVIL (Mnih and Gregor, 2014) algorithm.

    .. math::

        \\begin{aligned}
        \\nabla \\, \\mathbb{E}_{q(\\mathbf{z}|\\mathbf{x})} \\big[f(\\mathbf{x},\\mathbf{z})\\big]
            &= \\mathbb{E}_{q(\\mathbf{z}|\\mathbf{x})}\\Big[
                \\nabla f(\\mathbf{x},\\mathbf{z}) + f(\\mathbf{x},\\mathbf{z})\\,\\nabla\\log q(\\mathbf{z}|\\mathbf{x})\\Big] \\\\
            &= \\mathbb{E}_{q(\\mathbf{z}|\\mathbf{x})}\\Big[
                \\nabla f(\\mathbf{x},\\mathbf{z}) + \\big(f(\\mathbf{x},\\mathbf{z}) - C_{\\psi}(\\mathbf{x})-c\\big)\\,\\nabla\\log q(\\mathbf{z}|\\mathbf{x})\\Big]
        \\end{aligned}

    where :math:`C_{\\psi}(\\mathbf{x})` is a learnable network with parameter
    :math:`\\psi`, and `c` is a learnable constant.  They would be learnt by
    minimizing :math:`\\mathbb{E}_{ q(\\mathbf{z}|\\mathbf{x}) }\\Big[\\big(f(\\mathbf{x},\\mathbf{z}) - C_{\\psi}(\\mathbf{x})-c\\big)^2 \\Big]`.

    Args:
        values: Values of the target function given `z` and `x`, i.e.,
            :math:`f(\\mathbf{z},\\mathbf{x})`.
        latent_log_joint: Values of :math:`\\log q(\\mathbf{z}|\\mathbf{x})`.
        baseline: Values of the baseline function :math:`C_{\\psi}(\\mathbf{x})`
            given input `x`.  If this is not specified, then this method will
            degenerate to the REINFORCE algorithm, with only a moving
            average estimated constant baseline `c`.
        center_by_moving_average (bool): Whether or not to use the moving
            average to maintain an estimation of `c` in above equations?
        decay: The decaying factor for moving average.
        axis: The sampling axes to be reduced in outputs.
            If not specified, no axis will be reduced.
        keepdims (bool): When `axis` is specified, whether or not to keep
            the reduced axes?  (default :obj:`False`)
        batch_axis: The batch axes to be reduced when computing
            expectation over `x`.  If not specified, all axes will be
            treated as batch axes, except the sampling axes.

    Returns:
        (tf.Tensor, tf.Tensor): The `(surrogate, baseline cost)`.

            `surrogate` is the surrogate for optimizing the original target.
            Maximizing/minimizing this surrogate via gradient descent will
            effectively maximize/minimize the original target.

            `baseline cost` is the cost to be minimized for training baseline.
            It will be :obj:`None` if `baseline` is :obj:`None`.
    """
    if baseline is None and not center_by_moving_average:
        raise ValueError('`baseline` is not specified, thus '
                         '`center_by_moving_average` must be False.')

    values = tf.convert_to_tensor(values)  # f(x,z)
    latent_log_joint = tf.convert_to_tensor(latent_log_joint)  # log q(z|x)
    if baseline is not None:
        baseline = tf.convert_to_tensor(baseline)
    dtype = values.dtype

    @contextmanager
    def mk_scope():
        if center_by_moving_average:
            with tf.variable_scope(None, default_name=name
                                   or 'nvil_estimator'):
                yield
        else:
            ns_values = [values, latent_log_joint]
            if baseline is not None:
                ns_values += [baseline]
            with tf.name_scope(name or 'nvil_estimator', values=ns_values):
                yield

    with mk_scope():
        l_signal = values
        baseline_cost = None

        # compute the baseline cost
        if baseline is not None:
            # baseline_cost = E[(f(x,z)-C(x)-c)^2]
            with tf.name_scope('baseline_cost'):
                baseline_cost = tf.square(
                    tf.stop_gradient(l_signal) - baseline)
                if axis is not None:
                    baseline_cost = tf.reduce_mean(baseline_cost,
                                                   axis,
                                                   keepdims=keepdims)

            l_signal = l_signal - baseline

        # estimate `c` by moving average
        if center_by_moving_average:
            with tf.name_scope('center_by_moving_average'):
                batch_center = tf.reduce_mean(l_signal,
                                              axis=batch_axis,
                                              keepdims=True)
                moving_mean_shape = get_static_shape(batch_center)
                if None in moving_mean_shape:
                    raise ValueError(
                        'The shape of `values` after `batch_axis` having been '
                        'reduced must be static: values {}, batch_axis {}'.
                        format(values, batch_axis))
                moving_mean = tf.get_variable(
                    'moving_mean',
                    shape=moving_mean_shape,
                    initializer=tf.constant_initializer(0.),
                    trainable=False,
                    dtype=dtype)

                decay = convert_to_tensor_and_cast(1. - decay, dtype)
                moving_mean = moving_mean.assign(moving_mean -
                                                 (moving_mean - batch_center) *
                                                 decay)
                l_signal = l_signal - moving_mean

        # compute the nvil cost
        with tf.name_scope('cost'):
            cost = tf.stop_gradient(l_signal) * latent_log_joint + values
            if axis is not None:
                cost = tf.reduce_mean(cost, axis, keepdims=keepdims)

        return cost, baseline_cost
示例#6
0
    def __init__(self, size, strict=False, dtype=tf.float32, epsilon=1e-6,
                 trainable=True, random_state=None, name=None, scope=None):
        """
        Construct a new :class:`InvertibleMatrix`.

        Args:
            size (int or (int, int)): Size of the matrix.
            strict (bool): If :obj:`True`, will derive the matrix using a
                variant of PLU decomposition, to enforce invertibility
                (see above).  If :obj:`False`, the matrix will only be
                initialized to be an orthogonal invertible matrix, without
                further constraint.  (default :obj:`False`)
            dtype (tf.DType): The data type of the variables.
            epsilon: Small float to avoid dividing by zero or taking
                logarithm of zero.
            trainable (bool): Whether or not the parameters are trainable?
            random_state (np.random.RandomState): Use this random state,
                instead of constructing a :class:`VarScopeRandomState`.
        """
        from tfsnippet.ops import convert_to_tensor_and_cast

        # validate the arguments
        def validate_shape():
            if is_integer(size):
                shape = (int(size),) * 2
            else:
                h, w = size
                shape = (int(h), int(w))
            if shape[0] != shape[1] or shape[0] < 1:
                raise ValueError()
            return shape

        try:
            shape = validate_shape()
        except Exception:
            raise ValueError('`size` is not valid for a square matrix: {!r}.'.
                             format(size))

        strict = bool(strict)
        dtype = tf.as_dtype(dtype)

        self._shape = shape
        self._strict = strict
        self._dtype = dtype
        self._epsilon = epsilon

        # initialize the variable scope and the random state
        super(InvertibleMatrix, self).__init__(name=name, scope=scope)
        if random_state is None:
            random_state = VarScopeRandomState(self.variable_scope)
        self._random_state = random_state

        # generate the initial orthogonal matrix
        initial_matrix = la.qr(random_state.normal(size=shape))[0]

        # helper for creating the variable and add to histogram
        def check_tensor(tensor, name=None):
            if name is None:
                name = tensor.name.rsplit('/')[-1]
                if name.endswith(':0'):
                    name = name[:-2]
            maybe_add_histogram(tensor, name, strip_scope=True)
            return maybe_check_numerics(tensor, name)

        # create the variables
        with reopen_variable_scope(self.variable_scope):
            if not strict:
                # the matrix
                self._matrix = check_tensor(
                    model_variable(
                        'matrix',
                        initializer=tf.constant(initial_matrix, dtype=dtype),
                        dtype=dtype,
                        trainable=trainable
                    )
                )
                self._inv_matrix = check_tensor(
                    tf.matrix_inverse(self._matrix, name='inv_matrix'))

                # log_det
                if is_tensorflow_version_higher_or_equal('1.10.0'):
                    self._log_det = tf.linalg.slogdet(
                        self._matrix, name='log_det')[1]
                else:
                    # low versions of TensorFlow does not have a gradient op
                    # for `slogdet`, thus we have to derive it as follows:
                    with tf.name_scope('log_det', values=[self._matrix]):
                        m = convert_to_tensor_and_cast(self._matrix, tf.float64)
                        self._log_det = tf.log(
                            tf.maximum(tf.abs(tf.matrix_determinant(m)),
                                       epsilon)
                        )
                        self._log_det = \
                            convert_to_tensor_and_cast(self._log_det, dtype)

                self._log_det = check_tensor(self._log_det, 'log_det')

            else:
                initial_P, initial_L, initial_U = la.lu(initial_matrix)
                initial_s = np.diag(initial_U)
                initial_sign = np.sign(initial_s)
                initial_log_s = np.log(
                    np.maximum(np.abs(initial_s), self._epsilon))
                initial_U = np.triu(initial_U, k=1)

                # TODO: use PermutationMatrix to derive P once we can export it
                #
                # PermutationMatrix is faster, however, it cannot be exported
                # by just saving the TensorFlow variables.  Thus for the time
                # being, we have to use a true TensorFlow variable to derive P.
                #
                # P = self._P = PermutationMatrix(initial_P)

                P = self._P = model_variable(
                    'P',
                    initializer=tf.constant(initial_P, dtype=dtype),
                    dtype=dtype,
                    trainable=False
                )
                pre_L = self._pre_L = check_tensor(
                    model_variable(
                        'pre_L',
                        initializer=tf.constant(initial_L, dtype=dtype),
                        dtype=dtype,
                        trainable=trainable
                    )
                )
                pre_U = self._pre_U = check_tensor(
                    model_variable(
                        'pre_U',
                        initializer=tf.constant(initial_U, dtype=dtype),
                        dtype=dtype,
                        trainable=trainable
                    )
                )
                sign = self._sign = model_variable(
                    'sign',
                    initializer=tf.constant(initial_sign, dtype=dtype),
                    dtype=dtype,
                    trainable=False
                )
                log_s = self._log_s = check_tensor(
                    model_variable(
                        'log_s',
                        initializer=tf.constant(initial_log_s, dtype=dtype),
                        dtype=dtype,
                        trainable=trainable
                    )
                )

                with tf.name_scope('L', values=[pre_L]):
                    L_mask = tf.constant(np.tril(np.ones(shape), k=-1),
                                         dtype=dtype)
                    L = self._L = check_tensor(
                        L_mask * pre_L + tf.eye(*shape, dtype=dtype), 'L')

                with tf.name_scope('U', values=[pre_U, sign, log_s]):
                    U_mask = tf.constant(np.triu(np.ones(shape), k=1),
                                         dtype=dtype)
                    U = self._U = check_tensor(
                        U_mask * pre_U + tf.diag(sign * tf.exp(log_s)), 'U')

                with tf.name_scope('matrix', values=[P, L, U]):
                    self._matrix = check_tensor(
                        tf.matmul(P, tf.matmul(L, U), name='matrix'))

                with tf.name_scope('inv_matrix', values=[P, L, U]):
                    self._inv_matrix = check_tensor(
                        tf.matmul(
                            check_tensor(
                                tf.matrix_inverse(U, name='inv_U')),
                            tf.matmul(
                                check_tensor(
                                    tf.matrix_inverse(L, name='inv_L')),
                                check_tensor(
                                    tf.matrix_inverse(P, name='inv_P')),
                            ),
                            name='inv_matrix'
                        )
                    )

                with tf.name_scope('log_det', values=[log_s]):
                    self._log_det = check_tensor(
                        tf.reduce_sum(log_s, name='log_det'))