예제 #1
0
  def _prepare_local(self, var_device, var_dtype, apply_state):
    lr_t = tf.identity(self._get_hyper('learning_rate', var_dtype))
    beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype))
    beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype))
    local_step = tf.cast(self.iterations + 1, var_dtype)
    next_step = tf.cast(self.iterations + 2, var_dtype)

    decay_base = tf.cast(0.96, var_dtype)

    m_t = beta_1_t * (1. - 0.5 * (
        tf.pow(decay_base, self._initial_decay * local_step)))
    m_t_1 = beta_1_t * (1. - 0.5 * (
        tf.pow(decay_base, self._initial_decay * next_step)))

    m_schedule_new = tf.cast(self._m_cache_read, var_dtype) * m_t
    if var_dtype is self._m_cache.dtype:
      m_schedule_new = tf.identity(tf.compat.v1.assign(
          self._m_cache, m_schedule_new, use_locking=self._use_locking))
    m_schedule_next = m_schedule_new * m_t_1

    apply_state[(var_device, var_dtype)] = dict(
        lr_t=lr_t,
        neg_lr_t=-lr_t,
        epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
        beta_1_t=beta_1_t,
        beta_2_t=beta_2_t,
        m_t=m_t,
        m_t_1=m_t_1,
        one_minus_beta_1_t=1 - beta_1_t,
        one_minus_beta_2_t=1 - beta_2_t,
        one_minus_m_t=1. - m_t,
        one_minus_m_schedule_new=1. - m_schedule_new,
        one_minus_m_schedule_next=1. - m_schedule_next,
        v_t_prime_denominator=1. - tf.pow(beta_2_t, local_step),
    )
예제 #2
0
def get_beta_accumulators(opt, dtype):
  local_step = tf.cast(opt.iterations + 1, dtype)
  beta_1_t = tf.cast(opt._get_hyper("beta_1"), dtype)
  beta_1_power = tf.pow(beta_1_t, local_step)
  beta_2_t = tf.cast(opt._get_hyper("beta_2"), dtype)
  beta_2_power = tf.pow(beta_2_t, local_step)
  return (beta_1_power, beta_2_power)
    def _cdf(self, x):
        loc = tf.convert_to_tensor(self.loc)
        scale = tf.convert_to_tensor(self.scale)
        power = tf.convert_to_tensor(self.power)
        ipower = tf.math.reciprocal(power)
        half = tf.constant(0.5, dtype=self.dtype)  # 0.5 is fp64 in numpy

        # For the CDF computation, we need to use a double-where a la:
        # https://github.com/tensorflow/probability/blob/master/discussion/where-nan.pdf
        # to avoid NaN gradients. This comes from computing (loc - x) ** power when
        # x > loc. If power is a not an even integer, then this value is not defined
        # or is negative, both of which are not valid values for `igamma`.

        loc_stop_grad = tf.stop_gradient(loc)
        # Use values that are right below loc and above loc. At loc, this will
        # result in `gamma|igamma(c)(1. / power, 0.)`. This has an undefined
        # gradient at 0.
        safe_x_lt_loc = tf.where(x > loc_stop_grad, loc_stop_grad - half, x)
        safe_x_gt_loc = tf.where(x < loc_stop_grad, loc_stop_grad + half, x)
        cdf = tf.where(
            x < loc,
            half *
            tf.math.igammac(ipower, tf.pow(
                (loc - safe_x_lt_loc) / scale, power)), half + half *
            tf.math.igamma(ipower, tf.pow(
                (safe_x_gt_loc - loc) / scale, power)))
        return cdf
예제 #4
0
  def update_step(self, gradient, variable):
    """Update step given gradient and the associated model variable."""
    if self._var_key(variable) not in self._index_dict:
      raise KeyError(f'Optimizer cannot recognize variable {variable.name}, '
                     f'this usually means you are calling an optimizer '
                     f'previously used on a different model. Please try '
                     f'creating a new optimizer instance.')

    lr = tf.cast(self.learning_rate, variable.dtype)
    var_key = self._var_key(variable)
    accum = self._accumulators[self._index_dict[var_key]]
    linear = self._linears[self._index_dict[var_key]]

    lr_power = self.learning_rate_power
    l2_reg = self.l2_regularization_strength
    l2_reg = (l2_reg + self.beta / (2. * lr))

    # Ftrl optimizer has the same implementation for sparse and dense
    # gradients update.
    grad_to_use = (
        gradient + 2 * self.l2_shrinkage_regularization_strength * variable)
    new_accum = accum + tf.pow(gradient, 2)
    linear.assign_add(grad_to_use -
                      (tf.pow(new_accum, -lr_power) -
                       tf.pow(accum, -lr_power)) / lr * variable)
    quadratic = tf.pow(new_accum,
                       (-lr_power)) / lr + 2 * l2_reg
    linear_clipped = tf.clip_by_value(linear,
                                      -self.l1_regularization_strength,
                                      self.l1_regularization_strength)
    variable.assign((linear_clipped - linear) / quadratic)
    accum.assign(new_accum)
예제 #5
0
파일: ftrl.py 프로젝트: paolodedios/keras
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""

        lr = tf.cast(self.learning_rate, variable.dtype)
        var_key = self._var_key(variable)
        accum = self._accumulators[self._index_dict[var_key]]
        linear = self._linears[self._index_dict[var_key]]

        lr_power = self.learning_rate_power
        l2_reg = self.l2_regularization_strength
        l2_reg = l2_reg + self.beta / (2.0 * lr)

        # Ftrl optimizer has the same implementation for sparse and dense
        # gradients update.
        grad_to_use = (
            gradient +
            2 * self.l2_shrinkage_regularization_strength * variable)
        new_accum = accum + tf.pow(gradient, 2)
        linear.assign_add(grad_to_use -
                          (tf.pow(new_accum, -lr_power) -
                           tf.pow(accum, -lr_power)) / lr * variable)
        quadratic = tf.pow(new_accum, (-lr_power)) / lr + 2 * l2_reg
        linear_clipped = tf.clip_by_value(
            linear,
            -self.l1_regularization_strength,
            self.l1_regularization_strength,
        )
        variable.assign((linear_clipped - linear) / quadratic)
        accum.assign(new_accum)
예제 #6
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        var_dtype = variable.dtype
        lr = tf.cast(self.learning_rate, var_dtype)
        local_step = tf.cast(self.iterations + 1, var_dtype)
        next_step = tf.cast(self.iterations + 2, var_dtype)
        decay = tf.cast(0.96, var_dtype)
        beta_1 = tf.cast(self.beta_1, var_dtype)
        beta_2 = tf.cast(self.beta_2, var_dtype)
        u_t = beta_1 * (1.0 - 0.5 * (tf.pow(decay, local_step)))
        u_t_1 = beta_1 * (1.0 - 0.5 * (tf.pow(decay, next_step)))

        def get_cached_u_product():
            return self._u_product

        def compute_new_u_product():
            u_product_t = self._u_product * u_t
            self._u_product.assign(u_product_t)
            self._u_product_counter += 1
            return u_product_t

        u_product_t = tf.cond(
            self._u_product_counter == (self.iterations + 2),
            true_fn=get_cached_u_product,
            false_fn=compute_new_u_product,
        )
        u_product_t_1 = u_product_t * u_t_1
        beta_2_power = tf.pow(beta_2, local_step)

        var_key = self._var_key(variable)
        m = self._momentums[self._index_dict[var_key]]
        v = self._velocities[self._index_dict[var_key]]

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            m.assign_add(-m * (1 - beta_1))
            m.scatter_add(
                tf.IndexedSlices(gradient.values * (1 - beta_1),
                                 gradient.indices))
            v.assign_add(-v * (1 - beta_2))
            v.scatter_add(
                tf.IndexedSlices(
                    tf.square(gradient.values) * (1 - beta_2),
                    gradient.indices))
            m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / (
                1 - u_product_t)
            v_hat = v / (1 - beta_2_power)

            variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - beta_1))
            v.assign_add((tf.square(gradient) - v) * (1 - beta_2))
            m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / (
                1 - u_product_t)
            v_hat = v / (1 - beta_2_power)

            variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
예제 #7
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = []

        with tf.control_dependencies(
            [tf.compat.v1.assign_add(self.iterations, 1)]
        ):
            t = tf.cast(self.iterations, backend.floatx())

        # Due to the recommendations in [2], i.e. warming momentum schedule
        momentum_cache_t = self.beta_1 * (
            1.0
            - 0.5
            * (tf.pow(backend.cast_to_floatx(0.96), t * self.schedule_decay))
        )
        momentum_cache_t_1 = self.beta_1 * (
            1.0
            - 0.5
            * (
                tf.pow(
                    backend.cast_to_floatx(0.96), (t + 1) * self.schedule_decay
                )
            )
        )
        m_schedule_new = self.m_schedule * momentum_cache_t
        m_schedule_next = (
            self.m_schedule * momentum_cache_t * momentum_cache_t_1
        )
        self.updates.append((self.m_schedule, m_schedule_new))

        ms, vs = self._create_all_weights(params)

        for p, g, m, v in zip(params, grads, ms, vs):
            # the following equations given in [1]
            g_prime = g / (1.0 - m_schedule_new)
            m_t = self.beta_1 * m + (1.0 - self.beta_1) * g
            m_t_prime = m_t / (1.0 - m_schedule_next)
            v_t = self.beta_2 * v + (1.0 - self.beta_2) * tf.square(g)
            v_t_prime = v_t / (1.0 - tf.pow(self.beta_2, t))
            m_t_bar = (
                1.0 - momentum_cache_t
            ) * g_prime + momentum_cache_t_1 * m_t_prime

            self.updates.append(tf.compat.v1.assign(m, m_t))
            self.updates.append(tf.compat.v1.assign(v, v_t))

            p_t = p - self.lr * m_t_bar / (
                backend.sqrt(v_t_prime) + self.epsilon
            )
            new_p = p_t

            # Apply constraints.
            if getattr(p, "constraint", None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(tf.compat.v1.assign(p, new_p))
        return self.updates
예제 #8
0
파일: utils.py 프로젝트: kthakore/edward2
def cosine_distance(x, y):
    """Cosine distance between vectors x and y."""
    x_norm = tf.math.sqrt(tf.reduce_sum(tf.pow(x, 2), axis=-1))
    x_norm = tf.reshape(x_norm, (-1, 1))
    y_norm = tf.math.sqrt(tf.reduce_sum(tf.pow(y, 2), axis=-1))
    y_norm = tf.reshape(y_norm, (-1, 1))
    normalized_x = x / x_norm
    normalized_y = y / y_norm
    return tf.reduce_mean(tf.reduce_sum(normalized_x * normalized_y, axis=-1))
예제 #9
0
 def _cdf(self, x):
   loc = tf.convert_to_tensor(self.loc)
   scale = tf.convert_to_tensor(self.scale)
   power = tf.convert_to_tensor(self.power)
   ipower = tf.math.reciprocal(power)
   half = tf.constant(0.5, dtype=self.dtype)  # 0.5 is fp64 in numpy
   cdf = tf.where(
       x < loc,
       half * tf.math.igammac(ipower, tf.pow((loc - x) / scale, power)),
       half + half * tf.math.igamma(ipower, tf.pow((x - loc) / scale, power)))
   return cdf
예제 #10
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        beta_1_power = None
        beta_2_power = None
        lr = tf.cast(self.learning_rate, variable.dtype)
        local_step = tf.cast(self.iterations + 1, variable.dtype)
        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
        beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)

        var_key = self._var_key(variable)
        m = self._momentums[self._index_dict[var_key]]
        v = self._velocities[self._index_dict[var_key]]

        alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)

        # Apply step weight decay
        if (
            self.weight_decay != 0
            and variable not in self._exclude_from_weight_decay
        ):
            wd = tf.cast(self.weight_decay, variable.dtype)
            variable.assign_sub(variable * wd)

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            m.assign_add(-m * (1 - self.beta_1))
            m.scatter_add(
                tf.IndexedSlices(
                    gradient.values * (1 - self.beta_1), gradient.indices
                )
            )
            v.assign_add(-v * (1 - self.beta_2))
            v.scatter_add(
                tf.IndexedSlices(
                    tf.square(gradient.values) * (1 - self.beta_2),
                    gradient.indices,
                )
            )
            if self.amsgrad:
                v_hat = self._velocity_hats[self._index_dict[var_key]]
                v_hat.assign(tf.maximum(v_hat, v))
                v = v_hat
            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - self.beta_1))
            v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
            if self.amsgrad:
                v_hat = self._velocity_hats[self._index_dict[var_key]]
                v_hat.assign(tf.maximum(v_hat, v))
                v = v_hat
            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
예제 #11
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        if self._var_key(variable) not in self._index_dict:
            raise KeyError(
                f'Optimizer cannot recognize variable {variable.name}, '
                f'this usually means you are calling an optimizer '
                f'previously used on a different model. Please try '
                f'creating a new optimizer instance.')
        beta_1_power = None
        beta_2_power = None
        lr = tf.cast(self.learning_rate, variable.dtype)
        local_step = tf.cast(self.iterations + 1, variable.dtype)
        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
        beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)

        var_key = self._var_key(variable)
        m = self._momentums[self._index_dict[var_key]]
        v = self._velocities[self._index_dict[var_key]]

        alpha = (lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))

        # Apply step weight decay
        if self.weight_decay != 0:
            wd = tf.cast(self.weight_decay, variable.dtype)
            variable.assign_sub(variable * (1 - lr * wd))

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            m.assign_add(-m * (1 - self.beta_1))
            m.scatter_add(
                tf.IndexedSlices(gradient.values * (1 - self.beta_1),
                                 gradient.indices))
            v.assign_add(-v * (1 - self.beta_2))
            v.scatter_add(
                tf.IndexedSlices(
                    tf.square(gradient.values) * (1 - self.beta_2),
                    gradient.indices))
            if self.amsgrad:
                v_hat = self._velocity_hats[self._index_dict[var_key]]
                v_hat.assign(tf.maximum(v_hat, v))
                v = v_hat
            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - self.beta_1))
            v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
            if self.amsgrad:
                v_hat = self._velocity_hats[self._index_dict[var_key]]
                v_hat.assign(tf.maximum(v_hat, v))
                v = v_hat
            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
def geomspace(start, stop, num=50, endpoint=True, dtype=float):
    """Returns `num` values from a geometric progression.

  The ratio of any two consecutive values in the output sequence is constant.
  This is similar to `logspace`, except the endpoints are specified directly
  instead of as powers of a base.

  Args:
    start: start of the geometric progression.
    stop: end of the geometric progression. This is included in the output
      if endpoint is true.
    num: Number of values to sample. Defaults to 50.
    endpoint: Whether to include `stop` in the output. Defaults to true.
    dtype: Optional. Type of the resulting ndarray. Could be a python type, a
      NumPy type or a TensorFlow `DType`. If not provided, it is figured from
      input args.

  Returns:
    An ndarray.

  Raises:
    ValueError: If there is an error in the arguments.
  """
    # TODO(srbs): Check whether dtype is handled properly.
    if dtype:
        dtype = utils.to_tf_type(dtype)
    if num < 0:
        raise ValueError(
            'Number of samples {} must be non-negative.'.format(num))
    if not num:
        return empty([0])
    if start == 0:
        raise ValueError('start: {} must be non-zero.'.format(start))
    if stop == 0:
        raise ValueError('stop: {} must be non-zero.'.format(stop))
    if np_sign(start) != np_sign(stop):
        raise ValueError('start: {} and stop: {} must have same sign.'.format(
            start, stop))
    step = 1.
    if endpoint:
        if num > 1:
            step = tf.pow((stop / start), 1 / (num - 1))
    else:
        step = tf.pow((stop / start), 1 / num)
    result = tf.cast(tf.range(num), step.dtype)
    result = tf.pow(step, result)
    result = tf.multiply(result, start)
    if dtype:
        result = tf.cast(result, dtype=dtype)
    return utils.tensor_to_ndarray(result)
예제 #13
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        if self._var_key(variable) not in self._index_dict:
            raise KeyError(
                f'Optimizer cannot recognize variable {variable.name}, '
                f'this usually means you are calling an optimizer '
                f'previously used on a different model. Please try '
                f'creating a new optimizer instance.')
        lr = tf.cast(self.learning_rate, variable.dtype)
        local_step = tf.cast(self.iterations + 1, variable.dtype)
        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)

        var_key = self._var_key(variable)
        m = self._m[self._index_dict[var_key]]
        u = self._u[self._index_dict[var_key]]

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            indices = gradient.indices
            m.assign_add(-m * (1 - self.beta_1))
            m.scatter_add(
                tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices))
            u.assign(u * self.beta_2)
            u_slice = tf.gather(u, indices)
            u_slice_incremental = tf.maximum(u_slice, tf.abs(
                gradient.values)) - u_slice
            u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices))
            variable.assign_sub(
                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - self.beta_1))
            u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient)))
            variable.assign_sub(
                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
예제 #14
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        lr = tf.cast(self.learning_rate, variable.dtype)
        local_step = tf.cast(self.iterations + 1, variable.dtype)
        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)

        var_key = self._var_key(variable)
        m = self._m[self._index_dict[var_key]]
        u = self._u[self._index_dict[var_key]]

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            indices = gradient.indices
            m.assign_add(-m * (1 - self.beta_1))
            m.scatter_add(
                tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices))
            u.assign(u * self.beta_2)
            u_slice = tf.gather(u, indices)
            u_slice_incremental = (
                tf.maximum(u_slice, tf.abs(gradient.values)) - u_slice)
            u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices))
            variable.assign_sub(
                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - self.beta_1))
            u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient)))
            variable.assign_sub(
                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
예제 #15
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = []

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (  # pylint: disable=g-no-augmented-assignment
                1. / (1. + self.decay *
                      tf.cast(self.iterations, backend.dtype(self.decay))))

        with tf.control_dependencies(
            [tf.compat.v1.assign_add(self.iterations, 1)]):
            t = tf.cast(self.iterations, backend.floatx())
        lr_t = lr / (1. - tf.pow(self.beta_1, t))

        ms, us = self._create_all_weights(params)

        for p, g, m, u in zip(params, grads, ms, us):

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            u_t = tf.maximum(self.beta_2 * u, tf.abs(g))
            p_t = p - lr_t * m_t / (u_t + self.epsilon)

            self.updates.append(tf.compat.v1.assign(m, m_t))
            self.updates.append(tf.compat.v1.assign(u, u_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(tf.compat.v1.assign(p, new_p))
        return self.updates
예제 #16
0
    def __call__(self, step):
        with tf.name_scope(self.name or "PolynomialDecay") as name:
            initial_learning_rate = tf.convert_to_tensor(
                self.initial_learning_rate, name="initial_learning_rate")
            dtype = initial_learning_rate.dtype
            end_learning_rate = tf.cast(self.end_learning_rate, dtype)
            power = tf.cast(self.power, dtype)

            global_step_recomp = tf.cast(step, dtype)
            decay_steps_recomp = tf.cast(self.decay_steps, dtype)
            if self.cycle:
                # Find the first multiple of decay_steps that is bigger than
                # global_step. If global_step is zero set the multiplier to 1
                multiplier = tf.where(
                    tf.equal(global_step_recomp, 0), 1.0,
                    tf.math.ceil(global_step_recomp / self.decay_steps))
                decay_steps_recomp = tf.multiply(decay_steps_recomp,
                                                 multiplier)
            else:
                # Make sure that the global_step used is not bigger than decay_steps.
                global_step_recomp = tf.minimum(global_step_recomp,
                                                decay_steps_recomp)

            p = tf.divide(global_step_recomp, decay_steps_recomp)
            return tf.add(tf.multiply(
                initial_learning_rate - end_learning_rate,
                tf.pow(1 - p, power)),
                          end_learning_rate,
                          name=name)
예제 #17
0
    def __call__(self, step):
        with tf.name_scope(self.name or "NoisyLinearCosineDecay") as name:
            initial_learning_rate = tf.convert_to_tensor(
                self.initial_learning_rate, name="initial_learning_rate")
            dtype = initial_learning_rate.dtype
            decay_steps = tf.cast(self.decay_steps, dtype)
            initial_variance = tf.cast(self.initial_variance, dtype)
            variance_decay = tf.cast(self.variance_decay, dtype)
            num_periods = tf.cast(self.num_periods, dtype)
            alpha = tf.cast(self.alpha, dtype)
            beta = tf.cast(self.beta, dtype)

            global_step_recomp = tf.cast(step, dtype)
            global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
            linear_decayed = (decay_steps - global_step_recomp) / decay_steps
            variance = initial_variance / (tf.pow(1.0 + global_step_recomp,
                                                  variance_decay))
            std = tf.sqrt(variance)
            noisy_linear_decayed = (
                linear_decayed +
                tf.random.normal(linear_decayed.shape, stddev=std))

            completed_fraction = global_step_recomp / decay_steps
            fraction = 2.0 * num_periods * completed_fraction
            cosine_decayed = 0.5 * (1.0 +
                                    tf.cos(tf.constant(math.pi) * fraction))
            noisy_linear_cosine_decayed = (
                (alpha + noisy_linear_decayed) * cosine_decayed + beta)

            return tf.multiply(initial_learning_rate,
                               noisy_linear_cosine_decayed,
                               name=name)
def mix_white_noise(audio, noise_level_db):
  _, variance = tf.nn.moments(audio, axes=[0])
  audio_rms = tf.math.sqrt(variance)
  noise_rms = tf.pow(10.0, noise_level_db / 10.0) * audio_rms
  noise = tf.random.normal(
      tf.shape(audio), mean=0.0, stddev=noise_rms, dtype=tf.float32)
  return audio + noise
예제 #19
0
    def _variance(self):
        tailweight = tf.convert_to_tensor(self.tailweight)
        scale = tf.convert_to_tensor(self.scale)
        # For tail < 0.5, the variance is finite. See Eq (18) in
        # https://www.hindawi.com/journals/tswj/2015/909231/
        var = (
            tf.cast(tf.pow(1. - 2. * tailweight, -3. / 2.), dtype=self.dtype) *
            tf.math.square(scale))
        # We need to put the tf.where inside the outer tf.where to ensure we never
        # hit a NaN in the gradient.
        result_where_defined = tf.where(
            tailweight < 0.5, var,
            tf.convert_to_tensor(np.inf, dtype=self.dtype))

        if self.allow_nan_stats:
            return tf.where(tailweight < 1.0, result_where_defined,
                            tf.convert_to_tensor(np.nan, self.dtype))
        else:
            return distribution_util.with_dependencies([
                assert_util.assert_greater_equal(
                    tf.ones([], dtype=self.dtype),
                    tailweight,
                    message=
                    "variance not defined for components of tailweight >= 1"),
            ], result_where_defined)
예제 #20
0
def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None):
  """Returns `num` values sampled evenly on a log scale.

  Equivalent to `base ** linspace(start, stop, num, endpoint)`.

  Args:
    start: base**start is the start of the output sequence.
    stop: If `endpoint` is true and num > 1, base ** stop is included in the
      output. If `endpoint` is false, `num` + 1 values are linearly sampled in
      [start, stop] both inclusive and the last value is ignored before raising
      to power of `base`.
    num: Number of values to sample. Defaults to 50.
    endpoint: When to include `base ** stop` in the output. Defaults to true.
    base: Base of the log space.
    dtype: Optional. Type of the resulting ndarray. Could be a python type, a
      NumPy type or a TensorFlow `DType`. If not provided, it is figured from
      input args.
  """
  # TODO(srbs): Check whether dtype is handled properly.
  if dtype:
    dtype = utils.to_tf_type(dtype)
  result = linspace(start, stop, num=num, endpoint=endpoint)
  result = tf.pow(base, result.data)
  if dtype:
    result = utils.maybe_cast(result, dtype)
  return utils.tensor_to_ndarray(result)
예제 #21
0
  def update(self, expert_dataset_iter, replay_buffer_iter):
    """Performs a single training step for critic and actor.

    Args:
      expert_dataset_iter: An tensorflow graph iteratable over expert data.
      replay_buffer_iter: An tensorflow graph iteratable over replay buffer.
    """
    expert_states, expert_actions, _ = next(expert_dataset_iter)
    policy_states, policy_actions, _, _, _ = next(replay_buffer_iter)[0]

    policy_inputs = tf.concat([policy_states, policy_actions], -1)
    expert_inputs = tf.concat([expert_states, expert_actions], -1)

    alpha = tf.random.uniform(shape=(policy_inputs.get_shape()[0], 1))
    inter = alpha * policy_inputs + (1 - alpha) * expert_inputs

    with tf.GradientTape(watch_accessed_variables=False) as tape:
      tape.watch(self.discriminator.variables)
      policy_output = self.discriminator(policy_inputs)
      expert_output = self.discriminator(expert_inputs)

      # Using the standard value for label smoothing instead of 0.25.
      classification_loss = tfgan_losses.modified_discriminator_loss(
          expert_output, policy_output, label_smoothing=0.0)

      with tf.GradientTape(watch_accessed_variables=False) as tape2:
        tape2.watch(inter)
        output = self.discriminator(inter)

      grad = tape2.gradient(output, [inter])[0]
      grad_penalty = tf.reduce_mean(tf.pow(tf.norm(grad, axis=-1) - 1, 2))
      total_loss = classification_loss + self.grad_penalty_coeff * grad_penalty

    grads = tape.gradient(total_loss, self.discriminator.variables)

    self.optimizer.apply_gradients(zip(grads, self.discriminator.variables))

    self.avg_classification_loss(classification_loss)
    self.avg_gp_loss(grad_penalty)
    self.avg_total_loss(total_loss)

    if tf.equal(self.optimizer.iterations % self.log_interval, 0):
      tf.summary.scalar(
          'train gail/classification loss',
          self.avg_classification_loss.result(),
          step=self.optimizer.iterations)
      self.avg_classification_loss.reset_states()

      tf.summary.scalar(
          'train gail/gradient penalty',
          self.avg_gp_loss.result(),
          step=self.optimizer.iterations)
      self.avg_gp_loss.reset_states()

      tf.summary.scalar(
          'train gail/loss',
          self.avg_total_loss.result(),
          step=self.optimizer.iterations)
      self.avg_total_loss.reset_states()
예제 #22
0
def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None):
    if dtype:
        dtype = utils.result_type(dtype)
    result = linspace(start, stop, num=num, endpoint=endpoint)
    result = tf.pow(base, result.data)
    if dtype:
        result = tf.cast(result, dtype)
    return arrays.tensor_to_ndarray(result)
예제 #23
0
 def call(self, y_true, y_pred):
     error = tf.pow(tf.abs(tf.squeeze(y_pred) - y_true), self._power)
     target_weights, target_index = self._get_target_weights_and_indices()
     quantiles = ops.softsort(error,
                              axis=0,
                              target_weights=target_weights,
                              **self._kwargs)
     return tf.gather(quantiles, target_index, axis=0)
예제 #24
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = []

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (
                1.0
                / (
                    1.0
                    + self.decay
                    * tf.cast(self.iterations, backend.dtype(self.decay))
                )
            )

        with tf.control_dependencies(
            [tf.compat.v1.assign_add(self.iterations, 1)]
        ):
            t = tf.cast(self.iterations, backend.floatx())
        lr_t = lr * (
            backend.sqrt(1.0 - tf.pow(self.beta_2, t))
            / (1.0 - tf.pow(self.beta_1, t))
        )

        ms, vs, vhats = self._create_all_weights(params)
        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * tf.square(g)
            if self.amsgrad:
                vhat_t = tf.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (backend.sqrt(vhat_t) + self.epsilon)
                self.updates.append(tf.compat.v1.assign(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (backend.sqrt(v_t) + self.epsilon)

            self.updates.append(tf.compat.v1.assign(m, m_t))
            self.updates.append(tf.compat.v1.assign(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, "constraint", None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(tf.compat.v1.assign(p, new_p))
        return self.updates
예제 #25
0
 def call(self, y_true, y_pred):
     error = tf.pow(tf.abs(tf.squeeze(y_pred) - y_true), self._power)
     width = self._end_quantile - self._start_quantile
     quantile = 0.5 * (self._end_quantile + self._start_quantile)
     return ops.softquantiles(error,
                              quantile,
                              quantile_width=width,
                              axis=0,
                              **self._kwargs)
예제 #26
0
def oadam_update(g, alpha, beta_1, beta_2, epsilon, t, m, v):
    """Implements 'Algorithm 1' from [1]."""
    old_m = m
    old_v = v
    m = beta_1 * m + (1. - beta_1) * g  # Biased first moment estimate.
    v = beta_2 * v + (1. -
                      beta_2) * g * g  # Biased second raw moment estimate.
    m_hat = m / (1. - tf.pow(beta_1, t))  # Bias corrected 1st moment estimate.
    v_hat = v / (1. - tf.pow(beta_2, t))  # Bias corrected 2nd moment estimate.
    if t == 1:
        update = alpha * m_hat / (tf.sqrt(v_hat) + epsilon)
    else:
        # Old bias corrected moment estimates.
        old_m_hat = old_m / (1. - tf.pow(beta_1, (t - 1)))
        old_v_hat = old_v / (1. - tf.pow(beta_2, (t - 1)))
        update = alpha * (2 * m_hat / (tf.sqrt(v_hat) + epsilon) - old_m_hat /
                          (tf.sqrt(old_v_hat) + epsilon))
    return update, m, v
예제 #27
0
def geomspace(start, stop, num=50, endpoint=True, dtype=float):  # pylint: disable=missing-docstring
  if dtype:
    dtype = utils.result_type(dtype)
  if num < 0:
    raise ValueError('Number of samples {} must be non-negative.'.format(num))
  if not num:
    return empty([0])
  step = 1.
  if endpoint:
    if num > 1:
      step = tf.pow((stop / start), 1 / (num - 1))
  else:
    step = tf.pow((stop / start), 1 / num)
  result = tf.cast(tf.range(num), step.dtype)
  result = tf.pow(step, result)
  result = tf.multiply(result, start)
  if dtype:
    result = tf.cast(result, dtype=dtype)
  return arrays_lib.tensor_to_ndarray(result)
예제 #28
0
 def test_trimmed(self, start, end, power):
     loss_fn = losses.TrimmedRegressionLoss(start_quantile=start,
                                            end_quantile=end,
                                            power=power)
     loss = loss_fn(self._y_true, self._y_pred)
     start_index = int(start * self._num_points)
     end_index = int(end * self._num_points)
     selected = tf.pow(self._values[start_index:end_index], power)
     expected_loss = tf.math.reduce_mean(selected)
     self.assertAllClose(loss, expected_loss, 0.2, 0.2)
예제 #29
0
 def _log_prob(self, x):
     loc = tf.convert_to_tensor(self.loc)
     scale = tf.convert_to_tensor(self.scale)
     power = tf.convert_to_tensor(self.power)
     one = tf.constant(1., dtype=self.dtype)
     two = tf.constant(2., dtype=self.dtype)
     log_normalization = (tf.math.log(two) + tf.math.log(scale) +
                          tf.math.lgamma(one + tf.math.reciprocal(power)))
     log_unnormalized = -tf.pow(tf.abs(x - loc) / scale, power)
     return log_unnormalized - log_normalization
예제 #30
0
    def _prepare_local(self, var_device, var_dtype, apply_state):
        super(Adam, self)._prepare_local(var_device, var_dtype, apply_state)

        local_step = tf.cast(self.iterations + 1, var_dtype)
        beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype))
        beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype))
        beta_1_power = tf.pow(beta_1_t, local_step)
        beta_2_power = tf.pow(beta_2_t, local_step)
        lr = (apply_state[(var_device, var_dtype)]['lr_t'] *
              (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)))
        apply_state[(var_device, var_dtype)].update(
            dict(lr=lr,
                 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
                 beta_1_t=beta_1_t,
                 beta_1_power=beta_1_power,
                 one_minus_beta_1_t=1 - beta_1_t,
                 beta_2_t=beta_2_t,
                 beta_2_power=beta_2_power,
                 one_minus_beta_2_t=1 - beta_2_t))