예제 #1
0
def batch_norm(input_,
               dim,
               name,
               scale=True,
               train=True,
               epsilon=1e-8,
               decay=.1,
               axes=[0],
               bn_lag=DEFAULT_BN_LAG):
    """Batch normalization."""
    # create variables
    with tf.variable_scope(name):
        var = variable_on_cpu(
            "var", [dim], tf.constant_initializer(1.), trainable=False)
        mean = variable_on_cpu(
            "mean", [dim], tf.constant_initializer(0.), trainable=False)
        step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False)
        if scale:
            gamma = variable_on_cpu("gamma", [dim], tf.constant_initializer(1.))
        beta = variable_on_cpu("beta", [dim], tf.constant_initializer(0.))
    # choose the appropriate moments
    if train:
        used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm")
        cur_mean, cur_var = used_mean, used_var
        if bn_lag > 0.:
            used_mean -= (1. - bn_lag) * (used_mean - tf.stop_gradient(mean))
            used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var))
            used_mean /= (1. - bn_lag**(step + 1))
            used_var /= (1. - bn_lag**(step + 1))
    else:
        used_mean, used_var = mean, var
        cur_mean, cur_var = used_mean, used_var

    # normalize
    res = (input_ - used_mean) / tf.sqrt(used_var + epsilon)
    # de-normalize
    if scale:
        res *= gamma
    res += beta

    # update variables
    if train:
        with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]):
            with ops.colocate_with(mean):
                new_mean = tf.assign_sub(
                    mean,
                    tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean."))
        with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]):
            with ops.colocate_with(var):
                new_var = tf.assign_sub(
                    var,
                    tf.check_numerics(decay * (var - cur_var),
                                      "NaN in moving variance."))
        with tf.name_scope(name, "IncrementTime", [step]):
            with ops.colocate_with(step):
                new_step = tf.assign_add(step, 1.)
        res += 0. * new_mean * new_var * new_step

    return res
예제 #2
0
def batch_norm_log_diff(input_,
                        dim,
                        name,
                        train=True,
                        epsilon=1e-8,
                        decay=.1,
                        axes=[0],
                        reuse=None,
                        bn_lag=DEFAULT_BN_LAG):
    """Batch normalization with corresponding log determinant Jacobian."""
    if reuse is None:
        reuse = not train
    # create variables
    with tf.variable_scope(name) as scope:
        if reuse:
            scope.reuse_variables()
        var = variable_on_cpu(
            "var", [dim], tf.constant_initializer(1.), trainable=False)
        mean = variable_on_cpu(
            "mean", [dim], tf.constant_initializer(0.), trainable=False)
        step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False)
    # choose the appropriate moments
    if train:
        used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm")
        cur_mean, cur_var = used_mean, used_var
        if bn_lag > 0.:
            used_var = stable_var(input_=input_, mean=used_mean, axes=axes)
            cur_var = used_var
            used_mean -= (1 - bn_lag) * (used_mean - tf.stop_gradient(mean))
            used_mean /= (1. - bn_lag**(step + 1))
            used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var))
            used_var /= (1. - bn_lag**(step + 1))
    else:
        used_mean, used_var = mean, var
        cur_mean, cur_var = used_mean, used_var

    # update variables
    if train:
        with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]):
            with ops.colocate_with(mean):
                new_mean = tf.assign_sub(
                    mean,
                    tf.check_numerics(
                        decay * (mean - cur_mean), "NaN in moving mean."))
        with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]):
            with ops.colocate_with(var):
                new_var = tf.assign_sub(
                    var,
                    tf.check_numerics(decay * (var - cur_var),
                                      "NaN in moving variance."))
        with tf.name_scope(name, "IncrementTime", [step]):
            with ops.colocate_with(step):
                new_step = tf.assign_add(step, 1.)
        used_var += 0. * new_mean * new_var * new_step
    used_var += epsilon

    return used_mean, used_var
예제 #3
0
 def central_step():
     # restore v1, slots
     op5 = tf.group(*[ tf.assign(w,v) for w,v in zip(restored_vars, tmp_vars)])
     with tf.get_default_graph().control_dependencies([op5]):
         back =  tf.group(*[tf.assign_sub(v, -self._lr_t*grad) for grad,v in grads_and_vars])
         with tf.get_default_graph().control_dependencies([back]):
             return tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
예제 #4
0
 def testInitRequiredAssignSub(self):
   with self.test_session():
     p = tf.Variable(tf.fill([1024, 1024], 1),
                            tf.int32)
     a = tf.assign_sub(p, tf.fill([1024, 1024], 0))
     with self.assertRaisesOpError("use uninitialized"):
       a.op.run()
예제 #5
0
 def _initAssignSubFetch(self, x, y, use_gpu=False):
   """Initialize a param to init, and compute param -= y."""
   with self.test_session(use_gpu=use_gpu):
     p = tf.Variable(x)
     sub = tf.assign_sub(p, y)
     p.initializer.run()
     new_value = sub.eval()
     return p.eval(), new_value
예제 #6
0
  def exponential_moving_average(self,
                                 var,
                                 avg_var=None,
                                 decay=0.999,
                                 ignore_nan=False):
    """Calculates the exponential moving average.

    TODO(): check if this implementation of moving average can now
    be replaced by tensorflows implementation.

    Adds a variable to keep track of the exponential moving average and adds an
    update operation to the bookkeeper. The name of the variable is
    '%s_average' % name prefixed with the current variable scope.

    Args:
       var: The variable for which a moving average should be computed.
       avg_var: The variable to set the average into, if None create a zero
         initialized one.
       decay: How much history to use in the moving average.
         Higher, means more history values [0, 1) accepted.
       ignore_nan: If the value is NaN or Inf, skip it.
    Returns:
       The averaged variable.
    Raises:
      ValueError: if decay is not in [0, 1).
    """
    with self._g.as_default():
      if decay < 0 or decay >= 1.0:
        raise ValueError('Decay is %5.2f, but has to be in [0, 1).' % decay)
      if avg_var is None:
        avg_name = '%s_average' % _bare_var_name(var)
        with tf.control_dependencies(None):
          with tf.name_scope(avg_name + '/Initializer/'):
            if isinstance(var, tf.Variable):
              init_val = var.initialized_value()
            elif var.get_shape().is_fully_defined():
              init_val = tf.constant(0,
                                     shape=var.get_shape(),
                                     dtype=var.dtype.base_dtype)
            else:
              init_val = tf.constant(0, dtype=var.dtype.base_dtype)
          avg_var = tf.Variable(init_val, name=avg_name, trainable=False)

      num_updates = tf.cast(self.global_step, tf.float32)
      decay = tf.minimum(decay, tf.maximum(0.9, (1.0 + num_updates) /
                                           (10.0 + num_updates)))
      with tf.device(avg_var.device):
        if ignore_nan:
          var = tf.where(tf.is_finite(var), var, avg_var)
        if var.get_shape().is_fully_defined():
          avg_update = tf.assign_sub(avg_var, (1 - decay) * (avg_var - var))
        else:
          avg_update = tf.assign(avg_var,
                                 avg_var - (1 - decay) * (avg_var - var),
                                 validate_shape=False)
      self._g.add_to_collection(GraphKeys.UPDATE_OPS, avg_update)
      return avg_update
예제 #7
0
 def curl():
     grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
     op3 = tf.group(*[tf.assign_sub(v, self._lr_t*grad) for grad,v in zip(grads, all_vars)])
     with tf.get_default_graph().control_dependencies([op3]):
         def curlcombine(g1,g2):
             stepsize = self._lr_t
             return g1-(g2-g1)/stepsize
         new_grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
         g3s = [curlcombine(g1,g2) for g1,g2 in zip(grads,new_grads)]
         return g3s
예제 #8
0
 def _assign_sub(self, ref, updates, indices=None):
   if indices is not None:
     if isinstance(ref, tf.Variable):
       return tf.scatter_sub(ref, indices, updates, use_locking=self._use_locking)
     elif isinstance(ref, resource_variable_ops.ResourceVariable):
       with tf.control_dependencies([resource_variable_ops.resource_scatter_add(ref.handle, indices, -updates)]):
         return ref.value()
     else:
       raise TypeError("did not expect type %r" % type(ref))
   else:
     return tf.assign_sub(ref, updates, use_locking=self._use_locking)
예제 #9
0
 def _apply_dense(self, grad, var):
     lr_t = tf.cast(self._lr_t, var.dtype.base_dtype)
     beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype)
     if var.dtype.base_dtype == tf.float16:
         eps = 1e-7
     else:
         eps = 1e-8
     m = self.get_slot(var, "m")
     m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad)))
     g_t = grad / m_t
     var_update = tf.assign_sub(var, lr_t * g_t)
     return tf.group(*[var_update, m_t])
예제 #10
0
def sgd(cost, parameters=None, learning_rate=0.01):
    if parameters is None:
        parameters = tf.trainable_variables()

    grads = tf.gradients(cost, parameters)

    all_updates = []
    for grad, param in zip(grads, parameters):
        assigned = tf.assign_sub(param, learning_rate * grad)
        all_updates.append(assigned)

    update_op = tf.group(*all_updates)
    return update_op
예제 #11
0
 def _resource_apply_dense(self, grad, var):
   grad_squared = tf.square(grad) + 1e-30
   grad_squared_mean = tf.reduce_mean(grad_squared)
   decay_rate = self._decay_rate
   update_scale = self._learning_rate
   if self._multiply_by_parameter_scale:
     update_scale *= self._parameter_scale(var)
   # HACK: Make things dependent on grad.
   # This confounds the XLA rewriter and keeps it from fusing computations
   # across different variables.  This fusion is a bad for HBM usage, since
   # it causes the gradients to persist in memory.
   decay_rate += grad_squared_mean * 1e-30
   update_scale += grad_squared_mean * 1e-30
   # END HACK
   mixing_rate = 1.0 - decay_rate
   shape = var.get_shape().as_list()
   updates = []
   if self._should_use_factored_second_moment_estimate(shape):
     grad_squared_row_mean = tf.reduce_mean(grad_squared, 1)
     grad_squared_col_mean = tf.reduce_mean(grad_squared, 0)
     vr = self.get_slot(var, "vr")
     new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean)
     vc = self.get_slot(var, "vc")
     new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean)
     vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking)
     vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking)
     updates = [vr_update, vc_update]
     long_term_mean = tf.reduce_mean(new_vr)
     r_factor = tf.rsqrt(new_vr / long_term_mean)
     c_factor = tf.rsqrt(new_vc)
     x = grad * tf.expand_dims(r_factor, 1) * tf.expand_dims(c_factor, 0)
   else:
     v = self.get_slot(var, "v")
     new_v = decay_rate * v + mixing_rate * grad_squared
     v_update = tf.assign(v, new_v, use_locking=self._use_locking)
     updates = [v_update]
     x = grad * tf.rsqrt(new_v)
   if self._clipping_threshold is not None:
     clipping_denom = tf.maximum(1.0, reduce_rms(x) / self._clipping_threshold)
     x /= clipping_denom
   subtrahend = update_scale * x
   if self._beta1:
     m = self.get_slot(var, "m")
     new_m = self._beta1 * m + (1.0 - self._beta1) * subtrahend
     updates.append(tf.assign(m, new_m, use_locking=self._use_locking))
     subtrahend = new_m
   var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking)
   updates = [var_update] + updates
   return tf.group(*updates)
예제 #12
0
def adam(cost,
         parameters=None,
         learning_rate=1e-3,
         beta1=0.9,
         beta2=0.999,
         epsilon=1e-8):
    if parameters is None:
        parameters = tf.trainable_variables()

    grads = tf.gradients(cost, parameters)
    all_updates = []
    zero_init = tf.constant_initializer(0.)
    with tf.variable_scope("adam"):
        t_prev = tf.get_variable("t",
                                 shape=(),
                                 initializer=zero_init)
        t = tf.assign_add(t_prev, 1)
        all_updates.append(t)

        for grad, param in zip(grads, parameters):
            with tf.variable_scope(param.name.replace(":", "_")):
                param_shape = tfu.get_shape_values(param)
                m_prev = tf.get_variable("m",
                                         shape=param_shape,
                                         initializer=zero_init)
                v_prev = tf.get_variable("v",
                                         shape=param_shape,
                                         initializer=zero_init)
                m = tf.assign(m_prev,
                              m_prev * beta1 + grad * (1 - beta1))
                v = tf.assign(v_prev,
                              v_prev * beta2 + tf.square(grad) * (1 - beta2))

                numerator = learning_rate * m / (1 - tf.pow(beta1, t))
                denominator = tf.sqrt(v / (1 - tf.pow(beta2, t))) + epsilon
                assigned = tf.assign_sub(param, numerator / denominator)
                all_updates += [m, v, assigned]

    update_op = tf.group(*all_updates)
    return update_op
예제 #13
0
  def exponential_moving_average(
      self, var, avg_var=None, decay=0.999, ignore_nan=False):
    """Calculates the exponential moving average.

    Adds a variable to keep track of the exponential moving average and adds an
    update operation to the bookkeeper. The name of the variable is
    '%s_average' % name prefixed with the current variable scope.

    Args:
       var: The variable for which a moving average should be computed.
       avg_var: The variable to set the average into, if None create a zero
         initialized one.
       decay: How much history to use in the moving average.
         Higher, means more history values [0, 1) accepted.
       ignore_nan: If the value is NaN or Inf, skip it.
    Returns:
       The averaged variable.
    Raises:
      ValueError: if decay is not in [0, 1).
    """
    with self.g.as_default():
      if decay < 0 or decay >= 1.0:
        raise ValueError('Decay is %5.2f, but has to be in [0, 1).' % decay)
      if not avg_var:
        shape = var.get_shape()
        avg_name = '%s_average' % _bare_var_name(var)
        avg_var = tf.Variable(
            tf.zeros_initializer(shape=shape, dtype=var.dtype),
            name=avg_name,
            trainable=False)
      num_updates = tf.cast(self.global_step, tf.float32)
      decay = tf.maximum(
          0.9, tf.minimum(decay, (1.0 + num_updates) / (10.0 + num_updates)))
      with tf.device(avg_var.device):
        if ignore_nan:
          var = tf.select(tf.is_finite(var), var, avg_var)
        avg_update = tf.assign_sub(avg_var, (1 - decay) * (avg_var - var))
      self._g.add_to_collection(GraphKeys.UPDATE_OPS, avg_update)
      return avg_var
예제 #14
0
 def _update_params(self, ema, g, v):
     """Create ops to update trainable parameters"""
     return tf.assign_sub(v, self._eta_max * ema['u'] * g)
예제 #15
0
 def testInitRequiredAssignSub(self):
     with self.test_session():
         p = tf.Variable(tf.fill([1024, 1024], 1), tf.int32)
         a = tf.assign_sub(p, tf.fill([1024, 1024], 0))
         with self.assertRaisesOpError("use uninitialized"):
             a.op.run()
    def _apply_dense(self, grad: tf.Tensor, var: tf.Variable) -> tf.Operation:
        """Add ops to apply dense gradients to `var`.

        Args:
            grad: A gradient `Tensor`.
            var: A `Variable` object.

        Returns:
            An `Operation`.
        """
        alpha_t = tf.cast(self._alpha_t, var.dtype.base_dtype)
        lr_update_t = tf.cast(self._lr_update_t, var.dtype.base_dtype)
        lr_max_t = tf.cast(self._lr_max_t, var.dtype.base_dtype)
        lr_min_t = tf.cast(self._lr_min_t, var.dtype.base_dtype)
        m_coef_update_t = tf.cast(self._momentum_coef_update_t,
                                  var.dtype.base_dtype)

        # get cached tensors
        old_grad = self.get_slot(var, "old_grad")
        momentum = self.get_slot(var, "momentum")
        # learnable stuff
        lr = self.get_slot(var, "lr")
        m_coef = self.get_slot(var, "m_coef")

        # generate random noise
        noise = alpha_t * tf.random_uniform(
            shape=tf.shape(var), minval=-1.0, maxval=+1.0)

        # compute aggregated gradient
        momentum_grad = momentum * m_coef + grad
        with tf.control_dependencies([momentum_grad]):
            if self._norm_type == 'max':
                # compute normalization constant
                g_max = tf.reduce_max(tf.abs(momentum_grad))
                denominator = _EPSILON + g_max
                g_update_normed = momentum_grad / denominator
            elif self._norm_type == 'std':
                std = tf.keras.backend.std(momentum_grad) + _EPSILON
                g_update_normed = momentum_grad / std
            else:
                g_update_normed = tf.nn.l2_normalize(momentum_grad)

        # compute update grad
        update_grad = lr * (g_update_normed + noise)
        var_update = tf.assign_sub(var, update_grad)
        update_m = tf.assign(momentum, momentum_grad)

        # compute gradient correlation
        g_normed = tf.nn.l2_normalize(grad)
        old_g_normed = tf.nn.l2_normalize(old_grad)
        lr_change = -tf.reduce_sum(g_normed * old_g_normed)

        # update learning rate
        new_lr = lr * (1 - lr_update_t * lr_change)
        new_lr = tf.clip_by_value(new_lr, lr_min_t, lr_max_t)

        # update momentum
        beta = 1 - m_coef_update_t
        new_m_coef = m_coef * beta + (1 - beta) * lr_change
        new_m_coef = tf.clip_by_value(new_m_coef, 0.0, 1.0)

        self._grad_correlation_t = lr_change

        with tf.control_dependencies([new_lr, new_m_coef]):
            lr_update = tf.assign(lr, new_lr)
            m_update = tf.assign(m_coef, new_m_coef)
            old_g_update = tf.assign(old_grad, grad)

        return tf.group(
            [update_m, var_update, lr_update, old_g_update, m_update])
예제 #17
0
def update_sub(x, decrement):
    return tf.assign_sub(x, decrement)
            y: x_values_sm_b,
            modulation: np.zeros((batch_size, num_steps, 1)),
            state: get_zero_state() 
        })

        duration = time.time() - start_time

        error = np.sum(np.square(out_v_test[-1][0]/c.lambda_max - x_values_sm_b))

        dw_grads.append(state_v[0][5])
        db_grads.append(state_v[0][6])
        r.append(rhythm)

    print "Epoch {} ({:.2f}s), train error {:.3f}".format(
        i, 
        duration, 
        error
    )

    r = np.asarray(r)
    dw_grads = np.asarray(dw_grads)
    db_grads = np.asarray(db_grads)
    dw_grads_m = np.mean(dw_grads, 0)
    
    # dw_grads_m = 2.0*dw_bl
    # db_grads = 2.0*dbias_bl
    
    sess.run(tf.assign_sub(net.cells[-1].params[0], 10.0*dw_grads_m.reshape(input_size, output_size)))
    sess.run(tf.assign_sub(net.cells[-1].params[1], 10.0*np.mean(db_grads).reshape(1)))

예제 #19
0
    def apply_updates(self) -> tf.Operation:
        """Construct training op to update the registered variables based on their gradients."""
        tfutil.assert_tf_initialized()
        assert not self._updates_applied
        self._updates_applied = True
        devices = list(self._dev_grads.keys())
        total_grads = sum(len(grads) for grads in self._dev_grads.values())
        assert len(devices) >= 1 and total_grads >= 1
        ops = []

        with tfutil.absolute_name_scope(self.scope):
            # Cast gradients to FP32 and calculate partial sum within each device.
            dev_grads = OrderedDict()  # device => [(grad, var), ...]

            for dev_idx, dev in enumerate(devices):
                with tf.name_scope("ProcessGrads%d" % dev_idx), tf.device(dev):
                    sums = []

                    for gv in zip(*self._dev_grads[dev]):
                        assert all(v is gv[0][1] for g, v in gv)
                        g = [tf.cast(g, tf.float32) for g, v in gv]
                        g = g[0] if len(g) == 1 else tf.add_n(g)
                        sums.append((g, gv[0][1]))

                    dev_grads[dev] = sums

            # Sum gradients across devices.
            if len(devices) > 1:
                with tf.name_scope("SumAcrossGPUs"), tf.device(None):
                    for var_idx, grad_shape in enumerate(self._grad_shapes):
                        g = [dev_grads[dev][var_idx][0] for dev in devices]

                        if np.prod(
                                grad_shape
                        ):  # nccl does not support zero-sized tensors
                            g = tf.contrib.nccl.all_sum(g)

                        for dev, gg in zip(devices, g):
                            dev_grads[dev][var_idx] = (
                                gg, dev_grads[dev][var_idx][1])

            # Apply updates separately on each device.
            for dev_idx, (dev, grads) in enumerate(dev_grads.items()):
                with tf.name_scope("ApplyGrads%d" % dev_idx), tf.device(dev):
                    # Scale gradients as needed.
                    if self.use_loss_scaling or total_grads > 1:
                        with tf.name_scope("Scale"):
                            coef = tf.constant(np.float32(1.0 / total_grads),
                                               name="coef")
                            coef = self.undo_loss_scaling(coef)
                            grads = [(g * coef, v) for g, v in grads]

                    # Check for overflows.
                    with tf.name_scope("CheckOverflow"):
                        grad_ok = tf.reduce_all(
                            tf.stack([
                                tf.reduce_all(tf.is_finite(g))
                                for g, v in grads
                            ]))

                    # Update weights and adjust loss scaling.
                    with tf.name_scope("UpdateWeights"):
                        # pylint: disable=cell-var-from-loop
                        opt = self._dev_opt[dev]
                        ls_var = self.get_loss_scaling_var(dev)

                        if not self.use_loss_scaling:
                            ops.append(
                                tf.cond(grad_ok,
                                        lambda: opt.apply_gradients(grads),
                                        tf.no_op))
                        else:
                            ops.append(
                                tf.cond(
                                    grad_ok, lambda: tf.group(
                                        tf.assign_add(ls_var, self.
                                                      loss_scaling_inc),
                                        opt.apply_gradients(grads)),
                                    lambda: tf.group(
                                        tf.assign_sub(ls_var, self.
                                                      loss_scaling_dec))))

                    # Report statistics on the last device.
                    if dev == devices[-1]:
                        with tf.name_scope("Statistics"):
                            ops.append(
                                autosummary.autosummary(
                                    self.id + "/learning_rate",
                                    self.learning_rate))
                            ops.append(
                                autosummary.autosummary(
                                    self.id + "/overflow_frequency",
                                    tf.where(grad_ok, 0, 1)))

                            if self.use_loss_scaling:
                                ops.append(
                                    autosummary.autosummary(
                                        self.id + "/loss_scaling_log2",
                                        ls_var))

            # Initialize variables and group everything into a single op.
            self.reset_optimizer_state()
            tfutil.init_uninitialized_vars(list(self._dev_ls_var.values()))

            return tf.group(*ops, name="TrainingOp")
예제 #20
0
파일: network.py 프로젝트: ioanachelu/turi
    def __init__(self, scope, trainer, global_step=None):
        with tf.variable_scope(scope):
            self.prob_of_random_goal = tf.Variable(FLAGS.initial_random_goal_prob, trainable=False,
                                                   name="prob_of_random_goal", dtype=tf.float32)
            self.inputs = tf.placeholder(shape=[None, FLAGS.resized_height, FLAGS.resized_width, FLAGS.agent_history_length],
                                         dtype=tf.float32, name="Inputs")

            self.prev_rewards = tf.placeholder(shape=[None], dtype=tf.float32, name="Prev_Rewards")

            self.prev_rewards_onehot = tf.one_hot(tf.cast(self.prev_rewards, dtype=tf.int32), 2, dtype=tf.float32,
                                                  name="Prev_Rewards_OneHot")

            self.prev_rewards = tf.expand_dims(self.prev_rewards, 1, name="rewards")

            # self.prev_rewards_onehot = tf.expand_dims(self.prev_rewards, 0)

            self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Prev_Actions")
            self.prev_actions_onehot = tf.one_hot(self.prev_actions, FLAGS.nb_actions, dtype=tf.float32,
                                                  name="Prev_Actions_OneHot")

            self.prev_goal = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_Goals")

            self.image_summaries = []

            if FLAGS.game not in flags.SUPPORTED_ENVS:
                self.conv0 = tf.contrib.layers.conv2d(
                    self.inputs, 16, 8, 4, activation_fn=tf.nn.elu, scope="conv0")
                with tf.variable_scope('conv0'):
                    tf.get_variable_scope().reuse_variables()
                    weights = tf.get_variable('weights')
                    grid = self.put_kernels_on_grid(weights)
                    self.image_summaries.append(
                        tf.summary.image('kernels', grid, max_outputs=1))
                self.conv = tf.contrib.layers.conv2d(
                    self.conv0, 32, 4, 2, activation_fn=tf.nn.elu, scope="conv1")
            else:
                self.conv = tf.contrib.layers.conv2d(
                    self.inputs, 32, 5, 2, activation_fn=tf.nn.elu, scope="conv1")
                with tf.variable_scope('conv1'):
                    tf.get_variable_scope().reuse_variables()
                    weights = tf.get_variable('weights')
                    grid = self.put_kernels_on_grid(weights)
                    self.image_summaries.append(
                        tf.summary.image('kernels', grid, max_outputs=1))

            with tf.variable_scope('inputs'):
                tf.get_variable_scope().reuse_variables()
                self.image_summaries.append(
                    tf.summary.image('input', self.inputs, max_outputs=100))

            self.conv_flat = tf.contrib.layers.flatten(self.conv)
            self.fc = tf.contrib.layers.fully_connected(self.conv_flat, FLAGS.hidden_dim)
            self.fc = tf.contrib.layers.layer_norm(self.fc)
            self.f_percept = tf.nn.elu(self.fc, name="Zt")

            if FLAGS.game not in flags.SUPPORTED_ENVS:
                self.f_percept = tf.concat(
                    [self.f_percept, self.prev_rewards], 1,
                    name="Zt_r")
            else:
                self.f_percept = tf.concat(
                    [self.f_percept, self.prev_rewards_onehot], 1,
                    name="Zt_r")

            summary_f_percept_act = tf.contrib.layers.summarize_activation(self.f_percept)

            ############################################################################################################
            # Manager network

            if FLAGS.meta:
                self.f_Mspace = tf.concat(
                    [self.f_percept, self.prev_goal], 1,
                    name="Zt_r")
            else:
                self.f_Mspace = tf.identity(self.f_percept, name="Zt_r")

            self.f_Mspace = tf.contrib.layers.fully_connected(self.f_Mspace, FLAGS.hidden_dim)

            self.f_percept = tf.concat(
                [self.f_percept, self.prev_actions_onehot], 1,
                name="Zt_r")

            self.f_Mspace = tf.contrib.layers.layer_norm(self.f_Mspace)
            self.f_Mspace = tf.nn.elu(self.f_Mspace, name="St")
            summary_f_Mspace_act = tf.contrib.layers.summarize_activation(self.f_Mspace)

            m_rnn_in = tf.expand_dims(self.f_Mspace, [0], name="Mrnn_in")
            step_size = tf.shape(self.inputs)[:1]

            m_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(FLAGS.hidden_dim)
            m_c_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32)
            m_h_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon), np.float32)
            self.m_state_init = [m_c_init, m_h_init]
            m_c_in = tf.placeholder(tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_c_in")
            m_h_in = tf.placeholder(tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon], name="Mrnn_h_in")
            self.m_state_in = (m_c_in, m_h_in)
            m_state_in = tf.contrib.rnn.LSTMStateTuple(m_c_in, m_h_in)

            m_lstm_outputs, m_lstm_state = self.fast_dlstm(m_rnn_in, m_state_in, m_lstm_cell, FLAGS.manager_horizon,
                                                           FLAGS.hidden_dim * FLAGS.manager_horizon)

            m_lstm_c, m_lstm_h = m_lstm_state
            self.m_state_out = (m_lstm_c[-1, :1, :], m_lstm_h[-1, :1, :])
            self.goals = tf.reshape(m_lstm_outputs, [-1, FLAGS.hidden_dim])
            self.normalized_goals = tf.contrib.layers.fully_connected(self.goals, FLAGS.hidden_dim, activation_fn=tf.tanh, name="Gt")

            summary_goals = tf.contrib.layers.summarize_activation(self.normalized_goals)

            def randomize_goals(t):
                t = tf.cast(t, tf.int32)
                packed_tensors = tf.stack([tf.random_normal([FLAGS.hidden_dim, ]), self.normalized_goals[t, :]])

                to_update = tf.cond(
                    tf.less(self.prob_of_random_goal, tf.constant(FLAGS.final_random_goal_prob, dtype=tf.float32)),
                    lambda: tf.cast(
                        tf.multinomial(
                            tf.log([[self.prob_of_random_goal,
                                     tf.subtract(tf.constant(1.0),
                                                 self.prob_of_random_goal)]]), 1)[0][0], tf.int32),
                    lambda: tf.constant(1, tf.int32))

                resulted_tensor = tf.gather(packed_tensors, to_update)

                return resulted_tensor

            self.randomized_goals = tf.map_fn(lambda t: randomize_goals(t), tf.to_float(tf.range(0, step_size[0])),
                                              name="random_gt")

            summary_random_goals = tf.contrib.layers.summarize_activation(self.randomized_goals)

            self.decrease_prob_of_random_goal = tf.assign_sub(self.prob_of_random_goal, tf.constant(
                (FLAGS.initial_random_goal_prob - FLAGS.final_random_goal_prob) / FLAGS.explore_steps))

            m_fc_value_w = tf.get_variable("M_Value_W", shape=[FLAGS.hidden_dim, 1],
                                           initializer=normalized_columns_initializer(1.0))
            self.m_value = tf.matmul(m_rnn_out, m_fc_value_w, name="M_Value")

            summary_m_value_act = tf.contrib.layers.summarize_activation(self.m_value)

            ############################################################################################################

            # Worker network

            self.sum_prev_goals = tf.placeholder(shape=[None, FLAGS.hidden_dim], dtype=tf.float32, name="Prev_c_Goals_sum")

            w_rnn_in = tf.expand_dims(self.f_percept, [0], name="Wrnn_in")
            step_size = tf.shape(self.inputs)[:1]
            w_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(FLAGS.goal_embedding_size * FLAGS.nb_actions)
            w_c_init = np.zeros((1, w_lstm_cell.state_size.c), np.float32)
            w_h_init = np.zeros((1, w_lstm_cell.state_size.h), np.float32)
            self.w_state_init = [w_c_init, w_h_init]
            w_c_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.c], name="Wrnn_c_in")
            w_h_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.h], name="Wrnn_h_in")
            self.w_state_in = (w_c_in, w_h_in)
            w_state_in = tf.contrib.rnn.LSTMStateTuple(w_c_in, w_h_in)

            w_lstm_outputs, w_lstm_state = tf.nn.dynamic_rnn(
                w_lstm_cell, w_rnn_in, initial_state=w_state_in, sequence_length=step_size,
                time_major=False)

            w_lstm_c, w_lstm_h = w_lstm_state
            self.w_state_out = (w_lstm_c[:1, :], w_lstm_h[:1, :])
            Ut = tf.reshape(w_lstm_outputs, [step_size[0], FLAGS.nb_actions, FLAGS.goal_embedding_size],
                                   name="Ut")
            Ut_flat = tf.reshape(w_lstm_outputs, [step_size[0], FLAGS.nb_actions * FLAGS.goal_embedding_size],
                                        name="Ut_flat")

            summary_wrnn_act = tf.contrib.layers.summarize_activation(Ut)

            goal_encoding = tf.contrib.layers.fully_connected(self.sum_prev_goals, FLAGS.goal_embedding_size,
                                                              biases_initializer=None, scope="goal_emb")

            interm_rez = tf.squeeze(tf.matmul(Ut, tf.expand_dims(goal_encoding, 2)), 2)
            interm_rez = tf.contrib.layers.flatten(interm_rez)
            self.w_policy = tf.nn.softmax(interm_rez, name="W_Policy")

            summary_w_policy_act = tf.contrib.layers.summarize_activation(self.w_policy)

            w_fc_value_w = tf.get_variable("W_Value_W", shape=[FLAGS.nb_actions * FLAGS.goal_embedding_size + FLAGS.goal_embedding_size, 1],
                                           initializer=normalized_columns_initializer(1.0))
            self.w_value = tf.matmul(tf.concat([Ut_flat, goal_encoding], 1), w_fc_value_w, name="W_Value")

            summary_w_value_act = tf.contrib.layers.summarize_activation(self.w_value)

            if scope != 'global':

                self.w_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32)
                self.m_extrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32)
                self.w_intrinsic_return = tf.placeholder(shape=[None], dtype=tf.float32)

                def gather_state_at_horiz(t):
                    t = tf.cast(t, tf.int32)
                    f_Mspace_c = tf.gather(self.f_Mspace,
                                           tf.minimum(t + tf.constant(FLAGS.manager_horizon, dtype=tf.int32),
                                                      step_size[0] - 1))
                    return f_Mspace_c

                self.f_Mspace_c = tf.cast(
                    tf.map_fn(lambda t: gather_state_at_horiz(t), tf.to_float(tf.range(0, step_size[0])),
                              name="state_at_horiz"), dtype=tf.float32)
                self.state_diff = self.f_Mspace_c - self.f_Mspace
                self.cos_sim_state_diff = self.cosine_distance(tf.stop_gradient(self.state_diff), self.normalized_goals,
                                                               dim=1)

                self.m_advantages = self.m_extrinsic_return - tf.stop_gradient(tf.reshape(self.m_value, [-1]))
                self.goals_loss = - tf.reduce_sum(self.m_advantages * self.cos_sim_state_diff)
                self.m_value_loss = FLAGS.m_beta_v * tf.reduce_sum(
                    tf.square(self.m_extrinsic_return - tf.reshape(self.m_value, [-1])))

                self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="Actions")
                self.actions_onehot = tf.one_hot(self.actions, FLAGS.nb_actions, dtype=tf.float32,
                                                 name="Actions_Onehot")

                self.responsible_outputs = tf.reduce_sum(self.w_policy * self.actions_onehot, [1])

                self.intrinsic_return = FLAGS.alpha * self.w_intrinsic_return
                self.total_return = self.w_extrinsic_return + self.intrinsic_return
                self.w_advantages = self.total_return - tf.stop_gradient(tf.reshape(self.w_value, [-1]))

                # Loss functions
                self.w_value_loss = FLAGS.w_beta_v * tf.reduce_sum(
                    tf.square(self.total_return - tf.reshape(self.w_value, [-1])))
                self.entropy = - tf.reduce_sum(self.w_policy * tf.log(self.w_policy + 1e-7))

                self.w_policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs + 1e-7) * self.w_advantages) - self.entropy * FLAGS.beta_e

                self.loss = self.w_value_loss + self.w_policy_loss + self.m_value_loss + self.goals_loss

                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, FLAGS.gradient_clip_value)

                self.worker_summaries = [summary_f_percept_act, summary_f_Mspace_act, summary_goals,
                                         summary_random_goals,
                                         summary_m_value_act,
                                         summary_wrnn_act, summary_w_policy_act, summary_w_value_act]
                for grad, weight in zip(grads, local_vars):
                    self.worker_summaries.append(tf.summary.histogram(weight.name + '_grad', grad))
                    self.worker_summaries.append(tf.summary.histogram(weight.name, weight))

                self.merged_summary = tf.summary.merge(self.worker_summaries)

                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
예제 #21
0
    def train_eq_op(self, X, y, z, epochs=32, batch_size=1):
        """ Train a model with (positive class) equality of opportunity debiasing.

        Inputs:

          X: np.ndarray [N, F] -- Instances over F features.

          y: np.ndarray [N, 1 of {0,1}] -- Target class.

          z: np.ndarray [N, 1 of {0,1}] -- Group membership.

        Returns nothing but updates self.classifier
        """

        #raise NotImplementedError('You need to implement this.')

        # SOLUTION
        # END OF SOLUTION

        # Model
        inp = Input(1)  # for giving y as input to the adversary
        next_layer = tf.keras.layers.Concatenate(axis=1)(
            [self.classifier.output, inp])
        out = Dense(1, activation='sigmoid')(next_layer)

        adversary = Model([self.classifier.input, inp], out)

        # The following part is same as dem_parity (Only difference is now y is given as input to the adversary)
        # Defining Tensors Operations

        Y = tf.placeholder(tf.float32, shape=[None, 1])
        Z = tf.placeholder(tf.float32, shape=[None, 1])

        class_params = adversary.trainable_weights[:-2]
        adv_params = adversary.trainable_weights[-2:]
        outputs = [layer.output for layer in adversary.layers]

        l_p = K.mean(K.binary_crossentropy(Y, outputs[4], from_logits=False))
        loss_p = K.function([adversary.input, Y], l_p)

        l_a = K.mean(K.binary_crossentropy(Z, outputs[-1], from_logits=False))

        loss_a = K.function([
            adversary.input,
            tf.concat((outputs[4], adversary.layers[5].input), -1), Z
        ], l_a)

        grads_adv = tf.gradients(ys=l_a, xs=adv_params)
        grads_class = tf.gradients(ys=l_p, xs=class_params)
        grads_class_adv = tf.gradients(ys=l_a, xs=class_params)

        gradients_adv = K.function([
            adversary.input,
            tf.concat((outputs[4], adversary.layers[5].input), -1), Z
        ], grads_adv)
        gradients_class = K.function([adversary.input, Y], grads_class)
        gradients_class_adv = K.function([
            adversary.input,
            tf.concat((outputs[4], adversary.layers[5].input), -1), Z
        ], grads_class_adv)

        num = len(X) // batch_size

        #sess.run(tf.global_variables_initializer())

        for epoch in range(epochs):

            learning_rate = 1 / (epoch + 1)
            alpha = np.sqrt(epoch + 1)
            c = 0
            loss_class = 0
            loss_adv = 0

            outer = tqdm(total=num, desc='Train epochs', position=0)
            for b in range(0, len(X), batch_size):
                outer.update(1)
                c = c + 1

                # Notations same as dem parity trainer
                l1 = loss_p([
                    X[b:b + batch_size], y[b:b + batch_size],
                    y[b:b + batch_size]
                ])
                l2 = loss_a([
                    X[b:b + batch_size], z[b:b + batch_size],
                    z[b:b + batch_size], z[b:b + batch_size]
                ])
                clas = gradients_class([
                    X[b:b + batch_size], y[b:b + batch_size],
                    y[b:b + batch_size]
                ])
                adv = gradients_adv([
                    X[b:b + batch_size], y[b:b + batch_size],
                    z[b:b + batch_size], z[b:b + batch_size]
                ])
                clasadv = gradients_class_adv([
                    X[b:b + batch_size], y[b:b + batch_size],
                    z[b:b + batch_size], z[b:b + batch_size]
                ])

                for i in range(len(adversary.trainable_weights)):

                    if i > 7:
                        sess.run(
                            tf.assign_sub(adversary.trainable_weights[i],
                                          learning_rate * adv[i - 8]))

                    else:
                        k = self.projection_weights(clas[i], clasadv[i])
                        grad = clas[i] - k - alpha * clasadv[i]
                        sess.run(
                            tf.assign_sub(adversary.trainable_weights[i],
                                          learning_rate * grad))

                loss_class += l1
                loss_adv += l2

                del l1, l2, clas, adv, clasadv, k, grad
            y_pred = (self.classifier.predict(X) > 0.5) * 1
            acc1 = (y_pred == y).mean()

            y_pred1 = (adversary.predict([X, y]) > 0.5) * 1
            acc2 = (y_pred1 == z).mean()

            print('Epoch: ', epoch + 1)
            print('Demographic Parity: ', evaluate_dem_parity(y_pred, y, z))
            print('Equality of Opportunity: ', evaluate_eq_op(y_pred, y, z))
            print('Classification Loss: ', loss_class / c)
            print('Adversarial Loss: ', loss_adv / c)
            print('Classifier Accuracy: ', acc1)
            print('Adversary Accuracy: ', acc2)
            del y_pred, y_pred1
예제 #22
0
    def train_dem_parity(self, X, y, z, epochs=32, batch_size=1024):
        """ Train a model with (positive class) demographic parity.
        Inputs:

          X: np.ndarray [N, F] -- Instances over F features.

          y: np.ndarray [N, 1] -- Target class.

          z: np.ndarray [N, 1] -- Group membership.

          Returns nothing but updates self.classifier
        """

        #raise NotImplementedError('You need to implement this.')

        # SOLUTION
        # END OF SOLUTION
        #K.clear_session()
        #sess = tf.Session()
        #K.set_session(sess)

        adversary = self._get_adversary_architecture(
        )  #getting the adversary model

        # Defining Tensors Operations
        Y = tf.placeholder(tf.float32,
                           shape=[None, 1])  # placeholder for true labels
        Z = tf.placeholder(tf.float32,
                           shape=[None,
                                  1])  # placeholder for protected attribute

        class_params = adversary.trainable_weights[:
                                                   -2]  #parameters of the classifier
        adv_params = adversary.trainable_weights[
            -2:]  #parameters of the adversary
        outputs = [layer.output for layer in adversary.layers
                   ]  #getting the symbolic tensors of all layers

        l_p = K.mean(K.binary_crossentropy(
            Y, outputs[-2], from_logits=False))  #classifier loss
        loss_p = K.function([adversary.input, Y], l_p)

        l_a = K.mean(K.binary_crossentropy(Z, outputs[-1],
                                           from_logits=False))  #adversary loss
        loss_a = K.function([adversary.input, Z], l_a)

        grads_adv = tf.gradients(ys=l_a, xs=adv_params)  #Adversary gradients
        grads_class = tf.gradients(ys=l_p,
                                   xs=class_params)  #Classifier gradients
        grads_class_adv = tf.gradients(
            ys=l_a, xs=class_params)  #classifier gradients wrt adversary loss

        gradients_adv = K.function([adversary.input, Z], grads_adv)
        gradients_class = K.function([adversary.input, Y], grads_class)
        gradients_class_adv = K.function([adversary.input, Z], grads_class_adv)

        num = len(X) // batch_size

        #sess.run(tf.global_variables_initializer())

        for epoch in range(epochs):
            outer = tqdm(total=num, desc='Train epochs', position=0)

            learning_rate = 1 / (epoch + 1)
            alpha = np.sqrt(epoch + 1)

            loss_class = 0
            loss_adv = 0
            c = 0

            for b in range(0, len(X), batch_size):
                outer.update(1)

                c = c + 1
                l1 = loss_p([X[b:b + batch_size],
                             y[b:b + batch_size]])  #classifier loss
                l2 = loss_a([X[b:b + batch_size],
                             z[b:b + batch_size]])  #adversary loss
                clas = gradients_class(
                    [X[b:b + batch_size],
                     y[b:b + batch_size]])  #classifier gradients
                adv = gradients_adv([X[b:b + batch_size], z[b:b + batch_size]
                                     ])  #adversary gradients
                clasadv = gradients_class_adv([
                    X[b:b + batch_size], z[b:b + batch_size]
                ])  #classifier gradient wrt adversary loss

                for i in range(len(adversary.trainable_weights)):

                    if i > 7:
                        sess.run(
                            tf.assign_sub(
                                adversary.trainable_weights[i], learning_rate *
                                adv[i - 8]))  #adversary weight update

                    else:

                        k = self.projection_weights(clas[i], clasadv[i])
                        grad = clas[i] - k - alpha * clasadv[i]
                        sess.run(
                            tf.assign_sub(adversary.trainable_weights[i],
                                          learning_rate *
                                          grad))  #classifier weight update

                loss_class += l1
                loss_adv += l2
                del l1, l2, clas, adv, clasadv, k, grad

            y_pred = (self.classifier.predict(X) > 0.5) * 1
            acc1 = (y_pred == y).mean()

            y_pred1 = (adversary.predict(X) > 0.5) * 1
            acc2 = (y_pred1 == z).mean()

            print('Epoch: ', epoch + 1)
            print('Demographic Parity: ', evaluate_dem_parity(y_pred, y, z))
            print('Equality of Opportunity: ', evaluate_eq_op(y_pred, y, z))
            print('Classification Loss: ', loss_class / c)
            print('Adversarial Loss: ', loss_adv / c)
            print('Classification Accuracy: ', acc1)
            print('Adversary Accuracy: ', acc2)
            del y_pred, y_pred1
예제 #23
0
 def _anneal_learning_rate(self):
     return tf.cond(
         self.learning_rate > 0.0,
         lambda: tf.assign_sub(self.learning_rate, self.delta_lr),
         lambda: tf.assign(self.learning_rate, 0.0))
예제 #24
0
    def _apply_dense(self, grad, var):
        # SM3 upper bounds the gradient square sums:
        #
        # To illustrate:
        #
        # For a Tensor `T` of shape [M, N, K].
        #
        # `G` be its gradient of shape [M, N, K]
        #
        # SM3 keeps around three accumulators A1, A2, A3 of size M, N, K
        # respectively.
        #
        # `A` be the accumulator of shape [M, N, K]. `A` is not materialized until
        #   its needed for every step, and is approximated by A1, A2, A3.
        #
        # At every gradient update step the accumulators satisify:
        #   A1_t[i] >= Sum_{s <= t} G_t[i, j, k]^2 for all j, k.
        #   A2_t[j] >= Sum_{s <= t} G_t[i, j, k]^2 for all i, k.
        #   A3_t[k] >= Sum_{s <= t} G_t[i, j, k]^2 for all i, j.
        #
        # The RHS is the gradient sum squares.
        #
        # For every step we materialize the tensor `A` based on accumulated tensors
        # A1, A2 and A3.
        #
        #  A = min(A1[i], A2[j], A3[j]) + G[i, j, k]^2
        #
        # SM3 preconditioned gradient is
        #
        #  preconditioned G = A^{-0.5} * G
        #
        # We then update the individual accumulator factors as:
        #
        #  A1[i] = max_{j, k} A[i, j, k]
        #  A2[j] = max_{i, k} A[i, j, k]
        #  A3[k] = max_{i, j} A[i, j, k]
        #
        shape = np.array(var.get_shape())
        var_rank = len(shape)
        if var_rank > 1:
            accumulator_list = [
                self.get_slot(var, "accumulator_" + str(i))
                for i in range(var_rank)
            ]
            accumulator = self._compute_past_accumulator(
                accumulator_list, shape)
            accumulator += grad * grad
        else:
            accumulator_var = self.get_slot(var, "accumulator")
            accumulator = tf.assign_add(accumulator_var, grad * grad)

        accumulator_inv_sqrt = tf.where(tf.greater(accumulator, 0),
                                        tf.rsqrt(accumulator),
                                        tf.zeros_like(accumulator))
        scaled_g = (1.0 - self._momentum_tensor) * (grad *
                                                    accumulator_inv_sqrt)
        accumulator_update_ops = []

        with tf.control_dependencies([scaled_g]):
            if var_rank > 1:
                # Updates individual accumulator factors as:
                #  A1[i] = max_{j, k} A[i, j, k]
                #  A2[j] = max_{i, k} A[i, j, k]
                #  A3[k] = max_{i, j} A[i, j, k]
                for i, accumulator_i in enumerate(accumulator_list):
                    axes = list(range(i)) + list(range(i + 1, var_rank))
                    new_accumulator_i = tf.reduce_max(accumulator, axis=axes)
                    accumulator_update_ops.append(
                        tf.assign(accumulator_i, new_accumulator_i))

        with tf.control_dependencies(accumulator_update_ops):
            if self._momentum > 0:
                gbar = self.get_slot(var, "momentum")
                update = tf.assign_add(
                    gbar,
                    gbar * (self._momentum_tensor - 1.0) + scaled_g)
            else:
                update = scaled_g
            return tf.assign_sub(var, self._learning_rate_tensor * update)
예제 #25
0
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 batch_size,
                 mode,
                 input_keep_prob,
                 output_keep_prob,
                 state_keep_prob,
                 beam_search,
                 beam_size,
                 schedule_sampling='linear',
                 sampling_decay_rate=0.99,
                 sampling_global_step=150000,
                 sampling_decay_steps=500,
                 pretrain_vec=None,
                 pretrain_trainable=False,
                 length_penalty=None,
                 length_penalty_factor=0.6):

        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.buckets = buckets
        # units of rnn cell
        self.size = size
        # dimension of words
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(0.5, trainable=False)
        self.mode = mode
        self.dummy_reply = ["what ?", "yeah .", "you are welcome ! ! ! !"]

        # learning rate decay
        self.learning_rate_decay = self.learning_rate.assign(
            self.learning_rate * 0.99)

        # input for Reinforcement part
        self.loop_or_not = tf.placeholder(tf.bool)
        self.reward = tf.placeholder(tf.float32, [None])
        batch_reward = tf.stop_gradient(self.reward)
        self.RL_index = [None for _ in self.buckets]

        # dropout
        self.input_keep_prob = input_keep_prob
        self.output_keep_prob = output_keep_prob
        self.state_keep_prob = state_keep_prob

        # beam search
        self.beam_search = beam_search
        self.beam_size = beam_size
        self.length_penalty = length_penalty
        self.length_penalty_factor = length_penalty_factor

        # if load pretrain word vector
        self.pretrain_vec = pretrain_vec
        self.pretrain_trainable = pretrain_trainable

        # schedule sampling
        self.sampling_probability_clip = None
        self.schedule_sampling = schedule_sampling
        if self.schedule_sampling == 'False': self.schedule_sampling = False
        self.init_sampling_probability = 1.0
        self.sampling_global_step = sampling_global_step
        self.sampling_decay_steps = sampling_decay_steps
        self.sampling_decay_rate = sampling_decay_rate

        if self.schedule_sampling == 'linear':
            self.decay_fixed = self.init_sampling_probability * (
                self.sampling_decay_steps / self.sampling_global_step)
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            self.sampling_probability_decay = tf.assign_sub(
                self.sampling_probability, self.decay_fixed)
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
            #self.sampling_probability = tf.maximum(self.sampling_probability,tf.constant(0.0))
        elif self.schedule_sampling == 'exp':
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            #self.sampling_probability = tf.train.exponential_decay(
            self.sampling_probability_decay = tf.assign(
                self.sampling_probability,
                tf.train.natural_exp_decay(self.sampling_probability,
                                           self.sampling_global_step,
                                           self.sampling_decay_steps,
                                           self.sampling_decay_rate,
                                           staircase=True))
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
        elif self.schedule_sampling == 'inverse_sigmoid':
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            self.sampling_probability_decay = tf.assign(
                self.sampling_probability,
                #tf.train.cosine_decay(
                tf.train.linear_cosine_decay(
                    self.sampling_probability,
                    self.sampling_decay_steps,
                    self.sampling_global_step,
                ))
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
        elif not self.schedule_sampling:
            pass
        else:
            raise ValueError(
                "schedule_sampling must be one of the following: [linear|exp|inverse_sigmoid|False]"
            )

        w_t = tf.get_variable('proj_w', [self.trg_vocab_size, self.size])
        w = tf.transpose(w_t)
        b = tf.get_variable('proj_b', [self.trg_vocab_size])
        output_projection = (w, b)

        def sample_loss(labels, inputs):
            labels = tf.reshape(labels, [-1, 1])
            local_w_t = tf.cast(w_t, tf.float32)
            local_b = tf.cast(b, tf.float32)
            local_inputs = tf.cast(inputs, tf.float32)
            # num_classes:所有的wordvec維度; num_sampled:取樣後使用的維度來計算softmax。
            return tf.cast(tf.nn.sampled_softmax_loss(
                weights=local_w_t,
                biases=local_b,
                inputs=local_inputs,
                labels=labels,
                num_sampled=512,
                num_classes=self.trg_vocab_size),
                           dtype=tf.float32)

        softmax_loss_function = sample_loss

        #FIXME add RL function
        def seq2seq_multi(encoder_inputs,
                          decoder_inputs,
                          mode,
                          pretrain_vec=None):
            if pretrain_vec is not None:
                pad_num = self.src_vocab_size - pretrain_vec.shape[0]
                pretrain_vec = np.pad(pretrain_vec, [(0, pad_num), (0, 0)],
                                      mode='constant')
                tag_vec = pretrain_vec[:data_utils.SPECIAL_TAGS_COUNT]
                pretrain_vec = pretrain_vec[data_utils.SPECIAL_TAGS_COUNT:]
                special_tags = tf.get_variable(name="special_tags",
                                               initializer=tag_vec,
                                               trainable=True)
                embedding = tf.get_variable(name="embedding",
                                            initializer=pretrain_vec,
                                            trainable=self.pretrain_trainable)
                embedding = tf.concat([special_tags, embedding], 0)
            else:
                embedding = tf.get_variable("embedding",
                                            [self.src_vocab_size, self.size])
            loop_function_RL = None
            if mode == 'MLE':
                feed_previous = False
            elif mode == 'TEST':
                feed_previous = True

            # need loop_function
            elif mode == 'RL':
                feed_previous = True

                def loop_function_RL(prev, i):
                    prev = tf.matmul(
                        prev, output_projection[0]) + output_projection[1]
                    prev_index = tf.multinomial(tf.log(tf.nn.softmax(prev)), 1)

                    if i == 1:
                        for index, RL in enumerate(self.RL_index):
                            if RL is None:
                                self.RL_index[index] = prev_index
                                self.index = index
                                break
                    else:
                        self.RL_index[self.index] = tf.concat(
                            [self.RL_index[self.index], prev_index], axis=1)
                    #self.RL_index: [(?,9),(?,14),(?,24),(?,49)]
                    #RL_index指的是取樣後每個字的index
                    prev_index = tf.reshape(prev_index, [-1])
                    #prev_index: (?,)
                    # decide which to be the next time step input
                    sample = tf.nn.embedding_lookup(embedding, prev_index)
                    #sample: (?,256)
                    from_decoder = tf.nn.embedding_lookup(
                        embedding, decoder_inputs[i])
                    #from_decoder: (?,256)
                    return tf.where(self.loop_or_not, sample, from_decoder)
            self.loop_function_RL = loop_function_RL

            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=self.src_vocab_size,
                num_decoder_symbols=self.trg_vocab_size,
                embedding_size=self.size,
                output_projection=output_projection,
                feed_previous=feed_previous,
                dtype=tf.float32,
                embedding=embedding,
                beam_search=self.beam_search,
                beam_size=self.beam_size,
                loop=loop_function_RL,
                schedule_sampling=self.schedule_sampling,
                sampling_probability=self.sampling_probability_clip,
                length_penalty=self.length_penalty,
                length_penalty_factor=self.length_penalty_factor)

        # inputs
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        for i in range(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name='encoder{0}'.format(i)))
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name='decoder{0}'.format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name='weight{0}'.format(i)))
        targets = [
            self.decoder_inputs[i + 1]
            for i in range(len(self.decoder_inputs) - 1)
        ]

        def single_cell():
            return tf.contrib.rnn.GRUCell(self.size)
            #return tf.contrib.rnn.BasicLSTMCell(self.size)

        cell = single_cell()
        if self.num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell(
                [single_cell() for _ in range(self.num_layers)])
            cell = rnn.DropoutWrapper(cell,
                                      input_keep_prob=self.input_keep_prob,
                                      output_keep_prob=self.output_keep_prob,
                                      state_keep_prob=self.state_keep_prob)

        if self.mode == 'MLE':
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec),
                softmax_loss_function=softmax_loss_function)

            for b in range(len(self.buckets)):
                self.outputs[b] = [
                    tf.matmul(output, output_projection[0]) +
                    output_projection[1] for output in self.outputs[b]
                ]

            self.update = []
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in range(len(self.buckets)):
                gradients = tf.gradients(self.losses[b],
                                         tf.trainable_variables())
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                self.update.append(
                    optimizer.apply_gradients(
                        zip(clipped_gradients, tf.trainable_variables())))

        elif self.mode == 'TEST':
            self.buckets = [(10, 50), (15, 50), (25, 50), (50, 50)]

            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec),
                softmax_loss_function=softmax_loss_function)

            for b in range(len(self.buckets)):
                #print('self.outputs[b]: ',self.outputs[b])
                self.outputs[b] = [
                    tf.matmul(output, output_projection[0]) +
                    output_projection[1] for output in self.outputs[b]
                ]
                #print('self.outputs[b]: ',self.outputs[b])

        elif self.mode == 'RL':

            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec),
                softmax_loss_function=softmax_loss_function,
                per_example_loss=True)

            #print('self.buckets: ',len(self.buckets))
            for b in range(len(self.buckets)):
                self.outputs[b] = [
                    tf.matmul(output, output_projection[0]) +
                    output_projection[1] for output in self.outputs[b]
                ]

            #print('self.RL_index: ',self.RL_index)
            #print('self.outputs: ',len(self.outputs[0]),len(self.outputs[1]),len(self.outputs[2]),len(self.outputs[3]))
            #print('self.RL_index: ',len(self.RL_index))
            #print('self.outputs: ',len(self.outputs))
            for i, b in enumerate(self.outputs):
                prev_index = tf.multinomial(
                    tf.log(tf.nn.softmax(b[self.buckets[i][1] - 1])), 1)
                #下面一行目的為補足最後一個decoder output,因為在decoder當中呼叫一次loop_function,RL_index才會append一次,但最後一個input得到的output不會再當prev丟入下一個loop_function,因此要從self.outputs的最後一個物件來補齊。
                self.RL_index[i] = tf.concat([self.RL_index[i], prev_index],
                                             axis=1)
                #print(i,len(b))
                #print('self.buckets: ',self.buckets)
                #print('self.buckets[i][1]: ',self.buckets[i][1])
                #print('self.buckets[i][1] - 1: ',self.buckets[i][1] - 1)
                #print('b[self.buckets[i][1] - 1]: ', b[self.buckets[i][1] - 1])
                #print('prev_index: ',prev_index)
                #print('self.RL_index[i]: ',self.RL_index[i])
                #print('----------------')
            #self.outputs: list of 4 buckets, each (?,6258)
            #print('self.RL_index: ',self.RL_index)

            self.update = []
            optimizer = tf.train.GradientDescentOptimizer(0.01)
            #optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in range(len(self.buckets)):
                scaled_loss = tf.multiply(self.losses[b], batch_reward)
                self.losses[b] = tf.reduce_mean(scaled_loss)
                gradients = tf.gradients(self.losses[b],
                                         tf.trainable_variables())
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                self.update.append(
                    optimizer.apply_gradients(
                        zip(clipped_gradients, tf.trainable_variables())))

        # specify saver
        self.saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep)
class KalmanFilter:

    def __init__(self):
        pass

    # Const Params
    with tf.variable_scope("kf_constants"):
        F = tf.constant([
            [1, 0, 0.2, 0],
            [0, 1, 0, 0.2],
            [0, 0, 1, 0],
            [0, 0, 0, 1]], dtype=tf.float32, name="kf_F")
        B = tf.constant([
            [1, 0, 0, 0],
            [0, 1, 0, 0],
            [0, 0, 1, 0],
            [0, 0, 0, 1]], dtype=tf.float32, name="kf_B")
        H = tf.constant([
            [1, 0, 0, 0],
            [0, 1, 0, 0],
            [0, 0, 1, 0],
            [0, 0, 0, 1]], dtype=tf.float32, name="kf_H")
        Q = tf.constant([
            [0.001, 0, 0, 0],
            [0, 0.001, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0]], dtype=tf.float32, name="kf_Q")
        R = tf.constant([
            [0.1, 0, 0, 0],
            [0, 0.1, 0, 0],
            [0, 0, 0.1, 0],
            [0, 0, 0, 0.1]], dtype=tf.float32, name="kf_R")

    # Inputs and Outputs
    with tf.variable_scope("kf_inputs_outputs"):
        x0 = tf.placeholder(dtype=tf.float32, shape=(4, 1), name="kf_x0") # Last coordinates
        P = tf.Variable([
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0]], dtype=tf.float32, name="kf_P") # 4 dynamic parameter: coordinates and velocity

    # Predict
    with tf.variable_scope("kf_predict"):
        xhat = tf.Variable([
            [0],
            [0],
            [0],
            [0]], dtype=tf.float32, name="kf_xhat")
        predict_xhat = tf.assign(xhat, tf.matmul(F, x0), name="kf_predict_xhat")
        predict_P = tf.assign(P, tf.matmul(F, tf.matmul(P, F, transpose_b=True)) + Q, name="kf_predict_P")

    # Correction
    with tf.variable_scope("kf_correction"):
        S = tf.matmul(H, tf.matmul(P, H, transpose_b=True)) + R
        K = tf.matmul(tf.matmul(P, H, transpose_b=True), tf.matrix_inverse(S))

        z = tf.matmul(H, x0, name="kf_z")
        y1 = z - tf.matmul(H, xhat)
        update_xhat = tf.assign_add(xhat, tf.matmul(K, y1), name="kf_update_xhat")
        delta_P = tf.matmul(K, tf.matmul(H, P))
        update_P = tf.assign_sub(P, delta_P, name="kf_update_P")
        init = tf.global_variables_initializer()
    def _apply_dense(self, grad: tf.Tensor, var: tf.Variable) -> tf.Operation:
        """Add ops to apply dense gradients to `var`.

        Args:
            grad: A gradient `Tensor`.
            var: A `Variable` object.

        Returns:
            An `Operation`.
        """

        alpha_t = tf.cast(self._alpha_t, var.dtype.base_dtype)
        lr_update_t = tf.cast(self._lr_update_t, var.dtype.base_dtype)
        lr_max_t = tf.cast(self._lr_max_t, var.dtype.base_dtype)
        lr_min_t = tf.cast(self._lr_min_t, var.dtype.base_dtype)
        steps = self._steps_t

        current_step = self._current_step
        global_step = self._global_step

        gk_old = self.get_slot(var, "gk_old")
        gk = self.get_slot(var, "gk")
        var_old = self.get_slot(var, "v_old")
        lr = self.get_slot(var, "lr")

        noise = tf.random_uniform(shape=tf.shape(var),
                                  minval=-1.0,
                                  maxval=+1.0)

        if self._norm == 'max':
            # compute normalization constant
            g_max = tf.reduce_max(tf.abs(grad))
            denominator = _EPSILON + g_max
            g_update_normed = grad / denominator
        else:
            g_update_normed = tf.nn.l2_normalize(grad)

        # compute update grad
        update_grad = lr * (g_update_normed + noise * alpha_t)
        var_update = tf.assign_sub(var, update_grad)

        beta = 0.9

        def update_grads():

            agg_grad = gk * beta + (1 - beta) * update_grad
            # agg_grad = gk + update_grad
            update_gk = tf.assign(gk, agg_grad)

            return tf.group([update_gk]), lr

        def reset_steps():

            agg_grad = gk * beta + (1 - beta) * update_grad
            # I did try it however it was not stable :/
            # dx = var - var_old
            # dg = gk - gk_old
            # s1 = tf.reduce_sum(tf.square(dx))
            # s2 = tf.abs(tf.reduce_sum(dx * dg)) + _EPSILON
            # eta = s1 / s2

            # update learning rate
            g_normed = tf.nn.l2_normalize(agg_grad)
            old_g_normed = tf.nn.l2_normalize(gk_old)
            lr_change = -lr_update_t * tf.reduce_sum(g_normed * old_g_normed)
            eta = lr * (1 - lr_change)

            with tf.control_dependencies([eta]):
                update_gk_old = tf.assign(gk_old, agg_grad)
                with tf.control_dependencies([update_gk_old]):
                    update_gk = tf.assign(gk, tf.zeros_like(gk))

                update_var_old = tf.assign(var_old, var)
                step_assign = tf.assign(current_step, 0)
                update_g = tf.group(
                    [update_gk_old, update_var_old, update_gk, step_assign])

            return update_g, eta

        with tf.control_dependencies([var_update]):
            udaptes, new_lr = tf.cond(tf.greater_equal(current_step, steps),
                                      true_fn=reset_steps,
                                      false_fn=update_grads)

        with tf.control_dependencies([udaptes]):
            new_lr = tf.cond(tf.greater_equal(
                tf.to_float(global_step) / tf.to_float(steps), 2),
                             true_fn=lambda: new_lr,
                             false_fn=lambda: lr)

        global_step_update = tf.assign_add(global_step, 1)
        step_update = tf.assign_add(current_step, 1)

        new_lr = tf.clip_by_value(new_lr, lr_min_t, lr_max_t)
        lr_update = tf.assign(lr, new_lr)

        update = tf.group([lr_update, step_update, global_step_update])

        return update
 def testAssignUpdateNoShape(self):
     var = state_ops.variable_op([1, 2], tf.float32, set_shape=False)
     added = tf.assign_add(var, self._NewShapelessTensor())
     self.assertEqual(tensor_shape.unknown_shape(), added.get_shape())
     subbed = tf.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual(tensor_shape.unknown_shape(), subbed.get_shape())
예제 #29
0
def sgd_update(grad, var, lr):
    delta = lr * grad
    return tf.assign_sub(var, delta)
예제 #30
0
 def _eval_mean_update():
     difference = (1 - eval_mean_ema_decay) * (eval_mean -
                                               training_mean)
     return tf.assign_sub(eval_mean, difference)
    def apply_updates(self):
        assert not self._updates_applied
        self._updates_applied = True
        devices = list(self._dev_grads.keys())
        total_grads = sum(len(grads) for grads in self._dev_grads.values())
        assert len(devices) >= 1 and total_grads >= 1
        ops = []
        with absolute_name_scope(self.scope):

            # Cast gradients to FP32 and calculate partial sum within each device.
            dev_grads = OrderedDict() # device => [(grad, var), ...]
            for dev_idx, dev in enumerate(devices):
                with tf.name_scope('ProcessGrads%d' % dev_idx), tf.device(dev):
                    sums = []
                    for gv in zip(*self._dev_grads[dev]):
                        assert all(v is gv[0][1] for g, v in gv)
                        g = [tf.cast(g, tf.float32) for g, v in gv]
                        g = g[0] if len(g) == 1 else tf.add_n(g)
                        sums.append((g, gv[0][1]))
                    dev_grads[dev] = sums

            # Sum gradients across devices.
            if len(devices) > 1:
                with tf.name_scope('SumAcrossGPUs'), tf.device(None):
                    for var_idx, grad_shape in enumerate(self._grad_shapes):
                        g = [dev_grads[dev][var_idx][0] for dev in devices]
                        if np.prod(grad_shape): # nccl does not support zero-sized tensors
                            g = tf.contrib.nccl.all_sum(g)
                        for dev, gg in zip(devices, g):
                            dev_grads[dev][var_idx] = (gg, dev_grads[dev][var_idx][1])

            # Apply updates separately on each device.
            for dev_idx, (dev, grads) in enumerate(dev_grads.items()):
                with tf.name_scope('ApplyGrads%d' % dev_idx), tf.device(dev):

                    # Scale gradients as needed.
                    if self.use_loss_scaling or total_grads > 1:
                        with tf.name_scope('Scale'):
                            coef = tf.constant(np.float32(1.0 / total_grads), name='coef')
                            coef = self.undo_loss_scaling(coef)
                            grads = [(g * coef, v) for g, v in grads]

                    # Check for overflows.
                    with tf.name_scope('CheckOverflow'):
                        grad_ok = tf.reduce_all(tf.stack([tf.reduce_all(tf.is_finite(g)) for g, v in grads]))

                    # Update weights and adjust loss scaling.
                    with tf.name_scope('UpdateWeights'):
                        opt = self._dev_opt[dev]
                        ls_var = self.get_loss_scaling_var(dev)
                        if not self.use_loss_scaling:
                            ops.append(tf.cond(grad_ok, lambda: opt.apply_gradients(grads), tf.no_op))
                        else:
                            ops.append(tf.cond(grad_ok,
                                lambda: tf.group(tf.assign_add(ls_var, self.loss_scaling_inc), opt.apply_gradients(grads)),
                                lambda: tf.group(tf.assign_sub(ls_var, self.loss_scaling_dec))))

                    # Report statistics on the last device.
                    if dev == devices[-1]:
                        with tf.name_scope('Statistics'):
                            ops.append(autosummary(self.id + '/learning_rate', self.learning_rate))
                            ops.append(autosummary(self.id + '/overflow_frequency', tf.where(grad_ok, 0, 1)))
                            if self.use_loss_scaling:
                                ops.append(autosummary(self.id + '/loss_scaling_log2', ls_var))

            # Initialize variables and group everything into a single op.
            self.reset_optimizer_state()
            init_uninited_vars(list(self._dev_ls_var.values()))
            return tf.group(*ops, name='TrainingOp')
예제 #32
0
def resnet_model_fn(features, labels, mode, params):
  """Our model_fn for ResNet to be used with our Estimator."""
  tf.summary.image('images', features, max_outputs=6)

  # build model
  net = resnet.ResNet(features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
  logits = net.build_model() 
  predictions = {
      'classes': tf.argmax(logits, axis=1),
      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
  }

  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

  # Calculate loss, which includes softmax cross entropy and L2 regularization.
  # a. get loss coeficiente
  pos_mask = tf.reduce_sum(
               tf.cast(
                 tf.greater_equal(
                   labels, tf.fill(tf.shape(labels), FLAGS.mask_thres)), 
                   tf.float32), 
             0)
  pos_curr_count = tf.cast(tf.greater(   pos_mask, 0), tf.float32)
  neg_curr_count = tf.cast(tf.less_equal(pos_mask, 0), tf.float32)
  pos_count = tf.Variable(tf.zeros(shape=[FLAGS.class_num,]),  trainable=False)
  neg_count = tf.Variable(tf.zeros(shape=[FLAGS.class_num,]),  trainable=False)
  neg_select = tf.cast(
                 tf.less_equal(
                    tf.random_uniform(
                      shape=[FLAGS.class_num,], 
                      minval=0, maxval=1,
                      seed = FLAGS.random_seed),
                    FLAGS.neg_select), 
                 tf.float32)
  tf.summary.histogram('pos_curr_count', pos_curr_count)
  tf.summary.histogram('neg_curr_count', neg_curr_count)
  tf.summary.histogram('neg_select', neg_select)
  with tf.control_dependencies([pos_curr_count, neg_curr_count, neg_select]):
    pos_count = tf.assign_sub(
                   tf.assign_add(pos_count, pos_curr_count),
                   tf.multiply(pos_count, neg_curr_count))
    neg_count = tf.assign_sub(
                   tf.assign_add(neg_count, tf.multiply(neg_curr_count, neg_select)),
                   tf.multiply(neg_count, pos_curr_count))
    tf.summary.histogram('pos_count', pos_count)
    tf.summary.histogram('neg_count', neg_count)
  pos_loss_coef = -1 * (tf.log((0.01 + pos_count)/10)/tf.log(10.0))
  pos_loss_coef = tf.where(
                    tf.greater(pos_loss_coef, tf.fill(tf.shape(pos_loss_coef), 0.01)),
                    pos_loss_coef,
                    tf.fill(tf.shape(pos_loss_coef), 0.01))
  pos_loss_coef = tf.multiply(pos_loss_coef, pos_curr_count)
  tf.summary.histogram('pos_loss_coef', pos_loss_coef)
  neg_loss_coef = -1 * (tf.log((8 + neg_count)/10)/tf.log(10.0))
  neg_loss_coef = tf.where(
                   tf.greater(neg_loss_coef, tf.fill(tf.shape(neg_loss_coef), 0.01)),
                   neg_loss_coef,
                   tf.fill(tf.shape(neg_loss_coef), 0.001))
  neg_loss_coef = tf.multiply(neg_loss_coef, tf.multiply(neg_curr_count, neg_select))
  tf.summary.histogram('neg_loss_coef', neg_loss_coef)
  loss_coef = tf.add(pos_loss_coef, neg_loss_coef)
  tf.summary.histogram('loss_coef', loss_coef)

  # b. get non-negative mask
  non_neg_mask = tf.fill(tf.shape(labels), -1.0, name='non_neg')
  non_neg_mask = tf.cast(tf.not_equal(labels, non_neg_mask), tf.float32)
  tf.summary.histogram('non_neg', non_neg_mask)

  # cal loss
  cross_entropy = tf.nn.weighted_cross_entropy_with_logits(
       logits=logits, targets=labels, pos_weight=12, name='sigmod_cross_entropy')
  tf.summary.histogram('sigmod_ce', cross_entropy)
  cross_entropy_cost = tf.reduce_sum(tf.reduce_mean(cross_entropy * non_neg_mask, axis=0) * loss_coef)

  # Create a tensor named cross_entropy for logging purposes.
  tf.identity(cross_entropy_cost, name='cross_entropy')
  tf.summary.scalar('cross_entropy', cross_entropy_cost)

  # Add weight decay to the loss. We exclude the batch norm variables because
  # doing so leads to a small improvement in accuracy.
  loss = cross_entropy_cost + FLAGS.weight_decay * tf.add_n(
    [tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name])

  if mode == tf.estimator.ModeKeys.TRAIN:
    # Scale the learning rate linearly with the batch size. When the batch size
    # is 256, the learning rate should be 0.1.
    lr_warmup = FLAGS.lr_warmup
    warmup_step = FLAGS.warmup
    warmup_decay_step = FLAGS.lr_warmup_decay_step
    warmup_decay_factor = FLAGS.lr_warmup_decay_factor
    global_step = tf.train.get_or_create_global_step()
    boundaries = [
        int(FLAGS.lr_decay_step * epoch) for epoch in [1, 2, 3, 4]]
    values = [
        FLAGS.lr * decay for decay in [1, 0.1, 0.01, 1e-3, 1e-4]]
    learning_rate = tf.train.piecewise_constant(
        tf.cast(global_step, tf.int32), boundaries, values)

    # Linear Scaling Rule and Gradual Warmup 
    lr = tf.cond(
                global_step < warmup_step,
                lambda: tf.train.exponential_decay(
                    lr_warmup, 
                    global_step,
                    warmup_decay_step,
                    warmup_decay_factor,
                    staircase=True
                    ),
                lambda: learning_rate
                )

    # Create a tensor named learning_rate for logging purposes.
    tf.identity(lr, name='learning_rate')
    tf.summary.scalar('learning_rate', lr)

    optimizer = tf.train.MomentumOptimizer(
        learning_rate=lr,
        momentum=FLAGS.opt_momentum)

    # Batch norm requires update_ops to be added as a train_op dependency.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
      train_op = optimizer.minimize(loss, global_step)
  else:
    train_op = None

  # Build evaluate metrics
  accuracy = tf.metrics.accuracy(
      tf.argmax(labels, axis=1), predictions['classes'])
  metrics = {'accuracy': accuracy}
  tf.identity(accuracy[1], name='train_accuracy')
  tf.summary.scalar('train_accuracy', accuracy[1])

  return tf.estimator.EstimatorSpec(
      mode=mode,
      predictions=predictions,
      loss=loss,
      train_op=train_op,
      eval_metric_ops=metrics)
예제 #33
0
 def _eval_mean_update():
     difference = (1 - eval_mean_ema_decay) * (eval_mean - training_mean)
     return tf.assign_sub(eval_mean, difference)
예제 #34
0
    def __init__(self, scope, trainer, global_step=None):
        with tf.variable_scope(scope):
            self.prob_of_random_goal = tf.Variable(
                FLAGS.initial_random_goal_prob,
                trainable=False,
                name="prob_of_random_goal",
                dtype=tf.float32)
            self.inputs = tf.placeholder(shape=[
                None, FLAGS.resized_height, FLAGS.resized_width,
                FLAGS.agent_history_length
            ],
                                         dtype=tf.float32,
                                         name="Inputs")

            self.prev_rewards = tf.placeholder(shape=[None],
                                               dtype=tf.float32,
                                               name="Prev_Rewards")

            self.prev_rewards_onehot = tf.one_hot(tf.cast(self.prev_rewards,
                                                          dtype=tf.int32),
                                                  2,
                                                  dtype=tf.float32,
                                                  name="Prev_Rewards_OneHot")

            self.prev_rewards = tf.expand_dims(self.prev_rewards,
                                               1,
                                               name="rewards")

            # self.prev_rewards_onehot = tf.expand_dims(self.prev_rewards, 0)

            self.prev_actions = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name="Prev_Actions")
            self.prev_actions_onehot = tf.one_hot(self.prev_actions,
                                                  FLAGS.nb_actions,
                                                  dtype=tf.float32,
                                                  name="Prev_Actions_OneHot")

            self.prev_goal = tf.placeholder(shape=[None, FLAGS.hidden_dim],
                                            dtype=tf.float32,
                                            name="Prev_Goals")

            self.image_summaries = []

            if FLAGS.game not in flags.SUPPORTED_ENVS:
                self.conv0 = tf.contrib.layers.conv2d(self.inputs,
                                                      16,
                                                      8,
                                                      4,
                                                      activation_fn=tf.nn.elu,
                                                      scope="conv0")
                with tf.variable_scope('conv0'):
                    tf.get_variable_scope().reuse_variables()
                    weights = tf.get_variable('weights')
                    grid = self.put_kernels_on_grid(weights)
                    self.image_summaries.append(
                        tf.summary.image('kernels', grid, max_outputs=1))
                self.conv = tf.contrib.layers.conv2d(self.conv0,
                                                     32,
                                                     4,
                                                     2,
                                                     activation_fn=tf.nn.elu,
                                                     scope="conv1")
            else:
                self.conv = tf.contrib.layers.conv2d(self.inputs,
                                                     32,
                                                     5,
                                                     2,
                                                     activation_fn=tf.nn.elu,
                                                     scope="conv1")
                with tf.variable_scope('conv1'):
                    tf.get_variable_scope().reuse_variables()
                    weights = tf.get_variable('weights')
                    grid = self.put_kernels_on_grid(weights)
                    self.image_summaries.append(
                        tf.summary.image('kernels', grid, max_outputs=1))

            with tf.variable_scope('inputs'):
                tf.get_variable_scope().reuse_variables()
                self.image_summaries.append(
                    tf.summary.image('input', self.inputs, max_outputs=100))

            self.conv_flat = tf.contrib.layers.flatten(self.conv)
            self.fc = tf.contrib.layers.fully_connected(
                self.conv_flat, FLAGS.hidden_dim)
            self.fc = tf.contrib.layers.layer_norm(self.fc)
            self.f_percept = tf.nn.elu(self.fc, name="Zt")

            if FLAGS.game not in flags.SUPPORTED_ENVS:
                self.f_percept = tf.concat([self.f_percept, self.prev_rewards],
                                           1,
                                           name="Zt_r")
            else:
                self.f_percept = tf.concat(
                    [self.f_percept, self.prev_rewards_onehot], 1, name="Zt_r")

            summary_f_percept_act = tf.contrib.layers.summarize_activation(
                self.f_percept)

            ############################################################################################################
            # Manager network

            if FLAGS.meta:
                self.f_Mspace = tf.concat([self.f_percept, self.prev_goal],
                                          1,
                                          name="Zt_r")
            else:
                self.f_Mspace = tf.identity(self.f_percept, name="Zt_r")

            self.f_Mspace = tf.contrib.layers.fully_connected(
                self.f_Mspace, FLAGS.hidden_dim)

            self.f_percept = tf.concat(
                [self.f_percept, self.prev_actions_onehot], 1, name="Zt_r")

            self.f_Mspace = tf.contrib.layers.layer_norm(self.f_Mspace)
            self.f_Mspace = tf.nn.elu(self.f_Mspace, name="St")
            summary_f_Mspace_act = tf.contrib.layers.summarize_activation(
                self.f_Mspace)

            m_rnn_in = tf.expand_dims(self.f_Mspace, [0], name="Mrnn_in")
            step_size = tf.shape(self.inputs)[:1]

            m_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                FLAGS.hidden_dim)
            m_c_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon),
                                np.float32)
            m_h_init = np.zeros((1, FLAGS.hidden_dim * FLAGS.manager_horizon),
                                np.float32)
            self.m_state_init = [m_c_init, m_h_init]
            m_c_in = tf.placeholder(
                tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon],
                name="Mrnn_c_in")
            m_h_in = tf.placeholder(
                tf.float32, [1, FLAGS.hidden_dim * FLAGS.manager_horizon],
                name="Mrnn_h_in")
            self.m_state_in = (m_c_in, m_h_in)
            m_state_in = tf.contrib.rnn.LSTMStateTuple(m_c_in, m_h_in)

            m_lstm_outputs, m_lstm_state = self.fast_dlstm(
                m_rnn_in, m_state_in, m_lstm_cell, FLAGS.manager_horizon,
                FLAGS.hidden_dim * FLAGS.manager_horizon)

            m_lstm_c, m_lstm_h = m_lstm_state
            self.m_state_out = (m_lstm_c[-1, :1, :], m_lstm_h[-1, :1, :])
            self.goals = tf.reshape(m_lstm_outputs, [-1, FLAGS.hidden_dim])
            self.normalized_goals = tf.contrib.layers.fully_connected(
                self.goals, FLAGS.hidden_dim, activation_fn=tf.tanh, name="Gt")

            summary_goals = tf.contrib.layers.summarize_activation(
                self.normalized_goals)

            def randomize_goals(t):
                t = tf.cast(t, tf.int32)
                packed_tensors = tf.stack([
                    tf.random_normal([
                        FLAGS.hidden_dim,
                    ]), self.normalized_goals[t, :]
                ])

                to_update = tf.cond(
                    tf.less(
                        self.prob_of_random_goal,
                        tf.constant(FLAGS.final_random_goal_prob,
                                    dtype=tf.float32)),
                    lambda: tf.cast(
                        tf.multinomial(
                            tf.log([[
                                self.prob_of_random_goal,
                                tf.subtract(tf.constant(1.0), self.
                                            prob_of_random_goal)
                            ]]), 1)[0][0], tf.int32),
                    lambda: tf.constant(1, tf.int32))

                resulted_tensor = tf.gather(packed_tensors, to_update)

                return resulted_tensor

            self.randomized_goals = tf.map_fn(lambda t: randomize_goals(t),
                                              tf.to_float(
                                                  tf.range(0, step_size[0])),
                                              name="random_gt")

            summary_random_goals = tf.contrib.layers.summarize_activation(
                self.randomized_goals)

            self.decrease_prob_of_random_goal = tf.assign_sub(
                self.prob_of_random_goal,
                tf.constant(
                    (FLAGS.initial_random_goal_prob -
                     FLAGS.final_random_goal_prob) / FLAGS.explore_steps))

            m_fc_value_w = tf.get_variable(
                "M_Value_W",
                shape=[FLAGS.hidden_dim, 1],
                initializer=normalized_columns_initializer(1.0))
            self.m_value = tf.matmul(m_rnn_out, m_fc_value_w, name="M_Value")

            summary_m_value_act = tf.contrib.layers.summarize_activation(
                self.m_value)

            ############################################################################################################

            # Worker network

            self.sum_prev_goals = tf.placeholder(
                shape=[None, FLAGS.hidden_dim],
                dtype=tf.float32,
                name="Prev_c_Goals_sum")

            w_rnn_in = tf.expand_dims(self.f_percept, [0], name="Wrnn_in")
            step_size = tf.shape(self.inputs)[:1]
            w_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                FLAGS.goal_embedding_size * FLAGS.nb_actions)
            w_c_init = np.zeros((1, w_lstm_cell.state_size.c), np.float32)
            w_h_init = np.zeros((1, w_lstm_cell.state_size.h), np.float32)
            self.w_state_init = [w_c_init, w_h_init]
            w_c_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.c],
                                    name="Wrnn_c_in")
            w_h_in = tf.placeholder(tf.float32, [1, w_lstm_cell.state_size.h],
                                    name="Wrnn_h_in")
            self.w_state_in = (w_c_in, w_h_in)
            w_state_in = tf.contrib.rnn.LSTMStateTuple(w_c_in, w_h_in)

            w_lstm_outputs, w_lstm_state = tf.nn.dynamic_rnn(
                w_lstm_cell,
                w_rnn_in,
                initial_state=w_state_in,
                sequence_length=step_size,
                time_major=False)

            w_lstm_c, w_lstm_h = w_lstm_state
            self.w_state_out = (w_lstm_c[:1, :], w_lstm_h[:1, :])
            Ut = tf.reshape(
                w_lstm_outputs,
                [step_size[0], FLAGS.nb_actions, FLAGS.goal_embedding_size],
                name="Ut")
            Ut_flat = tf.reshape(
                w_lstm_outputs,
                [step_size[0], FLAGS.nb_actions * FLAGS.goal_embedding_size],
                name="Ut_flat")

            summary_wrnn_act = tf.contrib.layers.summarize_activation(Ut)

            goal_encoding = tf.contrib.layers.fully_connected(
                self.sum_prev_goals,
                FLAGS.goal_embedding_size,
                biases_initializer=None,
                scope="goal_emb")

            interm_rez = tf.squeeze(
                tf.matmul(Ut, tf.expand_dims(goal_encoding, 2)), 2)
            interm_rez = tf.contrib.layers.flatten(interm_rez)
            self.w_policy = tf.nn.softmax(interm_rez, name="W_Policy")

            summary_w_policy_act = tf.contrib.layers.summarize_activation(
                self.w_policy)

            w_fc_value_w = tf.get_variable(
                "W_Value_W",
                shape=[
                    FLAGS.nb_actions * FLAGS.goal_embedding_size +
                    FLAGS.goal_embedding_size, 1
                ],
                initializer=normalized_columns_initializer(1.0))
            self.w_value = tf.matmul(tf.concat([Ut_flat, goal_encoding], 1),
                                     w_fc_value_w,
                                     name="W_Value")

            summary_w_value_act = tf.contrib.layers.summarize_activation(
                self.w_value)

            if scope != 'global':

                self.w_extrinsic_return = tf.placeholder(shape=[None],
                                                         dtype=tf.float32)
                self.m_extrinsic_return = tf.placeholder(shape=[None],
                                                         dtype=tf.float32)
                self.w_intrinsic_return = tf.placeholder(shape=[None],
                                                         dtype=tf.float32)

                def gather_state_at_horiz(t):
                    t = tf.cast(t, tf.int32)
                    f_Mspace_c = tf.gather(
                        self.f_Mspace,
                        tf.minimum(
                            t +
                            tf.constant(FLAGS.manager_horizon, dtype=tf.int32),
                            step_size[0] - 1))
                    return f_Mspace_c

                self.f_Mspace_c = tf.cast(tf.map_fn(
                    lambda t: gather_state_at_horiz(t),
                    tf.to_float(tf.range(0, step_size[0])),
                    name="state_at_horiz"),
                                          dtype=tf.float32)
                self.state_diff = self.f_Mspace_c - self.f_Mspace
                self.cos_sim_state_diff = self.cosine_distance(
                    tf.stop_gradient(self.state_diff),
                    self.normalized_goals,
                    dim=1)

                self.m_advantages = self.m_extrinsic_return - tf.stop_gradient(
                    tf.reshape(self.m_value, [-1]))
                self.goals_loss = -tf.reduce_sum(
                    self.m_advantages * self.cos_sim_state_diff)
                self.m_value_loss = FLAGS.m_beta_v * tf.reduce_sum(
                    tf.square(self.m_extrinsic_return -
                              tf.reshape(self.m_value, [-1])))

                self.actions = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="Actions")
                self.actions_onehot = tf.one_hot(self.actions,
                                                 FLAGS.nb_actions,
                                                 dtype=tf.float32,
                                                 name="Actions_Onehot")

                self.responsible_outputs = tf.reduce_sum(
                    self.w_policy * self.actions_onehot, [1])

                self.intrinsic_return = FLAGS.alpha * self.w_intrinsic_return
                self.total_return = self.w_extrinsic_return + self.intrinsic_return
                self.w_advantages = self.total_return - tf.stop_gradient(
                    tf.reshape(self.w_value, [-1]))

                # Loss functions
                self.w_value_loss = FLAGS.w_beta_v * tf.reduce_sum(
                    tf.square(self.total_return -
                              tf.reshape(self.w_value, [-1])))
                self.entropy = -tf.reduce_sum(
                    self.w_policy * tf.log(self.w_policy + 1e-7))

                self.w_policy_loss = -tf.reduce_sum(
                    tf.log(self.responsible_outputs + 1e-7) *
                    self.w_advantages) - self.entropy * FLAGS.beta_e

                self.loss = self.w_value_loss + self.w_policy_loss + self.m_value_loss + self.goals_loss

                local_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms = tf.clip_by_global_norm(
                    self.gradients, FLAGS.gradient_clip_value)

                self.worker_summaries = [
                    summary_f_percept_act, summary_f_Mspace_act, summary_goals,
                    summary_random_goals, summary_m_value_act,
                    summary_wrnn_act, summary_w_policy_act, summary_w_value_act
                ]
                for grad, weight in zip(grads, local_vars):
                    self.worker_summaries.append(
                        tf.summary.histogram(weight.name + '_grad', grad))
                    self.worker_summaries.append(
                        tf.summary.histogram(weight.name, weight))

                self.merged_summary = tf.summary.merge(self.worker_summaries)

                global_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                    zip(grads, global_vars))
예제 #35
0
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    d_vars = []
    g_vars = []
    d_grads = []
    g_grads = []

    for grad,var in grads_and_vars:
        if var in self.gan.d_vars():
            d_vars += [var]
            d_grads += [grad]
        elif var in self.gan.g_vars():
            g_vars += [var]
            g_grads += [grad]
        else:
            raise("Couldn't find var in g_vars or d_vars")

    var_list = d_vars + g_vars

    with ops.init_scope():
        slots_list = []
        if self.config.include_slots:
            for name in self.optimizer.get_slot_names():
                for var in self.optimizer.variables():
                    slots_list.append(self._zeros_slot(var, "curl", "curl"))
    self._prepare()

    def _name(post, s):
        ss = s.split(":")
        return ss[0] + "_" + post + "_dontsave"


    v1 = [tf.Variable(v, name=_name("curl",v.name)) for v in var_list]
    slots_list = []
    slots_vars = []
    if self.config.include_slots:
        for name in self.optimizer.get_slot_names():
            for var in self.optimizer.variables():
                slots_vars += [var]
                slots_list.append(self._zeros_slot(var, "curl", "curl"))


    restored_vars = var_list + slots_vars
    tmp_vars = v1 + slots_list
    # store variables for resetting

    if self.config.beta_type == 'sga':
        Jgrads = tf.gradients(d_grads, d_vars, grad_ys=d_grads, stop_gradients=d_vars) + [tf.zeros_like(g) for g in g_vars]
    elif self.config.beta_type == 'magnitude':
        consensus_reg = [tf.square(g) for g in d_grads if g is not None]
        Jgrads = tf.gradients(consensus_reg, d_vars) + [tf.zeros_like(g) for g in g_vars]
    else:
        consensus_reg = 0.5 * sum(
                tf.reduce_sum(tf.square(g)) for g in d_grads if g is not None
        )
        Jgrads = tf.gradients(consensus_reg, d_vars, stop_gradients=d_vars) + [tf.zeros_like(g) for g in g_vars]

    g1s = d_grads + g_grads

    op1 = tf.group(*[tf.assign(w, v) for w,v in zip(tmp_vars, restored_vars)]) # store variables

    with tf.get_default_graph().control_dependencies([op1]):
        # store g2
        op3 = tf.group(*[tf.assign_sub(v, self._lr_t*grad) for grad,v in grads_and_vars])
        with tf.get_default_graph().control_dependencies([op3]):

            def curlcombine(g1,g2,_v1,_v2,curl,rho):
                stepsize = self._lr_t
                if curl == "mirror":
                    return self._gamma*(g1 + 2*g2)
                else:
                    return self._gamma*g1-rho*(g2-g1)/stepsize
            g2s = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
            if self.config.form == 'central':
                def central_step():
                    # restore v1, slots
                    op5 = tf.group(*[ tf.assign(w,v) for w,v in zip(restored_vars, tmp_vars)])
                    with tf.get_default_graph().control_dependencies([op5]):
                        back =  tf.group(*[tf.assign_sub(v, -self._lr_t*grad) for grad,v in grads_and_vars])
                        with tf.get_default_graph().control_dependencies([back]):
                            return tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
                def curlcombinecentral(g1,g2,_v1,_v2,curl,rho):
                    #stepsize = (_v2-_v1)/(g1+1e-8)
                    stepsize = self._lr_t
                    if curl == "mirror":
                        return self._gamma*(g1 + 2*g2)
                    else:
                        return self._gamma*g1-rho*(g2-g1)/(2*stepsize)

                g1s  = central_step()
                g3s = [curlcombinecentral(g1,g2,v1,v2,self.config.d_curl,self.d_rho) if v2 in d_vars else curlcombinecentral(g1,g2,v1,v2,self.config.g_curl,self.g_rho) for g1,g2,v1,v2 in zip(g1s,g2s,v1,var_list)]
            else:
                #forward
                g3s = [curlcombine(g1,g2,v1,v2,self.config.d_curl,self.d_rho) if v2 in d_vars else curlcombine(g1,g2,v1,v2,self.config.g_curl,self.g_rho) for g1,g2,v1,v2 in zip(g1s,g2s,v1,var_list)]
            # restore v1, slots
            op5 = tf.group(*[ tf.assign(w,v) for w,v in zip(restored_vars, tmp_vars)])
            with tf.get_default_graph().control_dependencies([op5]):
                flin = []
                for grad, jg in zip(g3s, Jgrads):
                    if jg is None or self._beta <= 0:
                        flin += [grad]
                    else:
                        flin += [grad + jg * self._beta]

                if self.config.orthonormal:
                    shapes = [self.gan.ops.shape(l) for l in flin]
                    u = [tf.reshape(l, [-1]) for l in flin[:len(d_vars)]]
                    v = [tf.reshape(l, [-1]) for l in Jgrads[:len(d_vars)]]
                    
                    def proj(u, v,shape):
                        dot = tf.tensordot(v, u, 1) / (tf.square(u)+1e-8)
                        dot = tf.maximum(-1.0, dot)
                        dot = tf.minimum(1.0, dot)
                        dot = dot * u
                        dot = tf.reshape(dot, shape)
                        return dot
                    proj_u1_v2 = [proj(_u, _v, _s) for _u, _v, _s in zip(u, v, shapes)]
                    flin = [_flin + self.gan.configurable_param(self.config.ortholambda) * proj for _flin, proj in zip(flin, proj_u1_v2)] + flin[len(d_vars):]

                step3 = list(zip(flin, var_list))
                op6 = self.optimizer.apply_gradients(step3.copy(), global_step=global_step, name=name)

                with tf.get_default_graph().control_dependencies([op6]):
                    return tf.no_op()
예제 #36
0
파일: svgd.py 프로젝트: thobotics/RoMBRL
    def __init__(self, particles, cost_fun, tf_scope="default", batch_generator=None,
                 stepsize_schedule=ConstantStepsizeSchedule(0.1),
                 alpha=0.9, fudge_factor=1e-6, session=tf.get_default_session(),
                 dtype=tf.float64, seed=None):
        """ Initialize the sampler parameters and set up a tensorflow.Graph
            for later queries.

        Parameters
        ----------
        particles : List[tensorflow.Variable]
            List of particles each representing a (different) guess of the
            target parameters of this sampler.

        cost_fun : callable
            Function that takes `params` of *one* particle as input and
            returns a 1-d `tensorflow.Tensor` that contains the cost-value.
            Frequently denoted with `U` in literature.

        batch_generator : iterable, optional
            Iterable which returns dictionaries to feed into
            tensorflow.Session.run() calls to evaluate the cost function.
            Defaults to `None` which indicates that no batches shall be fed.

        stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule
            Iterator class that produces a stream of stepsize values that
            we can use in our samplers.
            See also: `pysgmcmc.stepsize_schedules`

        alpha : float, optional
            TODO DOKU
            Defaults to `0.9`.

        fudge_factor : float, optional
            TODO DOKU
            Defaults to `1e-6`.

        session : tensorflow.Session, optional
            Session object which knows about the external part of the graph
            (which defines `Cost`, and possibly batches).
            Used internally to evaluate (burn-in/sample) the sampler.

        dtype : tensorflow.DType, optional
            Type of elements of `tensorflow.Tensor` objects used in this sampler.
            Defaults to `tensorflow.float64`.

        seed : int, optional
            Random seed to use.
            Defaults to `None`.

        See Also
        ----------
        pysgmcmc.sampling.MCMCSampler:
            Base class for `SteinVariationalGradientDescentSampler` that
            specifies how actual sampling is performed (using iterator protocol,
            e.g. `next(sampler)`).

        """

        assert isinstance(alpha, (int, float))
        assert isinstance(fudge_factor, (int, float))
        # assert callable(cost_fun)

        # self.particles = tf.stack(particles)

        self.particles = particles

        # def cost_fun_wrapper(params):
        #     return tf.map_fn(lambda particle: cost_fun(particle), self.particles)
        # cost_fun_wrapper.__name__ = "potential_energy"  # cost_fun.__name__

        # super().__init__(
        self._init_basic(
            params=particles,
            cost_fun=cost_fun,  # cost_fun_wrapper,
            tf_scope=tf_scope,
            batch_generator=batch_generator,
            session=session, seed=seed, dtype=dtype,
            stepsize_schedule=stepsize_schedule
        )

        with tf.variable_scope(tf_scope, reuse=tf.AUTO_REUSE):

            fudge_factor = tf.constant(
                fudge_factor, dtype=self.dtype, name="fudge_factor"
            )

            self.epsilon = tf.Variable(
                stepsize_schedule.initial_value, dtype=self.dtype, name="stepsize"
            )

            stack_vectorized_params = tf.stack(self.vectorized_params)

            self.n_particles = tf.cast(
                # self.particles.shape[0], self.dtype
                stack_vectorized_params.shape[0], self.dtype
            )

            historical_grad = tf.get_variable(
                "historical_grad", stack_vectorized_params.shape, dtype=dtype,
                initializer=tf.zeros_initializer()
            )

        self.session.run(
            tf.variables_initializer([historical_grad, self.epsilon])
        )

        # lnpgrad = tf.squeeze(tf.gradients(self.cost, self.particles))
        grads = []
        for i, cost in enumerate(cost_fun):
            grads.append(tf.concat([vectorize(gradient) for gradient in tf.gradients(cost, self.particles[i])], axis=0))
        lnpgrad = tf.squeeze(grads)

        kernel_matrix, kernel_gradients = self.svgd_kernel(stack_vectorized_params)  # self.svgd_kernel(self.particles)

        grad_theta = tf.divide(
            tf.matmul(kernel_matrix, lnpgrad) + kernel_gradients,
            self.n_particles
        )

        historical_grad_t = tf.assign(
            historical_grad,
            alpha * historical_grad + (1. - alpha) * (grad_theta ** 2)
        )

        adj_grad = tf.divide(
            grad_theta,
            fudge_factor + tf.sqrt(historical_grad_t)
        )

        for i, particle in enumerate(self.particles):

            vectorized_Theta_t = tf.assign_sub(
                self.vectorized_params[i], self.epsilon * adj_grad[i]
            )
            start_idx = 0

            for j, param in enumerate(particle):
                flat_shape = tf.reduce_prod(param.shape)
                vectorized_param = vectorized_Theta_t[start_idx:start_idx+flat_shape]
                self.theta_t[i*len(particle) + j] = tf.assign(
                    param,
                    tf.reshape(vectorized_param, shape=param.shape),
                    name="theta_t_%d_%d" % (i, j)
                )
                start_idx += flat_shape
        return
예제 #37
0
def build_graph(hp):
    # We build tensorflow graph of loaded model with parameter update operations
    # This model's trainable variables are error values on testset batch
    hp['dnn_hp']['save_dir'] = hp['trained_model_dir']
    dnn.DNN(utils.CIFAR_INPUT_SIZE, N_CLASSES, hp['dnn_hp'])
    dnn_saver = tf.train.Saver(var_list=tf.global_variables())

    with tf.variable_scope('DNN/EGT', reuse=tf.AUTO_REUSE):
        # Compute loss on test batch from 'learnable_error' parameter and actual prediction
        # Note that learnable_error is initialized to zero, which means the intial update is equivalent to making NN learn its own output
        g = tf.get_default_graph()
        test_X_ph = g.get_tensor_by_name("DNN/X:0")
        trained_variables = [
            v for v in tf.global_variables() if v.name[:4] == 'DNN/'
        ]
        learnable_error = tf.Variable(tf.zeros([hp['batch_size'], N_CLASSES],
                                               tf.float32),
                                      name='learnable_error')
        tf.summary.histogram('learnable_error', learnable_error)

        logits = g.get_tensor_by_name('DNN/output_layer/logits:0')
        probs = g.get_tensor_by_name("DNN/probs:0")
        log_probs = tf.nn.log_softmax(logits, name='log_probs')
        new_probs = tf.nn.softmax(logits + learnable_error, name='new_loss')
        test_loss = tf.reduce_sum(log_probs * new_probs, name='test_loss')

        # Build updated graph
        train_X_ph = tf.placeholder(tf.float32,
                                    test_X_ph.get_shape(),
                                    name='X')
        opt = SGD(lr=hp['sub_lr'], momentum=hp['sub_momentum'], nesterov=True)
        with tf.variable_scope('adam_updates'):
            sub_updates = opt.get_updates(test_loss, trained_variables)
            updates_ops = tf.group(*sub_updates, name="updates_ops")
        replacements = utils.extract_update_dict(sub_updates)
        replacements[test_X_ph] = train_X_ph

    utils.graph_replace(test_loss,
                        replacements,
                        dst_scope='EGT/UpdatedDNN/',
                        src_scope='DNN/')

    with tf.variable_scope('EGT/'):
        # Compute loss of updated graph on train batch
        train_y_ph = tf.placeholder(tf.int32, [None], name='y')
        updated_logits = g.get_tensor_by_name(
            "EGT/UpdatedDNN/output_layer/logits:0")
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=updated_logits,
            labels=train_y_ph,
            name='generalization_xentropy')
        generalization_loss = tf.reduce_mean(xentropy,
                                             name='generalization_loss')

        # We learn 'learnable_error' by backpropagating from 'generalization_loss' through sub-SGD update on test_loss
        # Note that this step is computationally expensive as this gradient computation needs second order derivative w.r.t model parameters
        # Thus, you either need to keep model capacity low or apply this technique on top layers of your model (TODO: implement this with hessian approximation?)
        lr = tf.constant(hp['lr'], name='lr')
        meta_gradients = tf.gradients(generalization_loss, learnable_error)
        meta_optimize = tf.assign_sub(learnable_error,
                                      lr * meta_gradients[0],
                                      name='optimize')

    init_ops = tf.variables_initializer(
        [v for v in tf.global_variables() if 'EGT/' in v.name])
    return (
        test_X_ph, train_X_ph, train_y_ph
    ), generalization_loss, test_loss, probs, new_probs, meta_optimize, updates_ops, init_ops, dnn_saver
예제 #38
0
    def __init__(self, shape_list, seed_num=0):
        seed(seed_num)
        set_random_seed(seed_num)
        # Placeholders for input, output and dropout
        sequence_length = 28*28
        num_classes = 10
        self.shape_list = shape_list
        self.input_x = tf.placeholder(tf.float32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.lr_array = tf.placeholder(tf.float32, name="lr_array")
        self.alpha_array = tf.placeholder(tf.float32, name="alpha_array")
        initializer = tf.contrib.layers.xavier_initializer()
        # initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.1)

        with tf.name_scope("input"):
            self.P1 = tf.Variable(tf.eye(int(shape_list[0][0])))

            w1 = tf.get_variable("w1", shape=shape_list[0], initializer=initializer)
            y_1 = tf.concat([self.input_x, tf.tile(tf.ones([1, 1]), [tf.shape(self.input_x)[0], 1])], 1)
            r = tf.reduce_mean(y_1, 0, keep_dims=True)
            k = tf.matmul(self.P1, tf.transpose(r))
            self.dela_P1 = tf.divide(tf.matmul(k, tf.transpose(k)), self.alpha_array[0][0] + tf.matmul(r, k))
            self.P1 = tf.assign_sub(self.P1, self.dela_P1)

            y1 = tf.nn.relu(tf.matmul(y_1, w1, name="y1"))

        with tf.name_scope("output"):
            self.P2 = tf.Variable(tf.eye(int(shape_list[1][0])))
            w2 = tf.get_variable("w2", shape=shape_list[1], initializer=initializer)
            y_2 = tf.concat([y1, tf.tile(tf.ones([1, 1]), [tf.shape(y1)[0], 1])], 1)
            r = tf.reduce_mean(y_2, 0, keep_dims=True)
            k = tf.matmul(self.P2, tf.transpose(r))
            self.dela_P2 = tf.divide(tf.matmul(k, tf.transpose(k)), self.alpha_array[0][1] + tf.matmul(r, k))
            self.P2 = tf.assign_sub(self.P2, self.dela_P2)

            y2 = tf.matmul(y_2, w2, name="y2")

        scores = y2
        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            # losses = tf.square(self.scores - self.input_y)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=self.input_y)
            # self.loss = tf.reduce_mean(losses) + 5e-4 * (tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2))
            self.loss = tf.reduce_mean(losses)

        # Accuracy
        with tf.name_scope("accuracy"):
            predictions = tf.argmax(scores, 1, name="predictions")
            correct_predictions = tf.equal(predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))

        self.optimizer = tf.train.MomentumOptimizer(self.lr_array[0][0], momentum=0.9)
        # self.optimizer = tf.train.GradientDescentOptimizer(self.lr_array[0])

        # back_forward
        grads_and_vars = self.optimizer.compute_gradients(self.loss, var_list=[w1, w2])
        for i, (g, v) in enumerate(grads_and_vars):
            if g is not None:
                grads_and_vars[i] = (tf.clip_by_norm(g, 10), v)
        grad_v_input = [self.owm(self.P1, grads_and_vars[0])]
        grad_v_out = [self.owm(self.P2, grads_and_vars[1])]
        self.back_forward = self.optimizer.apply_gradients([grad_v_input[0], grad_v_out[0]])
def cnn(x,
        filter_size,
        strides,
        pool_fn,
        pool_size,
        pool_strides,
        act_fn,
        dtype=tf.float32,
        add_bias=True,
        wd=None,
        init_std=None,
        init_method=None,
        batch_norm=True,
        scope="cnn",
        trainable=True,
        is_training=True,
        keep_ema=False,
        ext_wts=None):
    """Builds a convolutional neural networks.
  Each layer contains the following operations:
    1) Convolution, y = w * x.
    2) Additive bias (optional), y = w * x + b.
    3) Activation function (optional), y = g( w * x + b ).
    4) Pooling (optional).

  Args:
    x: Input variable.
    filter_size: Shape of the convolutional filters, list of 4-d int.
    strides: Convolution strides, list of 4-d int.
    pool_fn: Pooling functions, list of N callable objects.
    pool_size: Pooling field size, list of 4-d int.
    pool_strides: Pooling strides, list of 4-d int.
    act_fn: Activation functions, list of N callable objects.
    add_bias: Whether adding bias or not, bool.
    wd: Weight decay, float.
    scope: Scope of the model, str.
  """
    num_layer = len(filter_size)
    # x = tf.Print(x, ['x', tf.reduce_mean(x), tf.reduce_max(x)])
    h = x
    wt_dict = {}
    with tf.variable_scope(scope):
        for ii in range(num_layer):
            with tf.variable_scope("layer_{}".format(ii)):
                if init_method is not None and init_method[ii]:
                    _init_method = init_method[ii]
                else:
                    _init_method = "truncated_normal"
                if ext_wts is not None:
                    w = ext_wts["w_" + str(ii)]
                    if type(w) == np.ndarray:
                        w = tf.constant(w)
                    log.info("Found all weights from numpy array")
                else:
                    w = weight_variable(filter_size[ii],
                                        dtype=dtype,
                                        init_method=_init_method,
                                        init_param={
                                            "mean": 0.0,
                                            "stddev": init_std[ii]
                                        },
                                        wd=wd,
                                        name="w",
                                        trainable=trainable)
                wt_dict["w_" + str(ii)] = w
                if add_bias:
                    if ext_wts is not None:
                        b = ext_wts["b_" + str(ii)]
                        if type(b) == np.ndarray:
                            b = tf.constant(b)
                        log.info("Found all biases from numpy array")
                    else:
                        b = weight_variable([filter_size[ii][3]],
                                            dtype=dtype,
                                            init_method="constant",
                                            init_param={"val": 0},
                                            name="b",
                                            trainable=trainable)
                    wt_dict["b_" + str(ii)] = b
                h = tf.nn.conv2d(h,
                                 w,
                                 strides=strides[ii],
                                 padding="SAME",
                                 name="conv")
                if add_bias:
                    h = tf.add(h, b, name="conv_bias")

                if batch_norm:
                    # Batch normalization.
                    n_out = int(h.get_shape()[-1])
                    if ext_wts is not None:
                        assign_ema = False
                        beta = ext_wts["beta_" + str(ii)]
                        gamma = ext_wts["gamma_" + str(ii)]
                        emean = ext_wts["emean_" + str(ii)]
                        evar = ext_wts["evar_" + str(ii)]
                        if type(beta) == np.ndarray:
                            beta = tf.constant(ext_wts["beta_" + str(ii)])
                            gamma = tf.constant(ext_wts["gamma_" + str(ii)])
                            emean = tf.constant(ext_wts["emean_" + str(ii)])
                            evar = tf.constant(ext_wts["evar_" + str(ii)])
                        log.info("Found all BN weights from numpy array")
                    else:
                        assign_ema = True
                        beta = weight_variable([n_out],
                                               dtype=dtype,
                                               init_method="constant",
                                               init_param={"val": 0.0},
                                               name="beta")
                        gamma = weight_variable([n_out],
                                                dtype=dtype,
                                                init_method="constant",
                                                init_param={"val": 1.0},
                                                name="gamma")
                        emean = weight_variable([n_out],
                                                dtype=dtype,
                                                init_method="constant",
                                                init_param={"val": 0.0},
                                                name="ema_mean",
                                                trainable=False)
                        evar = weight_variable([n_out],
                                               dtype=dtype,
                                               init_method="constant",
                                               init_param={"val": 1.0},
                                               name="ema_var",
                                               trainable=False)

                    wt_dict["beta_" + str(ii)] = beta
                    wt_dict["gamma_" + str(ii)] = gamma
                    wt_dict["emean_" + str(ii)] = emean
                    wt_dict["evar_" + str(ii)] = evar

                    if is_training:
                        decay = 0.9
                        mean, var = tf.nn.moments(h, [0, 1, 2], name="moments")
                        if assign_ema:
                            # assert False
                            ema_mean_op = tf.assign_sub(
                                emean, (emean - mean) * (1 - decay))
                            ema_var_op = tf.assign_sub(evar, (evar - var) *
                                                       (1 - decay))
                            with tf.control_dependencies(
                                [ema_mean_op, ema_var_op]):
                                h = tf.nn.batch_normalization(
                                    h, mean, var, beta, gamma, 1e-5)
                        else:
                            h = (h - emean) / tf.sqrt(evar +
                                                      1e-5) * gamma + beta
                    else:
                        h = (h - emean) / tf.sqrt(evar + 1e-5) * gamma + beta

                if ii == num_layer - 1:
                    assert act_fn[ii] is None
                if act_fn[ii] is not None:
                    h = act_fn[ii](h, name="act")
                if pool_fn[ii] is not None:
                    _height = int(h.get_shape()[1])
                    _width = int(h.get_shape()[2])
                    h = pool_fn[ii](h,
                                    pool_size[ii],
                                    strides=pool_strides[ii],
                                    padding="VALID",
                                    name="pool")
                    _height = int(h.get_shape()[1])
                    _width = int(h.get_shape()[2])
                    log.info("After pool {} {}".format(_height, _width))
    return h, wt_dict
 def testAssignUpdateNoValueShape(self):
   var = state_ops.variable_op([1, 2], tf.float32)
   added = tf.assign_add(var, self._NewShapelessTensor())
   self.assertEqual([1, 2], added.get_shape())
   subbed = tf.assign_sub(var, self._NewShapelessTensor())
   self.assertEqual([1, 2], subbed.get_shape())
def batch_norm(x,
               is_training,
               gamma=None,
               beta=None,
               axes=[0, 1, 2],
               eps=1e-10,
               name="bn_out",
               decay=0.99,
               dtype=tf.float32):
    """Applies batch normalization.
    Collect mean and variances on x except the last dimension. And apply
    normalization as below:
    x_ = gamma * (x - mean) / sqrt(var + eps) + beta
    Args:
      x: Input tensor, [B, ...].
      n_out: Integer, depth of input variable.
      gamma: Scaling parameter.
      beta: Bias parameter.
      axes: Axes to collect statistics.
      eps: Denominator bias.
    Returns:
      normed: Batch-normalized variable.
      mean: Mean used for normalization (optional).
  """
    n_out = x.get_shape()[-1]
    try:
        n_out = int(n_out)
        shape = [n_out]
    except:
        shape = None
    emean = tf.get_variable("ema_mean",
                            shape=shape,
                            trainable=False,
                            dtype=dtype,
                            initializer=tf.constant_initializer(0.0,
                                                                dtype=dtype))
    evar = tf.get_variable("ema_var",
                           shape=shape,
                           trainable=False,
                           dtype=dtype,
                           initializer=tf.constant_initializer(1.0,
                                                               dtype=dtype))
    if is_training:
        mean, var = tf.nn.moments(x, axes, name="moments")
        ema_mean_op = tf.assign_sub(emean, (emean - mean) * (1 - decay))
        ema_var_op = tf.assign_sub(evar, (evar - var) * (1 - decay))
        normed = tf.nn.batch_normalization(x,
                                           mean,
                                           var,
                                           beta,
                                           gamma,
                                           eps,
                                           name=name)
        return normed, [ema_mean_op, ema_var_op]
    else:
        normed = tf.nn.batch_normalization(x,
                                           emean,
                                           evar,
                                           beta,
                                           gamma,
                                           eps,
                                           name=name)
        return normed, None
  def finite_differences(self, grads_and_vars, global_step, name, d_vars, g_vars, d_grads, g_grads):
    """ Attempt to directly compute hessian and apply equation (6) """
    d_grads = []
    g_grads = []
    d_vars = []
    g_vars = []
    beta = 0.5
    if self.config.beta is not None:
        beta = self.config.beta
    for grad,var in grads_and_vars:
        if var in self.gan.d_vars():
            d_vars += [var]
            d_grads += [grad]
        elif var in self.gan.g_vars():
            g_vars += [var]
            g_grads += [grad]
        else:
            raise("Couldn't find var in g_vars or d_vars")

    all_vars = d_vars + g_vars
    all_grads = d_grads + g_grads

    with ops.init_scope():
        [self._zeros_slot(v, "orig", self._name) for _,v in grads_and_vars]

    v1 = [self.get_slot(v, "orig") for v in all_vars]

    restored_vars = all_vars
    tmp_vars = v1

    e1 = 0.0001
    e2 = 0.0001

    #gamma12
    save = tf.group(*[tf.assign(w, v) for w,v in zip(tmp_vars.copy(), restored_vars.copy())]) # store variables

    def curl():
        grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
        op3 = tf.group(*[tf.assign_sub(v, self._lr_t*grad) for grad,v in zip(grads, all_vars)])
        with tf.get_default_graph().control_dependencies([op3]):
            def curlcombine(g1,g2):
                stepsize = self._lr_t
                return g1-(g2-g1)/stepsize
            new_grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
            g3s = [curlcombine(g1,g2) for g1,g2 in zip(grads,new_grads)]
            return g3s
 
    #gamma12
    if self.config.method == 'curl':
        all_grads = curl()
        d_grads = all_grads[:len(d_vars)]
        g_grads = all_grads[len(d_vars):]
    with tf.get_default_graph().control_dependencies([save]):
        #opboth = self.optimizer.apply_gradients(grads_and_vars, global_step=global_step, name=name)
        #opdp = self.optimizer.apply_gradients(grads_and_vars[:len(d_vars)], global_step=global_step, name=name)
        #opgp = self.optimizer.apply_gradients(grads_and_vars[len(d_vars):], global_step=global_step, name=name)
        restore = tf.group(*[tf.assign(w, v) for w,v in zip(restored_vars.copy(), tmp_vars.copy())]) # store variables
        opboth = [tf.assign_sub(w, self._lr_t * v) for w,v in zip(all_vars.copy(), all_grads.copy())] # store variables
        with tf.get_default_graph().control_dependencies([tf.group(*opboth)]):
            if self.config.method == "curl":
                gboth = curl()
            else:
                gboth = tf.gradients(self.loss[0], d_vars) + tf.gradients(self.loss[1], g_vars)
            with tf.get_default_graph().control_dependencies([restore]):
                opd = opboth[:len(d_vars)]
                with tf.get_default_graph().control_dependencies([tf.group(*opd)]):
                    if self.config.method == "curl":
                        new_d_grads = curl()
                    else:
                        new_d_grads = tf.gradients(self.loss[0], d_vars) + tf.gradients(self.loss[1], g_vars)
                    with tf.get_default_graph().control_dependencies([restore]):
                        opg = opboth[len(d_vars):]
                        with tf.get_default_graph().control_dependencies([tf.group(*opg)]):
                            if self.config.method == "curl":
                                new_g_grads = curl()
                            else:
                                new_g_grads = tf.gradients(self.loss[0], d_vars) + tf.gradients(self.loss[1], g_vars)
                            with tf.get_default_graph().control_dependencies([restore]):
                                new_grads = []
                                for _gboth, _gd, _gg, _g in zip(gboth,new_d_grads,new_g_grads,(d_grads+g_grads)):
                                    a = (_gg - _g) / self._lr_t # d2f/dx2i
                                    b = (_gboth - _gg) / (2*self._lr_t)+(_gd-_g)/(2*self._lr_t) # d2f/dx1dx2
                                    c = (_gboth - _gd) / (2*self._lr_t)+(_gg-_g)/(2*self._lr_t) # d2f/dx1dx2
                                    c = -c
                                    d = -(_gd - _g) / self._lr_t # d2f/dx2j
                                    if self.config.form == 5:
                                        a = (_gg - _g) / self._lr_t # d2f/dx2i
                                        b = (_gboth - _gg) / (2*self._lr_t)+(_gd-_g)/(2*self._lr_t) # d2f/dx1dx2
                                        c = (_gboth - _gd) / (2*self._lr_t)+(_gg-_g)/(2*self._lr_t) # d2f/dx1dx2
                                        d = (_gd - _g) / self._lr_t # d2f/dx2j
                                    J = np.array([[a, b], [c,d]])
                                    Jt = np.transpose(J)

                                    det = a*d-b*c+1e-8
                                    #h_1 = 1.0/det * (b+d-a-c)
                                    h_1_a = d/det
                                    h_1_b = -b/det
                                    h_1_c = -c/det
                                    h_1_d = a/det
                                    Jinv = np.array([[h_1_a,h_1_b],[h_1_c,h_1_d]])
                                    _j = Jt[0][0]*Jinv[0][0]*_g+Jt[1][0]*Jinv[1][0]*_g+Jt[0][1]*Jinv[0][1]*_g+Jt[1][1]*Jinv[1][1]*_g

                                    alpha = 0.5
                                    if self.config.alpha is not None:
                                        alpha = self.config.alpha
                                    beta = 0.5
                                    if self.config.beta is not None:
                                        beta = self.config.beta

                                    new_grads.append( alpha*_g + beta*_j )

                                new_grads_and_vars = list(zip(new_grads, all_vars)).copy()
                                return self.optimizer.apply_gradients(new_grads_and_vars, global_step=global_step, name=name)
예제 #43
0
def split_softmax(prelogits, label, num_classes, 
                global_step, weight_decay, gamma=16.0, reuse=None):
    nrof_features = prelogits.shape[1].value
    batch_size = tf.shape(prelogits)[0]
    with tf.variable_scope('SplitSoftmax', reuse=reuse):
        weights = tf.get_variable('weights', shape=(num_classes, nrof_features),
                regularizer=slim.l2_regularizer(weight_decay),
                initializer=slim.xavier_initializer(),
                # initializer=tf.truncated_normal_initializer(stddev=0.1),
                # initializer=tf.constant_initializer(0),
                trainable=True,
                dtype=tf.float32)
        alpha = tf.get_variable('alpha', shape=(),
                regularizer=slim.l2_regularizer(1e-2),
                initializer=tf.constant_initializer(1.00),
                trainable=True,
                dtype=tf.float32)
        beta = tf.get_variable('beta', shape=(),
                # regularizer=slim.l2_regularizer(1e-2),
                initializer=tf.constant_initializer(0.0),
                trainable=True,
                dtype=tf.float32)
        sigma = tf.get_variable('sigma', shape=(),
                regularizer=slim.l2_regularizer(1e-1),
                initializer=tf.constant_initializer(1.0),
                trainable=True,
                dtype=tf.float32)
        threshold_pos = tf.get_variable('threshold_pos', shape=(),
                initializer=tf.constant_initializer(16.0),
                trainable=False, 
                dtype=tf.float32)
        threshold_neg = tf.get_variable('threshold_neg', shape=(),
                initializer=tf.constant_initializer(0.0),
                trainable=False, 
                dtype=tf.float32)

        # Normalizing the vecotors
        weights_normed = tf.nn.l2_normalize(weights, dim=1)
        prelogits_normed = tf.nn.l2_normalize(prelogits, dim=1)
        # weights_normed = weights
        # prelogits_normed = prelogits

        # Caluculate Centers
        centers, label_center, center_idx, center_weight = centers_by_label(prelogits_normed, label)
        centers = tf.gather(centers, center_idx)
        centers_normed = tf.nn.l2_normalize(centers, dim=1)

        coef = 1.0
        # Label and logits between batch and examplars
        label_mat_glob = tf.one_hot(label, num_classes, dtype=tf.float32)
        label_mask_pos_glob = tf.cast(label_mat_glob, tf.bool)
        label_mask_neg_glob = tf.logical_not(label_mask_pos_glob)
        # label_exp_batch = tf.expand_dims(label, 1)
        # label_exp_glob = tf.expand_dims(label_history, 1)
        # label_mat_glob = tf.equal(label_exp_batch, tf.transpose(label_exp_glob))
        # label_mask_pos_glob = tf.cast(label_mat_glob, tf.bool)
        # label_mask_neg_glob = tf.logical_not(label_mat_glob)

        # dist_mat_glob = euclidean_distance(prelogits_normed, tf.transpose(weights_normed), False)
        dist_mat_glob = tf.matmul(prelogits_normed, tf.transpose(weights_normed)) # + beta
        dist_pos_glob = tf.boolean_mask(dist_mat_glob, label_mask_pos_glob)
        dist_neg_glob = tf.boolean_mask(dist_mat_glob, label_mask_neg_glob)

        logits_glob = coef * dist_mat_glob
        logits_pos_glob = tf.boolean_mask(logits_glob, label_mask_pos_glob)
        logits_neg_glob = tf.boolean_mask(logits_glob, label_mask_neg_glob)


        # Label and logits within batch
        label_exp_batch = tf.expand_dims(label, 1)
        label_mat_batch = tf.equal(label_exp_batch, tf.transpose(label_exp_batch))
        label_mask_pos_batch = tf.cast(label_mat_batch, tf.bool)
        label_mask_neg_batch = tf.logical_not(label_mask_pos_batch)
        mask_non_diag = tf.logical_not(tf.cast(tf.eye(batch_size), tf.bool))
        label_mask_pos_batch = tf.logical_and(label_mask_pos_batch, mask_non_diag)

        # dist_mat_batch = euclidean_distance(prelogits_normed, tf.transpose(prelogits_normed), False)
        dist_mat_batch = tf.matmul(prelogits_normed, tf.transpose(prelogits_normed))
        dist_pos_batch = tf.boolean_mask(dist_mat_batch, label_mask_pos_batch)
        dist_neg_batch = tf.boolean_mask(dist_mat_batch, label_mask_neg_batch)

        logits_batch =  coef * dist_mat_batch
        logits_pos_batch = tf.boolean_mask(logits_batch, label_mask_pos_batch)
        logits_neg_batch = tf.boolean_mask(logits_batch, label_mask_neg_batch)


        # num_anchor = 32
        # prelogits_anchor = tf.reshape(prelogits_normed[:num_anchor], [num_anchor, 1, nrof_features])
        # prelogits_refer = tf.reshape(prelogits_normed[num_anchor:], [num_anchor, -1, nrof_features])
        # dist_anchor = tf.reduce_sum(tf.square(prelogits_anchor-prelogits_refer), axis=2)
        # dist_anchor = tf.reshape(dist_anchor, [-1])
        # logits_anchor = -0.5 * gamma * dist_anchor
        

        logits_pos = logits_pos_glob
        logits_neg = logits_neg_glob
    
        dist_pos = dist_pos_glob
        dist_neg = dist_neg_glob

        # epsilon_trsd = 0.3
        t_pos = coef * (threshold_pos)
        t_neg = coef * (threshold_neg)


        if gamma == 'auto':
            # gamma = tf.nn.softplus(alpha)
            gamma = tf.log(tf.exp(1.0) + tf.exp(alpha))
        elif type(gamma) == tuple:
            t_min, decay = gamma
            epsilon = 1e-5
            t = t_min + 1.0/(epsilon + decay*tf.cast(global_step, tf.float32))
            gamma = 1.0 / t
        else:
            assert type(gamma) == float
            gamma = tf.constant(gamma)

        hinge_loss = lambda x: tf.nn.relu(1.0 + x)
        margin_func = hinge_loss

        # Losses
        losses = []
        # num_pos = tf.cast(0.95 * tf.cast(tf.size(logits_pos), tf.float32), tf.int32)
        # # num_neg = tf.cast(0.75 * tf.cast(tf.size(logits_neg), tf.float32), tf.int32)
        # q_d = tf.pow(tf.sqrt(dist_neg), 2-nrof_features)*tf.pow(1-0.25*dist_neg, (3-nrof_features)/2)
        # tf.add_to_collection('watch_list', ('q_d', tf.reduce_sum(q_d)))
        # q_d = tf.minimum(1.0, 1 * q_d / tf.reduce_sum(q_d))
        # tf.add_to_collection('watch_list', ('q_d', tf.reduce_mean(q_d)))
        # sample_mask = tf.random_uniform(shape=tf.shape(logits_neg)) <= q_d
        # sample_mask = logits_neg >= tf.reduce_min(logits_pos)
        # _logits_neg = tf.boolean_mask(logits_neg, sample_mask)
        # tf.add_to_collection('watch_list', ('sample_ratio', 
        #    tf.cast(tf.size(_logits_neg),tf.float32) / tf.cast(tf.size(logits_neg),tf.float32)))
               

        # gamma2 = 1 / 0.01
        _logits_pos = tf.reshape(logits_pos, [batch_size, -1])
        _logits_neg = tf.reshape(logits_neg, [batch_size, -1])

        norm = tf.square(tf.reduce_sum(tf.square(prelogits), axis=1, keep_dims=True))
        norm_weights = tf.norm(tf.gather(weights, label), axis=1, keep_dims=True)
        t_pos = (beta)
        t_neg = (beta)


        _logits_pos =  _logits_pos * gamma 
        _logits_neg =  _logits_neg * gamma
        # _logits_neg, _ = tf.nn.top_k(_logits_neg, num_neg)
        # _logits_pos, _ = tf.nn.top_k(_logits_pos, num_pos)
        # _logits_neg = tf.boolean_mask(_logits_neg, sample_mask)
        # _logits_pos = -tf.reduce_logsumexp(-_logits_pos)# , axis=1)[:,None]
        _logits_neg = tf.reduce_logsumexp(_logits_neg, axis=1)[:,None]
        # _logits_pos = tf.reduce_mean(_logits_pos)
        #-- Simulate Ranking
        # se_neg = tf.reduce_sum(tf.exp(_logits_neg))
        # min_pos = tf.reduce_min(_logits_pos)
        # t_pos = tf.stop_gradient(tf.log(se_neg))
        # t_neg = tf.stop_gradient(tf.log(se_neg - tf.exp(_logits_neg)))
        

        # norm = tf.reshape(prelogits[:,-1], [batch_size, -1])
        # norm_weighted = tf.exp(-norm)
        # norm_weighted = norm / tf.reduce_sum(norm) * tf.cast(tf.size(norm), tf.float32)

        # sigma_batch = tf.reshape(tf.gather(sigma, label), [batch_size, -1])

        m = 5.0
        # tf.add_to_collection('watch_list', ('m',m))

        factor = 1 / tf.cast(batch_size, tf.float32)
        bias = tf.log(tf.cast(num_classes, tf.float32))
        loss_pos = tf.nn.relu(m + _logits_neg - _logits_pos) * 0.5
        loss_neg = tf.nn.relu(m + _logits_neg - _logits_pos) * 0.5
        loss = tf.reduce_mean((loss_pos + loss_neg), name='split_loss')
        losses.extend([loss])
        tf.add_to_collection('watch_list', ('split_loss', loss))

        # Global loss
        # weights_batch = tf.gather(weights_normed, label)
        # _logits_pos_glob = tf.reduce_sum(tf.square(prelogits_normed - weights_batch), axis=1)  * coef * gamma
        _logits_pos_glob = tf.reshape(logits_pos_glob, [batch_size, -1]) * gamma
        _logits_neg_glob = tf.reshape(logits_neg_glob, [batch_size, -1]) * gamma
        _logits_neg_glob = tf.reduce_logsumexp(_logits_neg_glob) # , axis=1)[:,None]
        loss_glob = tf.reduce_mean(tf.nn.relu(1 + _logits_neg_glob - _logits_pos_glob), name='loss_glob')
        # losses.append(loss_glob)
        # tf.add_to_collection('watch_list', ('loss_glob', loss_glob))

        # Weight decay
        loss_weight = tf.reduce_sum( 1e-7 * tf.square(weights_normed), name='loss_weight')
        # losses.append(loss_weight)
        # tf.add_to_collection('watch_list', ('loss_weight', loss_weight))

        # Split Softmax
        # _logits_pos_glob = tf.reshape(logits_pos_glob, [batch_size, -1]) * gamma
        # _logits_neg_glob = tf.reshape(logits_neg_glob, [batch_size, -1]) * gamma
        # _logits_pos_glob = tf.log(tf.reduce_sum(tf.exp(_logits_pos_glob) + num_classes-1, axis=1)[:,None])
        # _logits_neg_glob = tf.reduce_logsumexp(_logits_neg_glob, axis=1)[:,None]
        # _t_pos = t_pos * gamma
        # _t_neg = t_neg * gamma
        # loss_pos = tf.reduce_mean(tf.nn.softplus(_t_pos - _logits_pos_glob), name='loss_pos')
        # loss_neg = tf.reduce_mean(tf.nn.softplus(_logits_neg_glob - _t_neg), name='loss_neg')
        # losses.extend([loss_pos, loss_neg])



        # Batch Center loss
        # centers_batch = tf.gather(centers, center_idx)
        centers_batch = tf.gather(weights_normed, label)
        dist_center = tf.reduce_sum(tf.square(prelogits_normed - centers_batch), axis=1)
        loss_center = tf.reduce_mean(1.0*dist_center, name='loss_center')
        # losses.append(loss_center)
        # tf.add_to_collection('watch_list', ('loss_center', loss_center))


        # Update threshold
        if not threshold_pos in tf.trainable_variables():
            # -- Mean threshold        
            mean_pos, var_pos = tf.nn.moments(dist_pos, axes=[0])
            mean_neg, var_neg = tf.nn.moments(dist_neg, axes=[0])
            std_pos = tf.sqrt(var_pos)
            std_neg = tf.sqrt(var_neg)
            threshold_batch = std_neg*mean_pos / (std_pos+std_neg) + std_pos*mean_neg / (std_pos+std_neg)
            threshold_pos_batch = threshold_neg_batch = threshold_batch
            # -- Logits
            # threshold_pos_batch = tf.reduce_logsumexp(_logits_neg)
            # threshold_neg_batch = -tf.reduce_logsumexp(-_logits_pos)
            # -- Quantile
            # diff_pos_sorted, _ = tf.nn.top_k(logits_pos, 2)
            # diff_neg_sorted, _ = tf.nn.top_k(logits_neg, 2704237)
            # threshold_pos_batch = diff_neg_sorted[-1]
            # threshold_neg_batch = diff_pos_sorted[-1]
            threshold_neg_batch = tf.reduce_min(_logits_pos)
            threshold_pos_batch = tf.reduce_max(_logits_neg)
            # -- Update
            diff_threshold_pos = threshold_pos - threshold_pos_batch
            diff_threshold_neg = threshold_neg - threshold_neg_batch
            diff_threshold_pos = 0.1 * diff_threshold_pos
            diff_threshold_neg = 0.1 * diff_threshold_neg
            threshold_pos_update_op = tf.assign_sub(threshold_pos, diff_threshold_pos)
            threshold_neg_update_op = tf.assign_sub(threshold_neg, diff_threshold_neg)
            threshold_update_op = tf.group(threshold_pos_update_op, threshold_neg_update_op)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, threshold_update_op)


        # Update centers
        if not weights in tf.trainable_variables():
            weights_batch = tf.gather(weights, label)
            diff_centers = weights_batch - prelogits
            unique_label, unique_idx, unique_count = tf.unique_with_counts(label)
            appear_times = tf.gather(unique_count, unique_idx)
            appear_times = tf.reshape(appear_times, [-1, 1])
            diff_centers = diff_centers / tf.cast((1 + appear_times), tf.float32)
            diff_centers = 0.5 * diff_centers
            centers_update_op = tf.scatter_sub(weights, label, diff_centers)
            # centers_decay_op = tf.assign_sub(weights, 2*weight_decay*weights)# weight decay
            centers_update_op = tf.group(centers_update_op)
            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, centers_update_op)

        # if not sigma in tf.trainable_variables(): 
        #     weights_batch = tf.gather(weights, label)
        #     diff_centers = weights_batch - prelogits
        #     _, var_pos = tf.nn.moments(diff_centers, axes=[0])
        #     sigma_batch = tf.reduce_mean(tf.sqrt(var_pos))
        #     diff_sigma = sigma - sigma_batch
        #     diff_sigma = 0.01 * diff_sigma
        #     sigma_update_op = tf.assign_sub(sigma, diff_sigma)
        #     tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, sigma_update_op)



        # Analysis
        mean_dist_pos = tf.reduce_mean(dist_pos, name='mean_dist_pos')
        mean_dist_neg = tf.reduce_mean(dist_neg, name='mean_dist_neg')
        acc_pos = tf.reduce_mean(tf.cast(tf.greater_equal(logits_pos, t_pos), tf.float32), name='acc_pos')
        acc_neg = tf.reduce_mean(tf.cast(tf.less(logits_neg, t_neg), tf.float32), name='acc_neg')
        tf.summary.scalar('threshold_pos', threshold_pos)
        tf.summary.scalar('mean_dist_pos', mean_dist_pos)
        tf.summary.scalar('mean_dist_neg', mean_dist_neg)
        tf.summary.scalar('acc_pos', acc_pos)
        tf.summary.scalar('acc_neg', acc_neg)
        tf.summary.scalar('gamma', gamma)
        tf.summary.scalar('alpha', alpha)
        tf.summary.scalar('beta', beta)
        tf.summary.histogram('dist_pos', dist_pos)
        tf.summary.histogram('dist_neg', dist_neg)
        # tf.summary.histogram('dist_neg_min', _logits_neg / coef)
        # tf.summary.histogram('sigma', sigma)

        # tf.add_to_collection('watch_list', ('alpha', alpha))
        tf.add_to_collection('watch_list', ('gamma', gamma))
        tf.add_to_collection('watch_list', ('alpha', alpha))
        tf.add_to_collection('watch_list', ('beta', beta))
        # tf.add_to_collection('watch_list', ('t_pos', t_pos))
        # tf.add_to_collection('watch_list', ('t_neg', tf.reduce_mean(t_neg)))
        # tf.add_to_collection('watch_list', ('dpos', mean_dist_pos))
        # tf.add_to_collection('watch_list', ('dneg', mean_dist_neg))
        # tf.add_to_collection('watch_list', ('loss_pos', loss_pos))
        # tf.add_to_collection('watch_list', ('loss_neg', loss_neg))
        # tf.add_to_collection('watch_list', ('sigma', sigma))
        # tf.add_to_collection('watch_list', ('logits_pos', tf.reduce_mean(_logits_pos)))
        # tf.add_to_collection('watch_list', ('logits_neg', tf.reduce_mean(_logits_neg)))
        # tf.add_to_collection('watch_list', ('acc_pos', acc_pos))
        # tf.add_to_collection('watch_list', ('acc_neg', acc_neg))

    return losses
예제 #44
0
 def _anneal_learning_rate(self):
     return tf.cond(
         self.learning_rate > 0.0,
         lambda: tf.assign_sub(self.learning_rate, self.delta_lr),
         lambda: tf.assign(self.learning_rate, 0.0))
예제 #45
0
    def __init__(self, data, placeholder, FLAGS):
        self.optimizer = FLAGS.optimizer
        self.opti_epsilon = FLAGS.epsilon
        self.lr = FLAGS.learning_rate
        self.vocab_size = data.vocab_size
        self.measure = FLAGS.measure
        self.embed_dim = FLAGS.embed_dim
        self.batch_size = FLAGS.batch_size
        self.rel_size = FLAGS.rel_size
        self.tuple_model = FLAGS.tuple_model
        self.init_embedding = FLAGS.init_embedding
        self.rang = tf.range(0, FLAGS.batch_size, 1)
        self.temperature = tf.Variable(FLAGS.temperature, trainable=False)
        self.decay_rate = FLAGS.decay_rate
        self.log_space = FLAGS.log_space
        # LSTM Params
        self.term = FLAGS.term
        self.hidden_dim = FLAGS.hidden_dim
        self.peephole = FLAGS.peephole
        self.freeze_grad = FLAGS.freeze_grad
        self.regularization_method = FLAGS.regularization_method
        self.marginal_method = FLAGS.marginal_method

        self.t1x = placeholder['t1_idx_placeholder']
        self.t1mask = placeholder['t1_msk_placeholder']
        self.t1length = placeholder['t1_length_placeholder']
        self.t2x = placeholder['t2_idx_placeholder']
        self.t2mask = placeholder['t2_msk_placeholder']
        self.t2length = placeholder['t2_length_placeholder']
        self.rel = placeholder['rel_placeholder']
        self.relmsk = placeholder['rel_msk_placeholder']
        self.label = placeholder['label_placeholder']
        """Initiate box embeddings"""
        self.min_embed, self.delta_embed = self.init_word_embedding(data)
        self.projector = unit_cube.MinMaxHyperCubeProjectorDeltaParam(
            self.min_embed, self.delta_embed, 0.0, 1e-10)
        self.project_op = self.projector.project_op
        """get unit box representation for both term, no matter they are phrases or words"""

        self.t1_min_embed, self.t1_max_embed, self.t2_min_embed, self.t2_max_embed = self.get_word_embedding(
            self.t1x, self.t2x)
        """get negative example unit box representation, if it's randomly generated during training."""
        if FLAGS.neg == 'uniform':
            neg_num = 1
            self.nt1x = tf.random_uniform([self.batch_size * neg_num, 1],
                                          0,
                                          self.vocab_size,
                                          dtype=tf.int32)
            self.nt2x = tf.random_uniform([self.batch_size * neg_num, 1],
                                          0,
                                          self.vocab_size,
                                          dtype=tf.int32)
            self.nt1_min_embed, self.nt1_max_embed, self.nt2_min_embed, self.nt2_max_embed = self.get_word_embedding(
                self.nt1x, self.nt2x)
            # combine the original word embedding with the new embeddings.
            self.nt1_min_embed = tf.concat(
                [tf.tile(self.t1_min_embed, [neg_num, 1]), self.nt1_min_embed],
                axis=0)
            self.nt1_max_embed = tf.concat(
                [tf.tile(self.t1_max_embed, [neg_num, 1]), self.nt1_max_embed],
                axis=0)
            self.nt2_min_embed = tf.concat(
                [self.nt2_min_embed,
                 tf.tile(self.t2_min_embed, [neg_num, 1])],
                axis=0)
            self.nt2_max_embed = tf.concat(
                [self.nt2_max_embed,
                 tf.tile(self.t2_max_embed, [neg_num, 1])],
                axis=0)
            self.label = tf.concat(
                [self.label,
                 tf.zeros([self.batch_size * neg_num * 2])], 0)
            self.t1_uniform_min_embed = tf.concat(
                [self.t1_min_embed, self.nt1_min_embed], axis=0)
            self.t1_uniform_max_embed = tf.concat(
                [self.t1_max_embed, self.nt1_max_embed], axis=0)
            self.t2_uniform_min_embed = tf.concat(
                [self.t2_min_embed, self.nt2_min_embed], axis=0)
            self.t2_uniform_max_embed = tf.concat(
                [self.t2_max_embed, self.nt2_max_embed], axis=0)
            conditional_logits, self.meet_min, self.meet_max, self.disjoint, self.nested, self.overlap_volume, self.rhs_volume = self.get_conditional_probability(
                self.t1_uniform_min_embed, self.t1_uniform_max_embed,
                self.t2_uniform_min_embed, self.t2_uniform_max_embed)
        else:
            conditional_logits, self.meet_min, self.meet_max, self.disjoint, self.nested, self.overlap_volume, self.rhs_volume = self.get_conditional_probability(
                self.t1_min_embed, self.t1_max_embed, self.t2_min_embed,
                self.t2_max_embed)

        evaluation_logits, _, _, _, _, _, _ = self.get_conditional_probability(
            self.t1_min_embed, self.t1_max_embed, self.t2_min_embed,
            self.t2_max_embed)
        self.eval_prob = -evaluation_logits
        """get conditional probability loss"""
        # self.cond_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = self.label, logits=conditional_logits))
        self.cond_loss = -tf.reduce_mean(
            tf.multiply(conditional_logits, self.label) +
            tf.multiply(tf.log(1 - tf.exp(conditional_logits) + 1e-10), 1 -
                        self.label))
        self.cond_loss = FLAGS.w1 * self.cond_loss
        """model marg prob loss"""
        if FLAGS.w2 > 0.0:
            if self.log_space:
                self.max_embed = self.min_embed + tf.exp(self.delta_embed)
            else:
                self.max_embed = self.min_embed + self.delta_embed
            if self.marginal_method == 'universe':
                self.universe_min = tf.reduce_min(self.min_embed,
                                                  axis=0,
                                                  keep_dims=True)
                self.universe_max = tf.reduce_max(self.max_embed,
                                                  axis=0,
                                                  keep_dims=True)
                self.universe_volume = tf.reduce_prod(tf.nn.softplus(
                    (self.universe_max - self.universe_min) / self.temperature)
                                                      * self.temperature,
                                                      axis=-1)
                self.box_volume = tf.reduce_prod(tf.nn.softplus(
                    (self.max_embed - self.min_embed) / self.temperature) *
                                                 self.temperature,
                                                 axis=-1)
                self.predicted_marginal_logits = tf.log(
                    self.box_volume) - tf.log(self.universe_volume)
            elif self.marginal_method == 'softplus':
                self.box_volume = tf.reduce_prod(unit_cube.normalized_softplus(
                    self.delta_embed, self.temperature),
                                                 axis=-1)
                self.predicted_marginal_logits = tf.log(self.box_volume)
            elif self.marginal_method == 'sigmoid':
                self.box_volume = tf.reduce_prod(
                    unit_cube.sigmoid_normalized_softplus(
                        self.delta_embed, self.temperature),
                    axis=-1)
                self.predicted_marginal_logits = tf.log(self.box_volume)
            else:
                raise ValueError(
                    "Expected either softplus or universe but received",
                    self.marginal_method)
            self.marginal_probability = tf.constant(data.margina_prob)
            self.marginal_probability = tf.reshape(self.marginal_probability,
                                                   [self.vocab_size])
            self.marg_loss = FLAGS.w2 * tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(
                    labels=self.marginal_probability,
                    logits=self.predicted_marginal_logits))
        else:
            self.marg_loss = tf.constant(0.0)
        self.debug = tf.constant(0.0)
        self.temperature_update = tf.assign_sub(self.temperature,
                                                FLAGS.decay_rate)

        if FLAGS.debug:
            # """model cond prob loss"""
            self.pos_disjoint = tf.logical_and(tf.cast(self.label, tf.bool),
                                               self.disjoint)
            self.pos_overlap = tf.logical_and(tf.cast(self.label, tf.bool),
                                              tf.logical_not(self.disjoint))
            self.neg_disjoint = tf.logical_and(
                tf.logical_not(tf.cast(self.label, tf.bool)), self.disjoint)
            self.neg_overlap = tf.logical_and(
                tf.logical_not(tf.cast(self.label, tf.bool)),
                tf.logical_not(self.disjoint))
            self.pos_nested = tf.logical_and(tf.cast(self.label, tf.bool),
                                             self.nested)
            self.neg_nested = tf.logical_and(
                tf.logical_not(tf.cast(self.label, tf.bool)), self.nested)
            self.pos_disjoint.set_shape([None])
            self.neg_disjoint.set_shape([None])
            self.pos_overlap.set_shape([None])
            self.neg_overlap.set_shape([None])
            self.pos_nested.set_shape([None])
            self.neg_nested.set_shape([None])
            if self.marginal_method == 'universe':
                lhs_volume = tf.reduce_prod(tf.nn.softplus(
                    (self.t2_max_embed - self.t2_min_embed) / self.temperature)
                                            * self.temperature,
                                            axis=-1)
                logx = tf.log(rhs_volume) - tf.log(self.universe_volume)
                logy = tf.log(lhs_volume) - tf.log(self.universe_volume)
                logxy = tf.log(overlap_volume) - tf.log(self.universe_volume)
            elif self.marginal_method == 'softplus':
                logx = tf.log(
                    tf.reduce_prod(unit_cube.normalized_softplus(
                        (self.t1_max_embed - self.t1_min_embed),
                        self.temperature),
                                   axis=-1))
                logy = tf.log(
                    tf.reduce_prod(unit_cube.normalized_softplus(
                        (self.t2_max_embed - self.t2_min_embed),
                        self.temperature),
                                   axis=-1))
                logxy = tf.log(
                    tf.reduce_prod(unit_cube.normalized_softplus(
                        (self.meet_max - self.meet_min), self.temperature),
                                   axis=-1))
            elif self.marginal_method == 'sigmoid':
                logx = tf.log(
                    tf.reduce_prod(unit_cube.sigmoid_normalized_softplus(
                        (self.t1_max_embed - self.t1_min_embed),
                        self.temperature),
                                   axis=-1))
                logy = tf.log(
                    tf.reduce_prod(unit_cube.sigmoid_normalized_softplus(
                        (self.t2_max_embed - self.t2_min_embed),
                        self.temperature),
                                   axis=-1))
                logxy = tf.log(
                    tf.reduce_prod(unit_cube.sigmoid_normalized_softplus(
                        (self.meet_max - self.meet_min), self.temperature),
                                   axis=-1))
            else:
                raise ValueError(
                    "Expected either softplus or universe but received",
                    self.marginal_method)
            lognume1 = logxy
            lognume2 = logx + logy
            logdomi = 0.5 * (logx + logy + tf_utils.log1mexp(-logx) +
                             tf_utils.log1mexp(-logy))
            correlation = tf.exp(lognume1 - logdomi) - tf.exp(lognume2 -
                                                              logdomi)
            self.marg_loss = tf.Print(self.marg_loss, [
                tf.exp(self.predicted_marginal_logits),
                self.marginal_probability, self.box_volume
            ], 'marginal prediction and label')
            self.cond_loss = tf.Print(self.cond_loss,
                                      [tf.exp(conditional_logits), self.label],
                                      'conditional prediction and label')
            self.cond_loss = tf.Print(self.cond_loss, [
                tf.reduce_sum(tf.cast(self.pos_nested, tf.int32)),
                tf.boolean_mask(tf.exp(conditional_logits), self.pos_nested)
            ], 'pos nested number')
            self.cond_loss = tf.Print(self.cond_loss, [
                tf.reduce_sum(tf.cast(self.neg_nested, tf.int32)),
                tf.boolean_mask(tf.exp(conditional_logits), self.neg_nested)
            ], 'neg nested number')
            self.cond_loss = tf.Print(self.cond_loss, [
                tf.reduce_mean(
                    tf.boolean_mask(tf.exp(conditional_logits),
                                    self.pos_disjoint)),
                tf.reduce_sum(tf.cast(self.pos_disjoint, tf.int32)),
                tf.count_nonzero(
                    tf.less_equal(
                        tf.boolean_mask(correlation, self.pos_disjoint), 0)),
                tf.reduce_mean(
                    tf.boolean_mask(tf.exp(logxy), self.pos_disjoint)),
                tf.reduce_mean(tf.boolean_mask(tf.exp(logx),
                                               self.pos_disjoint)),
                tf.boolean_mask(self.t2_max_embed, self.pos_disjoint),
                tf.boolean_mask(self.t2_min_embed, self.pos_disjoint)
            ], 'pos disjoint loss')

            self.cond_loss = tf.Print(self.cond_loss, [
                tf.reduce_mean(
                    tf.boolean_mask(tf.exp(conditional_logits),
                                    self.pos_overlap)),
                tf.reduce_sum(tf.cast(self.pos_overlap, tf.int32)),
                tf.count_nonzero(
                    tf.less_equal(
                        tf.boolean_mask(correlation, self.pos_overlap), 0)),
                tf.reduce_mean(tf.boolean_mask(tf.exp(logxy),
                                               self.pos_overlap)),
                tf.reduce_mean(tf.boolean_mask(tf.exp(logx), self.pos_overlap))
            ], 'pos overlap loss')

            self.cond_loss = tf.Print(self.cond_loss, [
                tf.reduce_mean(
                    tf.boolean_mask(tf.exp(conditional_logits),
                                    self.neg_disjoint)),
                tf.reduce_sum(tf.cast(self.neg_disjoint, tf.int32)),
                tf.count_nonzero(
                    tf.less_equal(
                        tf.boolean_mask(correlation, self.neg_disjoint), 0)),
                tf.reduce_mean(
                    tf.boolean_mask(tf.exp(logxy), self.neg_disjoint)),
                tf.reduce_mean(tf.boolean_mask(tf.exp(logx),
                                               self.neg_disjoint))
            ], 'neg disjoint loss')

            self.cond_loss = tf.Print(self.cond_loss, [
                tf.reduce_mean(
                    tf.boolean_mask(tf.exp(conditional_logits),
                                    self.neg_overlap)),
                tf.reduce_sum(tf.cast(self.neg_overlap, tf.int32)),
                tf.count_nonzero(
                    tf.less_equal(
                        tf.boolean_mask(correlation, self.neg_overlap), 0)),
                tf.boolean_mask(self.t1x, self.neg_overlap),
                tf.boolean_mask(self.t2x, self.neg_overlap),
                tf.reduce_mean(tf.boolean_mask(tf.exp(logxy),
                                               self.neg_overlap)),
                tf.reduce_mean(tf.boolean_mask(tf.exp(logx), self.neg_overlap))
            ], 'neg overlap loss')
        """model regurlization"""
        if self.regularization_method == 'universe_edge' and FLAGS.r1 > 0.0:
            self.regularization = FLAGS.r1 * tf.reduce_mean(
                tf.nn.softplus(self.universe_max - self.universe_min))
        elif self.regularization_method == 'delta' and FLAGS.r1 > 0.0:
            if self.log_space:
                self.regularization = FLAGS.r1 * tf.reduce_mean(
                    tf.square(tf.exp(self.delta_embed)))
            else:
                self.regularization = FLAGS.r1 * tf.reduce_mean(
                    tf.square(self.delta_embed))
        else:
            self.regularization = tf.constant(0.0)
        """model final loss"""

        self.loss = self.cond_loss + self.marg_loss + self.regularization
        """loss gradient"""
        grads = tf.gradients(self.loss, tf.trainable_variables())
        grad_norm = 0.0
        for g in grads:
            grad_norm += tf.reduce_sum(g.values * g.values)
        grad_norm = tf.sqrt(grad_norm)
        self.grad_norm = grad_norm
예제 #46
0
def assign_moving_mean_variance(
    mean_var, variance_var, value, decay, name=None):
  """Compute exponentially weighted moving {mean,variance} of a streaming value.

  The `value` updated exponentially weighted moving `mean_var` and
  `variance_var` are given by the following recurrence relations:

  ```python
  variance_var = decay * (variance_var + (1-decay) * (value - mean_var)**2)
  mean_var     = decay * mean_var + (1 - decay) * value
  ```

  Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
  the lag-1 mean.

  For derivation justification, see [Finch (2009; Eq. 143)][1].

  Args:
    mean_var: `float`-like `Variable` representing the exponentially weighted
      moving mean. Same shape as `variance_var` and `value`.
    variance_var: `float`-like `Variable` representing the
      exponentially weighted moving variance. Same shape as `mean_var` and
      `value`.
    value: `float`-like `Tensor`. Same shape as `mean_var` and `variance_var`.
    decay: A `float`-like `Tensor`. The moving mean decay. Typically close to
      `1.`, e.g., `0.999`.
    name: Optional name of the returned operation.

  Returns:
    mean_var: `Variable` representing the `value`-updated exponentially weighted
      moving mean.
    variance_var: `Variable` representing the `value`-updated
      exponentially weighted moving variance.

  Raises:
    TypeError: if `mean_var` does not have float type `dtype`.
    TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different
      `base_dtype`.

  #### References

  [1]: Tony Finch. Incremental calculation of weighted mean and variance.
       _Technical Report_, 2009.
       http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
  """
  with tf.name_scope(name, "assign_moving_mean_variance",
                     [variance_var, mean_var, value, decay]):
    with tf.colocate_with(variance_var):
      with tf.colocate_with(mean_var):
        base_dtype = mean_var.dtype.base_dtype
        if not base_dtype.is_floating:
          raise TypeError(
              "mean_var.base_dtype({}) does not have float type "
              "`dtype`.".format(base_dtype.name))
        if base_dtype != variance_var.dtype.base_dtype:
          raise TypeError(
              "mean_var.base_dtype({}) != variance_var.base_dtype({})".format(
                  base_dtype.name,
                  variance_var.dtype.base_dtype.name))
        value = tf.convert_to_tensor(value, dtype=base_dtype, name="value")
        decay = tf.convert_to_tensor(decay, dtype=base_dtype, name="decay")
        delta = value - mean_var
        with tf.control_dependencies([delta]):
          mean_var = tf.assign_add(mean_var, (1. - decay) * delta)
          variance_var = tf.assign_sub(
              variance_var,
              (1. - decay) * (variance_var - decay * tf.square(delta)))
        return mean_var, variance_var
예제 #47
0
    def apply_updates(self, allow_no_op: bool = False) -> tf.Operation:
        """Construct training op to update the registered variables based on their gradients."""
        tfutil.assert_tf_initialized()
        assert not self._updates_applied
        self._updates_applied = True
        all_ops = []

        # Check for no-op.
        if allow_no_op and len(self._devices) == 0:
            with tfutil.absolute_name_scope(self.scope):
                return tf.no_op(name='TrainingOp')

        # Clean up gradients.
        for device_idx, device in enumerate(self._devices.values()):
            with tfutil.absolute_name_scope(self.scope + "/Clean%d" %
                                            device_idx), tf.device(
                                                device.name):
                for var, grad in device.grad_raw.items():

                    # Filter out disconnected gradients and convert to float32.
                    grad = [g for g in grad if g is not None]
                    grad = [tf.cast(g, tf.float32) for g in grad]

                    # Sum within the device.
                    if len(grad) == 0:
                        grad = tf.zeros(var.shape)  # No gradients => zero.
                    elif len(grad) == 1:
                        grad = grad[0]  # Single gradient => use as is.
                    else:
                        grad = tf.add_n(grad)  # Multiple gradients => sum.

                    # Scale as needed.
                    scale = 1.0 / len(device.grad_raw[var]) / len(
                        self._devices)
                    scale = tf.constant(scale, dtype=tf.float32, name="scale")
                    if self.minibatch_multiplier is not None:
                        scale /= tf.cast(self.minibatch_multiplier, tf.float32)
                    scale = self.undo_loss_scaling(scale)
                    device.grad_clean[var] = grad * scale

        # Sum gradients across devices.
        if len(self._devices) > 1:
            with tfutil.absolute_name_scope(self.scope +
                                            "/Broadcast"), tf.device(None):
                if platform.system(
                ) == "Windows":  # Windows => NCCL ops are not available.
                    self._broadcast_fallback()
                elif tf.VERSION.startswith(
                        "1.15."
                ):  # TF 1.15 => NCCL ops are broken: https://github.com/tensorflow/tensorflow/issues/41539
                    self._broadcast_fallback()
                else:  # Otherwise => NCCL ops are safe to use.
                    self._broadcast_nccl()

        # Apply updates separately on each device.
        for device_idx, device in enumerate(self._devices.values()):
            with tfutil.absolute_name_scope(self.scope + "/Apply%d" %
                                            device_idx), tf.device(
                                                device.name):
                # pylint: disable=cell-var-from-loop

                # Accumulate gradients over time.
                if self.minibatch_multiplier is None:
                    acc_ok = tf.constant(True, name='acc_ok')
                    device.grad_acc = OrderedDict(device.grad_clean)
                else:
                    # Create variables.
                    with tf.control_dependencies(None):
                        for var in device.grad_clean.keys():
                            device.grad_acc_vars[var] = tf.Variable(
                                tf.zeros(var.shape),
                                trainable=False,
                                name="grad_acc_var")
                        device.grad_acc_count = tf.Variable(
                            tf.zeros([]),
                            trainable=False,
                            name="grad_acc_count")

                    # Track counter.
                    count_cur = device.grad_acc_count + 1.0
                    count_inc_op = lambda: tf.assign(device.grad_acc_count,
                                                     count_cur)
                    count_reset_op = lambda: tf.assign(device.grad_acc_count,
                                                       tf.zeros([]))
                    acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier,
                                                   tf.float32))
                    all_ops.append(
                        tf.cond(acc_ok, count_reset_op, count_inc_op))

                    # Track gradients.
                    for var, grad in device.grad_clean.items():
                        acc_var = device.grad_acc_vars[var]
                        acc_cur = acc_var + grad
                        device.grad_acc[var] = acc_cur
                        with tf.control_dependencies([acc_cur]):
                            acc_inc_op = lambda: tf.assign(acc_var, acc_cur)
                            acc_reset_op = lambda: tf.assign(
                                acc_var, tf.zeros(var.shape))
                            all_ops.append(
                                tf.cond(acc_ok, acc_reset_op, acc_inc_op))

                # No overflow => apply gradients.
                all_ok = tf.reduce_all(
                    tf.stack([acc_ok] + [
                        tf.reduce_all(tf.is_finite(g))
                        for g in device.grad_acc.values()
                    ]))
                apply_op = lambda: device.optimizer.apply_gradients(
                    [(tf.cast(grad, var.dtype), var)
                     for var, grad in device.grad_acc.items()])
                all_ops.append(tf.cond(all_ok, apply_op, tf.no_op))

                # Adjust loss scaling.
                if self.use_loss_scaling:
                    ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var,
                                                      self.loss_scaling_inc)
                    ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var,
                                                      self.loss_scaling_dec)
                    ls_update_op = lambda: tf.group(
                        tf.cond(all_ok, ls_inc_op, ls_dec_op))
                    all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op))

                # Last device => report statistics.
                if device_idx == len(self._devices) - 1:
                    all_ops.append(
                        autosummary.autosummary(
                            self.id + "/learning_rate",
                            tf.convert_to_tensor(self.learning_rate)))
                    all_ops.append(
                        autosummary.autosummary(self.id +
                                                "/overflow_frequency",
                                                tf.where(all_ok, 0, 1),
                                                condition=acc_ok))
                    if self.use_loss_scaling:
                        all_ops.append(
                            autosummary.autosummary(
                                self.id + "/loss_scaling_log2",
                                device.loss_scaling_var))

        # Initialize variables.
        self.reset_optimizer_state()
        if self.use_loss_scaling:
            tfutil.init_uninitialized_vars(
                [device.loss_scaling_var for device in self._devices.values()])
        if self.minibatch_multiplier is not None:
            tfutil.run([
                var.initializer for device in self._devices.values()
                for var in list(device.grad_acc_vars.values()) +
                [device.grad_acc_count]
            ])

        # Group everything into a single op.
        with tfutil.absolute_name_scope(self.scope):
            return tf.group(*all_ops, name="TrainingOp")
 def testAssignUpdateNoVarShape(self):
   var = state_ops.variable_op([1, 2], tf.float32, set_shape=False)
   added = tf.assign_add(var, [[2.0, 3.0]])
   self.assertEqual([1, 2], added.get_shape())
   subbed = tf.assign_sub(var, [[12.0, 13.0]])
   self.assertEqual([1, 2], subbed.get_shape())
예제 #49
0
    def build_trainer(self, child_model):
        # actor
        child_model.build_valid_rl()
        self.valid_acc = (tf.to_float(child_model.valid_shuffle_acc) /
                          tf.to_float(child_model.batch_size))
        self.reward = self.valid_acc

        if self.use_critic:
            # critic
            all_h = tf.concat(self.all_h, axis=0)
            value_function = tf.matmul(all_h, self.w_critic)
            advantage = value_function - self.reward
            critic_loss = tf.reduce_sum(advantage**2)
            self.baseline = tf.reduce_mean(value_function)
            self.loss = -tf.reduce_mean(self.sample_log_probs * advantage)

            critic_train_step = tf.Variable(0,
                                            dtype=tf.int32,
                                            trainable=False,
                                            name="critic_train_step")
            critic_train_op, _, _, _ = get_train_ops(critic_loss,
                                                     [self.w_critic],
                                                     critic_train_step,
                                                     clip_mode=None,
                                                     lr_init=1e-3,
                                                     lr_dec_start=0,
                                                     lr_dec_every=int(1e9),
                                                     optim_algo="adam",
                                                     sync_replicas=False)
        else:
            # or baseline
            self.sample_log_probs = tf.reduce_sum(self.sample_log_probs)
            self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False)
            baseline_update = tf.assign_sub(self.baseline, (1 - self.bl_dec) *
                                            (self.baseline - self.reward))
            with tf.control_dependencies([baseline_update]):
                self.reward = tf.identity(self.reward)
            self.loss = self.sample_log_probs * (self.reward - self.baseline)

        self.train_step = tf.Variable(0,
                                      dtype=tf.int32,
                                      trainable=False,
                                      name="train_step")
        tf_variables = [
            var for var in tf.trainable_variables()
            if var.name.startswith(self.name) and "w_critic" not in var.name
        ]
        print "-" * 80
        for var in tf_variables:
            print var
        self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
            self.loss,
            tf_variables,
            self.train_step,
            clip_mode=self.clip_mode,
            grad_bound=self.grad_bound,
            l2_reg=self.l2_reg,
            lr_init=self.lr_init,
            lr_dec_start=self.lr_dec_start,
            lr_dec_every=self.lr_dec_every,
            lr_dec_rate=self.lr_dec_rate,
            optim_algo=self.optim_algo,
            sync_replicas=self.sync_replicas,
            num_aggregate=self.num_aggregate,
            num_replicas=self.num_replicas)

        if self.use_critic:
            self.train_op = tf.group(self.train_op, critic_train_op)
 def testAssignUpdateNoShape(self):
   var = state_ops.variable_op([1, 2], tf.float32, set_shape=False)
   added = tf.assign_add(var, self._NewShapelessTensor())
   self.assertEqual(tensor_shape.unknown_shape(), added.get_shape())
   subbed = tf.assign_sub(var, self._NewShapelessTensor())
   self.assertEqual(tensor_shape.unknown_shape(), subbed.get_shape())
예제 #51
0
def optimize(loss,
             global_step,
             max_grad_norm,
             lr,
             lr_decay,
             sync_replicas=False,
             replicas_to_aggregate=1,
             task_id=0):
    """Builds optimization graph.

    * Creates an optimizer, and optionally wraps with SyncReplicasOptimizer
    * Computes, clips, and applies gradients
    * Maintains moving averages for all trainable variables
    * Summarizes variables and gradients

    Args:
        loss: scalar loss to minimize.
        global_step: integer scalar Variable.
        max_grad_norm: float scalar. Grads will be clipped to this value.
        lr: float scalar, learning rate.
        lr_decay: float scalar, learning rate decay rate.
        sync_replicas: bool, whether to use SyncReplicasOptimizer.
        replicas_to_aggregate: int, number of replicas to aggregate when using
        SyncReplicasOptimizer.
        task_id: int, id of the current task; used to ensure proper initialization
        of SyncReplicasOptimizer.

    Returns:
        train_op
    """
    with tf.name_scope('optimization'):
        # Compute gradients.
        tvars = tf.trainable_variables()
        grads = tf.gradients(
            loss,
            tvars,
            aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)

        # Clip non-embedding grads
        non_embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars)
                                        if 'embedding' not in v.op.name]
        embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars)
                                    if 'embedding' in v.op.name]

        ne_grads, ne_vars = zip(*non_embedding_grads_and_vars)
        ne_grads, _ = tf.clip_by_global_norm(ne_grads, max_grad_norm)
        non_embedding_grads_and_vars = list(zip(ne_grads, ne_vars))

        grads_and_vars = embedding_grads_and_vars + non_embedding_grads_and_vars
        if not global_step:
            opt = tf.train.AdamOptimizer(lr)
            apply_gradient_op = opt.apply_gradients(grads_and_vars)
            return apply_gradient_op
        # Summarize
        _summarize_vars_and_grads(grads_and_vars)

        # Decaying learning rate
        lr = tf.train.exponential_decay(lr,
                                        global_step,
                                        1,
                                        lr_decay,
                                        staircase=True)
        tf.summary.scalar('learning_rate', lr)
        opt = tf.train.AdamOptimizer(lr)
        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            0.999, global_step)
        global_step = tf.assign_sub(global_step, 1)

        # Apply gradients
    if sync_replicas:
        opt = tf.train.SyncReplicasOptimizer(
            opt,
            replicas_to_aggregate,
            variable_averages=variable_averages,
            variables_to_average=tvars,
            total_num_replicas=replicas_to_aggregate)
        apply_gradient_op = opt.apply_gradients(grads_and_vars)
        with tf.control_dependencies([apply_gradient_op]):
            train_op = tf.no_op(name='train_op')

        # Initialization ops
        tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                             opt.get_chief_queue_runner())
        if task_id == 0:
            local_init_op = opt.chief_init_op
            tf.add_to_collection('chief_init_op', opt.get_init_tokens_op())
        else:
            local_init_op = opt.local_step_init_op
        tf.add_to_collection('local_init_op', local_init_op)
        tf.add_to_collection('ready_for_local_init_op',
                             opt.ready_for_local_init_op)
    else:
        # Non-sync optimizer
        apply_gradient_op = opt.apply_gradients(grads_and_vars)
        with tf.control_dependencies([apply_gradient_op]):
            train_op = variable_averages.apply(tvars)
    return train_op
예제 #52
0
  def finite_differences(self, grads_and_vars, global_step, name, d_vars, g_vars, d_grads, g_grads):
    all_vars = [ v for _,v in grads_and_vars]
    all_grads = [ g for g, _ in grads_and_vars ]
    d_grads = all_grads[:len(d_vars)]
    g_grads = all_grads[len(d_vars):]
    d_vars = []
    g_vars = []
    for grad,var in grads_and_vars:
        if var in self.gan.d_vars():
            d_vars += [var]
        elif var in self.gan.g_vars():
            g_vars += [var]
        else:
            raise("Couldn't find var in g_vars or d_vars")

    with ops.init_scope():
        [self._zeros_slot(v, "orig", self._name) for _,v in grads_and_vars]
        slots_list = []
        if self.config.include_slots:
            for name in self.optimizer.get_slot_names():
                for var in self.optimizer.variables():
                    slots_list.append(self.optimizer._zeros_slot(var, "orig", "orig"))

    v1 = [self.get_slot(v, "orig") for _,v in grads_and_vars]
    slots_list = []
    slots_vars = []

    restored_vars = all_vars + slots_vars
    tmp_vars = v1 + slots_list

    e1 = 0.0001
    e2 = 0.0001

    #gamma12
    save = tf.group(*[tf.assign(w, v) for w,v in zip(tmp_vars, restored_vars)]) # store variables
    restore = tf.group(*[tf.assign(w, v) for w,v in zip(restored_vars, tmp_vars)]) # store variables

    def curl():
        grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
        op3 = tf.group(*[tf.assign_sub(v, self._lr_t*grad) for grad,v in zip(grads, all_vars)])
        with tf.get_default_graph().control_dependencies([op3]):
            def curlcombine(g1,g2):
                stepsize = self._lr_t
                return g1-(g2-g1)/stepsize
            new_grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
            g3s = [curlcombine(g1,g2) for g1,g2 in zip(grads,new_grads)]
            return g3s
 
    #gamma12
    with tf.get_default_graph().control_dependencies([save]):
        #opboth = self.optimizer.apply_gradients(grads_and_vars, global_step=global_step, name=name)
        #opdp = self.optimizer.apply_gradients(grads_and_vars[:len(d_vars)], global_step=global_step, name=name)
        #opgp = self.optimizer.apply_gradients(grads_and_vars[len(d_vars):], global_step=global_step, name=name)
        opboth = tf.group(*[tf.assign_sub(w, self._lr_t * v) for w,v in zip(all_vars, all_grads)]) # store variables
        opd = tf.group(*[tf.assign_sub(w, self._lr_t * v) for w,v in zip(d_vars, d_grads)]) # store variables
        opg = tf.group(*[tf.assign_sub(w, self._lr_t * v) for w,v in zip(g_vars, g_grads)]) # store variables
        with tf.get_default_graph().control_dependencies([opboth]):
            gboth = curl()#tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
            with tf.get_default_graph().control_dependencies([restore]):
                with tf.get_default_graph().control_dependencies([opd]):
                    #new_d_grads = [tf.zeros_like(_d) for _d in d_vars]+tf.gradients(self.gan.trainer.g_loss, g_vars)
                    new_d_grads = curl()#tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
                    with tf.get_default_graph().control_dependencies([restore]):
                        with tf.get_default_graph().control_dependencies([opg]):
                            #new_g_grads = tf.gradients(self.gan.trainer.d_loss, d_vars) + [tf.zeros_like(_g) for _g in g_vars]
                            new_g_grads = curl()#tf.gradients(self.gan.trainer.d_loss, d_vars) + tf.gradients(self.gan.trainer.g_loss, g_vars)
                            with tf.get_default_graph().control_dependencies([restore]):
                                new_grads = []
                                for _gboth, _gd, _gg, _g in zip(gboth,new_d_grads,new_g_grads,d_grads):
                                    det = tf.square(_gboth)-(_gg*_gd)+1e-8
                                    h_1 = 1.0/det * (2*_gboth - _gd - _gg)
                                    if self.config.hessian:
                                        #v = (g(x + hjej)-g(x)))/(2hj) + \
                                        #    (g(x + hiei)-g(x))/(2hi)
                                        a = (_gboth - _g) / self._lr_t # d2f/dx2i
                                        c = (_gboth - _g) / self._lr_t # d2f/dx2j
                                        b = (_gg - _g) / (2*self._lr_t)+(_gd-_g)/(2*self._lr_t) # d2f/dx1dx2
                                        d = b # d2f/dx2dx1
                                        det = a*d-b*c+1e-8
                                        #h_1 = 1.0/det * (b+d-a-c)
                                        h_1_a = d/det
                                        h_1_b = -b/det
                                        h_1_c = -c/det
                                        h_1_d = a/det

                                        h_1 = h_1_a*h_1_d-h_1_b*h_1_c
                                    new_grads.append( _g*h_1 )

                                for _gboth, _gd, _gg, _g in zip(gboth[len(d_vars):],new_d_grads[len(d_vars):],new_g_grads[len(d_vars):],g_grads):
                                    det = tf.square(_gboth)-(_gg*_gd)+1e-8
                                    h_1 = 1.0/det * (2*_gboth - _gd - _gg)
                                    if self.config.hessian:
                                        #v = (g(x + hjej)-g(x)))/(2hj) + \
                                        #    (g(x + hiei)-g(x))/(2hi)
                                        a = (_gboth - _g) / self._lr_t # d2f/dx2i
                                        c = (_gboth - _g) / self._lr_t # d2f/dx2j
                                        b = (_gg - _g) / (2*self._lr_t)+(_gd-_g)/(2*self._lr_t) # d2f/dx1dx2
                                        d = b # d2f/dx2dx1
                                        det = a*d-b*c+1e-8
                                        #h_1 = 1.0/det * (b+d-a-c)
                                        h_1_a = d/det
                                        h_1_b = -b/det
                                        h_1_c = -c/det
                                        h_1_d = a/det
                                        h_1 = h_1_a*h_1_d-h_1_b*h_1_c
                                    new_grads.append( _g*h_1 )

                                new_grads_and_vars = list(zip(new_grads, all_vars)).copy()
                                return self.optimizer.apply_gradients(new_grads_and_vars, global_step=global_step, name=name)
예제 #53
0
# relu
Relu1 = tf.nn.relu(X)  # 1,1,1
Relu0 = tf.nn.relu(P)

r_add = rtt.SecureReveal(Add)
r_sub = rtt.SecureReveal(Sub)
r_mul = rtt.SecureReveal(Mul)
r_matmul = rtt.SecureReveal(Matmul)
r_bias_add = rtt.SecureReveal(BiasAdd)
r_relu1 = rtt.SecureReveal(Relu1)
r_relu0 = rtt.SecureReveal(Relu0)
r_AB3 = rtt.SecureReveal(AB3)
r_AB4 = rtt.SecureReveal(AB4)
r_AB5 = rtt.SecureReveal(AB5)

r_assign_sub = rtt.SecureReveal(tf.assign_sub(Y, X))

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    print("add reveal: ", sess.run(r_add))
    print("add_multiple_demension reveal ab3(add): ", sess.run(r_AB3))
    print("add_multiple_demension reveal ab4(add): ", sess.run(r_AB4))
    print("add_multiple_demension reveal ab5(add): ", sess.run(r_AB5))
    print("sub reveal: ", sess.run(r_sub))
    print("mul reveal: ", sess.run(r_mul))
    print("matmul reveal: ", sess.run(r_matmul))
    print("bias_add reveal: ", sess.run(r_bias_add))
    print("relu(expect-0) reveal: ", sess.run(r_relu0))
    print("relu(expect-1) reveal: ", sess.run(r_relu1))
    print("assign_sub(expect-1) reveal: ", sess.run(r_assign_sub))
 def testAssignUpdateNoVarShape(self):
     var = state_ops.variable_op([1, 2], tf.float32, set_shape=False)
     added = tf.assign_add(var, [[2.0, 3.0]])
     self.assertEqual([1, 2], added.get_shape())
     subbed = tf.assign_sub(var, [[12.0, 13.0]])
     self.assertEqual([1, 2], subbed.get_shape())
 def testAssignUpdateNoValueShape(self):
     var = state_ops.variable_op([1, 2], tf.float32)
     added = tf.assign_add(var, self._NewShapelessTensor())
     self.assertEqual([1, 2], added.get_shape())
     subbed = tf.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual([1, 2], subbed.get_shape())
예제 #56
0
def update_sub(x, decrement):
    return tf.assign_sub(x, decrement)
예제 #57
0
    def tf_store(self, states, internals, actions, terminal, reward):
        # Memory indices to overwrite.
        num_instances = tf.shape(input=terminal)[0]
        with tf.control_dependencies(
            [tf.assert_less_equal(num_instances, self.capacity)]):
            indices = tf.range(self.memory_index, self.memory_index +
                               num_instances) % self.capacity

        # Remove episode indices.
        num_episodes = tf.count_nonzero(input_tensor=tf.gather(
            params=self.terminal_memory, indices=indices),
                                        axis=0,
                                        dtype=util.tf_dtype('int'))
        num_episodes = tf.minimum(x=num_episodes, y=self.episode_count)
        assignment = tf.assign(
            ref=self.episode_indices[:self.episode_count - num_episodes],
            value=self.episode_indices[num_episodes:self.episode_count])

        # Decrement episode count.
        with tf.control_dependencies(control_inputs=(assignment, )):
            assignment = tf.assign_sub(ref=self.episode_count,
                                       value=num_episodes)

        # Assign new observations.
        with tf.control_dependencies(control_inputs=(assignment, )):
            assignments = list()
            for name in sorted(states):
                assignments.append(
                    tf.scatter_update(ref=self.states_memory[name],
                                      indices=indices,
                                      updates=states[name]))
            for name in sorted(internals):
                assignments.append(
                    tf.scatter_update(ref=self.internals_memory[name],
                                      indices=indices,
                                      updates=internals[name]))
            for name in sorted(actions):
                assignments.append(
                    tf.scatter_update(ref=self.actions_memory[name],
                                      indices=indices,
                                      updates=actions[name]))
            assignments.append(
                tf.scatter_update(ref=self.terminal_memory,
                                  indices=indices,
                                  updates=terminal))
            assignments.append(
                tf.scatter_update(ref=self.reward_memory,
                                  indices=indices,
                                  updates=reward))

        # Add episode indices.
        with tf.control_dependencies(control_inputs=assignments):
            num_episodes = tf.count_nonzero(input_tensor=terminal,
                                            axis=0,
                                            dtype=util.tf_dtype('int'))
            assignment = tf.assign(
                ref=self.episode_indices[self.
                                         episode_count:self.episode_count +
                                         num_episodes],
                value=tf.boolean_mask(tensor=indices, mask=terminal))

        # Increment episode count.
        with tf.control_dependencies(control_inputs=(assignment, )):
            assignment = tf.assign_add(ref=self.episode_count,
                                       value=num_episodes)

        # Increment memory index.
        with tf.control_dependencies(control_inputs=(assignment, )):
            assignment = tf.assign(
                ref=self.episode_indices[-1],
                value=tf.where(
                    self.memory_index + num_instances > self.capacity,
                    self.episode_indices[self.episode_count - 1],
                    self.capacity - 1))

        with tf.control_dependencies(control_inputs=(assignment, )):
            assignment = tf.assign(ref=self.memory_index,
                                   value=((self.memory_index + num_instances) %
                                          self.capacity))

        with tf.control_dependencies(control_inputs=(assignment, )):
            return tf.no_op()