Exemplo n.º 1
0
    def rebuild_graph(self,
                      path,
                      model_name,
                      full_assign=False,
                      train_data=None):
        if train_data is None:
            raise ValueError("SVDpp model must provide train_data "
                             "when rebuilding graph")
        sparse_implicit_interaction = sparse_tensor_interaction(train_data,
                                                                recent_num=10)
        self._build_model(sparse_implicit_interaction)
        self._build_train_ops()

        variable_path = os.path.join(path, f"{model_name}_variables.npz")
        variables = np.load(variable_path)
        variables = dict(variables.items())

        (user_variables, item_variables, sparse_variables, dense_variables,
         manual_variables) = modify_variable_names(self, trainable=True)

        update_ops = []
        for v in tf.trainable_variables():
            if user_variables is not None and v.name in user_variables:
                # no need to remove oov values
                old_var = variables[v.name]
                user_op = tf.IndexedSlices(old_var, tf.range(len(old_var)))
                update_ops.append(v.scatter_update(user_op))

            if item_variables is not None and v.name in item_variables:
                old_var = variables[v.name]
                item_op = tf.IndexedSlices(old_var, tf.range(len(old_var)))
                update_ops.append(v.scatter_update(item_op))

        if full_assign:
            (optimizer_user_variables, optimizer_item_variables,
             optimizer_sparse_variables, optimizer_dense_variables,
             _) = modify_variable_names(self, trainable=False)

            other_variables = [
                v for v in tf.global_variables()
                if v.name not in manual_variables
            ]
            for v in other_variables:
                if (optimizer_user_variables is not None
                        and v.name in optimizer_user_variables):
                    old_var = variables[v.name]
                    user_op = tf.IndexedSlices(old_var, tf.range(len(old_var)))
                    update_ops.append(v.scatter_update(user_op))

                elif (optimizer_item_variables is not None
                      and v.name in optimizer_item_variables):
                    old_var = variables[v.name]
                    item_op = tf.IndexedSlices(old_var, tf.range(len(old_var)))
                    update_ops.append(v.scatter_update(item_op))

                else:
                    old_var = variables[v.name]
                    update_ops.append(v.assign(old_var))

        self.sess.run(update_ops)
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    with tf.init_scope():
      self._create_slots([v for (_, v) in grads_and_vars])

    accums = []
    variables = []

    for g, v in grads_and_vars:
      accum = self.get_slot(v, 'grad_accum')
      variables.append(v)
      if isinstance(g, tf.IndexedSlices):
        scaled_grad = tf.IndexedSlices(
            g.values / self._grad_steps, g.indices, dense_shape=g.dense_shape)
        accums.append(accum.assign_add(scaled_grad))  # pytype: disable=attribute-error
      else:
        accums.append(accum.assign_add(g / self._grad_steps))  # pytype: disable=attribute-error

    def _apply_and_zero():
      apply_op = self._opt.apply_gradients(list(zip(accums, variables)))
      with tf.control_dependencies([apply_op]):
        zero_op = [tf.assign(accum, tf.zeros_like(accum)) for accum in accums]
      return tf.group(zero_op, tf.assign_add(self._counter, 1))

    def _accum():
      return tf.group(accums)

    accum_step = tf.cond(
        tf.equal(tf.mod(global_step, self._grad_steps), self._grad_steps - 1),
        _apply_and_zero, _accum)

    with tf.control_dependencies([accum_step]):
      global_step = tf.assign_add(global_step, 1)
      return tf.group(global_step)
Exemplo n.º 3
0
    def _get_model_spec(self, features, labels, mode):
        features = features.copy()
        if mode == tf.estimator.ModeKeys.PREDICT:
            fids = tf.IndexedSlices(
                indices=features.pop('fids_indices'),
                values=features.pop('fids_values'),
                dense_shape=features.pop('fids_dense_shape'))
            features.update(self._preprocess_fids(
                fids, self._slot_configs))

        bias_embedding = embedding.Embedding(self._bias_slot_configs,
                                             devices=self._embedding_devices)
        bias_tensor = bias_embedding.lookup(features)
        if self._vec_slot_configs is not None:
            vec_embedding = embedding.Embedding(self._vec_slot_configs,
                                                devices=self._embedding_devices)
            vec_tensor = vec_embedding.lookup(features)
        else:
            vec_embedding = None
            vec_tensor = None

        model = SparseFLModel(self._role, self._bridge,
                              features.get('example_id', None),
                              config_run=False,
                              bias_tensor=bias_tensor,
                              bias_embedding=bias_embedding,
                              vec_tensor=vec_tensor,
                              vec_embedding=vec_embedding,
                              feature_columns=self._feature_columns)

        spec = self._model_fn(model, features, labels, mode)
        assert model._frozen, "Please finalize model in model_fn"
        return spec, model
Exemplo n.º 4
0
def matmul_sparse_dense(A, B, name=None, transpose_a=False, transpose_b=False):  # pylint: disable=invalid-name
  """Computes matmul(A, B) where A is sparse, B is dense.

  Args:
    A: tf.IndexedSlices with dense shape [m, n].
    B: tf.Tensor with shape [n, k].
    name: str. Name of op.
    transpose_a: Bool. If true we transpose A before multiplying it by B.
      (Default: False)
    transpose_b: Bool. If true we transpose B before multiplying it by A.
      (Default: False)

  Returns:
    tf.IndexedSlices resulting from matmul(A, B).

  Raises:
    ValueError: If A doesn't represent a matrix.
    ValueError: If B is not rank-2.
  """
  with tf.name_scope(name, "matmul_sparse_dense", [A, B]):
    if A.indices.shape.ndims != 1 or A.values.shape.ndims != 2:
      raise ValueError("A must represent a matrix. Found: %s." % A)
    if B.shape.ndims != 2:
      raise ValueError("B must be a matrix.")
    new_values = tf.matmul(
        A.values, B, transpose_a=transpose_a, transpose_b=transpose_b)
    return tf.IndexedSlices(
        new_values,
        A.indices,
        dense_shape=tf.stack([A.dense_shape[0], new_values.shape[1]]))
Exemplo n.º 5
0
def gradients_assign_add(ref, value):
    if isinstance(ref, tf.IndexedSlices):
        indices = ref.indices
        values = ref.values + value.values
        return tf.IndexedSlices(values, indices, ref.dense_shape)
    else:
        return ref + value
Exemplo n.º 6
0
    def gradients(self, objective, parameters):
        """Compute gradients of the objective with respect to the parameters.

    Args:
      objective: The objective op (e.g. output of self.objective())
      parameters: A list of tensors (the parameters to optimize)

    Returns:
      A list of tensors representing the gradient for each parameter,
        returned in the same order as the given list
    """
        grads = tf.gradients(objective, list(parameters))
        noisy_grads = []

        for grad in grads:
            if isinstance(grad, tf.IndexedSlices):
                noise = self.noise_stdev * tf.random_normal(
                    tf.shape(grad.values))
                new_grad = tf.IndexedSlices(grad.values + noise, grad.indices)
            else:
                new_grad = grad + self.noise_stdev * tf.random_normal(
                    grad.get_shape())
            noisy_grads.append(new_grad)

        return noisy_grads
Exemplo n.º 7
0
 def _preprocess_fids(self, fids, configs):
     if fids.indices.shape.rank == 2:
         fids = tf.IndexedSlices(indices=fids.indices[:, 0],
                                 values=fids.values,
                                 dense_shape=fids.dense_shape)
     features = {}
     for config in configs:
         features.update(operator._multidevice_preprocess_fids(
             fids, config, num_shards=self._num_shards))
     return features
Exemplo n.º 8
0
def average_gradient(tower_grads):
    avg_grads = []
    for grads_vars in zip(*tower_grads):
        values = tf.concat([g.values / num_gpus for g, _ in grads_vars], 0)
        indices = tf.concat([g.indices for g, _ in grads_vars], 0)
        grad = tf.IndexedSlices(values, indices)

        var = grads_vars[0][1]
        cur_grad_and_var = (grad, var)
        avg_grads.append(cur_grad_and_var)
    return avg_grads
Exemplo n.º 9
0
 def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
   tf.logging.warning("MultistepAdamOptimizer does not support sparse updates")
   # Note that conversion to a dense Tensor handles duplicate `indices`
   # correctly (summing them). A real sparse implementation will probably want
   # to override _resource_apply_sparse instead so it gets them de-duplicated
   # automatically.
   dense_grad = tf.convert_to_tensor(
       tf.IndexedSlices(
           values=grad, indices=indices, dense_shape=tf.shape(var)))
   return self._apply_cond(self._resource_apply_dense_in_action, dense_grad,
                           var)
Exemplo n.º 10
0
    def average_sparse(grad_and_vars):
        if len(grad_and_vars) == 1:
            return grad_and_vars[0][0]

        indices = []
        values = []
        for g, _ in grad_and_vars:
            indices += [g.indices]
            values += [g.values]
        indices = tf.concat(indices, 0)
        values = tf.concat(values, 0) / len(grad_and_vars)
        return tf.IndexedSlices(values, indices,
                                grad_and_vars[0][0].dense_shape)
Exemplo n.º 11
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        counter = tf.get_variable(shape=[],
                                  initializer=tf.zeros_initializer,
                                  name="counter")
        accums = []
        update_op = []
        variables = []
        for (grad, param) in grads_and_vars:
            if grad is None or param is None:
                continue

            if self._grad_clipping is not None:
                grad_clipping = self._steps * self._grad_clipping
                grad = tf.clip_by_value(grad, -grad_clipping, grad_clipping)

            variables.append(param)
            param_name = self._get_variable_name(param.name)

            accum = tf.get_variable(name=param_name + "/accum",
                                    shape=param.shape.as_list(),
                                    dtype=tf.float32,
                                    trainable=False,
                                    initializer=tf.zeros_initializer())
            accums.append(accum)

            if isinstance(grad, tf.IndexedSlices):
                scaled_grad = tf.IndexedSlices(grad.values / self._steps,
                                               grad.indices,
                                               dense_shape=grad.dense_shape)
                update_op.append(accum.assign_add(scaled_grad))
            else:
                update_op.append(accum.assign_add(grad / self._steps))

        def _apply_and_zero():
            with tf.control_dependencies(update_op):
                apply_op = self._opt.apply_gradients(
                    list(zip(accums, variables)), global_step, name)
            with tf.control_dependencies([apply_op]):
                zero_op = [
                    tf.assign(accum, tf.zeros_like(accum))
                    for accum in accums + [counter]
                ]
            return tf.group(zero_op)

        def _accum():
            return tf.group(update_op)

        # Control that the counter has been incremented already
        with tf.control_dependencies([counter.assign_add(1)]):
            return tf.cond(tf.equal(tf.mod(counter, self._steps), 0),
                           _apply_and_zero, _accum)
Exemplo n.º 12
0
def splice(obj, input_map, control_inputs=None):
    if type(obj) is tf.Operation:
        return splice_op(obj, input_map, control_inputs=control_inputs)
    elif type(obj) is tf.Tensor:
        return splice_tensor(obj, input_map.get(obj.op, obj.op))
    elif type(obj) is tf.IndexedSlices:
        return tf.IndexedSlices(values=input_map.get(obj.values, obj.values),
                                indices=input_map.get(obj.indices,
                                                      obj.indices),
                                dense_shape=input_map.get(
                                    obj.dense_shape, obj.dense_shape))
    else:
        raise AssertionError(
            f'Could not get deps from{repr(type(obj))} {repr(obj)}')
Exemplo n.º 13
0
    def testSparseUpdates(self):
        """Test that checks sparse updates."""

        with self.cached_session() as sess:
            var = tf.Variable([[0.5, 0.05], [0.05, 1.0], [0.15, 3.0],
                               [0.35, 2.0]])
            # A sparse gradient that updates index 1, and 3.
            grad_np = [[0.1, 0.05], [0.01, 1.5]]
            indices_np = [1, 3]
            shape = [2, 2]
            grad = tf.IndexedSlices(
                tf.constant(grad_np, shape=shape),
                tf.constant(indices_np),  # indices
                tf.constant(shape))  # shape
            opt = sm3.SM3Optimizer(learning_rate=self._learning_rate,
                                   momentum=self._momentum)
            step = opt.apply_gradients([(grad, var)])
            sess.run(tf.global_variables_initializer())
            # Check that variable and momentum are as expected before starting
            # training.
            var_np = sess.run(var)
            self.assertAllClose(
                [[0.5, 0.05], [0.05, 1.0], [0.15, 3.0], [0.35, 2.0]], var_np)
            # Run one step of training.
            step.run()
            accumulator = numpy.zeros_like(var_np)
            accumulator[indices_np, :] += numpy.square(grad_np)
            row_accumulator = numpy.amax(accumulator, axis=1, keepdims=True)
            # Update SM3 accumulators.
            exp_p_grad = grad_np / numpy.sqrt(accumulator[indices_np, :])
            exp_var_np = var_np
            exp_var_np[indices_np, :] = var_np[
                indices_np, :] - self._learning_rate * exp_p_grad
            var_np = sess.run(var)
            self.assertAllClose(exp_var_np, var_np)
            row_accumulator_var = numpy.reshape(
                sess.run(opt.get_slot(var, 'accumulator_0')), [4, 1])
            self.assertAllClose(row_accumulator_var, row_accumulator)
Exemplo n.º 14
0
def _embedding_pooling_gradient(op, grad):
    num_weights = op.get_attr("num_weights")
    control_inputs = op.control_inputs

    def _get_control_input_by_name(name):
        candidates = [x for x in control_inputs if x.name.find(name) != -1]
        assert len(candidates) == 1
        return candidates[0].outputs[0]

    num_unique_fids_per_partition = _get_control_input_by_name(
        'num_unique_fids_per_partition')
    fid_to_unique_index = _get_control_input_by_name('fid_to_unique_index')
    unique_fid_hash = [_get_control_input_by_name('unique_fid_hash_%d'%i) \
        for i in range(num_weights)]

    assert len(unique_fid_hash) == num_weights

    values = lagrange_lite_ops.lagrange_embedding_unpooling(
        num_weights=num_weights,
        weight_sizes=op.get_attr('weight_sizes'),
        use_fid_v2=op.get_attr('use_fid_v2'),
        output_grad=grad,
        instance_ids=op.inputs[1],
        fids=op.inputs[2],
        fid_to_unique_index=fid_to_unique_index,
        num_unique_fids_per_partition=num_unique_fids_per_partition,
        slot_size=op.inputs[3],
        slot_weight_index=op.inputs[4],
        slot_output_offset=op.inputs[5])

    weight_grads = []
    for i, (k, v) in enumerate(zip(unique_fid_hash, values)):
        w = op.inputs[8 + i]
        shape = tf.shape(w, out_type=tf.int64)
        weight_grads.append(
            tf.IndexedSlices(indices=k, values=v, dense_shape=shape))

    return [None for i in range(8)] + weight_grads
Exemplo n.º 15
0
def matmul_diag_sparse(A_diag, B, name=None):  # pylint: disable=invalid-name
  """Computes matmul(A, B) where A is a diagonal matrix, B is sparse.

  Args:
    A_diag: diagonal entries of matrix A of shape [m, m].
    B: tf.IndexedSlices. Represents matrix of shape [m, n].
    name: str. Name of op.

  Returns:
    tf.IndexedSlices resulting from matmul(A, B).

  Raises:
    ValueError: If A_diag is not rank-1.
    ValueError: If B doesn't represent a matrix.
  """
  with tf.name_scope(name, "matmul_diag_sparse", [A_diag, B]):
    A_diag = tf.convert_to_tensor(A_diag)
    if A_diag.shape.ndims != 1:
      raise ValueError("A_diag must be a rank-1 Tensor.")
    if B.indices.shape.ndims != 1 or B.values.shape.ndims != 2:
      raise ValueError("B must represent a matrix. Found: %s." % B)
    a = tf.gather(A_diag, B.indices)
    a = tf.reshape(a, list(a.shape) + [1] * (B.values.shape.ndims - 1))
    return tf.IndexedSlices(a * B.values, B.indices, dense_shape=B.dense_shape)
Exemplo n.º 16
0
def tgn_memory(
    n_nodes: int,
    memory_size: int,
    time_embedding_size: int,
    node_ids: tf.Tensor,
    write_idx: tf.Tensor,
    write_mask: tf.Tensor,
    write_features: tf.Tensor,
    write_times: tf.Tensor,
) -> TgnMemory:
    """Create TGN memory read & update operations.

    A trainable memory for nodes in an temporal interaction graph. The memory
    state is computed using the latest interaction event that touched a node.
    The update is a GRU cell, taking as input the previous memory of both source
    and desination nodes for that edge, the edge feature vector and time difference
    from interaction to current time.

    Note that the GRU cell is computed lazily when the memory is read, rather than
    when it is stored, to support a single step of truncated backpropagation through
    time and obtain a gradient for GRU variables.

    Please see "Temporal Graph Network" (https://arxiv.org/abs/2006.10637) for full
    details.

    Arguments:

      n_nodes -- total number of slots in the memory

      memory_size -- size of stored state in the memory / GRU cell output size

      time_embedding_size -- size of the time encoding activation provided to the
                             GRU cell

      node_ids -- shape (n_read), (-1 <= ID < n_nodes), the memory locations to be read

      write_idx -- shape (2, n_write), (0 <= idx < n_read), the (src, dst) indices of
                   edges, selecting nodes that should be written with their updated
                   memory state

      write_mask -- shape (2, n_write), boolean tensor for elements in write_idx that
                    should be written (true) or skipped (false), such that each memory
                    location is written at most once

      write_features -- shape (n_write, feature_size), input features to be stored and
                        used to compute the memory when it is next accessed

      write_times -- shape (n_write), edge event times to be stored and used to compute
                     the memory when it next accessed

    Returns:

      TgnMemory(
        output      -- tensor of shape (n_read, memory_size), current memory for node_ids
        last_update -- tensor of shape (n_read), last update of output
        updates     -- tuple of operations to run to update the memory
      )
    """
    assert_shape(node_ids, (None, ))
    _, n_write = assert_shape(write_idx, (2, None))
    assert_shape(write_mask, (2, n_write))
    _, feature_size = assert_shape(write_features, (n_write, None))
    assert_shape(write_times, (n_write, ))
    dtype = write_features.dtype

    # Declare memory
    # As an optimisation, we concatenate the 6 fields required by the memory
    # into 2 tensors, one consisting of ints, the other of floats.
    # This requires some extra code to slice and concat, but means we can use
    # 2 (dynamic) gather operations instead of 6.

    # Each row: [last_update, dt, neighbour]
    v_ints = tf.get_variable(
        "ints",
        shape=(1 + n_nodes, 3),
        dtype=tf.int32,
        trainable=False,
        initializer=tf.zeros_initializer(),
        collections=[tf.GraphKeys.GLOBAL_VARIABLES, TGN_MEMORY_VARIABLES_KEY],
    )
    # Each row: [memory, features, direction]
    v_floats = tf.get_variable(
        "floats",
        shape=(1 + n_nodes, memory_size + feature_size + 2),
        dtype=dtype,
        trainable=False,
        initializer=tf.zeros_initializer(),
        collections=[tf.GraphKeys.GLOBAL_VARIABLES, TGN_MEMORY_VARIABLES_KEY],
    )

    # Memory[0] is used for padding (node_ids == -1)
    safe_node_ids = 1 + node_ids

    # Read memory for node_ids
    node_ints = tf.gather(v_ints, safe_node_ids)
    node_last_update, node_dt, node_neighbour_idx = tf.unstack(node_ints,
                                                               axis=1)
    node_neighbour = tf.gather(v_floats[:, :memory_size], node_neighbour_idx)
    node_time_encoding = time_encoder(tf.cast(node_dt, tf.float32),
                                      time_embedding_size, dtype)

    node_floats = tf.gather(v_floats, safe_node_ids)
    node_self = node_floats[:, :memory_size]
    node_features = node_floats[:, memory_size:memory_size + feature_size]
    node_direction = node_floats[:, memory_size + feature_size:]

    node_memory = gru_cell(
        node_self,
        tf.concat(
            [
                node_direction[:, 0, tf.newaxis] * node_self +
                node_direction[:, 1, tf.newaxis] * node_neighbour,
                node_direction[:, 1, tf.newaxis] * node_self +
                node_direction[:, 0, tf.newaxis] * node_neighbour,
                node_features,
                node_time_encoding,
            ],
            axis=1,
        ),
    )

    # Write memory according to (write_idx, write_mask)
    flat_write_idx = tf.reshape(write_idx, (-1, ))
    indices = tf.gather(safe_node_ids, flat_write_idx)
    masked_indices = indices * tf.cast(tf.reshape(write_mask,
                                                  (-1, )), indices.dtype)
    p_last_update = tf.reshape(tf.tile(write_times[tf.newaxis], (2, 1)),
                               (-1, ))
    p_dt = p_last_update - tf.gather(node_last_update, flat_write_idx)
    # Swap src and dst indices to get the neighbour index for each node
    p_neighbour = tf.roll(indices, n_write, 0)
    p_memory = tf.gather(node_memory, flat_write_idx)
    p_features = tf.tile(write_features, (2, 1))
    p_direction = tf.repeat(tf.eye(2, dtype=dtype), n_write,
                            0)  # src=[1, 0], dst=[0, 1]

    # There is already a data dependency, but just to be sure...
    with tf.control_dependencies([node_last_update, node_memory]):
        update_ints = v_ints.scatter_update(
            tf.IndexedSlices(
                tf.stack([p_last_update, p_dt, p_neighbour], axis=1),
                masked_indices))
        update_floats = v_floats.scatter_update(
            tf.IndexedSlices(
                tf.concat([p_memory, p_features, p_direction], axis=1),
                masked_indices))

    return TgnMemory(
        output=node_memory,
        last_update=node_last_update,
        updates=(update_ints, update_floats),
    )
Exemplo n.º 17
0
def freelb(model_fn,
           inputs,
           batch_size,
           max_length,
           optimizer=None,
           layer_name='word_embeddings',
           epsilon=0.3,
           n_loop=3):
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        model_outputs = model_fn(inputs, True)
        grads_and_vars = utils.compute_gradients(model_outputs['loss'],
                                                 optimizer)
    # loss对embedding的梯度
    embedding_gradients, embeddings = utils.find_grad_and_var(
        grads_and_vars, layer_name)
    init_r = tf.get_variable(
        'init_r',
        shape=[batch_size * max_length,
               embeddings.shape.as_list()[-1]],
        initializer=tf.random_uniform_initializer(minval=-epsilon,
                                                  maxval=epsilon),
        trainable=False)
    init_op = tf.variables_initializer([init_r])
    with tf.control_dependencies([init_op]):  # fix perturbation
        # Scale randomly initialized permutation, to make sure norm
        # of `r` is smaller than epsilon.
        r = tf.divide(init_r, tf.norm(init_r, np.inf))
        r = tf.IndexedSlices(values=r,
                             indices=embedding_gradients.indices,
                             dense_shape=embedding_gradients.dense_shape)
        attack_op = embeddings.assign(embeddings + r)
    # attack
    acc_r = r
    all_grads_and_vars = []
    for k in range(n_loop):
        with tf.variable_scope(tf.get_variable_scope(),
                               reuse=tf.AUTO_REUSE), tf.control_dependencies(
                                   [attack_op]):
            adv_outputs = model_fn(inputs, True)
            attack_grad_and_vars = utils.compute_gradients(
                adv_outputs['loss'], optimizer)
            all_grads_and_vars.append(attack_grad_and_vars)
            gradients, _ = utils.find_grad_and_var(attack_grad_and_vars,
                                                   layer_name)
            tmp_r = tf.multiply(1 / n_loop,
                                gradients / (tf.norm(gradients) + 1e-9))

            # In order not to shuffle the distribution of gradient-
            # induced perturbation, we use norm to scale instead of
            # simply clip the values.
            norm = tf.norm(acc_r + tmp_r)
            cur_r = tf.cond(norm > epsilon, lambda:
                            (acc_r + tmp_r) * tf.divide(epsilon, norm), lambda:
                            (acc_r + tmp_r))
            r = cur_r - acc_r  # calculate current step
            attack_op = embeddings.assign(embeddings + r)
            acc_r = cur_r
    # restore
    with tf.variable_scope(tf.get_variable_scope(),
                           reuse=tf.AUTO_REUSE), tf.control_dependencies(
                               [attack_op]):
        attack_outputs = model_fn(inputs, True)
        attack_grad_and_vars = utils.compute_gradients(attack_outputs['loss'],
                                                       optimizer)

        all_grads_and_vars.append(attack_grad_and_vars)
        restore_op = embeddings.assign(embeddings - acc_r)

    # sum up
    with tf.control_dependencies([restore_op]):
        grads_and_vars = utils.average_grads_and_vars(all_grads_and_vars)
    return AdversarialOutput(model_outputs, grads_and_vars)
Exemplo n.º 18
0
def _clip_by_global_norm(t_list, clip_norm, use_norm, name=None):
    """Clips values of multiple tensors by the ratio of the sum of their norms.
  Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
  this operation returns a list of clipped tensors `list_clipped`
  and the global norm (`global_norm`) of all tensors in `t_list`. The global
  norm is expected to be pre-computed and passed as use_norm.
  To perform the clipping, the values `t_list[i]` are set to:
      t_list[i] * clip_norm / max(global_norm, clip_norm)
  where:
      global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))
  If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
  otherwise they're all shrunk by the global ratio.
  Any of the entries of `t_list` that are of type `None` are ignored.
  This is the correct way to perform gradient clipping (for example, see
  [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063)
  ([pdf](http://arxiv.org/pdf/1211.5063.pdf))).
  However, it is slower than `clip_by_norm()` because all the parameters must be
  ready before the clipping operation can be performed.
  Args:
    t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
    clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
    use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global
      norm to use. If not provided, `global_norm()` is used to compute the norm.
    name: A name for the operation (optional).
  Returns:
    list_clipped: A list of `Tensors` of the same type as `list_t`.
    global_norm: A 0-D (scalar) `Tensor` representing the global norm.
  Raises:
    TypeError: If `t_list` is not a sequence.
  """
    if not isinstance(t_list, collections.Sequence) or isinstance(
            t_list, six.string_types):
        raise TypeError('t_list should be a sequence')
    t_list = list(t_list)

    # Removed as use_norm should always be passed
    # if use_norm is None:
    #   use_norm = global_norm(t_list, name)

    with tf.name_scope(name, 'clip_by_global_norm',
                       t_list + [clip_norm]) as name:
        # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
        scale = clip_norm * tf.minimum(
            1.0 / use_norm,
            tf.ones([1], dtype=use_norm.dtype) / clip_norm)

        values = [
            tf.cast(
                tf.convert_to_tensor(
                    t.values if isinstance(t, tf.IndexedSlices) else t,
                    name='t_%d' % i,
                ),
                dtype=tf.float32,
            ) if t is not None else t for i, t in enumerate(t_list)
        ]

        values_clipped = []
        for i, v in enumerate(values):
            if v is None:
                values_clipped.append(None)
            else:
                with tf.colocate_with(v):
                    values_clipped.append(
                        tf.identity(v * scale, name='%s_%d' % (name, i)))

        list_clipped = [
            tf.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance(
                t, tf.IndexedSlices) else c_v
            for (c_v, t) in zip(values_clipped, t_list)
        ]

    return list_clipped, use_norm
Exemplo n.º 19
0
def get_train_ops(loss,
                  tf_variables,
                  train_step,
                  clip_mode=None,
                  grad_bound=None,
                  l2_reg=1e-4,
                  lr_warmup_val=None,
                  lr_warmup_steps=100,
                  lr_init=0.1,
                  lr_dec_start=0,
                  lr_dec_every=10000,
                  lr_dec_rate=0.1,
                  lr_dec_min=None,
                  lr_cosine=False,
                  lr_max=None,
                  lr_min=None,
                  lr_T_0=None,
                  lr_T_mul=None,
                  num_train_batches=None,
                  optim_algo=None,
                  sync_replicas=False,
                  num_aggregate=None,
                  num_replicas=None,
                  get_grad_norms=False,
                  moving_average=None):
    """
	Args:
	  clip_mode: "global", "norm", or None.
	  moving_average: store the moving average of parameters
	"""

    if l2_reg > 0:
        l2_losses = []
        for var in tf_variables:
            l2_losses.append(tf.reduce_sum(var**2))
        l2_loss = tf.add_n(l2_losses)
        loss += l2_reg * l2_loss  # loss = loss + 1e-4*l2_loss

    grads = tf.gradients(loss, tf_variables)
    grad_norm = tf.global_norm(grads)

    grad_norms = {}
    for v, g in zip(tf_variables, grads):
        if v is None or g is None:
            continue
        if isinstance(g, tf.IndexedSlices):
            grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g.values**2))
        else:
            grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g**2))

    if clip_mode is not None:
        assert grad_bound is not None, "Need grad_bound to clip gradients."
        if clip_mode == "global":
            grads, _ = tf.clip_by_global_norm(grads, grad_bound)
        elif clip_mode == "norm":
            clipped = []
            for g in grads:
                if isinstance(g, tf.IndexedSlices):
                    c_g = tf.clip_by_norm(g.values, grad_bound)
                    c_g = tf.IndexedSlices(g.indices, c_g)
                else:
                    c_g = tf.clip_by_norm(g, grad_bound)
                clipped.append(g)
            grads = clipped
        else:
            raise NotImplementedError("Unknown clip_mode {}".format(clip_mode))

    if lr_cosine:
        assert lr_max is not None, "Need lr_max to use lr_cosine"
        assert lr_min is not None, "Need lr_min to use lr_cosine"
        assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine"
        assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine"
        assert num_train_batches is not None, ("Need num_train_batches to use"
                                               " lr_cosine")

        curr_epoch = train_step // num_train_batches  # train step will be calculated by just one batch!

        last_reset = tf.Variable(0,
                                 dtype=tf.int32,
                                 trainable=False,
                                 name="last_reset")
        T_i = tf.Variable(lr_T_0, dtype=tf.int32, trainable=False, name="T_i")
        T_curr = curr_epoch - last_reset

        def _update():
            update_last_reset = tf.assign(last_reset,
                                          curr_epoch,
                                          use_locking=True)
            update_T_i = tf.assign(T_i, T_i * lr_T_mul, use_locking=True)
            with tf.control_dependencies([update_last_reset, update_T_i]):
                rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926
                lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate))
            return lr

        def _no_update():
            rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926
            lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate))
            return lr

        learning_rate = tf.cond(tf.greater_equal(T_curr, T_i), _update,
                                _no_update)
    else:
        learning_rate = tf.train.exponential_decay(
            lr_init,
            tf.maximum(train_step - lr_dec_start, 0),
            lr_dec_every,
            lr_dec_rate,
            staircase=True)
        if lr_dec_min is not None:
            learning_rate = tf.maximum(learning_rate, lr_dec_min)

    if lr_warmup_val is not None:
        learning_rate = tf.cond(tf.less(train_step, lr_warmup_steps),
                                lambda: lr_warmup_val, lambda: learning_rate)

    if optim_algo == "momentum":
        opt = tf.train.MomentumOptimizer(learning_rate,
                                         0.9,
                                         use_locking=True,
                                         use_nesterov=True)
    elif optim_algo == "sgd":
        opt = tf.train.GradientDescentOptimizer(learning_rate,
                                                use_locking=True)
    elif optim_algo == "adam":
        opt = tf.train.AdamOptimizer(learning_rate,
                                     beta1=0.0,
                                     epsilon=1e-3,
                                     use_locking=True)
    else:
        raise ValueError("Unknown optim_algo {}".format(optim_algo))

    if sync_replicas:
        assert num_aggregate is not None, "Need num_aggregate to sync."
        assert num_replicas is not None, "Need num_replicas to sync."

        opt = tf.train.SyncReplicasOptimizer(
            opt,
            replicas_to_aggregate=num_aggregate,
            total_num_replicas=num_replicas,
            use_locking=True)

    if moving_average is not None:
        opt = tf.contrib.opt.MovingAverageOptimizer(
            opt, average_decay=moving_average)

    train_op = opt.apply_gradients(zip(grads, tf_variables),
                                   global_step=train_step)

    if get_grad_norms:
        return train_op, learning_rate, grad_norm, opt, grad_norms
    else:
        return train_op, learning_rate, grad_norm, opt
Exemplo n.º 20
0
def compute_gradients(total_loss):
    """Separate the function of gradient computation."""
    monitor_dict = {}
    print(FLAGS.weight_decay, "==weight_decay==")
    print(FLAGS.lr_layer_decay_rate, "==lr_layer_decay_rate==")
    print(FLAGS.use_wd_exclusion, "==use_wd_exclusion==")
    print(FLAGS.adam_correction, "==adam_correction==")

    ##### Configure optimizer
    global_step = tf.train.get_or_create_global_step()

    # Warmup the learning rate linearly
    if FLAGS.warmup_steps > 0:
        progress = (tf.cast(global_step, tf.float32) /
                    tf.cast(FLAGS.warmup_steps, tf.float32))
    else:
        progress = 1.0
    curr_ratio = progress + (1.0 - progress) * FLAGS.min_lr_ratio
    warmup_lr = curr_ratio * FLAGS.learning_rate

    # Decay the learning rate
    if FLAGS.decay_method == "poly":
        decay_lr = tf.train.polynomial_decay(
            FLAGS.learning_rate,
            global_step=global_step - FLAGS.warmup_steps,
            decay_steps=FLAGS.train_steps - FLAGS.warmup_steps,
            end_learning_rate=FLAGS.learning_rate * FLAGS.min_lr_ratio)
    elif FLAGS.decay_method == "cos":
        decay_lr = tf.train.cosine_decay(
            FLAGS.learning_rate,
            global_step=global_step - FLAGS.warmup_steps,
            decay_steps=FLAGS.train_steps - FLAGS.warmup_steps,
            alpha=FLAGS.min_lr_ratio)
    else:
        raise ValueError(FLAGS.decay_method)

    learning_rate = tf.where(global_step < FLAGS.warmup_steps, warmup_lr,
                             decay_lr)

    if (FLAGS.weight_decay > 0 and not FLAGS.use_tpu
            and FLAGS.num_core_per_host > 1):
        raise ValueError("Do not support `weight_decay > 0` with multi-gpu "
                         "training so far.")

    if FLAGS.use_wd_exclusion:
        exclude_from_weight_decay = ["LayerNorm", "layer_norm", "bias"]
    else:
        exclude_from_weight_decay = []

    print(exclude_from_weight_decay, "==exclude_from_weight_decay==")

    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        beta_1=FLAGS.adam_beta1,
        beta_2=FLAGS.adam_beta2,
        epsilon=FLAGS.adam_epsilon,
        bias_correction=FLAGS.adam_correction,
        exclude_from_weight_decay=exclude_from_weight_decay,
        weight_decay_rate=FLAGS.weight_decay)

    if FLAGS.use_tpu:
        if FLAGS.per_core_clip:
            optimizer = tpu_optimizer.CrossShardOptimizer(
                optimizer, skip_nan_grad=FLAGS.skip_nan_grad)
        else:
            optimizer = tpu_optimizer.CrossShardOptimizer(
                optimizer, skip_nan_grad=FLAGS.skip_nan_grad, clip=FLAGS.clip)

    ##### Compute gradient
    variables = tf.trainable_variables()
    gradients = tf.gradients(total_loss, variables)

    if FLAGS.clip > 0 and FLAGS.per_core_clip:
        tf.logging.info("Clip local gradient with norm %.3f.", FLAGS.clip)
        clipped, local_gnorm = tf.clip_by_global_norm(gradients, FLAGS.clip)
    else:
        tf.logging.info("Do not clip local gradient.")
        clipped = list(gradients)
        local_gnorm = tf.linalg.global_norm(gradients)

    # layer-wise learning rate decay
    if FLAGS.lr_layer_decay_rate != 1.0:

        def _get_layer_id(name):
            if "model/input" in name:
                return 0
            m = re.search(r"model/(encoder|decoder)/layer_(\d+?)/", name)
            if not m: return None
            return int(m.group(2)) + 1

        n_layer = 0
        for i in range(len(clipped)):
            layer_id = _get_layer_id(variables[i].name)
            if layer_id is None: continue
            n_layer = max(n_layer, layer_id + 1)

        for i in range(len(clipped)):
            layer_id = _get_layer_id(variables[i].name)
            if layer_id is not None:
                abs_rate = FLAGS.lr_layer_decay_rate**(n_layer - 1 - layer_id)
                tf.logging.info("Apply mult %.4f to the grad of %s", abs_rate,
                                variables[i].name)
                if isinstance(clipped[i], tf.IndexedSlices):
                    clipped[i] = tf.IndexedSlices(clipped[i].values * abs_rate,
                                                  clipped[i].indices,
                                                  clipped[i].dense_shape)
                else:
                    clipped[i] *= abs_rate
            else:
                tf.logging.info("Grad of %s is not decayed.",
                                variables[i].name)

    grad_and_vars = list(zip(clipped, variables))

    monitor_dict["local_gnorm"] = local_gnorm
    monitor_dict["learning_rate"] = learning_rate

    return optimizer, grad_and_vars, global_step, monitor_dict
Exemplo n.º 21
0
 def _resource_apply_sparse(self, grad, handle, indices):
     return self._resource_apply_dense(
         tf.convert_to_tensor(
             tf.IndexedSlices(grad, indices, tf.shape(handle))), handle)
Exemplo n.º 22
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        grad_list = []
        var_list = []
        for g, v in grads_and_vars:
            grad_list.append(g)
            var_list.append(v)
        with tf.init_scope():
            self._create_slots(var_list)

        # accumulate gradients
        accums = []
        for g, v in zip(grad_list, var_list):
            accum = self.get_slot(v, 'grad_accum')
            # pytype: disable=attribute-error
            if isinstance(g, tf.IndexedSlices):
                scaled_grad = tf.IndexedSlices(g.values / self._grad_steps,
                                               g.indices,
                                               dense_shape=g.dense_shape)
                accums.append(
                    accum.assign(
                        self._sharding(accum.read_value()) + scaled_grad))
            else:
                accums.append(
                    accum.assign(
                        self._sharding(accum.read_value()) +
                        g / self._grad_steps))
            # pytype: enable=attribute-error

        if self._use_tpu:

            def _apply_and_zero_tpu2():
                normalized_accums = accums
                if self._apply_crs_to_grad:
                    normalized_accums = [
                        tf.tpu.cross_replica_sum(accum.read_value())
                        for accum in accums
                    ]
                apply_op = self._opt.apply_gradients(
                    list(zip(normalized_accums, var_list)))
                with tf.control_dependencies([apply_op]):
                    zero_op = [
                        tf.assign(accum, tf.zeros_like(accum))
                        for accum in accums
                    ]
                return tf.group(zero_op, tf.assign_add(global_step, 1))

            def _accum_tpu2():
                return tf.group(tf.no_op(), tf.assign_add(global_step, 1))

            accum_step = tf.cond(
                tf.equal(tf.mod(self._counter, self._grad_steps),
                         self._grad_steps - 1), _apply_and_zero_tpu2,
                _accum_tpu2)

            with tf.control_dependencies([tf.group(accums)]):
                return tf.group(accum_step, tf.assign_add(self._counter, 1))

        # for GPUs, use merge_call outside tf.cond to avoid issues
        with tf.control_dependencies([tf.group(accums)]):
            merge_return = tf.distribute.get_replica_context().merge_call(
                self._maybe_apply_grads_and_zero,
                args=(global_step, accums, var_list))

        return merge_return