예제 #1
0
 def grad_variance(self):
     grad_var_ops = []
     tensor_to_avg = []
     for t, g in zip(self._tvars, self._grads):
         if isinstance(g, ops.IndexedSlices):
             tensor_to_avg.append(
                 tf.reshape(tf.unsorted_segment_sum(g.values, g.indices,
                                                    g.dense_shape[0]),
                            shape=t.get_shape()))
         else:
             tensor_to_avg.append(g)
     avg_op = self._moving_averager.apply(tensor_to_avg)
     grad_var_ops.append(avg_op)
     with tf.control_dependencies([avg_op]):
         self._grad_avg = [
             self._moving_averager.average(val) for val in tensor_to_avg
         ]
         self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
     self._grad_var = tf.maximum(
         tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
         self._grad_norm_squared_avg -
         tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared]))
     if self._sparsity_debias:
         self._grad_var *= self._sparsity_avg
     return grad_var_ops
예제 #2
0
 def grad_sparsity(self):
     # If the sparse minibatch gradient has 10 percent of its entries
     # non-zero, its sparsity is 0.1.
     # The norm of dense gradient averaged from full dataset
     # are roughly estimated norm of minibatch
     # sparse gradient norm * sqrt(sparsity)
     # An extension maybe only correct the sparse blob.
     non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads])
     all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads])
     self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \
       / tf.cast(all_entry_cnt, self._grads[0].dtype)
     avg_op = self._moving_averager.apply([
         self._sparsity,
     ])
     with tf.control_dependencies([avg_op]):
         self._sparsity_avg = self._moving_averager.average(self._sparsity)
     return avg_op
예제 #3
0
    def before_apply(self):
        self._moving_averager = tf.train.ExponentialMovingAverage(
            decay=self._beta, zero_debias=self._zero_debias)
        assert self._grads is not None and len(self._grads) > 0
        before_apply_ops = []

        # get per var g**2 and norm**2
        self._grad_squared = []
        self._grad_norm_squared = []
        for v, g in zip(self._tvars, self._grads):
            if g is None:
                continue
            with ops.colocate_with(v):
                self._grad_squared.append(tf.square(g))
        self._grad_norm_squared = [
            tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared
        ]

        if self._sparsity_debias:
            avg_op_sparsity = self.grad_sparsity()
            before_apply_ops.append(avg_op_sparsity)

        # the following running average on squared norm of gradient is shared
        # by `grad_variance` and `dist_to_opt`
        avg_op = self._moving_averager.apply(self._grad_norm_squared)
        with tf.control_dependencies([avg_op]):
            self._grad_norm_squared_avg = [
                self._moving_averager.average(val)
                for val in self._grad_norm_squared
            ]
            self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
            self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
        before_apply_ops.append(avg_op)

        with tf.control_dependencies([avg_op]):
            curv_range_ops = self.curvature_range()
            before_apply_ops += curv_range_ops
            grad_var_ops = self.grad_variance()
            before_apply_ops += grad_var_ops
            dist_to_opt_ops = self.dist_to_opt()
            before_apply_ops += dist_to_opt_ops
        return tf.group(*before_apply_ops)
예제 #4
0
 def l2_loss(self, tvars=None):
     _l2_loss = 0.0
     weight_decay = self.config['solver']['optimizer'].get(
         'weight_decay', None)
     if weight_decay:
         logging.info(f"add L2 Loss with decay: {weight_decay}")
         with tf.name_scope('l2_loss'):
             tvars = tvars if tvars else tf.trainable_variables()
             tvars = [v for v in tvars if 'bias' not in v.name]
             _l2_loss = weight_decay * tf.add_n(
                 [tf.nn.l2_loss(v) for v in tvars])
             summary_lib.scalar('l2_loss', _l2_loss)
     return _l2_loss
예제 #5
0
  def build(self, mode: str):
    """Build the model for training, eval and infer."""
    inputs = self.input_fn(mode)
    logging.info("build input data done...")

    model = self.model_fn()
    training = mode == utils.TRAIN
    model.logits = model(inputs["input_x_dict"], training=training)
    model.input_x_len = inputs["input_x_len"]
    model.iterator = inputs["iterator"]
    model.input_x_dict = inputs["input_x_dict"]
    model.input_x_len = inputs["input_x_len"]
    model.loss_fn = self.get_loss_fn()
    if mode != utils.INFER or not self.infer_no_label:
      input_y = inputs["input_y_dict"]["input_y"]
      if isinstance(model.loss_fn, list):
        model.loss = []
        for i, one_loss_fn in enumerate(model.loss_fn):
          one_loss = one_loss_fn(
              labels=input_y[i],
              logits=model.logits[i],
              input_length=model.input_x_len,
              model=model,
              name="loss_{}".format(i))
          model.loss.append(one_loss)
        model.loss_op = tf.add_n(model.loss, name="loss_sum")
      else:
        model.loss = model.loss_fn(
            labels=input_y,
            logits=model.logits,
            input_length=model.input_x_len,
            model=model,
            name="loss")
        model.loss_op = model.loss
      logging.info("model.loss done")
      model.input_y = input_y

    # output related
    self.build_output(model)
    return model