Exemplo n.º 1
0
  def _average_gradients(self, tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
      Note that this function provides a synchronization point across all towers.
      Args:
        tower_grads: List of lists of (gradient, variable) tuples. The outer list
          is over individual gradients. The inner list is over the gradient
          calculation for each tower.
      Returns:
         List of pairs of (gradient, variable) where the gradient has been averaged
         across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
      # Note that each grad_and_vars looks like the following:
      #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
      grads = []
      for g, v in grad_and_vars:
        # Add 0 dimension to the gradients to represent the tower.
        if g is None:
          log.warning("No gradient for variable \"{}\"".format(v.name))
          grads.append(None)
          break
        else:
          expanded_g = tf.expand_dims(g, 0)
          grads.append(expanded_g)

      # Average over the "tower" dimension.
      if grads[0] is None:
        grad = None
      else:
        grad = concat(grads, axis=0)
        grad = tf.reduce_mean(grad, 0)

      # Keep in mind that the Variables are redundant because they are shared
      # across towers. So .. we will just return the first tower"s pointer to
      # the Variable.
      v = grad_and_vars[0][1]
      grad_and_var = (grad, v)
      average_grads.append(grad_and_var)
    return average_grads
Exemplo n.º 2
0
    def build_inference_network(self, x):
        config = self.config
        is_training = self.is_training
        num_stages = len(self.config.num_residual_units)
        strides = config.strides
        activate_before_residual = config.activate_before_residual
        filters = [ff for ff in config.filters]  #复制config中的filters大小
        h = self._init_conv(x, filters[0])  #卷积层,第一个卷积层所需核个数
        if config.use_bottleneck:
            res_func = self._bottleneck_residual  #????????????????????????
            # For CIFAR-10 it's [16, 16, 32, 64] => [16, 64, 128, 256]-- 通道个数增加
            for ii in range(1, len(filters)):
                filters[ii] *= 4
        else:
            res_func = self._residual

        # New version, single for-loop. Easier for checkpoint.
        #循环建立residual块
        nlayers = sum(config.num__residual_units)
        ss = 0
        ii = 0
        for ll in range(nlayers):
            if ss == 0 and ii == 0:
                no_activation = True
            else:
                no_activation = False
            if ii == 0:
                if ss == 0:
                    no_activation = True
                else:
                    no_activation = False
                in_filter = filters[ss]
                stride = self._stride_arr(strides[ss])
            else:
                in_filter = filters[ss + 1]
                stride = self._stride_arr(1)
            out_filter = filters[ss + 1]

            #保存隐藏层状态
            if ii == 0:
                self._saved_hidden.append(h)

            #建立residual块
            with tf.variable_scope("unit_{}_{}".format(ss + 1, ii)):
                h = res_func(h,
                             in_filter,
                             out_filter,
                             no_activation=no_activation,
                             add_bn_ops=True)
                if (ii + 1) % config.num__residual_units[ss] == 0:
                    ss += 1
                    ii = 0
                else:
                    ii += 1

            #保存隐藏状态
            self._saved_hidden.append(h)

            #make a single tensor

            if type(h) == tuple:
                h = concat(h, axis=3)

            with tf.variable_scope("unit_last"):
                h = self._batch_norm("final_bn", h)
                h = self._relu("final_relu", h)
            h = self._global_avg_pool(h)  #??????????

            #分类层
            with tf.variable_scope("logit"):
                logits = self._fully_connected(h, config.num_classes)
            return logits
    def build_inference_network(self, x):
        config = self.config
        is_training = self.is_training
        num_stages = len(self.config.num_residual_units)
        strides = config.strides
        activate_before_residual = config.activate_before_residual
        filters = [ff for ff in config.filters]  # Copy filter config.
        init_filter = config.init_filter

        with tf.variable_scope("init"):
            h = self._conv("init_conv", x, init_filter,
                           self.config.num_channel, filters[0],
                           self._stride_arr(config.init_stride))
            h = self._batch_norm("init_bn", h)
            h = self._relu("init_relu", h)

            # Max-pooling is used in ImageNet experiments to further reduce
            # dimensionality.
            if config.init_max_pool:
                h = tf.nn.max_pool(h, [1, 3, 3, 1], [1, 2, 2, 1], "SAME")

        if config.use_bottleneck:
            res_func = self._bottleneck_residual
            # For CIFAR-10 it's [16, 16, 32, 64] => [16, 64, 128, 256]
            for ii in range(1, len(filters)):
                filters[ii] *= 4
        else:
            res_func = self._residual

        # New version, single for-loop. Easier for checkpoint.
        nlayers = sum(config.num_residual_units)
        ss = 0
        ii = 0
        for ll in range(nlayers):
            # Residual unit configuration.
            if ss == 0 and ii == 0:
                no_activation = True
            else:
                no_activation = False
            if ii == 0:
                if ss == 0:
                    no_activation = True
                else:
                    no_activation = False
                in_filter = filters[ss]
                stride = self._stride_arr(strides[ss])
            else:
                in_filter = filters[ss + 1]
                stride = self._stride_arr(1)
            out_filter = filters[ss + 1]

            # Save hidden state.
            if ii == 0:
                self._saved_hidden.append(h)

            # Build residual unit.
            with tf.variable_scope("unit_{}_{}".format(ss + 1, ii)):
                h = res_func(h,
                             in_filter,
                             out_filter,
                             stride,
                             no_activation=no_activation,
                             add_bn_ops=True)

            if (ii + 1) % config.num_residual_units[ss] == 0:
                ss += 1
                ii = 0
            else:
                ii += 1

        # Save hidden state.
        self._saved_hidden.append(h)
        print('length of saved hidden is {}'.format(len(self._saved_hidden)))
        print(self._saved_hidden)

        # Make a single tensor.
        if type(h) == tuple:
            print('Yes. h is a tuple')
            h = concat(h, axis=3)

        with tf.variable_scope("unit_last"):
            h = self._batch_norm("final_bn", h)
            h = self._relu("final_relu", h)

        print('shape of h is {}'.format(h.shape))
        h = self._global_avg_pool(h)
        print('shape of h is {} (after avg_pool)'.format(h.shape))

        # Classification layer.
        with tf.variable_scope("logit"):
            logits = self._fully_connected(h, config.num_classes)
        print('logits shape is {}'.format(logits.shape))
        # exit(0)
        return logits
Exemplo n.º 4
0
  def _build_towers(self):
    # Calculate the gradients for each model tower.
    config = self.config
    tower_grads = []
    op_list = []

    with tf.device(self._get_device("cpu", 0)):
      inputs = split(self.input, self.num_replica, axis=0)
      labels = split(self.label, self.num_replica, axis=0)
      outputs = []
      costs = []
      cross_ents = []
      tower_grads_and_vars = []

      if self.is_training:
        self._lr = tf.get_variable(
            "learn_rate", [],
            initializer=tf.constant_initializer(0.0),
            dtype=self.dtype,
            trainable=False)

      for ii in range(self.num_replica):
        visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
        if visible_devices is None:
          device = self._get_device("cpu", 0)
        else:
          num_gpu = len(visible_devices)
          device = self._get_device("gpu", ii % num_gpu)
        with tf.device(device):
          with tf.name_scope("%s_%d" % ("replica", ii)) as scope:
            tower_ = self._tower_cls(
                config,
                is_training=self.is_training,
                inference_only=True,
                inp=inputs[ii],
                label=labels[ii],
                batch_size=self._batch_size,
                idx=ii)
            outputs.append(tower_.output)
            cross_ents.append(tower_.cross_ent)
            costs.append(tower_.cost)
            self._towers.append(tower_)

            if self.is_training:
              # Calculate the gradients for the batch of data on this tower.
              wd_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
              if len(wd_losses) > 0:
                log.info("Replica {}, Weight decay variables: {}".format(
                    ii, wd_losses))
                log.info("Replica {}, Number of weight decay variables: {}".
                         format(ii, len(wd_losses)))
              tower_grads_and_vars.append(
                  tower_._compute_gradients(tower_.cost))

            log.info("Replica {} built".format(ii), verbose=0)

            # Reuse variables for the next tower.
            tf.get_variable_scope().reuse_variables()

      self._output = concat(outputs, axis=0)
      self._output_idx = tf.cast(tf.argmax(self._output, axis=1), tf.int32)
      self._correct = tf.to_float(tf.equal(self._output_idx, self.label))
      self._cost = tf.reduce_mean(stack(costs))
      self._cross_ent = tf.reduce_mean(stack(cross_ents))
      if not self.is_training or self.inference_only:
        return

      grads_and_vars = self._average_gradients(tower_grads_and_vars)
      self._tower_grads_and_vars = tower_grads_and_vars
      self._grads_and_vars = grads_and_vars

      if self._apply_grad:
        tf.get_variable_scope()._reuse = None
        global_step = tf.get_variable(
            "global_step", [],
            initializer=tf.constant_initializer(0.0),
            trainable=False,
            dtype=self.dtype)
        self._global_step = global_step
        opt = tf.train.MomentumOptimizer(self.lr, momentum=self.config.momentum)
        train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
        self._train_op = train_op
      self._new_lr = tf.placeholder(
          self.dtype, shape=[], name="new_learning_rate")
      self._lr_update = tf.assign(self._lr, self._new_lr)