示例#1
0
    def _run():
      """Forward pass through the network."""
      with slim.arg_scope([slim.dropout], is_training=is_training):
        with slim.arg_scope(
            [slim.conv2d, slim.fully_connected],
            weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
            weights_regularizer=slim.l2_regularizer(self._l2_regularization),
            activation_fn=tf.nn.relu,
            trainable=is_training):
          with slim.arg_scope(
              [slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'):
            with slim.arg_scope(
                [slim.conv2d, slim.fully_connected],
                normalizer_fn=slim.batch_norm,
                normalizer_params=batch_norm):
              _, grasp_image = images
              net = slim.conv2d(
                  grasp_image,
                  64, [6, 6],
                  stride=2,
                  scope='conv1_1',
                  activation_fn=None,
                  normalizer_fn=None,
                  normalizer_params=None)
              # Old checkpoints (such as those used for tests) did not have
              # scaling on the separate batch norm operations (those not
              # associated with a conv operation), so only setting the scale
              # parameter in arg_scope would break the tests. We set scale=
              # False for these separate batch norm operations temporarily.
              # However, future users are encouraged to not set scale=False so
              # that barch_norm parameters are consistent through the whole
              # network.
              net = tf.nn.relu(slim.batch_norm(net, scale=False))
              net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool1')
              self.activation_layers.append(net)
              for l in range(2, 2 + self.num_convs[0]):
                net = slim.conv2d(net, 64, [5, 5], scope='conv%d' % l)
                self.activation_layers.append(net)
              net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool2')
              end_points['pool2'] = net
              self.activation_layers.append(net)
              logging.debug('pool2')
              logging.debug(net.get_shape())

              if grasp_param_names is None:
                grasp_param_blocks = [grasp_params]
                grasp_param_block_names = ['fcgrasp']
              else:
                grasp_param_blocks = []
                grasp_param_block_names = []
                # Note: Creating variables must happen in a deterministic
                # order, otherwise some workers will look for variables on the
                # wrong parameter servers, so we sort the grasp_param_names
                # here.
                for block_name in sorted(grasp_param_names):
                  offset, size = grasp_param_names[block_name]
                  grasp_param_blocks += [
                      tf.slice(grasp_params, [0, offset], [-1, size])
                  ]
                  grasp_param_block_names += [block_name]

              grasp_param_tensors = []
              for block, name in zip(grasp_param_blocks,
                                     grasp_param_block_names):
                grasp_param_tensors += [
                    slim.fully_connected(
                        block,
                        256,
                        scope=name,
                        activation_fn=None,
                        normalizer_fn=None,
                        normalizer_params=None)
                ]

              fcgrasp = tf.add_n(grasp_param_tensors)

              # Old checkpoints (such as those used for tests) did not have
              # scaling on the separate batch norm operations (those not
              # associated with a conv operation), so only setting the scale
              # parameter in arg_scope would break the tests. We set scale=
              # False for these separate batch norm operations temporarily.
              # However, future users are encouraged to not set scale=False so
              # that barch_norm parameters are consistent through the whole
              # network.
              fcgrasp = tf.nn.relu(slim.batch_norm(fcgrasp, scale=False))
              fcgrasp = slim.fully_connected(fcgrasp, 64, scope='fcgrasp2')
              context = tf.reshape(fcgrasp, [-1, 1, 1, 64])
              end_points['fcgrasp'] = fcgrasp
              # Tile the image embedding action_batch_size times to align
              # with the expanded action dimension of action_batch_size.
              # Same image is used with all the actions in a action_batch.
              # net pre expansion should be [batch, *, *, *]
              # net post expansion should be [batch x action_batch, *, *, *]
              if tile_batch:
                net = contrib_seq2seq.tile_batch(net, self._action_batch_size)
              net = tf.add(net, context)
              logging.debug('net post add %s', net)
              end_points['vsum'] = net
              self.activation_layers.append(net)
              logging.debug('vsum')
              logging.debug(net.get_shape())
              for l in range(2 + sum(self.num_convs[:1]),
                             2 + sum(self.num_convs[:2])):
                net = slim.conv2d(net, 64, [3, 3], scope='conv%d' % l)
                logging.debug('conv%d', l)
                self.activation_layers.append(net)
              logging.debug(net.get_shape())
              net = slim.max_pool2d(net, [2, 2], stride=2, scope='pool3')
              logging.debug('pool3')
              logging.debug(net.get_shape())
              self.activation_layers.append(net)
              for l in range(2 + sum(self.num_convs[:2]),
                             2 + sum(self.num_convs[:3])):
                net = slim.conv2d(
                    net, 64, [3, 3], scope='conv%d' % l, padding='VALID')
                self.activation_layers.append(net)
              logging.debug('final conv')
              logging.debug(net.get_shape())
              end_points['final_conv'] = net

              batch_size = tf.shape(net)[0]
              if goal_spatial_fn is not None:
                goal_spatial = goal_spatial_fn()
                # Tile goal to match net batch size (e.g. CEM).
                goal_batch_size = tf.shape(goal_spatial)[0]
                goal_spatial = tf.tile(
                    goal_spatial, [batch_size//goal_batch_size, 1, 1, 1])
                # Merging features in style of Fang 2017.
                net = tf.concat([net, goal_spatial], axis=3)
              net = slim.flatten(net, scope='flatten')

              if goal_vector_fn is not None:
                goal_vector = goal_vector_fn()
                goal_batch_size = tf.shape(goal_vector)[0]
                goal_vector = tf.tile(
                    goal_vector, [batch_size//goal_batch_size, 1])
                net = tf.concat([net, goal_vector], axis=1)

              for l in range(self.hid_layers):
                net = slim.fully_connected(net, 64, scope='fc%d' % l)

              name = 'logit'
              if num_classes > 1:
                name = 'logit_%d' % num_classes
              logits = slim.fully_connected(
                  net,
                  num_classes,
                  activation_fn=None,
                  scope=name,
                  normalizer_fn=None,
                  normalizer_params=None)
              end_points['logits'] = logits
              if softmax:
                predictions = tf.nn.softmax(logits)
              else:
                predictions = tf.nn.sigmoid(logits)
              if tile_batch:

                if num_classes > 1:
                  predictions = tf.reshape(
                      predictions, [-1, self._action_batch_size, num_classes])
                else:
                  predictions = tf.reshape(predictions,
                                           [-1, self._action_batch_size])
              end_points['predictions'] = predictions
              return logits, end_points
def yolo_v3(inputs,
            num_classes,
            is_training=False,
            data_format='NCHW',
            reuse=False,
            with_spp=False):
    """
    Creates YOLO v3 model.

    :param inputs: a 4-D tensor of size [batch_size, height, width, channels].
        Dimension batch_size may be undefined. The channel order is RGB.
    :param num_classes: number of predicted classes.
    :param is_training: whether is training or not.
    :param data_format: data format NCHW or NHWC.
    :param reuse: whether or not the network and its variables should be reused.
    :param with_spp: whether or not is using spp layer.
    :return:
    """
    # it will be needed later on
    img_size = inputs.get_shape().as_list()[1:3]

    # transpose the inputs to NCHW
    if data_format == 'NCHW':
        inputs = tf.transpose(inputs, [0, 3, 1, 2])

    # normalize values to range [0..1]
    inputs = inputs / 255

    # set batch norm params
    batch_norm_params = {
        'decay': _BATCH_NORM_DECAY,
        'epsilon': _BATCH_NORM_EPSILON,
        'scale': True,
        'is_training': is_training,
        'fused': None,  # Use fused batch norm if possible.
    }

    # Set activation_fn and parameters for conv2d, batch_norm.
    with slim.arg_scope([slim.conv2d, slim.batch_norm, _fixed_padding],
                        data_format=data_format,
                        reuse=reuse):
        with slim.arg_scope([slim.conv2d],
                            normalizer_fn=slim.batch_norm,
                            normalizer_params=batch_norm_params,
                            biases_initializer=None,
                            activation_fn=lambda x: tf.nn.leaky_relu(
                                x, alpha=_LEAKY_RELU)):
            with tf.variable_scope('darknet-53'):
                route_1, route_2, inputs = darknet53(inputs)

            with tf.variable_scope('yolo-v3'):
                route, inputs = _yolo_block(inputs, 512, data_format, with_spp)

                detect_1 = _detection_layer(inputs, num_classes, _ANCHORS[6:9],
                                            img_size, data_format)
                detect_1 = tf.identity(detect_1, name='detect_1')

                inputs = _conv2d_fixed_padding(route, 256, 1)
                upsample_size = route_2.get_shape().as_list()
                inputs = _upsample(inputs, upsample_size, data_format)
                inputs = tf.concat([inputs, route_2],
                                   axis=1 if data_format == 'NCHW' else 3)

                route, inputs = _yolo_block(inputs, 256)

                detect_2 = _detection_layer(inputs, num_classes, _ANCHORS[3:6],
                                            img_size, data_format)
                detect_2 = tf.identity(detect_2, name='detect_2')

                inputs = _conv2d_fixed_padding(route, 128, 1)
                upsample_size = route_1.get_shape().as_list()
                inputs = _upsample(inputs, upsample_size, data_format)
                inputs = tf.concat([inputs, route_1],
                                   axis=1 if data_format == 'NCHW' else 3)

                _, inputs = _yolo_block(inputs, 128)

                detect_3 = _detection_layer(inputs, num_classes, _ANCHORS[0:3],
                                            img_size, data_format)
                detect_3 = tf.identity(detect_3, name='detect_3')

                detections = tf.concat([detect_1, detect_2, detect_3], axis=1)
                detections = tf.identity(detections, name='detections')
                return detections
示例#3
0
def overfeat(inputs,
             num_classes=1000,
             is_training=True,
             dropout_keep_prob=0.5,
             spatial_squeeze=True,
             scope='overfeat',
             global_pool=False):
    """Contains the model definition for the OverFeat network.

  The definition for the network was obtained from:
    OverFeat: Integrated Recognition, Localization and Detection using
    Convolutional Networks
    Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and
    Yann LeCun, 2014
    http://arxiv.org/abs/1312.6229

  Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 231x231. To use in fully
        convolutional mode, set spatial_squeeze to false.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes. If 0 or None, the logits layer is
      omitted and the input features to the logits layer are returned instead.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.
    global_pool: Optional boolean flag. If True, the input to the classification
      layer is avgpooled to size 1x1, for any input size. (This is not part
      of the original OverFeat.)

  Returns:
    net: the output of the logits layer (if num_classes is a non-zero integer),
      or the non-dropped-out input to the logits layer (if num_classes is 0 or
      None).
    end_points: a dict of tensors with intermediate activations.
  """
    with tf.variable_scope(scope, 'overfeat', [inputs]) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d
        with slim.arg_scope(
            [slim.conv2d, slim.fully_connected, slim.max_pool2d],
                outputs_collections=end_points_collection):
            net = slim.conv2d(inputs,
                              64, [11, 11],
                              4,
                              padding='VALID',
                              scope='conv1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.conv2d(net, 256, [5, 5], padding='VALID', scope='conv2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.conv2d(net, 512, [3, 3], scope='conv3')
            net = slim.conv2d(net, 1024, [3, 3], scope='conv4')
            net = slim.conv2d(net, 1024, [3, 3], scope='conv5')
            net = slim.max_pool2d(net, [2, 2], scope='pool5')

            # Use conv2d instead of fully_connected layers.
            with slim.arg_scope(
                [slim.conv2d],
                    weights_initializer=trunc_normal(0.005),
                    biases_initializer=tf.constant_initializer(0.1)):
                net = slim.conv2d(net,
                                  3072, [6, 6],
                                  padding='VALID',
                                  scope='fc6')
                net = slim.dropout(net,
                                   dropout_keep_prob,
                                   is_training=is_training,
                                   scope='dropout6')
                net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
                # Convert end_points_collection into a end_point dict.
                end_points = slim.utils.convert_collection_to_dict(
                    end_points_collection)
                if global_pool:
                    net = tf.reduce_mean(net, [1, 2],
                                         keep_dims=True,
                                         name='global_pool')
                    end_points['global_pool'] = net
                if num_classes:
                    net = slim.dropout(net,
                                       dropout_keep_prob,
                                       is_training=is_training,
                                       scope='dropout7')
                    net = slim.conv2d(
                        net,
                        num_classes, [1, 1],
                        activation_fn=None,
                        normalizer_fn=None,
                        biases_initializer=tf.zeros_initializer(),
                        scope='fc8')
                    if spatial_squeeze:
                        net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
                    end_points[sc.name + '/fc8'] = net
            return net, end_points
def nopad_inception_v3_base_129(inputs,
                                min_depth=16,
                                depth_multiplier=1.0,
                                num_final_1x1_conv=0,
                                scope=None):
    """Constructs a no padding Inception v3 network from inputs.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels]. Must be
      floating point. If a pretrained checkpoint is used, pixel values should be
      the same as during training.
    min_depth: Minimum depth value (number of channels) for all convolution ops.
      Enforced when depth_multiplier < 1, and not an active constraint when
      depth_multiplier >= 1.
    depth_multiplier: Float multiplier for the depth (number of channels) for
      all convolution ops. The value must be greater than zero. Typical usage
      will be to set this value in (0, 1) to reduce the number of parameters or
      computation cost of the model.
    num_final_1x1_conv: Int, number of final 1x1 conv layers.
    scope: Optional variable_scope.

  Returns:
    tensor_out: output tensor.
    end_points: a set of activations for external use, for example summaries or
                losses.

  Raises:
    ValueError: if depth_multiplier <= 0
  """
    # end_points will collect relevant activations for external use, for example
    # summaries or losses.
    end_points = {}

    if depth_multiplier <= 0:
        raise ValueError('depth_multiplier is not greater than zero.')
    depth = lambda d: max(int(d * depth_multiplier), min_depth)

    with tf.variable_scope(scope, 'NopadInceptionV3', [inputs]):
        with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                            stride=1,
                            padding='VALID'):
            # 129 x 129 x 3
            end_point = 'Conv2d_1a_3x3'
            net = slim.conv2d(inputs, depth(32), [3, 3], scope=end_point)
            end_points[end_point] = net
            # 127 x 127 x 32
            end_point = 'Conv2d_2a_3x3'
            net = slim.conv2d(net, depth(32), [3, 3], scope=end_point)
            end_points[end_point] = net
            # 125 x 125 x 32
            end_point = 'Conv2d_2b_3x3'
            net = slim.conv2d(net, depth(64), [3, 3], scope=end_point)
            end_points[end_point] = net
            # 123 x 123 x 64
            end_point = 'MaxPool_3a_3x3'
            net = slim.max_pool2d(net, [3, 3], scope=end_point)
            end_points[end_point] = net
            # 121 x 121 x 64
            end_point = 'Conv2d_3b_1x1'
            net = slim.conv2d(net, depth(80), [1, 1], scope=end_point)
            end_points[end_point] = net
            # 121 x 121 x 80
            end_point = 'Conv2d_4a_3x3'
            net = slim.conv2d(net, depth(192), [3, 3], scope=end_point)
            end_points[end_point] = net
            # 119 x 119 x 192
            end_point = 'MaxPool_5a_3x3'
            net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point)
            end_points[end_point] = net
            # 59 x 59 x 192

        # Inception blocks
        with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                            stride=1,
                            padding='VALID'):
            # Mixed_5b: 55 x 55 x 256
            end_point = 'Mixed_5b'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(48), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(64), [5, 5],
                                           scope='Conv2d_0b_5x5')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0b_3x3')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0c_3x3')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(32), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    [
                        _trim_border_px(branch_0, 2),  # branch_0: 59 x 59 x 64
                        branch_1,  # branch_1: 55 x 55 x 64
                        branch_2,  # branch_2: 55 x 55 x 96
                        _trim_border_px(branch_3, 1)  # branch_3: 57 x 57 x 32
                    ],
                    3)
            end_points[end_point] = net

            # Mixed_5c: 51 x 51 x 288
            end_point = 'Mixed_5c'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(48), [1, 1],
                                           scope='Conv2d_0b_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(64), [5, 5],
                                           scope='Conv_1_0c_5x5')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0b_3x3')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0c_3x3')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    [
                        _trim_border_px(branch_0, 2),  # branch_0: 55 x 55 x 64
                        branch_1,  # branch_1: 51 x 51 x 64
                        branch_2,  # branch_2: 51 x 51 x 96
                        _trim_border_px(branch_3, 1)  # branch_3: 53 x 53 x 64
                    ],
                    3)
            end_points[end_point] = net

            # Mixed_6a: 25 x 25 x 768
            end_point = 'Mixed_6a'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(384), [3, 3],
                                           stride=2,
                                           padding='VALID',
                                           scope='Conv2d_1a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(96), [3, 3],
                                           stride=2,
                                           padding='VALID',
                                           scope='Conv2d_1a_1x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.max_pool2d(net, [3, 3],
                                               stride=2,
                                               padding='VALID',
                                               scope='MaxPool_1a_3x3')
                net = tf.concat(
                    [
                        branch_0,  # branch_0: 25 x 25 x 384
                        branch_1,  # branch_1: 25 x 25 x 96
                        branch_2,  # branch_2: 25 x 25 x 288
                    ],
                    3)
            end_points[end_point] = net

            # Mixed_6b: 17 x 17 x 768
            end_point = 'Mixed_6b'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(128), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(128), [1, 5],
                                           scope='Conv2d_0b_1x7')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [5, 1],
                                           scope='Conv2d_0c_7x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(128), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(128), [5, 1],
                                           scope='Conv2d_0b_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(128), [1, 5],
                                           scope='Conv2d_0c_1x7')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(128), [5, 1],
                                           scope='Conv2d_0d_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 5],
                                           scope='Conv2d_0e_1x7')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    [
                        _trim_border_px(branch_0,
                                        4),  # branch_0: 25 x 25 x 192
                        _trim_border_px(branch_1,
                                        2),  # branch_1: 21 x 21 x 192
                        branch_2,  # branch_2: 17 x 17 x 192
                        _trim_border_px(branch_3, 3)  # branch_3: 23 x 23 x 192
                    ],
                    3)
            end_points[end_point] = net

            # mixed_6c: 9 x 9 x 768
            end_point = 'Mixed_6c'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(160), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(160), [1, 5],
                                           scope='Conv2d_0b_1x7')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [5, 1],
                                           scope='Conv2d_0c_7x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(160), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [5, 1],
                                           scope='Conv2d_0b_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [1, 5],
                                           scope='Conv2d_0c_1x7')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [5, 1],
                                           scope='Conv2d_0d_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 5],
                                           scope='Conv2d_0e_1x7')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    [
                        _trim_border_px(branch_0,
                                        4),  # branch_0: 17 x 17 x 192
                        _trim_border_px(branch_1,
                                        2),  # branch_1: 13 x 13 x 192
                        branch_2,  # branch_2: 9 x 9 x 192
                        _trim_border_px(branch_3, 3)  # branch_3: 15 x 15 x 192
                    ],
                    3)
            end_points[end_point] = net

            # Mixed_6d: 1 x 1
            end_point = 'Mixed_6d'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [1, 5],
                                           scope='Conv2d_0b_1x7')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [5, 1],
                                           scope='Conv2d_0c_7x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [5, 1],
                                           scope='Conv2d_0b_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 5],
                                           scope='Conv2d_0c_1x7')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [5, 1],
                                           scope='Conv2d_0d_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 5],
                                           scope='Conv2d_0e_1x7')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    [
                        _trim_border_px(branch_0, 4),  # branch_0: 9 x 9 x 192
                        _trim_border_px(branch_1, 2),  # branch_1: 5 x 5 x 192
                        branch_2,  # branch_2: 1 x 1 x 192
                        _trim_border_px(branch_3, 3)  # branch_3: 7 x 7 x 192
                    ],
                    3)
            end_points[end_point] = net

            for i in range(num_final_1x1_conv):
                slim.conv2d(net,
                            depth(256), [1, 1],
                            scope='Final_Conv2d_{}_1x1'.format(i))
                end_points['Final_Conv2d_{}_1x1'.format(i)] = net
            return net, end_points
def decoder(encoded, scales, styles, texture_only=False, style_size=8, image_size=(112,112),
        keep_prob=1.0, phase_train=True, weight_decay=0.0, reuse=None, scope='Decoder'):
    with tf.compat.v1.variable_scope(scope, reuse=reuse):
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        # weights_initializer=tf.contrib.layers.xavier_initializer(),
                        weights_initializer=tf.compat.v1.keras.initializers.VarianceScaling(scale=2.0),
                        weights_regularizer=tf.keras.regularizers.l2(0.5 * (weight_decay))):
            with slim.arg_scope([slim.dropout, slim.batch_norm], is_training=phase_train):
                with slim.arg_scope([slim.fully_connected],
                    normalizer_fn=layer_norm, normalizer_params=None):
                    print('{} input shape:'.format(scope), [dim.value for dim in encoded.shape])
                        
                    batch_size = tf.shape(input=encoded)[0]
                    h, w = tuple(image_size)
                    k = 64
    
                    with tf.compat.v1.variable_scope('StyleController'):

                        if styles is None:
                            styles = tf.random.normal((batch_size, style_size))

                        net = tf.identity(styles, name='input_style')

                        net = slim.fully_connected(net, 128, scope='fc2')
                        print('module fc2 shape:', [dim.value for dim in net.shape])

                        net = slim.fully_connected(net, 128, scope='fc3')
                        print('module fc3 shape:', [dim.value for dim in net.shape])

                        gamma = slim.fully_connected(net, 4*k, activation_fn=None, normalizer_fn=None, scope='fc4')
                        gamma = tf.reshape(gamma, [-1, 1, 1, 4*k], name='gamma')
                        print('gamma shape:', [dim.value for dim in gamma.shape])

                        beta = slim.fully_connected(net, 4*k, activation_fn=None, normalizer_fn=None, scope='fc5')
                        beta = tf.reshape(beta, [-1, 1, 1, 4*k], name='beta')
                        print('beta shape:', [dim.value for dim in beta.shape])


                    
                    with tf.compat.v1.variable_scope('Decoder'):
                        print('-- Decoder')
                        net = encoded

                        adain = lambda x : gamma * instance_norm(x, center=False, scale=False) + beta

                        with slim.arg_scope([slim.conv2d_transpose, slim.conv2d],
                                    normalizer_fn=adain, normalizer_params=None):
                            for i in range(3):
                                net_ = conv(net, 4*k, 3, scope='res{}_0'.format(i))
                                net += conv(net_, 4*k, 3, activation_fn=None, biases_initializer=None, scope='res{}_1'.format(i))
                                print('module res{} shape:'.format(i), [dim.value for dim in net.shape])

               
                        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected],
                                normalizer_fn=layer_norm, normalizer_params=None):
                            net = upscale2d(net, 2)
                            net = conv(net, 2*k, 5, pad=2, scope='deconv1_1')
                            print('module deconv1 shape:', [dim.value for dim in net.shape])

                            net = upscale2d(net, 2)
                            net = conv(net, k, 5, pad=2, scope='deconv2_1')

                        net = conv(net, 3, 7, pad=3, activation_fn=None, normalizer_fn=None, 
                                    weights_initializer=tf.compat.v1.constant_initializer(0.0), scope='conv_image')
                        images_rendered = tf.nn.tanh(net, name='images_rendered')
                        print('images_rendered shape:', [dim.value for dim in images_rendered.shape])

                    if texture_only:
                        return images_rendered                        

                    with tf.compat.v1.variable_scope('WarpController'):

                        print('-- WarpController')

                        net = encoded
                        warp_input = tf.identity(images_rendered, name='warp_input')

                        net = slim.flatten(net)

                        net = slim.fully_connected(net, 128, scope='fc1')
                        print('module fc1 shape:', [dim.value for dim in net.shape])

                        num_ldmark = 16

                        # Predict the control points
                        ldmark_mean = (np.random.normal(0,50, (num_ldmark,2)) + np.array([[0.5*h,0.5*w]])).flatten()
                        ldmark_mean = tf.Variable(ldmark_mean.astype(np.float32), name='ldmark_mean')
                        print('ldmark_mean shape:', [dim.value for dim in ldmark_mean.shape])

                        ldmark_pred = slim.fully_connected(net, num_ldmark*2, 
                            weights_initializer=tf.compat.v1.truncated_normal_initializer(stddev=1.0),
                            normalizer_fn=None, activation_fn=None, biases_initializer=None, scope='fc_ldmark')
                        ldmark_pred = ldmark_pred + ldmark_mean
                        print('ldmark_pred shape:', [dim.value for dim in ldmark_pred.shape])
                        ldmark_pred = tf.identity(ldmark_pred, name='ldmark_pred')
                 

                        # Predict the displacements
                        ldmark_diff = slim.fully_connected(net, num_ldmark*2, 
                            normalizer_fn=None,  activation_fn=None, scope='fc_diff')
                        print('ldmark_diff shape:', [dim.value for dim in ldmark_diff.shape])
                        ldmark_diff = tf.identity(ldmark_diff, name='ldmark_diff')
                        ldmark_diff = tf.identity(tf.reshape(scales,[-1,1]) * ldmark_diff, name='ldmark_diff_scaled')



                        src_pts = tf.reshape(ldmark_pred, [-1, num_ldmark ,2])
                        dst_pts = tf.reshape(ldmark_pred + ldmark_diff, [-1, num_ldmark, 2])

                        diff_norm = tf.reduce_mean(input_tensor=tf.norm(tensor=src_pts-dst_pts, axis=[1,2]))
                        # tf.summary.scalar('diff_norm', diff_norm)
                        # tf.summary.scalar('mark', ldmark_pred[0,0])

                        images_transformed, dense_flow = sparse_image_warp(warp_input, src_pts, dst_pts,
                                regularization_weight = 1e-6, num_boundary_points=0)
                        dense_flow = tf.identity(dense_flow, name='dense_flow')

                return images_transformed, images_rendered, ldmark_pred, ldmark_diff
def inception_resnet_v1(inputs,
                        is_training=True,
                        dropout_keep_prob=0.8,
                        bottleneck_layer_size=128,
                        reuse=None,
                        scope='InceptionResnetV1'):
    """Creates the Inception Resnet V1 model.
    Args:
      inputs: a 4-D tensor of size [batch_size, height, width, 3].
      num_classes: number of predicted classes.
      is_training: whether is training or not.
      dropout_keep_prob: float, the fraction to keep before final layer.
      reuse: whether or not the network and its variables should be reused. To be
        able to reuse 'scope' must be given.
      scope: Optional variable_scope.
    Returns:
      logits: the logits outputs of the model.
      end_points: the set of end_points from the inception model.
    """
    end_points = {}

    with tf.variable_scope(scope, 'InceptionResnetV1', [inputs], reuse=reuse):
        with slim.arg_scope([slim.batch_norm, slim.dropout],
                            is_training=is_training):
            with slim.arg_scope(
                [slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                    stride=1,
                    padding='SAME'):

                # 149 x 149 x 32
                net = slim.conv2d(inputs,
                                  32,
                                  3,
                                  stride=2,
                                  padding='VALID',
                                  scope='Conv2d_1a_3x3')
                end_points['Conv2d_1a_3x3'] = net
                # 147 x 147 x 32
                net = slim.conv2d(net,
                                  32,
                                  3,
                                  padding='VALID',
                                  scope='Conv2d_2a_3x3')
                end_points['Conv2d_2a_3x3'] = net
                # 147 x 147 x 64
                net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3')
                end_points['Conv2d_2b_3x3'] = net
                # 73 x 73 x 64
                net = slim.max_pool2d(net,
                                      3,
                                      stride=2,
                                      padding='VALID',
                                      scope='MaxPool_3a_3x3')
                end_points['MaxPool_3a_3x3'] = net
                # 73 x 73 x 80
                net = slim.conv2d(net,
                                  80,
                                  1,
                                  padding='VALID',
                                  scope='Conv2d_3b_1x1')
                end_points['Conv2d_3b_1x1'] = net
                # 71 x 71 x 192
                net = slim.conv2d(net,
                                  192,
                                  3,
                                  padding='VALID',
                                  scope='Conv2d_4a_3x3')
                end_points['Conv2d_4a_3x3'] = net
                # 35 x 35 x 256
                net = slim.conv2d(net,
                                  256,
                                  3,
                                  stride=2,
                                  padding='VALID',
                                  scope='Conv2d_4b_3x3')
                end_points['Conv2d_4b_3x3'] = net

                # 5 x Inception-resnet-A
                net = slim.repeat(net, 5, block35, scale=0.17)
                end_points['Mixed_5a'] = net

                # Reduction-A
                with tf.variable_scope('Mixed_6a'):
                    net = reduction_a(net, 192, 192, 256, 384)
                end_points['Mixed_6a'] = net

                # 10 x Inception-Resnet-B
                net = slim.repeat(net, 10, block17, scale=0.10)
                end_points['Mixed_6b'] = net

                # Reduction-B
                with tf.variable_scope('Mixed_7a'):
                    net = reduction_b(net)
                end_points['Mixed_7a'] = net

                # 5 x Inception-Resnet-C
                net = slim.repeat(net, 5, block8, scale=0.20)
                end_points['Mixed_8a'] = net

                net = block8(net, activation_fn=None)
                end_points['Mixed_8b'] = net

                with tf.variable_scope('Logits'):
                    end_points['PrePool'] = net
                    #pylint: disable=no-member
                    net = slim.avg_pool2d(net,
                                          net.get_shape()[1:3],
                                          padding='VALID',
                                          scope='AvgPool_1a_8x8')
                    net = slim.flatten(net)

                    net = slim.dropout(net,
                                       dropout_keep_prob,
                                       is_training=is_training,
                                       scope='Dropout')

                    end_points['PreLogitsFlatten'] = net

                net = slim.fully_connected(net,
                                           bottleneck_layer_size,
                                           activation_fn=None,
                                           scope='Bottleneck',
                                           reuse=False)

    return net, end_points
示例#7
0
def resnet_v1_200(inputs,
                  num_classes=None,
                  is_training=True,
                  global_pool=True,
                  output_stride=None,
                  spatial_squeeze=True,
                  reuse=None,
                  scope='resnet_v1_200'):
    """ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
    blocks = [
        resnet_utils.Block(
            'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]),
        resnet_utils.Block(
            'block2', bottleneck, [(512, 128, 1)] * 23 + [(512, 128, 2)]),
        resnet_utils.Block(
            'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]),
        resnet_utils.Block(
            'block4', bottleneck, [(2048, 512, 1)] * 3)]
    return resnet_v1(inputs, blocks, num_classes, is_training,
                     global_pool=global_pool, output_stride=output_stride,
                     include_root_block=True, spatial_squeeze=spatial_squeeze,
                     reuse=reuse, scope=scope)


resnet_v1_200.default_image_size = resnet_v1.default_image_size


if __name__ == '__main__':
    input = tf.compat.v1.placeholder(tf.float32, shape=(None, 224, 224, 3), name='input')
    with slim.arg_scope(resnet_arg_scope()) as sc:
        logits = resnet_v1_50(input)
def resnet_v2(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              spatial_squeeze=True,
              reuse=None,
              scope=None):
    with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
        end_points_collection = sc.name + '_end_points'
        with slim.arg_scope(
            [slim.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
                outputs_collections=end_points_collection):
            with slim.arg_scope([slim.batch_norm], is_training=is_training):
                net = inputs
                if include_root_block:
                    if output_stride is not None:
                        if output_stride % 4 != 0:
                            raise ValueError(
                                'The output_stride needs to be a multiple of 4.'
                            )

                        output_stride /= 4

                    with slim.arg_scope([slim.conv2d],
                                        activation_fn=None,
                                        normalizer_fn=None):
                        net = resnet_utils.conv2d_same(net,
                                                       64,
                                                       6,
                                                       stride=1,
                                                       scope='conv1')
                    net = slim.max_pool2d(net, [2, 2], stride=2, scope='pool1')
                net = resnet_utils.stack_blocks_dense(net, blocks,
                                                      output_stride)
                net = slim.batch_norm(net,
                                      activation_fn=tf.nn.relu,
                                      scope='postnorm')
                output0 = net

                if global_pool:
                    net = tf.reduce_mean(net, [1, 2],
                                         name='pool5',
                                         keep_dims=True)
                    output1 = net
                if num_classes is not None:
                    net = slim.conv2d(net,
                                      num_classes, [1, 1],
                                      activation_fn=None,
                                      normalizer_fn=None,
                                      scope='logits')

                if spatial_squeeze:
                    logits = tf.squeeze(net, [1, 2], name='SpatialSqueeze')

                end_points = slim.utils.convert_collection_to_dict(
                    end_points_collection)

                if num_classes is not None:
                    end_points['predictions'] = slim.softmax(
                        logits, scope='predictions')

                return logits, end_points, output0, output1
示例#9
0
文件: model.py 项目: atkochi/magenta
def model_fn(features, labels, mode, params, config):
    """Builds the acoustic model."""
    del config
    hparams = params

    length = features.length
    spec = features.spec

    is_training = mode == tf.estimator.ModeKeys.TRAIN

    if is_training:
        onset_labels = labels.onsets
        offset_labels = labels.offsets
        velocity_labels = labels.velocities
        frame_labels = labels.labels
        frame_label_weights = labels.label_weights

    if hparams.stop_activation_gradient and not hparams.activation_loss:
        raise ValueError(
            'If stop_activation_gradient is true, activation_loss must be true.'
        )

    losses = {}
    with slim.arg_scope([slim.batch_norm, slim.dropout],
                        is_training=is_training):
        with tf.variable_scope('onsets'):
            onset_outputs = acoustic_model(spec,
                                           hparams,
                                           lstm_units=hparams.onset_lstm_units,
                                           lengths=length)
            onset_probs = slim.fully_connected(onset_outputs,
                                               constants.MIDI_PITCHES,
                                               activation_fn=tf.sigmoid,
                                               scope='onset_probs')

            # onset_probs_flat is used during inference.
            onset_probs_flat = flatten_maybe_padded_sequences(
                onset_probs, length)
            if is_training:
                onset_labels_flat = flatten_maybe_padded_sequences(
                    onset_labels, length)
                onset_losses = tf_utils.log_loss(onset_labels_flat,
                                                 onset_probs_flat)
                tf.losses.add_loss(tf.reduce_mean(onset_losses))
                losses['onset'] = onset_losses
        with tf.variable_scope('offsets'):
            offset_outputs = acoustic_model(
                spec,
                hparams,
                lstm_units=hparams.offset_lstm_units,
                lengths=length)
            offset_probs = slim.fully_connected(offset_outputs,
                                                constants.MIDI_PITCHES,
                                                activation_fn=tf.sigmoid,
                                                scope='offset_probs')

            # offset_probs_flat is used during inference.
            offset_probs_flat = flatten_maybe_padded_sequences(
                offset_probs, length)
            if is_training:
                offset_labels_flat = flatten_maybe_padded_sequences(
                    offset_labels, length)
                offset_losses = tf_utils.log_loss(offset_labels_flat,
                                                  offset_probs_flat)
                tf.losses.add_loss(tf.reduce_mean(offset_losses))
                losses['offset'] = offset_losses
        with tf.variable_scope('velocity'):
            velocity_outputs = acoustic_model(
                spec,
                hparams,
                lstm_units=hparams.velocity_lstm_units,
                lengths=length)
            velocity_values = slim.fully_connected(velocity_outputs,
                                                   constants.MIDI_PITCHES,
                                                   activation_fn=None,
                                                   scope='onset_velocities')

            velocity_values_flat = flatten_maybe_padded_sequences(
                velocity_values, length)
            if is_training:
                velocity_labels_flat = flatten_maybe_padded_sequences(
                    velocity_labels, length)
                velocity_loss = tf.reduce_sum(
                    onset_labels_flat *
                    tf.square(velocity_labels_flat - velocity_values_flat),
                    axis=1)
                tf.losses.add_loss(tf.reduce_mean(velocity_loss))
                losses['velocity'] = velocity_loss

        with tf.variable_scope('frame'):
            if not hparams.share_conv_features:
                # TODO(eriche): this is broken when hparams.frame_lstm_units > 0
                activation_outputs = acoustic_model(
                    spec,
                    hparams,
                    lstm_units=hparams.frame_lstm_units,
                    lengths=length)
                activation_probs = slim.fully_connected(
                    activation_outputs,
                    constants.MIDI_PITCHES,
                    activation_fn=tf.sigmoid,
                    scope='activation_probs')
            else:
                activation_probs = slim.fully_connected(
                    onset_outputs,
                    constants.MIDI_PITCHES,
                    activation_fn=tf.sigmoid,
                    scope='activation_probs')

            probs = []
            if hparams.stop_onset_gradient:
                probs.append(tf.stop_gradient(onset_probs))
            else:
                probs.append(onset_probs)

            if hparams.stop_activation_gradient:
                probs.append(tf.stop_gradient(activation_probs))
            else:
                probs.append(activation_probs)

            if hparams.stop_offset_gradient:
                probs.append(tf.stop_gradient(offset_probs))
            else:
                probs.append(offset_probs)

            combined_probs = tf.concat(probs, 2)

            if hparams.combined_lstm_units > 0:
                outputs = lstm_layer(
                    combined_probs,
                    hparams.combined_lstm_units,
                    lengths=length if hparams.use_lengths else None,
                    stack_size=hparams.combined_rnn_stack_size,
                    use_cudnn=hparams.use_cudnn,
                    bidirectional=hparams.bidirectional)
            else:
                outputs = combined_probs

            frame_probs = slim.fully_connected(outputs,
                                               constants.MIDI_PITCHES,
                                               activation_fn=tf.sigmoid,
                                               scope='frame_probs')

        frame_probs_flat = flatten_maybe_padded_sequences(frame_probs, length)

        if is_training:
            frame_labels_flat = flatten_maybe_padded_sequences(
                frame_labels, length)
            frame_label_weights_flat = flatten_maybe_padded_sequences(
                frame_label_weights, length)
            if hparams.weight_frame_and_activation_loss:
                frame_loss_weights = frame_label_weights_flat
            else:
                frame_loss_weights = None
            frame_losses = tf_utils.log_loss(frame_labels_flat,
                                             frame_probs_flat,
                                             weights=frame_loss_weights)
            tf.losses.add_loss(tf.reduce_mean(frame_losses))
            losses['frame'] = frame_losses

            if hparams.activation_loss:
                if hparams.weight_frame_and_activation_loss:
                    activation_loss_weights = frame_label_weights
                else:
                    activation_loss_weights = None
                activation_losses = tf_utils.log_loss(
                    frame_labels_flat,
                    flatten_maybe_padded_sequences(activation_probs, length),
                    weights=activation_loss_weights)
                tf.losses.add_loss(tf.reduce_mean(activation_losses))
                losses['activation'] = activation_losses

    frame_predictions = frame_probs_flat > hparams.predict_frame_threshold
    onset_predictions = onset_probs_flat > hparams.predict_onset_threshold
    offset_predictions = offset_probs_flat > hparams.predict_offset_threshold

    frame_predictions = tf.expand_dims(frame_predictions, axis=0)
    onset_predictions = tf.expand_dims(onset_predictions, axis=0)
    offset_predictions = tf.expand_dims(offset_predictions, axis=0)
    velocity_values = tf.expand_dims(velocity_values_flat, axis=0)

    metrics_values = metrics.define_metrics(
        frame_probs=frame_probs,
        onset_probs=onset_probs,
        frame_predictions=frame_predictions,
        onset_predictions=onset_predictions,
        offset_predictions=offset_predictions,
        velocity_values=velocity_values,
        length=features.length,
        sequence_label=labels.note_sequence,
        frame_labels=labels.labels,
        sequence_id=features.sequence_id,
        hparams=hparams)

    for label, loss_collection in losses.items():
        loss_label = 'losses/' + label
        metrics_values[loss_label] = loss_collection

    def predict_sequence():
        """Convert frame predictions into a sequence (TF)."""
        def _predict(frame_probs, onset_probs, frame_predictions,
                     onset_predictions, offset_predictions, velocity_values):
            """Convert frame predictions into a sequence (Python)."""
            sequence = infer_util.predict_sequence(
                frame_probs=frame_probs,
                onset_probs=onset_probs,
                frame_predictions=frame_predictions,
                onset_predictions=onset_predictions,
                offset_predictions=offset_predictions,
                velocity_values=velocity_values,
                hparams=hparams,
                min_pitch=constants.MIN_MIDI_PITCH)
            return sequence.SerializeToString()

        sequence = tf.py_func(_predict,
                              inp=[
                                  frame_probs[0],
                                  onset_probs[0],
                                  frame_predictions[0],
                                  onset_predictions[0],
                                  offset_predictions[0],
                                  velocity_values[0],
                              ],
                              Tout=tf.string,
                              stateful=False)
        sequence.set_shape([])
        return tf.expand_dims(sequence, axis=0)

    predictions = {
        'frame_probs': frame_probs,
        'onset_probs': onset_probs,
        'frame_predictions': frame_predictions,
        'onset_predictions': onset_predictions,
        'offset_predictions': offset_predictions,
        'velocity_values': velocity_values,
        'sequence_predictions': predict_sequence(),
        # Include some features and labels in output because Estimator 'predict'
        # API does not give access to them.
        'sequence_ids': features.sequence_id,
        'sequence_labels': labels.note_sequence,
        'frame_labels': labels.labels,
        'onset_labels': labels.onsets,
    }
    for k, v in metrics_values.items():
        predictions[k] = tf.stack(v)

    metric_ops = {k: tf.metrics.mean(v) for k, v in metrics_values.items()}

    train_op = None
    loss = None
    if is_training:
        # Creates a pianoroll labels in red and probs in green [minibatch, 88]
        images = {}
        onset_pianorolls = tf.concat([
            onset_labels[:, :, :, tf.newaxis], onset_probs[:, :, :,
                                                           tf.newaxis],
            tf.zeros(tf.shape(onset_labels))[:, :, :, tf.newaxis]
        ],
                                     axis=3)
        images['OnsetPianorolls'] = onset_pianorolls
        offset_pianorolls = tf.concat([
            offset_labels[:, :, :, tf.newaxis], offset_probs[:, :, :,
                                                             tf.newaxis],
            tf.zeros(tf.shape(offset_labels))[:, :, :, tf.newaxis]
        ],
                                      axis=3)
        images['OffsetPianorolls'] = offset_pianorolls
        activation_pianorolls = tf.concat([
            frame_labels[:, :, :, tf.newaxis], frame_probs[:, :, :,
                                                           tf.newaxis],
            tf.zeros(tf.shape(frame_labels))[:, :, :, tf.newaxis]
        ],
                                          axis=3)
        images['ActivationPianorolls'] = activation_pianorolls
        for name, image in images.items():
            tf.summary.image(name, image)

        loss = tf.losses.get_total_loss()
        tf.summary.scalar('loss', loss)
        for label, loss_collection in losses.items():
            loss_label = 'losses/' + label
            tf.summary.scalar(loss_label, tf.reduce_mean(loss_collection))

        train_op = slim.optimize_loss(
            name='training',
            loss=loss,
            global_step=tf.train.get_or_create_global_step(),
            learning_rate=hparams.learning_rate,
            learning_rate_decay_fn=functools.partial(
                tf.train.exponential_decay,
                decay_steps=hparams.decay_steps,
                decay_rate=hparams.decay_rate,
                staircase=True),
            clip_gradients=hparams.clip_norm,
            optimizer='Adam')

    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=predictions,
                                      loss=loss,
                                      train_op=train_op,
                                      eval_metric_ops=metric_ops)
def inception_resnet_v2(inputs,
                        num_classes=1001,
                        is_training=True,
                        dropout_keep_prob=0.8,
                        reuse=None,
                        scope='InceptionResnetV2',
                        create_aux_logits=True,
                        activation_fn=tf.nn.relu):
    """Creates the Inception Resnet V2 model.

  Args:
    inputs: a 4-D tensor of size [batch_size, height, width, 3].
      Dimension batch_size may be undefined. If create_aux_logits is false,
      also height and width may be undefined.
    num_classes: number of predicted classes. If 0 or None, the logits layer
      is omitted and the input features to the logits layer (before  dropout)
      are returned instead.
    is_training: whether is training or not.
    dropout_keep_prob: float, the fraction to keep before final layer.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.
    create_aux_logits: Whether to include the auxilliary logits.
    activation_fn: Activation function for conv2d.

  Returns:
    net: the output of the logits layer (if num_classes is a non-zero integer),
      or the non-dropped-out input to the logits layer (if num_classes is 0 or
      None).
    end_points: the set of end_points from the inception model.
  """
    end_points = {}

    with tf.compat.v1.variable_scope(scope,
                                     'InceptionResnetV2', [inputs],
                                     reuse=reuse) as scope:
        with slim.arg_scope([slim.batch_norm, slim.dropout],
                            is_training=is_training):

            net, end_points = inception_resnet_v2_base(
                inputs, scope=scope, activation_fn=activation_fn)

            if create_aux_logits and num_classes:
                with tf.compat.v1.variable_scope('AuxLogits'):
                    aux = end_points['PreAuxLogits']
                    aux = slim.avg_pool2d(aux,
                                          5,
                                          stride=3,
                                          padding='VALID',
                                          scope='Conv2d_1a_3x3')
                    aux = slim.conv2d(aux, 128, 1, scope='Conv2d_1b_1x1')
                    aux = slim.conv2d(aux,
                                      768,
                                      aux.get_shape()[1:3],
                                      padding='VALID',
                                      scope='Conv2d_2a_5x5')
                    aux = slim.flatten(aux)
                    aux = slim.fully_connected(aux,
                                               num_classes,
                                               activation_fn=None,
                                               scope='Logits')
                    end_points['AuxLogits'] = aux

            with tf.compat.v1.variable_scope('Logits'):
                # TODO(sguada,arnoegw): Consider adding a parameter global_pool which
                # can be set to False to disable pooling here (as in resnet_*()).
                kernel_size = net.get_shape()[1:3]
                if kernel_size.is_fully_defined():
                    net = slim.avg_pool2d(net,
                                          kernel_size,
                                          padding='VALID',
                                          scope='AvgPool_1a_8x8')
                else:
                    net = tf.reduce_mean(input_tensor=net,
                                         axis=[1, 2],
                                         keepdims=True,
                                         name='global_pool')
                end_points['global_pool'] = net
                if not num_classes:
                    return net, end_points
                net = slim.flatten(net)
                net = slim.dropout(net,
                                   dropout_keep_prob,
                                   is_training=is_training,
                                   scope='Dropout')
                end_points['PreLogitsFlatten'] = net
                logits = slim.fully_connected(net,
                                              num_classes,
                                              activation_fn=None,
                                              scope='Logits')
                end_points['Logits'] = logits
                end_points['Predictions'] = tf.nn.softmax(logits,
                                                          name='Predictions')

        return logits, end_points
示例#11
0
def style_prediction_mobilenet(style_input_,
                               activation_names,
                               activation_depths,
                               mobilenet_end_point='layer_19',
                               mobilenet_trainable=True,
                               style_params_trainable=False,
                               style_prediction_bottleneck=100,
                               reuse=None):
    """Maps style images to the style embeddings using MobileNetV2.

  Args:
    style_input_: Tensor. Batch of style input images.
    activation_names: string. Scope names of the activations of the transformer
        network which are used to apply style normalization.
    activation_depths: Shapes of the activations of the transformer network
        which are used to apply style normalization.
    mobilenet_end_point: string. Specifies the endpoint to construct the
        MobileNetV2 network up to. This network is part of the style prediction
        network.
    mobilenet_trainable: bool. Should the MobileNetV2 parameters be marked
        as trainable?
    style_params_trainable: bool. Should the mapping from bottleneck to
        beta and gamma parameters be marked as trainable?
    style_prediction_bottleneck: int. Specifies the bottleneck size in the
        number of parameters of the style embedding.
    reuse: bool. Whether to reuse model parameters. Defaults to False.

  Returns:
    Tensor for the output of the style prediction network, Tensor for the
        bottleneck of style parameters of the style prediction network.
  """
    with tf.name_scope('style_prediction_mobilenet') and tf.variable_scope(
            tf.get_variable_scope(), reuse=reuse):
        with slim.arg_scope(
                mobilenet_v2.training_scope(is_training=mobilenet_trainable)):
            _, end_points = mobilenet.mobilenet_base(
                style_input_,
                conv_defs=mobilenet_v2.V2_DEF,
                final_endpoint=mobilenet_end_point,
                scope='MobilenetV2')

        feat_convlayer = end_points[mobilenet_end_point]
        with tf.name_scope('bottleneck'):
            # (batch_size, 1, 1, depth).
            bottleneck_feat = tf.reduce_mean(feat_convlayer,
                                             axis=[1, 2],
                                             keep_dims=True)

        if style_prediction_bottleneck > 0:
            with tf.variable_scope('mobilenet_conv'):
                with slim.arg_scope([slim.conv2d],
                                    activation_fn=None,
                                    normalizer_fn=None,
                                    trainable=mobilenet_trainable):
                    # (batch_size, 1, 1, style_prediction_bottleneck).
                    bottleneck_feat = slim.conv2d(bottleneck_feat,
                                                  style_prediction_bottleneck,
                                                  [1, 1])

        style_params = {}
        with tf.variable_scope('style_params'):
            for i in range(len(activation_depths)):
                with tf.variable_scope(activation_names[i], reuse=reuse):
                    with slim.arg_scope([slim.conv2d],
                                        activation_fn=None,
                                        normalizer_fn=None,
                                        trainable=style_params_trainable):
                        # Computing beta parameter of the style normalization for the
                        # activation_names[i] layer of the style transformer network.
                        # (batch_size, 1, 1, activation_depths[i])
                        beta = slim.conv2d(bottleneck_feat,
                                           activation_depths[i], [1, 1])
                        # (batch_size, activation_depths[i])
                        beta = tf.squeeze(beta, [1, 2], name='SpatialSqueeze')
                        style_params['{}/beta'.format(
                            activation_names[i])] = beta

                        # Computing gamma parameter of the style normalization for the
                        # activation_names[i] layer of the style transformer network.
                        # (batch_size, 1, 1, activation_depths[i])
                        gamma = slim.conv2d(bottleneck_feat,
                                            activation_depths[i], [1, 1])
                        # (batch_size, activation_depths[i])
                        gamma = tf.squeeze(gamma, [1, 2],
                                           name='SpatialSqueeze')
                        style_params['{}/gamma'.format(
                            activation_names[i])] = gamma

    return style_params, bottleneck_feat
def inception_resnet_v2_base(inputs,
                             final_endpoint='Conv2d_7b_1x1',
                             output_stride=16,
                             align_feature_maps=False,
                             scope=None,
                             activation_fn=tf.nn.relu):
    """Inception model from  http://arxiv.org/abs/1602.07261.

  Constructs an Inception Resnet v2 network from inputs to the given final
  endpoint. This method can construct the network up to the final inception
  block Conv2d_7b_1x1.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    final_endpoint: specifies the endpoint to construct the network up to. It
      can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
      'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3',
      'Mixed_5b', 'Mixed_6a', 'PreAuxLogits', 'Mixed_7a', 'Conv2d_7b_1x1']
    output_stride: A scalar that specifies the requested ratio of input to
      output spatial resolution. Only supports 8 and 16.
    align_feature_maps: When true, changes all the VALID paddings in the network
      to SAME padding so that the feature maps are aligned.
    scope: Optional variable_scope.
    activation_fn: Activation function for block scopes.

  Returns:
    tensor_out: output tensor corresponding to the final_endpoint.
    end_points: a set of activations for external use, for example summaries or
                losses.

  Raises:
    ValueError: if final_endpoint is not set to one of the predefined values,
      or if the output_stride is not 8 or 16, or if the output_stride is 8 and
      we request an end point after 'PreAuxLogits'.
  """
    if output_stride != 8 and output_stride != 16:
        raise ValueError('output_stride must be 8 or 16.')

    padding = 'SAME' if align_feature_maps else 'VALID'

    end_points = {}

    def add_and_check_final(name, net):
        end_points[name] = net
        return name == final_endpoint

    with tf.compat.v1.variable_scope(scope, 'InceptionResnetV2', [inputs]):
        with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                            stride=1,
                            padding='SAME'):
            # 149 x 149 x 32
            net = slim.conv2d(inputs,
                              32,
                              3,
                              stride=2,
                              padding=padding,
                              scope='Conv2d_1a_3x3')
            if add_and_check_final('Conv2d_1a_3x3', net):
                return net, end_points

            # 147 x 147 x 32
            net = slim.conv2d(net,
                              32,
                              3,
                              padding=padding,
                              scope='Conv2d_2a_3x3')
            if add_and_check_final('Conv2d_2a_3x3', net):
                return net, end_points
            # 147 x 147 x 64
            net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3')
            if add_and_check_final('Conv2d_2b_3x3', net):
                return net, end_points
            # 73 x 73 x 64
            net = slim.max_pool2d(net,
                                  3,
                                  stride=2,
                                  padding=padding,
                                  scope='MaxPool_3a_3x3')
            if add_and_check_final('MaxPool_3a_3x3', net):
                return net, end_points
            # 73 x 73 x 80
            net = slim.conv2d(net,
                              80,
                              1,
                              padding=padding,
                              scope='Conv2d_3b_1x1')
            if add_and_check_final('Conv2d_3b_1x1', net):
                return net, end_points
            # 71 x 71 x 192
            net = slim.conv2d(net,
                              192,
                              3,
                              padding=padding,
                              scope='Conv2d_4a_3x3')
            if add_and_check_final('Conv2d_4a_3x3', net):
                return net, end_points
            # 35 x 35 x 192
            net = slim.max_pool2d(net,
                                  3,
                                  stride=2,
                                  padding=padding,
                                  scope='MaxPool_5a_3x3')
            if add_and_check_final('MaxPool_5a_3x3', net):
                return net, end_points

            # 35 x 35 x 320
            with tf.compat.v1.variable_scope('Mixed_5b'):
                with tf.compat.v1.variable_scope('Branch_0'):
                    tower_conv = slim.conv2d(net, 96, 1, scope='Conv2d_1x1')
                with tf.compat.v1.variable_scope('Branch_1'):
                    tower_conv1_0 = slim.conv2d(net,
                                                48,
                                                1,
                                                scope='Conv2d_0a_1x1')
                    tower_conv1_1 = slim.conv2d(tower_conv1_0,
                                                64,
                                                5,
                                                scope='Conv2d_0b_5x5')
                with tf.compat.v1.variable_scope('Branch_2'):
                    tower_conv2_0 = slim.conv2d(net,
                                                64,
                                                1,
                                                scope='Conv2d_0a_1x1')
                    tower_conv2_1 = slim.conv2d(tower_conv2_0,
                                                96,
                                                3,
                                                scope='Conv2d_0b_3x3')
                    tower_conv2_2 = slim.conv2d(tower_conv2_1,
                                                96,
                                                3,
                                                scope='Conv2d_0c_3x3')
                with tf.compat.v1.variable_scope('Branch_3'):
                    tower_pool = slim.avg_pool2d(net,
                                                 3,
                                                 stride=1,
                                                 padding='SAME',
                                                 scope='AvgPool_0a_3x3')
                    tower_pool_1 = slim.conv2d(tower_pool,
                                               64,
                                               1,
                                               scope='Conv2d_0b_1x1')
                net = tf.concat(
                    [tower_conv, tower_conv1_1, tower_conv2_2, tower_pool_1],
                    3)

            if add_and_check_final('Mixed_5b', net): return net, end_points
            # TODO(alemi): Register intermediate endpoints
            net = slim.repeat(net,
                              10,
                              block35,
                              scale=0.17,
                              activation_fn=activation_fn)

            # 17 x 17 x 1088 if output_stride == 8,
            # 33 x 33 x 1088 if output_stride == 16
            use_atrous = output_stride == 8

            with tf.compat.v1.variable_scope('Mixed_6a'):
                with tf.compat.v1.variable_scope('Branch_0'):
                    tower_conv = slim.conv2d(net,
                                             384,
                                             3,
                                             stride=1 if use_atrous else 2,
                                             padding=padding,
                                             scope='Conv2d_1a_3x3')
                with tf.compat.v1.variable_scope('Branch_1'):
                    tower_conv1_0 = slim.conv2d(net,
                                                256,
                                                1,
                                                scope='Conv2d_0a_1x1')
                    tower_conv1_1 = slim.conv2d(tower_conv1_0,
                                                256,
                                                3,
                                                scope='Conv2d_0b_3x3')
                    tower_conv1_2 = slim.conv2d(tower_conv1_1,
                                                384,
                                                3,
                                                stride=1 if use_atrous else 2,
                                                padding=padding,
                                                scope='Conv2d_1a_3x3')
                with tf.compat.v1.variable_scope('Branch_2'):
                    tower_pool = slim.max_pool2d(net,
                                                 3,
                                                 stride=1 if use_atrous else 2,
                                                 padding=padding,
                                                 scope='MaxPool_1a_3x3')
                net = tf.concat([tower_conv, tower_conv1_2, tower_pool], 3)

            if add_and_check_final('Mixed_6a', net): return net, end_points

            # TODO(alemi): register intermediate endpoints
            with slim.arg_scope([slim.conv2d], rate=2 if use_atrous else 1):
                net = slim.repeat(net,
                                  20,
                                  block17,
                                  scale=0.10,
                                  activation_fn=activation_fn)
            if add_and_check_final('PreAuxLogits', net): return net, end_points

            if output_stride == 8:
                # TODO(gpapan): Properly support output_stride for the rest of the net.
                raise ValueError(
                    'output_stride==8 is only supported up to the '
                    'PreAuxlogits end_point for now.')

            # 8 x 8 x 2080
            with tf.compat.v1.variable_scope('Mixed_7a'):
                with tf.compat.v1.variable_scope('Branch_0'):
                    tower_conv = slim.conv2d(net,
                                             256,
                                             1,
                                             scope='Conv2d_0a_1x1')
                    tower_conv_1 = slim.conv2d(tower_conv,
                                               384,
                                               3,
                                               stride=2,
                                               padding=padding,
                                               scope='Conv2d_1a_3x3')
                with tf.compat.v1.variable_scope('Branch_1'):
                    tower_conv1 = slim.conv2d(net,
                                              256,
                                              1,
                                              scope='Conv2d_0a_1x1')
                    tower_conv1_1 = slim.conv2d(tower_conv1,
                                                288,
                                                3,
                                                stride=2,
                                                padding=padding,
                                                scope='Conv2d_1a_3x3')
                with tf.compat.v1.variable_scope('Branch_2'):
                    tower_conv2 = slim.conv2d(net,
                                              256,
                                              1,
                                              scope='Conv2d_0a_1x1')
                    tower_conv2_1 = slim.conv2d(tower_conv2,
                                                288,
                                                3,
                                                scope='Conv2d_0b_3x3')
                    tower_conv2_2 = slim.conv2d(tower_conv2_1,
                                                320,
                                                3,
                                                stride=2,
                                                padding=padding,
                                                scope='Conv2d_1a_3x3')
                with tf.compat.v1.variable_scope('Branch_3'):
                    tower_pool = slim.max_pool2d(net,
                                                 3,
                                                 stride=2,
                                                 padding=padding,
                                                 scope='MaxPool_1a_3x3')
                net = tf.concat(
                    [tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool],
                    3)

            if add_and_check_final('Mixed_7a', net): return net, end_points

            # TODO(alemi): register intermediate endpoints
            net = slim.repeat(net,
                              9,
                              block8,
                              scale=0.20,
                              activation_fn=activation_fn)
            net = block8(net, activation_fn=None)

            # 8 x 8 x 1536
            net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1')
            if add_and_check_final('Conv2d_7b_1x1', net):
                return net, end_points

        raise ValueError('final_endpoint (%s) not recognized', final_endpoint)
示例#13
0
    def predict(self, inputs, **kwargs):
        """Predicts the resulting tensors.

    Args:
      inputs: A dictionary of input tensors keyed by names.

    Returns:
      predictions: A dictionary of prediction tensors keyed by name.
    """
        predictions = {}

        is_training = self._is_training
        options = self._model_proto

        # Decode image fields from `inputs`.
        (image, image_height, image_width, num_detections, detection_boxes,
         detection_classes, detection_scores) = (
             inputs[InputFields.img_data],
             inputs[InputFields.img_height],
             inputs[InputFields.img_width],
             inputs[InputFields.num_detections],
             inputs[InputFields.detection_boxes],
             inputs[InputFields.detection_classes],
             inputs[InputFields.detection_scores],
         )
        batch_size = image.shape[0]
        (max_num_detections, num_detections, detection_boxes,
         detection_classes, detection_scores) = remove_detections(
             num_detections,
             detection_boxes,
             detection_classes,
             detection_scores,
             max_num_detections=options.max_num_detections)

        # Extract Fast-RCNN features.
        image_batch_shape = tf.shape(image)
        detection_boxes = convert_to_batch_coordinates(detection_boxes,
                                                       image_height,
                                                       image_width,
                                                       image_batch_shape[1],
                                                       image_batch_shape[2])
        detection_features, _ = fast_rcnn.FastRCNN(
            image,
            detection_boxes,
            options=options.fast_rcnn_config,
            is_training=is_training)
        predictions.update({'detection_features': detection_features})
        with slim.arg_scope(self._slim_fc_scope):
            detection_features = self.project_detection_features(
                detection_features)

        # Ground objects.
        (choice_ids, choice_tag_ids,
         choice_lengths) = (inputs[self._field_choices],
                            inputs[self._field_choices_tag],
                            inputs[self._field_choices_len])

        choice_tag_ids = preprocess_tags(choice_tag_ids, max_num_detections)
        choice_tag_features = ground_detection_features(
            detection_features, choice_tag_ids)

        # Create BERT prediction.
        choice_ids_list = tf.unstack(choice_ids, axis=1)
        choice_tag_ids_list = tf.unstack(choice_tag_ids, axis=1)
        choice_tag_features_list = tf.unstack(choice_tag_features, axis=1)
        choice_lengths_list = tf.unstack(choice_lengths, axis=1)

        reuse = False
        feature_to_predict_choices = []
        for caption_ids, caption_tag_ids, caption_tag_features, caption_length in zip(
                choice_ids_list, choice_tag_ids_list, choice_tag_features_list,
                choice_lengths_list):
            with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
                bert_output, embedding_table = self.image_text_matching(
                    num_detections, detection_boxes, detection_classes,
                    detection_scores, detection_features, caption_ids,
                    caption_tag_ids, caption_tag_features, caption_length)
                feature_to_predict_choices.append(bert_output)
            reuse = True

        # Predicting the answer.
        with slim.arg_scope(self._slim_fc_scope):
            features = tf.stack(feature_to_predict_choices, 1)
            logits = slim.fully_connected(features,
                                          num_outputs=1,
                                          activation_fn=None,
                                          scope='itm/logits')
        predictions.update({'answer_prediction': tf.squeeze(logits, -1)})

        # Restore from BERT checkpoint.
        assignment_map, _ = checkpoints.get_assignment_map_from_checkpoint(
            [x for x in tf.global_variables() if x.op.name.startswith('bert')
             ],  # IMPORTANT to filter using `bert`.
            options.bert_checkpoint_file)
        tf.train.init_from_checkpoint(options.bert_checkpoint_file,
                                      assignment_map)

        return predictions
示例#14
0
def inception_resnet_v2(inputs,
                        is_training=True,
                        dropout_keep_prob=0.8,
                        bottleneck_layer_size=128,
                        reuse=None,
                        scope='InceptionResnetV2'):
    """Creates the Inception Resnet V2 model.
    Args:
      inputs: a 4-D tensor of size [batch_size, height, width, 3].
      num_classes: number of predicted classes.
      is_training: whether is training or not.
      dropout_keep_prob: float, the fraction to keep before final layer.
      reuse: whether or not the network and its variables should be reused. To be
        able to reuse 'scope' must be given.
      scope: Optional variable_scope.
    Returns:
      logits: the logits outputs of the model.
      end_points: the set of end_points from the inception model.
    """
    end_points = {}

    with tf.variable_scope(scope, 'InceptionResnetV2', [inputs], reuse=reuse):
        with slim.arg_scope([slim.batch_norm, slim.dropout],
                            is_training=is_training):
            with slim.arg_scope(
                [slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                    stride=1,
                    padding='SAME'):

                # 149 x 149 x 32
                net = slim.conv2d(inputs,
                                  32,
                                  3,
                                  stride=2,
                                  padding='VALID',
                                  scope='Conv2d_1a_3x3')
                end_points['Conv2d_1a_3x3'] = net
                # 147 x 147 x 32
                net = slim.conv2d(net,
                                  32,
                                  3,
                                  padding='VALID',
                                  scope='Conv2d_2a_3x3')
                end_points['Conv2d_2a_3x3'] = net
                # 147 x 147 x 64
                net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3')
                end_points['Conv2d_2b_3x3'] = net
                # 73 x 73 x 64
                net = slim.max_pool2d(net,
                                      3,
                                      stride=2,
                                      padding='VALID',
                                      scope='MaxPool_3a_3x3')
                end_points['MaxPool_3a_3x3'] = net
                # 73 x 73 x 80
                net = slim.conv2d(net,
                                  80,
                                  1,
                                  padding='VALID',
                                  scope='Conv2d_3b_1x1')
                end_points['Conv2d_3b_1x1'] = net
                # 71 x 71 x 192
                net = slim.conv2d(net,
                                  192,
                                  3,
                                  padding='VALID',
                                  scope='Conv2d_4a_3x3')
                end_points['Conv2d_4a_3x3'] = net
                # 35 x 35 x 192
                net = slim.max_pool2d(net,
                                      3,
                                      stride=2,
                                      padding='VALID',
                                      scope='MaxPool_5a_3x3')
                end_points['MaxPool_5a_3x3'] = net

                # 35 x 35 x 320
                with tf.variable_scope('Mixed_5b'):
                    with tf.variable_scope('Branch_0'):
                        tower_conv = slim.conv2d(net,
                                                 96,
                                                 1,
                                                 scope='Conv2d_1x1')
                    with tf.variable_scope('Branch_1'):
                        tower_conv1_0 = slim.conv2d(net,
                                                    48,
                                                    1,
                                                    scope='Conv2d_0a_1x1')
                        tower_conv1_1 = slim.conv2d(tower_conv1_0,
                                                    64,
                                                    5,
                                                    scope='Conv2d_0b_5x5')
                    with tf.variable_scope('Branch_2'):
                        tower_conv2_0 = slim.conv2d(net,
                                                    64,
                                                    1,
                                                    scope='Conv2d_0a_1x1')
                        tower_conv2_1 = slim.conv2d(tower_conv2_0,
                                                    96,
                                                    3,
                                                    scope='Conv2d_0b_3x3')
                        tower_conv2_2 = slim.conv2d(tower_conv2_1,
                                                    96,
                                                    3,
                                                    scope='Conv2d_0c_3x3')
                    with tf.variable_scope('Branch_3'):
                        tower_pool = slim.avg_pool2d(net,
                                                     3,
                                                     stride=1,
                                                     padding='SAME',
                                                     scope='AvgPool_0a_3x3')
                        tower_pool_1 = slim.conv2d(tower_pool,
                                                   64,
                                                   1,
                                                   scope='Conv2d_0b_1x1')
                    net = tf.concat([
                        tower_conv, tower_conv1_1, tower_conv2_2, tower_pool_1
                    ], 3)

                end_points['Mixed_5b'] = net
                net = slim.repeat(net, 10, block35, scale=0.17)

                # 17 x 17 x 1024
                with tf.variable_scope('Mixed_6a'):
                    with tf.variable_scope('Branch_0'):
                        tower_conv = slim.conv2d(net,
                                                 384,
                                                 3,
                                                 stride=2,
                                                 padding='VALID',
                                                 scope='Conv2d_1a_3x3')
                    with tf.variable_scope('Branch_1'):
                        tower_conv1_0 = slim.conv2d(net,
                                                    256,
                                                    1,
                                                    scope='Conv2d_0a_1x1')
                        tower_conv1_1 = slim.conv2d(tower_conv1_0,
                                                    256,
                                                    3,
                                                    scope='Conv2d_0b_3x3')
                        tower_conv1_2 = slim.conv2d(tower_conv1_1,
                                                    384,
                                                    3,
                                                    stride=2,
                                                    padding='VALID',
                                                    scope='Conv2d_1a_3x3')
                    with tf.variable_scope('Branch_2'):
                        tower_pool = slim.max_pool2d(net,
                                                     3,
                                                     stride=2,
                                                     padding='VALID',
                                                     scope='MaxPool_1a_3x3')
                    net = tf.concat([tower_conv, tower_conv1_2, tower_pool], 3)

                end_points['Mixed_6a'] = net
                net = slim.repeat(net, 20, block17, scale=0.10)

                with tf.variable_scope('Mixed_7a'):
                    with tf.variable_scope('Branch_0'):
                        tower_conv = slim.conv2d(net,
                                                 256,
                                                 1,
                                                 scope='Conv2d_0a_1x1')
                        tower_conv_1 = slim.conv2d(tower_conv,
                                                   384,
                                                   3,
                                                   stride=2,
                                                   padding='VALID',
                                                   scope='Conv2d_1a_3x3')
                    with tf.variable_scope('Branch_1'):
                        tower_conv1 = slim.conv2d(net,
                                                  256,
                                                  1,
                                                  scope='Conv2d_0a_1x1')
                        tower_conv1_1 = slim.conv2d(tower_conv1,
                                                    288,
                                                    3,
                                                    stride=2,
                                                    padding='VALID',
                                                    scope='Conv2d_1a_3x3')
                    with tf.variable_scope('Branch_2'):
                        tower_conv2 = slim.conv2d(net,
                                                  256,
                                                  1,
                                                  scope='Conv2d_0a_1x1')
                        tower_conv2_1 = slim.conv2d(tower_conv2,
                                                    288,
                                                    3,
                                                    scope='Conv2d_0b_3x3')
                        tower_conv2_2 = slim.conv2d(tower_conv2_1,
                                                    320,
                                                    3,
                                                    stride=2,
                                                    padding='VALID',
                                                    scope='Conv2d_1a_3x3')
                    with tf.variable_scope('Branch_3'):
                        tower_pool = slim.max_pool2d(net,
                                                     3,
                                                     stride=2,
                                                     padding='VALID',
                                                     scope='MaxPool_1a_3x3')
                    net = tf.concat([
                        tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool
                    ], 3)

                end_points['Mixed_7a'] = net

                net = slim.repeat(net, 9, block8, scale=0.20)
                net = block8(net, activation_fn=None)

                net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1')
                end_points['Conv2d_7b_1x1'] = net

                with tf.variable_scope('Logits'):
                    end_points['PrePool'] = net
                    #pylint: disable=no-member
                    net = slim.avg_pool2d(net,
                                          net.get_shape()[1:3],
                                          padding='VALID',
                                          scope='AvgPool_1a_8x8')
                    net = slim.flatten(net)

                    net = slim.dropout(net,
                                       dropout_keep_prob,
                                       is_training=is_training,
                                       scope='Dropout')

                    end_points['PreLogitsFlatten'] = net

                net = slim.fully_connected(net,
                                           bottleneck_layer_size,
                                           activation_fn=None,
                                           scope='Bottleneck',
                                           reuse=False)

    return net, end_points
示例#15
0
def inception_v4(inputs, num_classes=1001, is_training=True,
                 dropout_keep_prob=0.8,
                 reuse=None,
                 scope='InceptionV4',
                 create_aux_logits=True):
  """Creates the Inception V4 model.

  Args:
    inputs: a 4-D tensor of size [batch_size, height, width, 3].
    num_classes: number of predicted classes. If 0 or None, the logits layer
      is omitted and the input features to the logits layer (before dropout)
      are returned instead.
    is_training: whether is training or not.
    dropout_keep_prob: float, the fraction to keep before final layer.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.
    create_aux_logits: Whether to include the auxiliary logits.

  Returns:
    net: a Tensor with the logits (pre-softmax activations) if num_classes
      is a non-zero integer, or the non-dropped input to the logits layer
      if num_classes is 0 or None.
    end_points: the set of end_points from the inception model.
  """
  end_points = {}
  with tf.variable_scope(scope, 'InceptionV4', [inputs], reuse=reuse) as scope:
    with slim.arg_scope([slim.batch_norm, slim.dropout],
                        is_training=is_training):
      net, end_points = inception_v4_base(inputs, scope=scope)

      # with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
      #                     stride=1, padding='SAME'):
      #   # Auxiliary Head logits
      #   if create_aux_logits and num_classes:
      #     with tf.variable_scope('AuxLogits'):
      #       # 17 x 17 x 1024
      #       aux_logits = end_points['Mixed_6h']
      #       aux_logits = slim.avg_pool2d(aux_logits, [5, 5], stride=3,
      #                                    padding='VALID',
      #                                    scope='AvgPool_1a_5x5')
      #       aux_logits = slim.conv2d(aux_logits, 128, [1, 1],
      #                                scope='Conv2d_1b_1x1')
      #       aux_logits = slim.conv2d(aux_logits, 768,
      #                                aux_logits.get_shape()[1:3],
      #                                padding='VALID', scope='Conv2d_2a')
      #       aux_logits = slim.flatten(aux_logits)
      #       aux_logits = slim.fully_connected(aux_logits, num_classes,
      #                                         activation_fn=None,
      #                                         scope='Aux_logits')
      #       end_points['AuxLogits'] = aux_logits

      #   # Final pooling and prediction
      #   # TODO(sguada,arnoegw): Consider adding a parameter global_pool which
      #   # can be set to False to disable pooling here (as in resnet_*()).
      #   with tf.variable_scope('Logits'):
      #     # 8 x 8 x 1536
      #     kernel_size = net.get_shape()[1:3]
      #     if kernel_size.is_fully_defined():
      #       net = slim.avg_pool2d(net, kernel_size, padding='VALID',
      #                             scope='AvgPool_1a')
      #     else:
      #       net = tf.reduce_mean(net, [1, 2], keep_dims=True,
      #                            name='global_pool')
      #     end_points['global_pool'] = net
      #     if not num_classes:
      #       return net, end_points
      #     # 1 x 1 x 1536
      #     net = slim.dropout(net, dropout_keep_prob, scope='Dropout_1b')
      #     net = slim.flatten(net, scope='PreLogitsFlatten')
      #     end_points['PreLogitsFlatten'] = net
      #     # 1536
      #     logits = slim.fully_connected(net, num_classes, activation_fn=None,
      #                                   scope='Logits')
      #     end_points['Logits'] = logits
      #     end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions')
    return net, end_points
示例#16
0
def encode_effect(states, contexts, use_relation, use_point_cloud,
                  dim_fc_state, dim_fc_context):
    """Encode the effect feature.

    Args:
        states: The state as a dict.
        contexts: The context data. Set to None if no contexts are used.
        use_relation: True if use relation encoding.
        use_point_cloud: True if point cloud data is used.
        dim_fc_state: Dimension of state encoding.
        dim_fc_context: Dimension of context encoding.

    Returns:
        A tensor of shape [batch_size, dim_fc_state].
    """
    positions = states['position']
    body_masks = states['body_mask']
    num_bodies = int(body_masks.shape[-1])

    with slim.arg_scope([slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        normalizer_fn=NORMALIZER_FN,
                        normalizer_params=NORMALIZER_PARAMS):

        features = []

        with tf.compat.v1.variable_scope('encode_position'):
            position_feats = slim.fully_connected(positions,
                                                  dim_fc_state,
                                                  scope='fc')
            features.append(position_feats)

        if use_relation:
            with tf.compat.v1.variable_scope('encode_relation'):
                relation_feats = encode_relation(positions,
                                                 body_masks,
                                                 dim_fc_state=dim_fc_state)
                features.append(relation_feats)

        if use_point_cloud:
            cloud_feats = states['cloud_feat']
            features.append(cloud_feats)

        if contexts is not None:
            with tf.compat.v1.variable_scope('encode_context'):
                context_feats = slim.fully_connected(contexts,
                                                     dim_fc_context,
                                                     scope='fc')
                context_feats = tf.tile(tf.expand_dims(context_feats, 1),
                                        [1, num_bodies, 1])
                features.append(context_feats)

        net = tf.concat(features, axis=-1)
        net = slim.repeat(net,
                          2,
                          slim.fully_connected,
                          dim_fc_state,
                          scope='fc')
        effects = tf.identity(net, 'effects')

    return effects
示例#17
0
    def prediction_layers(
        self,
        features,
        end_points,
        input_shape,
        scope="pose",
        reuse=None,
    ):
        net_type = self.cfg["net_type"]
        if self.cfg["multi_stage"]:  # MuNet! (multi_stage decoder + multi_fusion)
            # Defining multi_fusion backbone
            num_layers = re.findall("resnet_([0-9]*)", net_type)[0]
            layer_name = (
                "resnet_v1_{}".format(num_layers) + "/block{}/unit_{}/bottleneck_v1"
            )
            mid_pt_block1 = layer_name.format(1, 3)
            mid_pt_block2 = layer_name.format(2, 3)

            final_dims = tf.math.ceil(
                tf.divide(input_shape[1:3], tf.convert_to_tensor(16))
            )

            interim_dims_s8 = tf.scalar_mul(2, final_dims)
            interim_dims_s8 = tf.cast(interim_dims_s8, tf.int32)
            interim_dims_s4 = tf.scalar_mul(2, interim_dims_s8)
            interim_dims_s4 = tf.cast(interim_dims_s4, tf.int32)

            bank_1 = end_points[mid_pt_block1]
            bank_2 = end_points[mid_pt_block2]

            bank_2_s8 = tf.compat.v1.image.resize_images(bank_2, interim_dims_s8)
            bank_1_s4 = tf.compat.v1.image.resize_images(bank_1, interim_dims_s4)

            with slim.arg_scope(
                [slim.conv2d],
                padding="SAME",
                normalizer_fn=slim.layers.batch_norm,
                activation_fn=tf.nn.relu,
                weights_regularizer=slim.l2_regularizer(self.cfg["weight_decay"]),
            ):
                with tf.compat.v1.variable_scope("decoder_filters"):
                    bank_2_s16 = slim.conv2d(
                        bank_2_s8,
                        512,
                        kernel_size=[3, 3],
                        stride=2,
                        scope="decoder_parallel_1",
                    )
                    bank_2_s16 = slim.conv2d(
                        bank_2_s16,
                        128,
                        kernel_size=[1, 1],
                        stride=1,
                        scope="decoder_parallel_2",
                    )

                    bank_1_s8 = slim.conv2d(
                        bank_1_s4,
                        256,
                        kernel_size=[3, 3],
                        stride=2,
                        scope="decoder_parallel_3",
                    )
                    bank_1_s16 = slim.conv2d(
                        bank_1_s8,
                        256,
                        kernel_size=[3, 3],
                        stride=2,
                        scope="decoder_parallel_4",
                    )
                    bank_1_s16 = slim.conv2d(
                        bank_1_s16,
                        128,
                        kernel_size=[1, 1],
                        stride=1,
                        scope="decoder_parallel_5",
                    )

            with slim.arg_scope(
                [slim.conv2d_transpose],
                padding="SAME",
                normalizer_fn=None,
                weights_regularizer=slim.l2_regularizer(self.cfg["weight_decay"]),
            ):
                with tf.compat.v1.variable_scope("upsampled_features"):

                    concat_3_s16 = tf.concat([bank_1_s16, bank_2_s16, features], 3)

                    if self.cfg["stride"] == 8:
                        net = concat_3_s16

                    elif self.cfg["stride"] == 4:
                        upsampled_features_2x = slim.conv2d_transpose(
                            concat_3_s16,
                            self.cfg.get("bank3", 128),
                            kernel_size=[3, 3],
                            stride=2,
                            scope="block3",
                        )
                        net = upsampled_features_2x

                    elif self.cfg["stride"] == 2:
                        upsampled_features_2x = slim.conv2d_transpose(
                            concat_3_s16,
                            self.cfg.get("bank3", 128),
                            kernel_size=[3, 3],
                            stride=2,
                            scope="block3",
                        )
                        upsampled_features_4x = slim.conv2d_transpose(
                            upsampled_features_2x,
                            self.cfg.get("bank5", 128),
                            kernel_size=[3, 3],
                            stride=2,
                            scope="block4",
                        )
                        net = upsampled_features_4x

            out = {}
            # Attaching multi-stage decoder
            with tf.compat.v1.variable_scope(scope, reuse=reuse):
                stage1_hm_out = prediction_layer(
                    self.cfg,
                    net,
                    "part_pred_s1",
                    self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0),
                )

                if self.cfg["location_refinement"]:
                    out["locref"] = prediction_layer(
                        self.cfg, net, "locref_pred", self.cfg["num_joints"] * 2
                    )
                if (
                    self.cfg["pairwise_predict"]
                    and "multi-animal" not in self.cfg["dataset_type"]
                ):
                    out["pairwise_pred"] = prediction_layer(
                        self.cfg,
                        net,
                        "pairwise_pred",
                        self.cfg["num_joints"] * (self.cfg["num_joints"] - 1) * 2,
                    )
                if (
                    self.cfg["partaffinityfield_predict"]
                    and "multi-animal" in self.cfg["dataset_type"]
                ):
                    feature = slim.conv2d_transpose(
                        net, self.cfg.get("bank3", 128), kernel_size=[3, 3], stride=2
                    )

                    stage1_paf_out = prediction_layer(
                        self.cfg, net, "pairwise_pred_s1", self.cfg["num_limbs"] * 2
                    )

                    stage2_in = tf.concat([stage1_hm_out, stage1_paf_out, feature], 3)
                    stage_input = stage2_in
                    stage_paf_output = stage1_paf_out
                    stage_hm_output = stage1_hm_out

                    for i in range(2, 5):
                        pre_stage_paf_output = stage_paf_output
                        pre_stage_hm_output = stage_hm_output

                        stage_paf_output = prediction_layer_stage(
                            self.cfg,
                            stage_input,
                            f"pairwise_pred_s{i}",
                            self.cfg["num_limbs"] * 2,
                        )

                        stage_hm_output = prediction_layer_stage(
                            self.cfg,
                            stage_input,
                            f"part_pred_s{i}",
                            self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0),
                        )

                        if i > 2:
                            # stage_paf_output = stage_paf_output + pre_stage_paf_output
                            stage_hm_output = stage_hm_output + pre_stage_hm_output

                        stage_input = tf.concat(
                            [stage_hm_output, stage_paf_output, feature], 3
                        )

                    out["part_pred"] = prediction_layer_stage(
                        self.cfg,
                        stage_input,
                        "part_pred",
                        self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0),
                    )

                    out["pairwise_pred"] = prediction_layer_stage(
                        self.cfg,
                        stage_input,
                        "pairwise_pred",
                        self.cfg["num_limbs"] * 2,
                    )

                if self.cfg["intermediate_supervision"]:
                    interm_name = layer_name.format(
                        3, self.cfg["intermediate_supervision_layer"]
                    )
                    block_interm_out = end_points[interm_name]
                    out["part_pred_interm"] = prediction_layer(
                        self.cfg,
                        block_interm_out,
                        "intermediate_supervision",
                        self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0),
                    )

        else:  # dual fusion net (for stride 4 experiments)
            if "resnet" in net_type:
                num_layers = re.findall("resnet_([0-9]*)", net_type)[0]
                layer_name = "resnet_v1_{}/block{}/unit_{}/bottleneck_v1"
                mid_pt = layer_name.format(num_layers, 2, 3)
            elif "mobilenet" in net_type:
                mid_pt = "layer_7"
            elif "efficientnet" in net_type:
                mid_pt = f"block_{parallel_layers[net_type.split('-')[1]]}"
            else:
                raise ValueError(f"Unknown network of type {net_type}")

            final_dims = tf.math.ceil(
                tf.divide(input_shape[1:3], tf.convert_to_tensor(value=16))
            )
            interim_dims = tf.scalar_mul(2, final_dims)
            interim_dims = tf.cast(interim_dims, tf.int32)
            bank_3 = end_points[mid_pt]
            bank_3 = tf.image.resize(bank_3, interim_dims)

            with slim.arg_scope(
                [slim.conv2d],
                padding="SAME",
                normalizer_fn=None,
                weights_regularizer=tf.keras.regularizers.l2(
                    0.5 * (self.cfg["weight_decay"])
                ),
            ):
                with tf.compat.v1.variable_scope("decoder_filters"):
                    bank_3 = slim.conv2d(
                        bank_3,
                        self.cfg.get("bank3", 128),
                        1,
                        scope="decoder_parallel_1",
                    )

            with slim.arg_scope(
                [slim.conv2d_transpose],
                padding="SAME",
                normalizer_fn=None,
                weights_regularizer=tf.keras.regularizers.l2(
                    0.5 * (self.cfg["weight_decay"])
                ),
            ):
                with tf.compat.v1.variable_scope("upsampled_features"):
                    upsampled_features = slim.conv2d_transpose(
                        features,
                        self.cfg.get("bank5", 128),
                        kernel_size=[3, 3],
                        stride=2,
                        scope="block4",
                    )
            net = tf.concat([bank_3, upsampled_features], 3)
            out = super(PoseMultiNet, self).prediction_layers(
                net,
                scope,
                reuse,
            )
            with tf.compat.v1.variable_scope(scope, reuse=reuse):
                if (
                    self.cfg["intermediate_supervision"]
                    and "efficientnet" not in net_type
                ):
                    if "mobilenet" in net_type:
                        feat = end_points[
                            f"layer_{self.cfg['intermediate_supervision_layer']}"
                        ]
                    elif "resnet" in net_type:
                        layer_name = "resnet_v1_{}/block{}/unit_{}/bottleneck_v1"
                        num_layers = re.findall("resnet_([0-9]*)", net_type)[0]
                        interm_name = layer_name.format(
                            num_layers, 3, self.cfg["intermediate_supervision_layer"]
                        )
                        feat = end_points[interm_name]
                    else:
                        return out
                    pred_layer = out["part_pred_interm"] = prediction_layer(
                        self.cfg,
                        feat,
                        "intermediate_supervision",
                        self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0),
                    )
                    out["part_pred_interm"] = pred_layer
        return out
 def conv_hyperparams_fn():
     with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm), \
       slim.arg_scope([slim.batch_norm], is_training=False) as sc:
         return sc
  def extract_features(self,
                       preprocessed_inputs,
                       state_saver=None,
                       state_name='lstm_state',
                       unroll_length=5,
                       scope=None):
    """Extracts features from preprocessed inputs.

    The features include the base network features, lstm features and SSD
    features, organized in the following name scope:

    <parent scope>/MobilenetV1/...
    <parent scope>/LSTM/...
    <parent scope>/FeatureMaps/...

    Args:
      preprocessed_inputs: A [batch, height, width, channels] float tensor
        representing a batch of consecutive frames from video clips.
      state_saver: A state saver object with methods `state` and `save_state`.
      state_name: A python string for the name to use with the state_saver.
      unroll_length: The number of steps to unroll the lstm.
      scope: The scope for the base network of the feature extractor.

    Returns:
      A list of tensors where the ith tensor has shape [batch, height_i,
      width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)
    with slim.arg_scope(
        mobilenet_v1.mobilenet_v1_arg_scope(is_training=self._is_training)):
      with (slim.arg_scope(self._conv_hyperparams_fn())
            if self._override_base_feature_extractor_hyperparams else
            context_manager.IdentityContextManager()):
        with slim.arg_scope([slim.batch_norm], fused=False):
          # Base network.
          with tf.variable_scope(
              scope, self._base_network_scope,
              reuse=self._reuse_weights) as scope:
            net, image_features = mobilenet_v1.mobilenet_v1_base(
                ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
                final_endpoint='Conv2d_13_pointwise',
                min_depth=self._min_depth,
                depth_multiplier=self._depth_multiplier,
                scope=scope)

    with slim.arg_scope(self._conv_hyperparams_fn()):
      with slim.arg_scope(
          [slim.batch_norm], fused=False, is_training=self._is_training):
        # ConvLSTM layers.
        batch_size = net.shape[0].value // unroll_length
        with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope:
          lstm_cell, init_state, _ = self.create_lstm_cell(
              batch_size,
              (net.shape[1].value, net.shape[2].value),
              state_saver,
              state_name,
              dtype=preprocessed_inputs.dtype)
          net_seq = list(tf.split(net, unroll_length))

          # Identities added for inputing state tensors externally.
          c_ident = tf.identity(init_state[0], name='lstm_state_in_c')
          h_ident = tf.identity(init_state[1], name='lstm_state_in_h')
          init_state = (c_ident, h_ident)

          net_seq, states_out = rnn_decoder.rnn_decoder(
              net_seq, init_state, lstm_cell, scope=lstm_scope)
          batcher_ops = None
          self._states_out = states_out
          if state_saver is not None:
            self._step = state_saver.state('%s_step' % state_name)
            batcher_ops = [
                state_saver.save_state('%s_c' % state_name, states_out[-1][0]),
                state_saver.save_state('%s_h' % state_name, states_out[-1][1]),
                state_saver.save_state('%s_step' % state_name, self._step + 1)
            ]
          with tf_ops.control_dependencies(batcher_ops):
            image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0)

          # Identities added for reading output states, to be reused externally.
          tf.identity(states_out[-1][0], name='lstm_state_out_c')
          tf.identity(states_out[-1][1], name='lstm_state_out_h')

        # SSD layers.
        with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights):
          feature_maps = feature_map_generators.multi_resolution_feature_maps(
              feature_map_layout=self._feature_map_layout,
              depth_multiplier=(self._depth_multiplier),
              min_depth=self._min_depth,
              insert_1x1_conv=True,
              image_features=image_features)

    return list(feature_maps.values())
 def conv_hyperparams_fn():
     with (slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm)
           and slim.arg_scope(
               [slim.batch_norm], decay=0.97, epsilon=1e-3)) as sc:
         return sc
示例#21
0
def resnet_v1(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              spatial_squeeze=True,
              reuse=None,
              scope=None):
    """Generator for v1 ResNet models.

    This function generates a family of ResNet v1 models. See the resnet_v1_*()
    methods for specific model instantiations, obtained by selecting different
    block instantiations that produce ResNets of various depths.

    Training for image classification on Imagenet is usually done with [224, 224]
    inputs, resulting in [7, 7] feature maps at the output of the last ResNet
    block for the ResNets defined in [1] that have nominal stride equal to 32.
    However, for dense prediction tasks we advise that one uses inputs with
    spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
    this case the feature maps at the ResNet output will have spatial shape
    [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
    and corners exactly aligned with the input image corners, which greatly
    facilitates alignment of the features to the image. Using as input [225, 225]
    images results in [8, 8] feature maps at the output of the last ResNet block.

    For dense prediction tasks, the ResNet needs to run in fully-convolutional
    (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
    have nominal stride equal to 32 and a good choice in FCN mode is to use
    output_stride=16 in order to increase the density of the computed features at
    small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.

    Args:
      inputs: A tensor of size [batch, height_in, width_in, channels].
      blocks: A list of length equal to the number of ResNet blocks. Each element
        is a resnet_utils.Block object describing the units in the block.
      num_classes: Number of predicted classes for classification tasks. If None
        we return the features before the logit layer.
      is_training: whether is training or not.
      global_pool: If True, we perform global average pooling before computing the
        logits. Set to True for image classification, False for dense prediction.
      output_stride: If None, then the output will be computed at the nominal
        network stride. If output_stride is not None, it specifies the requested
        ratio of input to output spatial resolution.
      include_root_block: If True, include the initial convolution followed by
        max-pooling, if False excludes it.
      spatial_squeeze: if True, logits is of shape [B, C], if false logits is
          of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
      reuse: whether or not the network and its variables should be reused. To be
        able to reuse 'scope' must be given.
      scope: Optional variable_scope.

    Returns:
      net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
        If global_pool is False, then height_out and width_out are reduced by a
        factor of output_stride compared to the respective height_in and width_in,
        else both height_out and width_out equal one. If num_classes is None, then
        net is the output of the last ResNet block, potentially after global
        average pooling. If num_classes is not None, net contains the pre-softmax
        activations.
      end_points: A dictionary from components of the network to the corresponding
        activation.

    Raises:
      ValueError: If the target output_stride is not valid.
    """
    with tf.compat.v1.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
        end_points_collection = sc.name + '_end_points'
        with slim.arg_scope([slim.conv2d, bottleneck,
                             resnet_utils.stack_blocks_dense],
                            outputs_collections=end_points_collection):
            with slim.arg_scope([slim.batch_norm], is_training=is_training):
                net = inputs
                if include_root_block:
                    if output_stride is not None:
                        if output_stride % 4 != 0:
                            raise ValueError('The output_stride needs to be a multiple of 4.')
                        output_stride /= 4
                    net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
                    net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')

                    net = slim.utils.collect_named_outputs(end_points_collection, 'pool2', net)

                net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)

                end_points = slim.utils.convert_collection_to_dict(end_points_collection)

                # end_points['pool2'] = end_points['resnet_v1_50/pool1/MaxPool:0']
                try:
                    end_points['pool3'] = end_points['resnet_v1_50/block1']
                    end_points['pool4'] = end_points['resnet_v1_50/block2']
                except:
                    end_points['pool3'] = end_points['Detection/resnet_v1_50/block1']
                    end_points['pool4'] = end_points['Detection/resnet_v1_50/block2']
                end_points['pool5'] = net
                # if global_pool:
                #     # Global average pooling.
                #     net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
                # if num_classes is not None:
                #     net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
                #                       normalizer_fn=None, scope='logits')
                # if spatial_squeeze:
                #     logits = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
                # else:
                #     logits = net
                # # Convert end_points_collection into a dictionary of end_points.
                # end_points = slim.utils.convert_collection_to_dict(end_points_collection)
                # if num_classes is not None:
                #     end_points['predictions'] = slim.softmax(logits, scope='predictions')
                return net, end_points
def extract_features(images,
                     model_options,
                     weight_decay=0.0001,
                     reuse=None,
                     is_training=False,
                     fine_tune_batch_norm=False,
                     nas_training_hyper_parameters=None):
    """Extracts features by the particular model_variant.

  Args:
    images: A tensor of size [batch, height, width, channels].
    model_options: A ModelOptions instance to configure models.
    weight_decay: The weight decay for model variables.
    reuse: Reuse the model variables or not.
    is_training: Is training or not.
    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
    nas_training_hyper_parameters: A dictionary storing hyper-parameters for
      training nas models. Its keys are:
      - `drop_path_keep_prob`: Probability to keep each path in the cell when
        training.
      - `total_training_steps`: Total training steps to help drop path
        probability calculation.

  Returns:
    concat_logits: A tensor of size [batch, feature_height, feature_width,
      feature_channels], where feature_height/feature_width are determined by
      the images height/width and output_stride.
    end_points: A dictionary from components of the network to the corresponding
      activation.
  """
    features, end_points = feature_extractor.extract_features(
        images,
        output_stride=model_options.output_stride,
        multi_grid=model_options.multi_grid,
        model_variant=model_options.model_variant,
        depth_multiplier=model_options.depth_multiplier,
        divisible_by=model_options.divisible_by,
        weight_decay=weight_decay,
        reuse=reuse,
        is_training=is_training,
        preprocessed_images_dtype=model_options.preprocessed_images_dtype,
        fine_tune_batch_norm=fine_tune_batch_norm,
        nas_architecture_options=model_options.nas_architecture_options,
        nas_training_hyper_parameters=nas_training_hyper_parameters,
        use_bounded_activation=model_options.use_bounded_activation)

    if not model_options.aspp_with_batch_norm:
        return features, end_points
    else:
        if model_options.dense_prediction_cell_config is not None:
            tf.logging.info('Using dense prediction cell config.')
            dense_prediction_layer = dense_prediction_cell.DensePredictionCell(
                config=model_options.dense_prediction_cell_config,
                hparams={
                    'conv_rate_multiplier': 16 // model_options.output_stride,
                })
            concat_logits = dense_prediction_layer.build_cell(
                features,
                output_stride=model_options.output_stride,
                crop_size=model_options.crop_size,
                image_pooling_crop_size=model_options.image_pooling_crop_size,
                weight_decay=weight_decay,
                reuse=reuse,
                is_training=is_training,
                fine_tune_batch_norm=fine_tune_batch_norm)
            return concat_logits, end_points
        else:
            # The following codes employ the DeepLabv3 ASPP module. Note that we
            # could express the ASPP module as one particular dense prediction
            # cell architecture. We do not do so but leave the following codes
            # for backward compatibility.
            batch_norm_params = utils.get_batch_norm_params(
                decay=0.9997,
                epsilon=1e-5,
                scale=True,
                is_training=(is_training and fine_tune_batch_norm),
                sync_batch_norm_method=model_options.sync_batch_norm_method)
            batch_norm = utils.get_batch_norm_fn(
                model_options.sync_batch_norm_method)
            activation_fn = (tf.nn.relu6
                             if model_options.use_bounded_activation else
                             tf.nn.relu)
            with slim.arg_scope(
                [slim.conv2d, slim.separable_conv2d],
                    weights_regularizer=slim.l2_regularizer(weight_decay),
                    activation_fn=activation_fn,
                    normalizer_fn=batch_norm,
                    padding='SAME',
                    stride=1,
                    reuse=reuse):
                with slim.arg_scope([batch_norm], **batch_norm_params):
                    depth = model_options.aspp_convs_filters
                    branch_logits = []

                    if model_options.add_image_level_feature:
                        if model_options.crop_size is not None:
                            image_pooling_crop_size = model_options.image_pooling_crop_size
                            # If image_pooling_crop_size is not specified, use crop_size.
                            if image_pooling_crop_size is None:
                                image_pooling_crop_size = model_options.crop_size
                            pool_height = scale_dimension(
                                image_pooling_crop_size[0],
                                1. / model_options.output_stride)
                            pool_width = scale_dimension(
                                image_pooling_crop_size[1],
                                1. / model_options.output_stride)
                            image_feature = slim.avg_pool2d(
                                features, [pool_height, pool_width],
                                model_options.image_pooling_stride,
                                padding='VALID')
                            resize_height = scale_dimension(
                                model_options.crop_size[0],
                                1. / model_options.output_stride)
                            resize_width = scale_dimension(
                                model_options.crop_size[1],
                                1. / model_options.output_stride)
                        else:
                            # If crop_size is None, we simply do global pooling.
                            pool_height = tf.shape(features)[1]
                            pool_width = tf.shape(features)[2]
                            image_feature = tf.reduce_mean(features,
                                                           axis=[1, 2],
                                                           keepdims=True)
                            resize_height = pool_height
                            resize_width = pool_width
                        image_feature_activation_fn = tf.nn.relu
                        image_feature_normalizer_fn = batch_norm
                        if model_options.aspp_with_squeeze_and_excitation:
                            image_feature_activation_fn = tf.nn.sigmoid
                            if model_options.image_se_uses_qsigmoid:
                                image_feature_activation_fn = utils.q_sigmoid
                            image_feature_normalizer_fn = None
                        image_feature = slim.conv2d(
                            image_feature,
                            depth,
                            1,
                            activation_fn=image_feature_activation_fn,
                            normalizer_fn=image_feature_normalizer_fn,
                            scope=IMAGE_POOLING_SCOPE)
                        image_feature = _resize_bilinear(
                            image_feature, [resize_height, resize_width],
                            image_feature.dtype)
                        # Set shape for resize_height/resize_width if they are not Tensor.
                        if isinstance(resize_height, tf.Tensor):
                            resize_height = None
                        if isinstance(resize_width, tf.Tensor):
                            resize_width = None
                        image_feature.set_shape(
                            [None, resize_height, resize_width, depth])
                        if not model_options.aspp_with_squeeze_and_excitation:
                            branch_logits.append(image_feature)

                    # Employ a 1x1 convolution.
                    branch_logits.append(
                        slim.conv2d(features,
                                    depth,
                                    1,
                                    scope=ASPP_SCOPE + str(0)))

                    if model_options.atrous_rates:
                        # Employ 3x3 convolutions with different atrous rates.
                        for i, rate in enumerate(model_options.atrous_rates,
                                                 1):
                            scope = ASPP_SCOPE + str(i)
                            if model_options.aspp_with_separable_conv:
                                aspp_features = split_separable_conv2d(
                                    features,
                                    filters=depth,
                                    rate=rate,
                                    weight_decay=weight_decay,
                                    scope=scope)
                            else:
                                aspp_features = slim.conv2d(features,
                                                            depth,
                                                            3,
                                                            rate=rate,
                                                            scope=scope)
                            branch_logits.append(aspp_features)

                    # Merge branch logits.
                    concat_logits = tf.concat(branch_logits, 3)
                    if model_options.aspp_with_concat_projection:
                        concat_logits = slim.conv2d(
                            concat_logits,
                            depth,
                            1,
                            scope=CONCAT_PROJECTION_SCOPE)
                        concat_logits = slim.dropout(
                            concat_logits,
                            keep_prob=0.9,
                            is_training=is_training,
                            scope=CONCAT_PROJECTION_SCOPE + '_dropout')
                    if (model_options.add_image_level_feature and
                            model_options.aspp_with_squeeze_and_excitation):
                        concat_logits *= image_feature

                    return concat_logits, end_points
def encoder(images, style_size=8, keep_prob=1.0, phase_train=True, weight_decay=0.0, reuse=None, scope='Encoders'):
    with tf.compat.v1.variable_scope(scope, reuse=reuse):
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        # weights_initializer=tf.contrib.layers.xavier_initializer(),
                        weights_initializer=tf.compat.v1.keras.initializers.VarianceScaling(scale=2.0),
                        weights_regularizer=tf.keras.regularizers.l2(0.5 * (weight_decay))):
            with slim.arg_scope([slim.dropout, slim.batch_norm], is_training=phase_train):
                with slim.arg_scope([slim.fully_connected],
                    normalizer_fn=layer_norm, normalizer_params=None):
                    print('{} input shape:'.format(scope), [dim.value for dim in images.shape])

                    batch_size = tf.shape(input=images)[0]
                    k = 64


                    with tf.compat.v1.variable_scope('StyleEncoder'):
                        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected],
                            normalizer_fn=None, normalizer_params=None):
                            
                            print('-- StyleEncoder')

                            net = images

                            net = conv(net, k, 7, stride=1, pad=3, scope='conv0')
                            print('module conv0 shape:', [dim.value for dim in net.shape])

                            net = conv(net, 2*k, 4, stride=2, scope='conv1')
                            print('module conv1 shape:', [dim.value for dim in net.shape])

                            net = conv(net, 4*k, 4, stride=2, scope='conv2')
                            print('module conv2 shape:', [dim.value for dim in net.shape])
     

                            encoded_style = net

                            net = slim.avg_pool2d(net, net.shape[1:3], padding='VALID', scope='global_pool')
                            net = slim.flatten(net)

                            style_vec = slim.fully_connected(net, style_size, activation_fn=None, normalizer_fn=None, scope='fc1')
                            print('module fc1 shape:', [dim.value for dim in net.shape])
                            style_vec = tf.identity(style_vec, name='style_vec')


                    #  Transform textures
                    with tf.compat.v1.variable_scope('ContentEncoder'):
                        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose, slim.fully_connected],
                                normalizer_fn=instance_norm, normalizer_params=None):
                            print('-- ContentEncoder')
                            net = images

                            net = conv(net, k, 7, stride=1, pad=3, scope='conv0')
                            print('module conv0 shape:', [dim.value for dim in net.shape])

                            net = conv(net, 2*k, 4, stride=2, scope='conv1')
                            print('module conv1 shape:', [dim.value for dim in net.shape])

                            net = conv(net, 4*k, 4, stride=2, scope='conv2')
                            print('module conv2 shape:', [dim.value for dim in net.shape])
                            
                            for i in range(3):
                                net_ = conv(net, 4*k, 3, scope='res{}_0'.format(i))
                                net += conv(net_, 4*k, 3, activation_fn=None, biases_initializer=None, scope='res{}_1'.format(i))
                                print('module res{} shape:'.format(i), [dim.value for dim in net.shape])

                            encoded = net
                        
                    return encoded, style_vec
def refine_by_decoder(features,
                      end_points,
                      crop_size=None,
                      decoder_output_stride=None,
                      decoder_use_separable_conv=False,
                      decoder_use_sum_merge=False,
                      decoder_filters=256,
                      decoder_output_is_logits=False,
                      model_variant=None,
                      weight_decay=0.0001,
                      reuse=None,
                      is_training=False,
                      fine_tune_batch_norm=False,
                      use_bounded_activation=False,
                      sync_batch_norm_method='None'):
    """Adds the decoder to obtain sharper segmentation results.

  Args:
    features: A tensor of size [batch, features_height, features_width,
      features_channels].
    end_points: A dictionary from components of the network to the corresponding
      activation.
    crop_size: A tuple [crop_height, crop_width] specifying whole patch crop
      size.
    decoder_output_stride: A list of integers specifying the output stride of
      low-level features used in the decoder module.
    decoder_use_separable_conv: Employ separable convolution for decoder or not.
    decoder_use_sum_merge: Boolean, decoder uses simple sum merge or not.
    decoder_filters: Integer, decoder filter size.
    decoder_output_is_logits: Boolean, using decoder output as logits or not.
    model_variant: Model variant for feature extraction.
    weight_decay: The weight decay for model variables.
    reuse: Reuse the model variables or not.
    is_training: Is training or not.
    fine_tune_batch_norm: Fine-tune the batch norm parameters or not.
    use_bounded_activation: Whether or not to use bounded activations. Bounded
      activations better lend themselves to quantized inference.
    sync_batch_norm_method: String, method used to sync batch norm. Currently
     only support `None` (no sync batch norm) and `tpu` (use tpu code to
     sync batch norm).

  Returns:
    Decoder output with size [batch, decoder_height, decoder_width,
      decoder_channels].

  Raises:
    ValueError: If crop_size is None.
  """
    if crop_size is None:
        raise ValueError('crop_size must be provided when using decoder.')
    batch_norm_params = utils.get_batch_norm_params(
        decay=0.9997,
        epsilon=1e-5,
        scale=True,
        is_training=(is_training and fine_tune_batch_norm),
        sync_batch_norm_method=sync_batch_norm_method)
    batch_norm = utils.get_batch_norm_fn(sync_batch_norm_method)
    decoder_depth = decoder_filters
    projected_filters = 48
    if decoder_use_sum_merge:
        # When using sum merge, the projected filters must be equal to decoder
        # filters.
        projected_filters = decoder_filters
    if decoder_output_is_logits:
        # Overwrite the setting when decoder output is logits.
        activation_fn = None
        normalizer_fn = None
        conv2d_kernel = 1
        # Use original conv instead of separable conv.
        decoder_use_separable_conv = False
    else:
        # Default setting when decoder output is not logits.
        activation_fn = tf.nn.relu6 if use_bounded_activation else tf.nn.relu
        normalizer_fn = batch_norm
        conv2d_kernel = 3
    with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
                        weights_regularizer=slim.l2_regularizer(weight_decay),
                        activation_fn=activation_fn,
                        normalizer_fn=normalizer_fn,
                        padding='SAME',
                        stride=1,
                        reuse=reuse):
        with slim.arg_scope([batch_norm], **batch_norm_params):
            with tf.variable_scope(DECODER_SCOPE, DECODER_SCOPE, [features]):
                decoder_features = features
                decoder_stage = 0
                scope_suffix = ''
                for output_stride in decoder_output_stride:
                    feature_list = feature_extractor.networks_to_feature_maps[
                        model_variant][feature_extractor.
                                       DECODER_END_POINTS][output_stride]
                    # If only one decoder stage, we do not change the scope name in
                    # order for backward compactibility.
                    if decoder_stage:
                        scope_suffix = '_{}'.format(decoder_stage)
                    for i, name in enumerate(feature_list):
                        decoder_features_list = [decoder_features]
                        # MobileNet and NAS variants use different naming convention.
                        if ('mobilenet' in model_variant
                                or model_variant.startswith('mnas')
                                or model_variant.startswith('nas')):
                            feature_name = name
                        else:
                            feature_name = '{}/{}'.format(
                                feature_extractor.name_scope[model_variant],
                                name)
                        decoder_features_list.append(
                            slim.conv2d(end_points[feature_name],
                                        projected_filters,
                                        1,
                                        scope='feature_projection' + str(i) +
                                        scope_suffix))
                        # Determine the output size.
                        decoder_height = scale_dimension(
                            crop_size[0], 1.0 / output_stride)
                        decoder_width = scale_dimension(
                            crop_size[1], 1.0 / output_stride)
                        # Resize to decoder_height/decoder_width.
                        for j, feature in enumerate(decoder_features_list):
                            decoder_features_list[j] = _resize_bilinear(
                                feature, [decoder_height, decoder_width],
                                feature.dtype)
                            h = (None if isinstance(decoder_height, tf.Tensor)
                                 else decoder_height)
                            w = (None if isinstance(decoder_width, tf.Tensor)
                                 else decoder_width)
                            decoder_features_list[j].set_shape(
                                [None, h, w, None])
                        if decoder_use_sum_merge:
                            decoder_features = _decoder_with_sum_merge(
                                decoder_features_list,
                                decoder_depth,
                                conv2d_kernel=conv2d_kernel,
                                decoder_use_separable_conv=
                                decoder_use_separable_conv,
                                weight_decay=weight_decay,
                                scope_suffix=scope_suffix)
                        else:
                            if not decoder_use_separable_conv:
                                scope_suffix = str(i) + scope_suffix
                            decoder_features = _decoder_with_concat_merge(
                                decoder_features_list,
                                decoder_depth,
                                decoder_use_separable_conv=
                                decoder_use_separable_conv,
                                weight_decay=weight_decay,
                                scope_suffix=scope_suffix)
                    decoder_stage += 1
                return decoder_features
def deconv(x, *args, pad=1, **kwargs):
    with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], padding='VALID'):
        x = padding(x, pad)
        return slim.conv2d_transpose(x, *args, **kwargs)
def get_branch_logits(features,
                      num_classes,
                      atrous_rates=None,
                      aspp_with_batch_norm=False,
                      kernel_size=1,
                      weight_decay=0.0001,
                      reuse=None,
                      scope_suffix=''):
    """Gets the logits from each model's branch.

  The underlying model is branched out in the last layer when atrous
  spatial pyramid pooling is employed, and all branches are sum-merged
  to form the final logits.

  Args:
    features: A float tensor of shape [batch, height, width, channels].
    num_classes: Number of classes to predict.
    atrous_rates: A list of atrous convolution rates for last layer.
    aspp_with_batch_norm: Use batch normalization layers for ASPP.
    kernel_size: Kernel size for convolution.
    weight_decay: Weight decay for the model variables.
    reuse: Reuse model variables or not.
    scope_suffix: Scope suffix for the model variables.

  Returns:
    Merged logits with shape [batch, height, width, num_classes].

  Raises:
    ValueError: Upon invalid input kernel_size value.
  """
    # When using batch normalization with ASPP, ASPP has been applied before
    # in extract_features, and thus we simply apply 1x1 convolution here.
    if aspp_with_batch_norm or atrous_rates is None:
        if kernel_size != 1:
            raise ValueError(
                'Kernel size must be 1 when atrous_rates is None or '
                'using aspp_with_batch_norm. Gets %d.' % kernel_size)
        atrous_rates = [1]

    with slim.arg_scope(
        [slim.conv2d],
            weights_regularizer=slim.l2_regularizer(weight_decay),
            weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
            reuse=reuse):
        with tf.variable_scope(LOGITS_SCOPE_NAME, LOGITS_SCOPE_NAME,
                               [features]):
            branch_logits = []
            for i, rate in enumerate(atrous_rates):
                scope = scope_suffix
                if i:
                    scope += '_%d' % i

                branch_logits.append(
                    slim.conv2d(features,
                                num_classes,
                                kernel_size=kernel_size,
                                rate=rate,
                                activation_fn=None,
                                normalizer_fn=None,
                                scope=scope))

            return tf.add_n(branch_logits)
示例#27
0
 def conv_hyperparams_fn(self):
     with slim.arg_scope([]) as sc:
         return sc
示例#28
0
def inception_v4_base(inputs, final_endpoint='Mixed_7d', scope=None):
  """Creates the Inception V4 network up to the given final endpoint.

  Args:
    inputs: a 4-D tensor of size [batch_size, height, width, 3].
    final_endpoint: specifies the endpoint to construct the network up to.
      It can be one of [ 'Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
      'Mixed_3a', 'Mixed_4a', 'Mixed_5a', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d',
      'Mixed_5e', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e',
      'Mixed_6f', 'Mixed_6g', 'Mixed_6h', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c',
      'Mixed_7d']
    scope: Optional variable_scope.

  Returns:
    logits: the logits outputs of the model.
    end_points: the set of end_points from the inception model.

  Raises:
    ValueError: if final_endpoint is not set to one of the predefined values,
  """
  end_points = {}

  def add_and_check_final(name, net):
    end_points[name] = net
    return name == final_endpoint

  with tf.variable_scope(scope, 'InceptionV4', [inputs]):
    with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                        stride=1, padding='SAME'):
      # 299 x 299 x 3
      net = slim.conv2d(inputs, 32, [3, 3], stride=2,
                        padding='VALID', scope='Conv2d_1a_3x3')

      end_points["pool1"] = net

      if add_and_check_final('Conv2d_1a_3x3', net): return net, end_points
      # 149 x 149 x 32
      net = slim.conv2d(net, 32, [3, 3], padding='VALID',
                        scope='Conv2d_2a_3x3')
      if add_and_check_final('Conv2d_2a_3x3', net): return net, end_points
      # 147 x 147 x 32
      net = slim.conv2d(net, 64, [3, 3], scope='Conv2d_2b_3x3')
      if add_and_check_final('Conv2d_2b_3x3', net): return net, end_points
      # 147 x 147 x 64
      with tf.variable_scope('Mixed_3a'):
        with tf.variable_scope('Branch_0'):
          branch_0 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
                                     scope='MaxPool_0a_3x3')
        with tf.variable_scope('Branch_1'):
          branch_1 = slim.conv2d(net, 96, [3, 3], stride=2, padding='VALID',
                                 scope='Conv2d_0a_3x3')
        net = tf.concat(axis=3, values=[branch_0, branch_1])

        end_points["pool2"] = net

        if add_and_check_final('Mixed_3a', net): return net, end_points

      # 73 x 73 x 160
      with tf.variable_scope('Mixed_4a'):
        with tf.variable_scope('Branch_0'):
          branch_0 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1')
          branch_0 = slim.conv2d(branch_0, 96, [3, 3], padding='VALID',
                                 scope='Conv2d_1a_3x3')
        with tf.variable_scope('Branch_1'):
          branch_1 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1')
          branch_1 = slim.conv2d(branch_1, 64, [1, 7], scope='Conv2d_0b_1x7')
          branch_1 = slim.conv2d(branch_1, 64, [7, 1], scope='Conv2d_0c_7x1')
          branch_1 = slim.conv2d(branch_1, 96, [3, 3], padding='VALID',
                                 scope='Conv2d_1a_3x3')
        net = tf.concat(axis=3, values=[branch_0, branch_1])
        if add_and_check_final('Mixed_4a', net): return net, end_points

      # 71 x 71 x 192
      with tf.variable_scope('Mixed_5a'):
        with tf.variable_scope('Branch_0'):
          branch_0 = slim.conv2d(net, 192, [3, 3], stride=2, padding='VALID',
                                 scope='Conv2d_1a_3x3')
        with tf.variable_scope('Branch_1'):
          branch_1 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID',
                                     scope='MaxPool_1a_3x3')
        net = tf.concat(axis=3, values=[branch_0, branch_1])

        end_points["pool3"] = net

        if add_and_check_final('Mixed_5a', net): return net, end_points



      # 35 x 35 x 384
      # 4 x Inception-A blocks
      for idx in range(4):
        block_scope = 'Mixed_5' + chr(ord('b') + idx)
        net = block_inception_a(net, block_scope)
        if add_and_check_final(block_scope, net): return net, end_points

      # 35 x 35 x 384
      # Reduction-A block
      net = block_reduction_a(net, 'Mixed_6a')

      end_points["pool4"] = net

      if add_and_check_final('Mixed_6a', net): return net, end_points

      # 17 x 17 x 1024
      # 7 x Inception-B blocks
      for idx in range(7):
        block_scope = 'Mixed_6' + chr(ord('b') + idx)
        net = block_inception_b(net, block_scope)
        if add_and_check_final(block_scope, net): return net, end_points

      # 17 x 17 x 1024
      # Reduction-B block
      net = block_reduction_b(net, 'Mixed_7a')

      end_points["pool5"] = net

      if add_and_check_final('Mixed_7a', net): return net, end_points

      # 8 x 8 x 1536
      # 3 x Inception-C blocks
      for idx in range(3):
        block_scope = 'Mixed_7' + chr(ord('b') + idx)
        net = block_inception_c(net, block_scope)
        if add_and_check_final(block_scope, net): return net, end_points
  raise ValueError('Unknown final endpoint %s' % final_endpoint)
示例#29
0
def define_vggish_slim(training=False):
    """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
    # Defaults:
    # - All weights are initialized to N(0, INIT_STDDEV).
    # - All biases are initialized to 0.
    # - All activations are ReLU.
    # - All convolutions are 3x3 with stride 1 and SAME padding.
    # - All max-pools are 2x2 with stride 2 and SAME padding.
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        weights_initializer=tf.truncated_normal_initializer(
                            stddev=params.INIT_STDDEV),
                        biases_initializer=tf.zeros_initializer(),
                        activation_fn=tf.nn.relu,
                        trainable=training), \
         slim.arg_scope([slim.conv2d],
                        kernel_size=[3, 3], stride=1, padding='SAME'), \
         slim.arg_scope([slim.max_pool2d],
                        kernel_size=[2, 2], stride=2, padding='SAME'), \
         tf.variable_scope('vggish'):
        # Input: a batch of 2-D log-mel-spectrogram patches.
        features = tf.placeholder(tf.float32,
                                  shape=(None, params.NUM_FRAMES,
                                         params.NUM_BANDS),
                                  name='input_features')
        # Reshape to 4-D so that we can convolve a batch with conv2d().
        net = tf.reshape(features,
                         [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

        # The VGG stack of alternating convolutions and max-pools.
        net = slim.conv2d(net, 64, scope='conv1')
        net = slim.max_pool2d(net, scope='pool1')
        net = slim.conv2d(net, 128, scope='conv2')
        net = slim.max_pool2d(net, scope='pool2')
        net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
        net = slim.max_pool2d(net, scope='pool3')
        net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
        net = slim.max_pool2d(net, scope='pool4')

        # Flatten before entering fully-connected layers
        net = slim.flatten(net)
        net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
        # The embedding layer.
        net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
        return tf.identity(net, name='embedding')
示例#30
0
  def model(self,
            images,
            grasp_params,
            num_classes=1,
            is_training=False,
            softmax=False,
            restore=True,
            grasp_param_names=None,
            goal_spatial_fn=None,
            goal_vector_fn=None,
            scope=None,
            reuse=None,
            **kwargs):
    """Creates a tensorflow graph for this model.

    Args:
      images: A list of 4D tensors containing image data
      grasp_params: A 3D tensor of batch_size x action_batch_size x PARAMS_SIZE
        containing grasp params or a 2D tensor of batch_size x PARAMS_SIZE if
        action_batch_size is not None.
      num_classes: Number of classes to predict in the final layer
      is_training: If the model is in training or not
      softmax: If true the final layer is a softmax, logistic otherwise
      restore: To restore logit weights or not when initializing from a
        checkpoint
      grasp_param_names: A dictionary that maps sub-blocks of `grasp_params`to
        names (string). If not None, the naming is used in graph construction.
        A key `block_name` and value (`offset`, `size`,) assign a name to
        a block `grasp_params[:, offset:(offset + size)]`.
      goal_spatial_fn: Optional function, returns a 3-D tensor to merge into the
        features, for instance conditioning the Q function on some goal feature
        map.
      goal_vector_fn: Optional function, returns a 1-D vector to merge into
        features, conditioning Q function on some goal embedding.
      scope: The top-level scope of the tensorflow graph.
      reuse: True, None, or tf.AUTO_REUSE; if True, we go into reuse mode for
        this scope as well as all sub-scopes; if tf.AUTO_REUSE, we create
        variables if they do not exist, and return them otherwise; if None, we
        inherit the parent scope's reuse flag.
      **kwargs: Model-specific arguments.
    Returns:
      graph: A tensorflow graph for the model

    Raises:
      ValueError: if restore=False as it is currently not supported
    """
    del kwargs
    if not restore:
      raise ValueError("This model doesn't yet support restore=False")
    batch_norm_var_collection = 'moving_vars'
    batch_norm = {
        # Decay for the moving averages.
        'decay': self._batch_norm_decay,
        # epsilon to prevent 0s in variance.
        'epsilon': self._batch_norm_epsilon,
        # collection containing the moving mean and moving variance.
        'variables_collections': {
            'beta': None,
            'gamma': None,
            'moving_mean': [batch_norm_var_collection],
            'moving_variance': [batch_norm_var_collection],
        },
        # Whether to scale after normalization.
        'scale': True,
    }
    end_points = {}

    tile_batch = (len(grasp_params.shape) == 3)

    if tile_batch:

      def expand_to_megabatch(feature):
        # Collapse second dimension of megabatch.
        dim = tf.shape(feature)[2]
        return tf.reshape(feature, [-1, dim])

      grasp_params = contrib_framework.nest.map_structure(
          expand_to_megabatch, grasp_params)

    # Note that we need to do this before calling the tf.variable_scope
    # since there seems to be a bug in TF that reuse=True does not work with
    # scope=None even if the default_name is passed.
    # TODO(T2R_CONTRIBUTORS): Remove this None check and pass in the class name as
    # the default_name in the tf.variable_scope initialization.
    def _run():
      """Forward pass through the network."""
      with slim.arg_scope([slim.dropout], is_training=is_training):
        with slim.arg_scope(
            [slim.conv2d, slim.fully_connected],
            weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
            weights_regularizer=slim.l2_regularizer(self._l2_regularization),
            activation_fn=tf.nn.relu,
            trainable=is_training):
          with slim.arg_scope(
              [slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'):
            with slim.arg_scope(
                [slim.conv2d, slim.fully_connected],
                normalizer_fn=slim.batch_norm,
                normalizer_params=batch_norm):
              _, grasp_image = images
              net = slim.conv2d(
                  grasp_image,
                  64, [6, 6],
                  stride=2,
                  scope='conv1_1',
                  activation_fn=None,
                  normalizer_fn=None,
                  normalizer_params=None)
              # Old checkpoints (such as those used for tests) did not have
              # scaling on the separate batch norm operations (those not
              # associated with a conv operation), so only setting the scale
              # parameter in arg_scope would break the tests. We set scale=
              # False for these separate batch norm operations temporarily.
              # However, future users are encouraged to not set scale=False so
              # that barch_norm parameters are consistent through the whole
              # network.
              net = tf.nn.relu(slim.batch_norm(net, scale=False))
              net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool1')
              self.activation_layers.append(net)
              for l in range(2, 2 + self.num_convs[0]):
                net = slim.conv2d(net, 64, [5, 5], scope='conv%d' % l)
                self.activation_layers.append(net)
              net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool2')
              end_points['pool2'] = net
              self.activation_layers.append(net)
              logging.debug('pool2')
              logging.debug(net.get_shape())

              if grasp_param_names is None:
                grasp_param_blocks = [grasp_params]
                grasp_param_block_names = ['fcgrasp']
              else:
                grasp_param_blocks = []
                grasp_param_block_names = []
                # Note: Creating variables must happen in a deterministic
                # order, otherwise some workers will look for variables on the
                # wrong parameter servers, so we sort the grasp_param_names
                # here.
                for block_name in sorted(grasp_param_names):
                  offset, size = grasp_param_names[block_name]
                  grasp_param_blocks += [
                      tf.slice(grasp_params, [0, offset], [-1, size])
                  ]
                  grasp_param_block_names += [block_name]

              grasp_param_tensors = []
              for block, name in zip(grasp_param_blocks,
                                     grasp_param_block_names):
                grasp_param_tensors += [
                    slim.fully_connected(
                        block,
                        256,
                        scope=name,
                        activation_fn=None,
                        normalizer_fn=None,
                        normalizer_params=None)
                ]

              fcgrasp = tf.add_n(grasp_param_tensors)

              # Old checkpoints (such as those used for tests) did not have
              # scaling on the separate batch norm operations (those not
              # associated with a conv operation), so only setting the scale
              # parameter in arg_scope would break the tests. We set scale=
              # False for these separate batch norm operations temporarily.
              # However, future users are encouraged to not set scale=False so
              # that barch_norm parameters are consistent through the whole
              # network.
              fcgrasp = tf.nn.relu(slim.batch_norm(fcgrasp, scale=False))
              fcgrasp = slim.fully_connected(fcgrasp, 64, scope='fcgrasp2')
              context = tf.reshape(fcgrasp, [-1, 1, 1, 64])
              end_points['fcgrasp'] = fcgrasp
              # Tile the image embedding action_batch_size times to align
              # with the expanded action dimension of action_batch_size.
              # Same image is used with all the actions in a action_batch.
              # net pre expansion should be [batch, *, *, *]
              # net post expansion should be [batch x action_batch, *, *, *]
              if tile_batch:
                net = contrib_seq2seq.tile_batch(net, self._action_batch_size)
              net = tf.add(net, context)
              logging.debug('net post add %s', net)
              end_points['vsum'] = net
              self.activation_layers.append(net)
              logging.debug('vsum')
              logging.debug(net.get_shape())
              for l in range(2 + sum(self.num_convs[:1]),
                             2 + sum(self.num_convs[:2])):
                net = slim.conv2d(net, 64, [3, 3], scope='conv%d' % l)
                logging.debug('conv%d', l)
                self.activation_layers.append(net)
              logging.debug(net.get_shape())
              net = slim.max_pool2d(net, [2, 2], stride=2, scope='pool3')
              logging.debug('pool3')
              logging.debug(net.get_shape())
              self.activation_layers.append(net)
              for l in range(2 + sum(self.num_convs[:2]),
                             2 + sum(self.num_convs[:3])):
                net = slim.conv2d(
                    net, 64, [3, 3], scope='conv%d' % l, padding='VALID')
                self.activation_layers.append(net)
              logging.debug('final conv')
              logging.debug(net.get_shape())
              end_points['final_conv'] = net

              batch_size = tf.shape(net)[0]
              if goal_spatial_fn is not None:
                goal_spatial = goal_spatial_fn()
                # Tile goal to match net batch size (e.g. CEM).
                goal_batch_size = tf.shape(goal_spatial)[0]
                goal_spatial = tf.tile(
                    goal_spatial, [batch_size//goal_batch_size, 1, 1, 1])
                # Merging features in style of Fang 2017.
                net = tf.concat([net, goal_spatial], axis=3)
              net = slim.flatten(net, scope='flatten')

              if goal_vector_fn is not None:
                goal_vector = goal_vector_fn()
                goal_batch_size = tf.shape(goal_vector)[0]
                goal_vector = tf.tile(
                    goal_vector, [batch_size//goal_batch_size, 1])
                net = tf.concat([net, goal_vector], axis=1)

              for l in range(self.hid_layers):
                net = slim.fully_connected(net, 64, scope='fc%d' % l)

              name = 'logit'
              if num_classes > 1:
                name = 'logit_%d' % num_classes
              logits = slim.fully_connected(
                  net,
                  num_classes,
                  activation_fn=None,
                  scope=name,
                  normalizer_fn=None,
                  normalizer_params=None)
              end_points['logits'] = logits
              if softmax:
                predictions = tf.nn.softmax(logits)
              else:
                predictions = tf.nn.sigmoid(logits)
              if tile_batch:

                if num_classes > 1:
                  predictions = tf.reshape(
                      predictions, [-1, self._action_batch_size, num_classes])
                else:
                  predictions = tf.reshape(predictions,
                                           [-1, self._action_batch_size])
              end_points['predictions'] = predictions
              return logits, end_points

    if self._create_var_scope:
      if scope is None:
        scope = self.__class__.__name__
      with tf.variable_scope(scope,
                             values=[images],
                             reuse=reuse):
        with slim.arg_scope([slim.batch_norm],
                            is_training=is_training,
                            decay=batch_norm['decay'],
                            epsilon=batch_norm['epsilon'],
                            scale=batch_norm['scale']):
          logits, end_points = _run()
    else:
      with slim.arg_scope([slim.batch_norm],
                          is_training=is_training,
                          decay=batch_norm['decay'],
                          epsilon=batch_norm['epsilon'],
                          scale=batch_norm['scale'],
                          updates_collections=None):
        logits, end_points = _run()

    return logits, end_points