Exemplo n.º 1
0
def _deeplab_builder(x,
                     name,
                     cnn_fn,
                     num_classes,
                     is_training,
                     use_global_status,
                     reuse=False):
    """Helper function to build Deeplab v2 model for semantic segmentation.

  The Deeplab v2 model is composed of one base network (ResNet101) and 
  one ASPP module (4 Atrous Convolutional layers of different size). The
  segmentation prediction is the summation of 4 outputs of the ASPP module.

  Args:
    x: A tensor of size [batch_size, height_in, width_in, channels].
    name: The prefix of tensorflow variables defined in this network.
    cnn_fn: A function which builds the base network (ResNet101).
    num_classes: Number of predicted classes for classification tasks.
    is_training: If the tensorflow variables defined in this network
      would be used for training.
    use_global_status: enable/disable use_global_status for batch
      normalization. If True, moving mean and moving variance are updated
      by exponential decay.
    reuse: enable/disable reuse for reusing tensorflow variables. It is 
      useful for sharing weight parameters across two identical networks.

  Returns:
    A tensor of size [batch_size, height_in/8, width_in/8, num_classes].
  """
    # Build the base network.
    x = cnn_fn(x, name, is_training, use_global_status, reuse)

    with tf.variable_scope(name, reuse=reuse) as scope:
        # Build the ASPP module.
        aspp = []
        for i, dilation in enumerate([6, 12, 18, 24]):
            score = nn.atrous_conv(x,
                                   name='fc1_c{:d}'.format(i),
                                   filters=num_classes,
                                   kernel_size=3,
                                   dilation=dilation,
                                   padding='SAME',
                                   relu=False,
                                   biased=True,
                                   bn=False,
                                   is_training=is_training)
            aspp.append(score)

        score = tf.add_n(aspp, name='fc1_sum')

    return score
def _unet_builder(x,
                  name,
                  filters=[64,128,256,512,1024],
                  num_blocks=[2,3,3,3,3],
                  strides=[2,2,2,2,2],
                  dilations=[None,None,None,None,None],
                  num_classes=40,
                  is_training=True,
                  use_global_status=False,
                  reuse=False):
  """Helper function to construct UNet.
  """
  if len(filters) != len(num_blocks)\
      or len(filters) != len(strides):
    raise ValueError('length of lists are not consistent')

  with tf.variable_scope(name, reuse=reuse) as scope:
    # Encoder.
    shortcuts = []
    for ib in range(len(filters)):
      for iu in range(num_blocks[ib]):
        name_format = 'layer{:d}/unit_{:d}/encoder/'
        block_name = name_format.format(ib+1, iu+1)
        c_o = filters[ib] # output channel

        # strides at the begginning
        s = strides[ib] if iu == 0 else 1
        d = dilations[ib]
        if d is not None and d > 1 and s == 1:
          x = nn.atrous_conv(x,
                             name=block_name+'/conv',
                             filters=c_o,
                             kernel_size=3,
                             dilation=d,
                             padding='SAME',
                             biased=False,
                             bn=True,
                             relu=True,
                             decay=0.99,
                             is_training=is_training,
                             use_global_status=use_global_status)
        else:
          padding = 'VALID' if s > 1 else 'SAME'
          ksize = s*2 if s > 1 else 3
          x = nn.conv(x,
                      name=block_name+'/conv',
                      filters=c_o,
                      kernel_size=ksize,
                      strides=s,
                      padding=padding,
                      biased=False,
                      bn=True,
                      relu=True,
                      decay=0.99,
                      is_training=is_training,
                      use_global_status=use_global_status)
      print(x)
      shortcuts.append(x)

    # Decoder.
    for ib in range(len(shortcuts)-1, 0 ,-1):
      #for iu in range(num_blocks[ib-1]):
      for iu in range(3):
        n, h, w, c_o = shortcuts[ib-1].get_shape().as_list()
        name_format = 'layer{:d}/unit_{:d}/decoder/'
        block_name = name_format.format(2*len(filters)-ib, iu+1)
        x = nn.conv(x,
                    name=block_name+'conv',
                    filters=c_o,
                    kernel_size=3,
                    strides=1,
                    padding='SAME',
                    biased=False,
                    bn=True,
                    relu=True,
                    decay=0.99,
                    is_training=is_training,
                    use_global_status=use_global_status)
        if iu == 0:
          x = tf.image.resize_bilinear(x, [h,w])
          x = tf.concat([x, shortcuts[ib-1]], axis=-1)
      print(x)

    # output segmentation, depth and surface normal estimation.
    block_name = 'block5'
    seg = nn.conv(x, block_name+'/fc1_seg', num_classes, 3, 1, padding='SAME',
                  biased=True, bn=False, relu=False, is_training=is_training)

    dph = nn.conv(x, block_name+'/fc1_depth', 1, 3, 1, padding='SAME',
                  biased=True, bn=False, relu=True, is_training=is_training)

    nrm = nn.conv(x, block_name+'/fc1_normal', 3, 3, 1, padding='SAME',
                  biased=True, bn=False, relu=False, is_training=is_training)
    nrm = tf.nn.l2_normalize(nrm, dim=-1)

    return [seg, dph, nrm]
def _pspnet_builder(x,
                    name,
                    cnn_fn,
                    num_classes,
                    is_training,
                    use_global_status,
                    reuse=False):
    """Helper function to build PSPNet model for semantic segmentation.

  The PSPNet model is composed of one base network (ResNet101) and 
  one pyramid spatial pooling (PSP) module, followed with concatenation
  and two more convlutional layers for segmentation prediction.

  Args:
    x: A tensor of size [batch_size, height_in, width_in, channels].
    name: The prefix of tensorflow variables defined in this network.
    cnn_fn: A function which builds the base network (ResNet101).
    num_classes: Number of predicted classes for classification tasks.
    is_training: If the tensorflow variables defined in this network
      would be used for training.
    use_global_status: enable/disable use_global_status for batch
      normalization. If True, moving mean and moving variance are updated
      by exponential decay.
    reuse: enable/disable reuse for reusing tensorflow variables. It is 
      useful for sharing weight parameters across two identical networks.

  Returns:
    A tensor of size [batch_size, height_in/8, width_in/8, num_classes].
  """
    # Ensure that the size of input data is valid (should be multiple of 6x8=48).
    h, w = x.get_shape().as_list()[1:3]  # NxHxWxC
    assert (h % 48 == 0 and w % 48 == 0 and h == w)

    # Build the base network.
    x, bn = cnn_fn(x, name, is_training, use_global_status, reuse)

    with tf.variable_scope(name, reuse=reuse) as scope:
        # Build the PSP module
        psp = True
        mdc = True
        if psp:
            with tf.device('/gpu:1'):
                pool_k = int(h / 8)  # the base network is stride 8 by default.

                # Build pooling layer results in 1x1 output.
                pool1 = tf.nn.avg_pool(x,
                                       name='block5/pool1',
                                       ksize=[1, pool_k, pool_k, 1],
                                       strides=[1, pool_k, pool_k, 1],
                                       padding='VALID')
                pool1 = nn.conv(pool1,
                                'block5/pool1/conv1',
                                512,
                                1,
                                1,
                                padding='SAME',
                                biased=False,
                                bn=True,
                                relu=True,
                                is_training=is_training,
                                decay=0.99,
                                use_global_status=use_global_status)
                pool1 = tf.image.resize_bilinear(pool1, [pool_k, pool_k])

                # Build pooling layer results in 2x2 output.
                pool2 = tf.nn.avg_pool(
                    x,
                    name='block5/pool2',
                    ksize=[1, pool_k // 2, pool_k // 2, 1],
                    strides=[1, pool_k // 2, pool_k // 2, 1],
                    padding='VALID')
                pool2 = nn.conv(pool2,
                                'block5/pool2/conv1',
                                512,
                                1,
                                1,
                                padding='SAME',
                                biased=False,
                                bn=True,
                                relu=True,
                                is_training=is_training,
                                decay=0.99,
                                use_global_status=use_global_status)
                pool2 = tf.image.resize_bilinear(pool2, [pool_k, pool_k])

                # Build pooling layer results in 3x3 output.
                pool3 = tf.nn.avg_pool(
                    x,
                    name='block5/pool3',
                    ksize=[1, pool_k // 3, pool_k // 3, 1],
                    strides=[1, pool_k // 3, pool_k // 3, 1],
                    padding='VALID')
                pool3 = nn.conv(pool3,
                                'block5/pool3/conv1',
                                512,
                                1,
                                1,
                                padding='SAME',
                                biased=False,
                                bn=True,
                                relu=True,
                                is_training=is_training,
                                decay=0.99,
                                use_global_status=use_global_status)
                pool3 = tf.image.resize_bilinear(pool3, [pool_k, pool_k])

                # Build pooling layer results in 6x6 output.
                pool6 = tf.nn.avg_pool(
                    x,
                    name='block5/pool6',
                    ksize=[1, pool_k // 6, pool_k // 6, 1],
                    strides=[1, pool_k // 6, pool_k // 6, 1],
                    padding='VALID')
                pool6 = nn.conv(pool6,
                                'block5/pool6/conv1',
                                512,
                                1,
                                1,
                                padding='SAME',
                                biased=False,
                                bn=True,
                                relu=True,
                                is_training=is_training,
                                decay=0.99,
                                use_global_status=use_global_status)
                pool6 = tf.image.resize_bilinear(pool6, [pool_k, pool_k])

                # Fuse the pooled feature maps with its input, and generate
                # segmentation prediction.
                x = tf.concat([pool1, pool2, pool3, pool6, x],
                              name='block5/concat1',
                              axis=3)
                x = nn.conv(x,
                            'block5/conv2',
                            512,
                            3,
                            1,
                            padding='SAME',
                            biased=False,
                            bn=True,
                            relu=True,
                            is_training=is_training,
                            decay=0.99,
                            use_global_status=use_global_status)
        if mdc:
            with tf.device('/gpu:1'):
                d1 = nn.atrous_conv(x,
                                    'mdc',
                                    256,
                                    3,
                                    1,
                                    padding='SAME',
                                    biased=False,
                                    bn=True,
                                    relu=True,
                                    is_training=is_training,
                                    decay=0.99,
                                    use_global_status=use_global_status)
                d3 = nn.atrous_conv(x,
                                    'mdc',
                                    256,
                                    3,
                                    3,
                                    padding='SAME',
                                    biased=False,
                                    bn=True,
                                    relu=True,
                                    is_training=is_training,
                                    decay=0.99,
                                    use_global_status=use_global_status,
                                    reuse=True)
                d6 = nn.atrous_conv(x,
                                    'mdc',
                                    256,
                                    3,
                                    6,
                                    padding='SAME',
                                    biased=False,
                                    bn=True,
                                    relu=True,
                                    is_training=is_training,
                                    decay=0.99,
                                    use_global_status=use_global_status,
                                    reuse=True)
                d9 = nn.atrous_conv(x,
                                    'mdc',
                                    256,
                                    3,
                                    9,
                                    padding='SAME',
                                    biased=False,
                                    bn=True,
                                    relu=True,
                                    is_training=is_training,
                                    decay=0.99,
                                    use_global_status=use_global_status,
                                    reuse=True)
                x = d1 + (d3 + d6 + d9) / 3
        x = tf.image.resize_bilinear(x, [h / 4, w / 4])
        x = tf.concat([x, bn[1]], name='block5/concat1', axis=3)
        with tf.device('/gpu:1'):
            x = nn.conv(x,
                        'block5/conv3',
                        128,
                        3,
                        1,
                        padding='SAME',
                        biased=False,
                        bn=True,
                        relu=True,
                        is_training=is_training,
                        decay=0.99,
                        use_global_status=use_global_status)

            x = tf.image.resize_bilinear(x, [h / 2, w / 2])
            x = tf.concat([x, bn[0]], name='block5/concat2', axis=3)
            x = nn.conv(x,
                        'block5/conv4',
                        96,
                        3,
                        1,
                        padding='SAME',
                        biased=False,
                        bn=True,
                        relu=True,
                        is_training=is_training,
                        decay=0.99,
                        use_global_status=use_global_status)

            x = tf.image.resize_bilinear(x, [h / 1, w / 1])
            x = nn.conv(x,
                        'block5/conv5',
                        96,
                        3,
                        1,
                        padding='SAME',
                        biased=False,
                        bn=True,
                        relu=True,
                        is_training=is_training,
                        decay=0.99,
                        use_global_status=use_global_status)
            # x=tf.nn.dropout(x,0.3,name='block5/drop')

            x = nn.conv(x,
                        'block5/fc1_voc12',
                        num_classes,
                        1,
                        1,
                        padding='SAME',
                        biased=True,
                        bn=False,
                        relu=False,
                        is_training=is_training)
        return x
def bottleneck(x,
               name,
               filters,
               strides=None,
               dilation=None,
               is_training=True,
               use_global_status=True):
    """Builds the bottleneck module in ResNet.

  This function stack 3 convolutional layers and fuse the output with
  the residual connection.

  Args:
    x: A tensor of size [batch_size, height_in, width_in, channels].
    name: The prefix of tensorflow variables defined in this layer.
    filters: A number indicating the number of output channels.
    strides: A number indicating the stride of the sliding window for
      height and width.
    dilation: A number indicating the dilation factor for height and width.
    is_training: If the tensorflow variables defined in this layer 
      would be used for training.
    use_global_status: enable/disable use_global_status for batch
      normalization. If True, moving mean and moving variance are updated
      by exponential decay.

  Returns:
    A tensor of size [batch_size, height_out, width_out, channels_out].
  """
    if strides is None and dilation is None:
        raise ValueError('None of strides or dilation is specified, ' +
                         'set one of them to 1 or bigger number.')
    elif strides > 1 and dilation is not None and dilation > 1:
        raise ValueError('strides and dilation are both specified, ' +
                         'set one of them to 1 or None.')

    with tf.variable_scope(name) as scope:
        c_i = x.get_shape().as_list()[-1]

        if c_i != filters * 4:
            # Use a convolutional layer as residual connection when the
            # number of input channels is different from output channels.
            shortcut = nn.conv(x,
                               name='shortcut',
                               filters=filters * 4,
                               kernel_size=1,
                               strides=strides,
                               padding='VALID',
                               biased=False,
                               bn=True,
                               relu=False,
                               is_training=is_training,
                               use_global_status=use_global_status)
        elif strides > 1:
            # Use max-pooling as residual connection when the number of
            # input channel is same as output channels, but stride is
            # larger than 1.
            shortcut = nn.max_pool(x,
                                   name='shortcut',
                                   kernel_size=1,
                                   strides=strides,
                                   padding='VALID')
        else:
            # Otherwise, keep the original input as residual connection.
            shortcut = x

        # Build the 1st convolutional layer.
        x = nn.conv(x,
                    name='conv1',
                    filters=filters,
                    kernel_size=1,
                    strides=1,
                    padding='SAME',
                    biased=False,
                    bn=True,
                    relu=True,
                    is_training=is_training,
                    use_global_status=use_global_status)

        if dilation is not None and dilation > 1:
            # If dilation > 1, apply atrous conv to the 2nd convolutional layer.
            x = nn.atrous_conv(x,
                               name='conv2',
                               filters=filters,
                               kernel_size=3,
                               dilation=dilation,
                               padding='SAME',
                               biased=False,
                               bn=True,
                               relu=True,
                               is_training=is_training,
                               use_global_status=use_global_status)
        else:
            padding = 'VALID' if strides > 1 else 'SAME'
            x = nn.conv(x,
                        name='conv2',
                        filters=filters,
                        kernel_size=3,
                        strides=strides,
                        padding=padding,
                        biased=False,
                        bn=True,
                        relu=True,
                        is_training=is_training,
                        use_global_status=use_global_status)

        # Build the 3rd convolutional layer (increase the channels).
        x = nn.conv(x,
                    name='conv3',
                    filters=filters * 4,
                    kernel_size=1,
                    strides=1,
                    padding='SAME',
                    biased=False,
                    bn=True,
                    relu=False,
                    is_training=is_training,
                    use_global_status=use_global_status)

        # Fuse the convolutional outputs with residual connection.
        x = tf.add_n([x, shortcut], name='add')
        x = tf.nn.relu(x, name='relu')

    return x