예제 #1
0
    def get_decision_net_simple(self, net, net_prob_mat):

        avg_output = keras.layers.GlobalAveragePooling2D()(net_prob_mat)
        max_output = keras.layers.GlobalMaxPooling2D()(net_prob_mat)

        decision_net = tf.concat([avg_output, max_output], 3)

        decision_net = layers.conv2d(
            decision_net,
            1, [1, 1],
            scope='decision6',
            normalizer_fn=None,
            weights_initializer=initializers.xavier_initializer_conv2d(False),
            biases_initializer=tf.constant_initializer(0),
            activation_fn=None)

        return decision_net
예제 #2
0
파일: junk_net.py 프로젝트: KryoEM/tfmodels
def squeezenet_arg_scope(is_training,
                         weight_decay=0.00001,
                         use_batch_norm=False,
                         batch_norm_decay=0.999):

    normalizer_fn = slim.batch_norm if use_batch_norm else None
    with slim.arg_scope([slim.conv2d, slim.fully_connected, batch_activate],
                        activation_fn=tf.nn.relu):
        with slim.arg_scope(
            [slim.fully_connected],
                weights_regularizer=slim.l2_regularizer(weight_decay),
                weights_initializer=initializers.xavier_initializer()):
            with slim.arg_scope(
                [slim.conv2d],
                    weights_regularizer=slim.l2_regularizer(weight_decay),
                    weights_initializer=initializers.xavier_initializer_conv2d(
                    )):
                with slim.arg_scope([slim.batch_norm],
                                    is_training=is_training,
                                    decay=batch_norm_decay):
                    with slim.arg_scope(
                        [slim.conv2d, batch_activate],  # slim.fully_connected
                            normalizer_fn=normalizer_fn) as sc:
                        return sc
예제 #3
0
def legacy_convolution2d(x,
                         num_output_channels,
                         kernel_size,
                         activation_fn=None,
                         stride=(1, 1),
                         padding='SAME',
                         weight_init=initializers.xavier_initializer_conv2d(),
                         bias_init=standard_ops.zeros_initializer,
                         name=None,
                         weight_collections=(ops.GraphKeys.WEIGHTS,),
                         bias_collections=(ops.GraphKeys.BIASES,),
                         output_collections=(ops.GraphKeys.ACTIVATIONS,),
                         trainable=True,
                         weight_regularizer=None,
                         bias_regularizer=None):
  # pylint: disable=g-docstring-has-escape
  """Adds the parameters for a conv2d layer and returns the output.

  A neural network convolution layer is generally defined as:
  \\\\(y = f(conv2d(w, x) + b)\\\\) where **f** is given by `activation_fn`,
  **conv2d** is `tf.nn.conv2d` and `x` has shape
  `[batch, height, width, channels]`. The output of this op is of shape
  `[batch, out_height, out_width, num_output_channels]`, where `out_width` and
  `out_height` are determined by the `padding` argument. See `conv2D` for
  details.

  This op creates `w` and optionally `b` and adds various summaries that can be
  useful for visualizing learning or diagnosing training problems. Bias can be
  disabled by setting `bias_init` to `None`.

  The variable creation is compatible with `tf.variable_scope` and so can be
  reused with `tf.variable_scope` or `tf.make_template`.

  Most of the details of variable creation can be controlled by specifying the
  initializers (`weight_init` and `bias_init`) and which collections to place
  the created variables in (`weight_collections` and `bias_collections`).

  A per layer regularization can be specified by setting `weight_regularizer`.
  This is only applied to weights and not the bias.

  Args:
    x: A 4-D input `Tensor`.
    num_output_channels: The number of output channels (i.e. the size of the
      last dimension of the output).
    kernel_size: A length 2 `list` or `tuple` containing the kernel size.
    activation_fn: A function that requires a single Tensor that is applied as a
      non-linearity.
    stride: A length 2 `list` or `tuple` specifying the stride of the sliding
      window across the image.
    padding: A `string` from: "SAME", "VALID". The type of padding algorithm to
      use.
    weight_init: An optional initialization. If not specified, uses Xavier
      initialization (see `tf.learn.xavier_initializer`).
    bias_init: An initializer for the bias, defaults to 0. Set to`None` in order
      to disable bias.
    name: The name for this operation is used to name operations and to find
      variables. If specified it must be unique for this scope, otherwise a
      unique name starting with "convolution2d" will be created.  See
      `tf.variable_op_scope` for details.
    weight_collections: List of graph collections to which weights are added.
    bias_collections: List of graph collections to which biases are added.
    output_collections: List of graph collections to which outputs are added.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    weight_regularizer: A regularizer like the result of
      `l1_regularizer` or `l2_regularizer`. Used for weights.
    bias_regularizer: A regularizer like the result of
      `l1_regularizer` or `l2_regularizer`. Used for biases.

  Returns:
    The result of applying a 2-D convolutional layer.

  Raises:
    ValueError: If `kernel_size` or `stride` are not length 2.
  """
  with variable_scope.variable_op_scope([x], name, 'convolution2d'):
    num_input_channels = x.get_shape().dims[3].value

    if len(kernel_size) != 2:
      raise ValueError('kernel_size must be length 2: %d ' % kernel_size)
    if len(stride) != 2:
      raise ValueError('stride must be length 2: %d' % stride)

    stride = [1, stride[0], stride[1], 1]
    shape = [kernel_size[0], kernel_size[1], num_input_channels,
             num_output_channels]
    dtype = x.dtype.base_dtype

    weight_collections = set(list(weight_collections or []) +
                             [ops.GraphKeys.VARIABLES])
    w = variable_scope.get_variable('weights',
                                    shape=shape,
                                    dtype=dtype,
                                    initializer=weight_init,
                                    collections=weight_collections,
                                    regularizer=weight_regularizer,
                                    trainable=trainable)

    y = nn.conv2d(x, w, stride, padding)

    if bias_init is not None:
      bias_collections = set(list(bias_collections or []) +
                             [ops.GraphKeys.VARIABLES])
      b = variable_scope.get_variable('bias',
                                      shape=[num_output_channels],
                                      dtype=dtype,
                                      initializer=bias_init,
                                      collections=bias_collections,
                                      regularizer=bias_regularizer,
                                      trainable=trainable)

      y = nn.bias_add(y, b)

    return _apply_activation(y, activation_fn, output_collections)
예제 #4
0
def convolution2d(x,
                  num_output_channels,
                  kernel_size,
                  activation_fn=None,
                  stride=(1, 1),
                  padding='SAME',
                  weight_init=initializers.xavier_initializer_conv2d(),
                  bias_init=standard_ops.constant_initializer(0.),
                  name=None,
                  weight_collections=None,
                  bias_collections=None,
                  output_collections=None,
                  weight_regularizer=None,
                  bias_regularizer=None):
    """Adds the parameters for a conv2d layer and returns the output.

  A neural network convolution layer is generally defined as:
  \\\\(y = f(conv2d(w, x) + b)\\\\) where **f** is given by `activation_fn`,
  **conv2d** is `tf.nn.conv2d` and `x` has shape
  `[batch, height, width, channels]`. The output of this op is of shape
  `[batch, out_height, out_width, num_output_channels]`, where `out_width` and
  `out_height` are determined by the `padding` argument. See `conv2D` for
  details.

  This op creates `w` and optionally `b` and adds various summaries that can be
  useful for visualizing learning or diagnosing training problems. Bias can be
  disabled by setting `bias_init` to `None`.

  The variable creation is compatible with `tf.variable_scope` and so can be
  reused with `tf.variable_scope` or `tf.make_template`.

  Most of the details of variable creation can be controlled by specifying the
  initializers (`weight_init` and `bias_init`) and which collections to place
  the created variables in (`weight_collections` and `bias_collections`).

  A per layer regularization can be specified by setting `weight_regularizer`.
  This is only applied to weights and not the bias.

  Args:
    x: A 4-D input `Tensor`.
    num_output_channels: The number of output channels (i.e. the size of the
      last dimension of the output).
    kernel_size: A length 2 `list` or `tuple` containing the kernel size.
    activation_fn: A function that requires a single Tensor that is applied as a
      non-linearity.
    stride: A length 2 `list` or `tuple` specifying the stride of the sliding
      window across the image.
    padding: A `string` from: "SAME", "VALID". The type of padding algorithm to
      use.
    weight_init: An optional initialization. If not specified, uses Xavier
      initialization (see `tf.learn.xavier_initializer`).
    bias_init: An initializer for the bias, defaults to 0. Set to`None` in order
      to disable bias.
    name: The name for this operation is used to name operations and to find
      variables. If specified it must be unique for this scope, otherwise a
      unique name starting with "convolution2d" will be created.  See
      `tf.variable_op_scope` for details.
    weight_collections: List of graph collections to which weights are added.
    bias_collections: List of graph collections to which biases are added.
    output_collections: List of graph collections to which outputs are added.
    weight_regularizer: A regularizer like the result of
      `l1_regularizer` or `l2_regularizer`. Used for weights.
    bias_regularizer: A regularizer like the result of
      `l1_regularizer` or `l2_regularizer`. Used for biases.

  Returns:
    The result of applying a 2-D convolutional layer.

  Raises:
    ValueError: If `kernel_size` or `stride` are not length 2.
  """
    with variable_scope.variable_op_scope([x], name, 'convolution2d'):
        num_input_channels = x.get_shape().dims[3].value

        if len(kernel_size) != 2:
            raise ValueError('kernel_size must be length 2: ' % kernel_size)
        if len(stride) != 2:
            raise ValueError('stride must be length 2: ' % kernel_size)

        stride = [1, stride[0], stride[1], 1]
        shape = [
            kernel_size[0], kernel_size[1], num_input_channels,
            num_output_channels
        ]
        dtype = x.dtype.base_dtype

        w = _weight_variable(shape=shape,
                             dtype=dtype,
                             initializer=weight_init,
                             collections=weight_collections,
                             regularizer=weight_regularizer)

        y = nn.conv2d(x, w, stride, padding)

        if bias_init is not None:
            b = _bias_variable(shape=[num_output_channels],
                               dtype=dtype,
                               initializer=bias_init,
                               collections=bias_collections,
                               regularizer=bias_regularizer)

            y = nn.bias_add(y, b)

        return _apply_activation(y, activation_fn, output_collections)
예제 #5
0
    def create_shallownet(images, scope=None, net=None, dropout=True):
        """
        Args:
            images: a tensor of shape [B x H x W x C]
            net: An optional dict object
            scope: The variable scope for the subgraph, defaults to ShallowNet

        Returns:
            saliency_output: a tensor of shape [B x 48 x 48]
        """
        assert len(images.get_shape()) == 4  # [B, H, W, C]

        if net is None: net = {}
        else: assert isinstance(net, dict)

        net['dropout_keep_prob'] = tf.placeholder(tf.float32,
                                                  name='dropout_keep_prob')

        with tf.variable_scope(scope or 'ShallowNet'):
            # CONV
            net['conv1'] = convolution2d(
                images,
                32,
                kernel_size=(5, 5),
                stride=(1, 1),
                padding='VALID',
                activation_fn=None,  #tf.nn.relu,
                weights_initializer=initializers.xavier_initializer_conv2d(
                    uniform=True),
                biases_initializer=tf.constant_initializer(0.0),
                variables_collections=['MODEL_VARS'],
                scope='conv1')
            #net['conv1'] = tflearn.layers.batch_normalization(net['conv1'])
            net['conv1'] = tf.nn.relu(net['conv1'])

            net['pool1'] = tf.nn.max_pool(net['conv1'],
                                          ksize=[1, 2, 2, 1],
                                          strides=[1, 2, 2, 1],
                                          padding='SAME',
                                          name='pool1')
            log.info('Conv1 size : %s', net['conv1'].get_shape().as_list())
            log.info('Pool1 size : %s', net['pool1'].get_shape().as_list())

            net['conv2'] = convolution2d(
                net['pool1'],
                64,
                kernel_size=(3, 3),
                stride=(1, 1),
                padding='VALID',
                activation_fn=None,  #tf.nn.relu,
                weights_initializer=initializers.xavier_initializer_conv2d(
                    uniform=True),
                biases_initializer=tf.constant_initializer(0.0),
                variables_collections=['MODEL_VARS'],
                scope='conv2')
            #net['conv2'] = tflearn.layers.batch_normalization(net['conv2'])
            net['conv2'] = tf.nn.relu(net['conv2'])

            net['pool2'] = tf.nn.max_pool(net['conv2'],
                                          ksize=[1, 3, 3, 1],
                                          strides=[1, 2, 2, 1],
                                          padding='SAME',
                                          name='pool2')
            log.info('Conv2 size : %s', net['conv2'].get_shape().as_list())
            log.info('Pool2 size : %s', net['pool2'].get_shape().as_list())

            net['conv3'] = convolution2d(
                net['pool2'],
                32,
                kernel_size=(3, 3),
                stride=(1, 1),
                padding='VALID',
                activation_fn=None,  #tf.nn.relu,
                weights_initializer=initializers.xavier_initializer_conv2d(
                    uniform=True),
                biases_initializer=tf.constant_initializer(0.0),
                variables_collections=['MODEL_VARS'],
                scope='conv3')
            #net['conv3'] = tflearn.layers.batch_normalization(net['conv3'])
            net['conv3'] = tf.nn.relu(net['conv3'])

            net['pool3'] = tf.nn.max_pool(net['conv3'],
                                          ksize=[1, 3, 3, 1],
                                          strides=[1, 2, 2, 1],
                                          padding='SAME',
                                          name='pool3')
            log.info('Conv3 size : %s', net['conv3'].get_shape().as_list())
            log.info('Pool3 size : %s', net['pool3'].get_shape().as_list())

            # FC layer
            n_inputs = int(np.prod(net['pool3'].get_shape().as_list()[1:]))
            pool3_flat = tf.reshape(net['pool3'], [-1, n_inputs])
            net['fc1'] = fully_connected(
                pool3_flat,
                4802,
                activation_fn=None,  #tf.nn.relu,
                weights_initializer=initializers.xavier_initializer(
                    uniform=True),
                biases_initializer=tf.constant_initializer(0.0),
                variables_collections=['MODEL_VARS'],
                scope='fc1')
            log.info('fc1 size : %s', net['fc1'].get_shape().as_list())

            #net['fc1'] = tflearn.layers.batch_normalization(net['fc1'])
            net['fc1'] = tf.nn.relu(net['fc1'])

            if dropout:
                net['fc1'] = tf.nn.dropout(net['fc1'],
                                           net['dropout_keep_prob'])

            fc1_slice1, fc1_slice2 = tf.split(
                net['fc1'], num_or_size_splits=2, axis=1, name='fc1_slice'
            )  #syntax probably wrong here for newer tensorflow version
            net['max_out'] = tf.maximum(fc1_slice1,
                                        fc1_slice2,
                                        name='fc1_maxout')

            log.info('maxout size : %s', net['max_out'].get_shape().as_list())

            net['fc2'] = fully_connected(
                net['max_out'],
                4802,
                activation_fn=None,  # no relu here
                weights_initializer=initializers.xavier_initializer(
                    uniform=True),
                biases_initializer=tf.constant_initializer(0.0),
                variables_collections=['MODEL_VARS'],
                scope='fc2')

            #net['fc2'] = tflearn.layers.batch_normalization(net['fc2'])
            net['fc2'] = tf.nn.relu(net['fc2'])

            #if dropout:
            #    net['fc2'] = tf.nn.dropout( net['fc2'], net['dropout_keep_prob'] )

            log.info('fc2 size : %s', net['fc2'].get_shape().as_list())

            fc2_slice1, fc2_slice2 = tf.split(net['fc2'],
                                              num_or_size_splits=2,
                                              axis=1,
                                              name='fc2_slice')
            net['max_out2'] = tf.maximum(fc2_slice1,
                                         fc2_slice2,
                                         name='fc2_maxout')
            '''
            net['fc3'] = fully_connected(net['max_out2'], 4802,
                                        activation_fn=None, # no relu here
                                        weights_initXializer=initializers.xavier_initializer(uniform=True),
                                        biases_initializer=tf.constant_initializer(0.0),
                                        weight_collections=['MODEL_VARS'], bias_collections=['MODEL_VARS'],
                                        name='fc3')
            #net['fc3'] = tflearn.layers.batch_normalization(net['fc3'])
            net['fc3'] = tf.nn.relu(net['fc3'])


            fc3_slice1, fc3_slice2 = tf.split(1, 2, net['fc3'], name='fc3_slice')
            net['max_out3'] = tf.maximum(fc3_slice1, fc3_slice2, name='fc3_maxout')

            net['max_out3'] = tflearn.layers.batch_normalization(net['max_out3'])
            '''

            #net['fc2'] = tf.nn.dropout( net['fc2'], net['dropout_keep_prob'] )

            #log.info('fc3 size : %s', net['fc3'].get_shape().as_list())

            # debug and summary
            #net['fc1'].get_shape().assert_is_compatible_with([None, 4802])
            #net['fc2'].get_shape().assert_is_compatible_with([None, 4802])
            #net['fc3'].get_shape().assert_is_compatible_with([None, 4802])
            #for t in [self.conv1, self.conv2, self.conv3,
            #          self.pool1, self.pool2, self.pool3,
            #          self.fc1, self.max_out, self.fc2]:
            #    _add_activation_histogram_summary(t)

            net['saliency'] = tf.reshape(net['max_out2'], [-1, 49, 49],
                                         name='saliency')

        return net['saliency']
예제 #6
0
    def create_gazeprediction_network(frame_images,
                                      c3d_input,
                                      gt_gazemap,
                                      dropout_keep_prob,
                                      net=None):
        '''
        Args:
            frame_images: a [B x T x IH x IW x 3] tensor (frame images)
            c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features
            gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps
            dropout_keep_prob : float tensor
            (optional) net : a dictionary to get intra-layer activations or tensors.

        Outputs:
            [predicted_gazemaps, loss, image_summary] where

            predicted_gazemaps : a [B x T x GH x GW] tensor,
                predicted gaze maps per frame
            loss: a scalar (float) tensor of RNN supervision loss.
            image_summary
        '''

        if net is None: net = {}
        else: assert isinstance(net, dict)

        vars = E()

        # (0) input sanity check
        GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width
        IH, IW = CONSTANTS.image_height, CONSTANTS.image_width
        B, T = frame_images.get_shape().as_list()[:2]

        assert B > 0 and T > 0
        frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3])
        c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7])
        gt_gazemap.get_shape().assert_is_compatible_with([B, T, GH, GW])

        dim_cnn_proj = 512  # XXX FIXME (see __init__ in GazePredictionGRU)

        # some variables
        # --------------
        # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME
        rnn_state_size = 256  #dim_cnn_proj # filter size is more correct name
        ''' The RGP (Recurrent Gaze Prediction) model. '''

        # (1) Input frame saliency
        # ------------------------

        # Input.
        net['frame_images'] = frame_images  # [B x T x IH x IW x 3]

        net['frm_sal'] = SaliencyModel.create_shallownet(
            tf.reshape(net['frame_images'], [-1, IH, IW, 3]),
            scope='ShallowNet',
            dropout=False)  # [-1, 49, 49]
        net['frm_sal'] = tf.reshape(net['frm_sal'],
                                    [B, T, GH, GW])  # [B x T x 49 x 49]

        # [B x T x 49 x 49] --> [B x T x 49 x 49 x 1]
        net['frm_sal_cubic'] = tf.reshape(net['frm_sal'], [B, T, GH, GW, 1],
                                          name='frame_saliency_cubic')

        # (2) C3D
        # -------
        # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV
        # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN

        # c3d input.
        net['c3d_input'] = c3d_input  # [B x T x 1024 x 7 x 7]
        # change axis and reshape to [B x T x 7 x 7 x 1024]
        net['c3d_input_reshape'] = tf.transpose(net['c3d_input'],
                                                perm=[0, 1, 3, 4, 2],
                                                name='c3d_input_reshape')
        log.info('c3d_input_reshape shape : %s',
                 net['c3d_input_reshape'].get_shape().as_list())
        net['c3d_input_reshape'].get_shape().assert_is_compatible_with(
            [B, T, 7, 7, 1024])

        # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12
        vars.proj_c3d_W = tf.Variable(tf.random_uniform([1024, dim_cnn_proj],
                                                        -0.1, 0.1),
                                      name="proj_c3d_W")
        vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1,
                                                        0.1),
                                      name="proj_c3d_b")

        net['c3d_embedded'] = tf.nn.xw_plus_b(
            tf.reshape(net['c3d_input_reshape'],
                       [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b
        )  # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12

        # --> [B x T x 7 x 7 x 12]
        net['c3d_embedded'] = tf.reshape(net['c3d_embedded'],
                                         [B, T, 7, 7, dim_cnn_proj])
        log.info('c3d_embedded shape : %s',
                 net['c3d_embedded'].get_shape().as_list())
        net['c3d_embedded'].get_shape().assert_is_compatible_with(
            [B, T, 7, 7, dim_cnn_proj])

        # The RNN Part.
        # -------------

        # Batch size x (gaze map size), per frame
        net['gt_gazemap'] = gt_gazemap  # [B x T x GH, GW]
        log.info('gt_gazemap shape : %s',
                 net['gt_gazemap'].get_shape().as_list())

        with tf.variable_scope('RCNBottom') as scope:
            vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj)

            state_u = vars.lstm_u.zero_state(B, tf.float32)
            log.info('RNN state shape : %s', state_u.get_shape().as_list())

            # n_lstm_step for example, 35.
            net['rcn_outputs'] = rcn_outputs = []
            for i in range(T):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector)
                rnn_input = tf.concat(
                    concat_dim=3,  # [:, i, 7, 7, HERE]
                    values=[  #  0     1  2  3
                        net['c3d_embedded']
                        [:, i, :, :, :],  # (i) C3D map (embedded into 7x7x12)
                    ],
                    name='rnn_input' + str(i))

                #with tf.variable_scope("RNN"):
                output_u, state_u = vars.lstm_u(rnn_input, state_u)

                # at time t
                output_u.get_shape().assert_is_compatible_with(
                    [B, 7, 7, rnn_state_size])  # Bx{time}x7x7x32
                rcn_outputs.append(output_u)

        # (3) RCN output unpooling to 49x49 size
        # each of (7x7x32) maps are up-sampled to (49x49x8)
        upsampling_filter_size = 11
        upsampling_output_channel = 64
        vars.upsampling_filter = tf.get_variable(
            'Upsampling/weight',
            [
                upsampling_filter_size, upsampling_filter_size,
                upsampling_output_channel, rnn_state_size
            ],  # rnn_state_size bad name (indeed a channel size)
            initializer=initializers.xavier_initializer_conv2d(uniform=True))

        net['rcn_upsampled_outputs'] = rcn_upsampled_outputs = []
        for i in range(T):
            rcn_output_map = rcn_outputs[i]  # [B x 7 x 7 x 128]

            rcn_upsampled_output = tf.nn.conv2d_transpose(
                rcn_output_map,
                vars.upsampling_filter,
                output_shape=[B, GH, GW, upsampling_output_channel],
                strides=[1, 7, 7, 1],
                padding='SAME',
                name='upsampled_rcn_output_' + str(i))
            rcn_upsampled_output.get_shape().assert_is_compatible_with(
                [B, GH, GW, upsampling_output_channel])
            rcn_upsampled_outputs.append(rcn_upsampled_output)

            if i == 0:
                log.info('RCN input map size : %s',
                         rcn_output_map.get_shape().as_list())
                log.info('RCN upsampled size : %s',
                         rcn_upsampled_output.get_shape().as_list())

        # (4) The upper layer of GRCN to emit gaze map
        # --------------------------------------------
        with tf.variable_scope('RCNGaze') as scope:

            vars.lstm_g = GRU_RCN_Cell(
                num_units=3,
                #                                       dim_feature=upsampling_output_channel + 1 + 1, # 10?
                dim_feature=upsampling_output_channel + 1,  # 10?
                spatial_shape=[GH, GW],
                kernel_spatial_shape=[5, 5])

            state_g = vars.lstm_g.zero_state(B, tf.float32)
            #            last_output_gazemap = tf.zeros([B, GH, GW, 1])

            predicted_gazemaps = []
            for i in range(T):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                # try RNN supervision with GT gazemap.
                # FIXME decoder should be spin off here
                #if i > 0:
                #    last_output_gazemap = tf.expand_dims(gt_gazemap[:, i - 1, :, :], 3)

                # now, combine image saliency, rcn map from the bottom layer,
                # and the previous input
                '''
                rcn_input_concat = tf.concat(concat_dim=3, # the last dimension
                                            values=[
                                                rcn_upsampled_outputs[i],             # [B x 49 x 49 x 8]
                                                net['frm_sal_cubic'][:, i, :, :, :],  # [B x 49 x 49 x 1]
#                                                last_output_gazemap                   # [B x 49 x 49 x 1]
                                            ])
                '''
                #with tf.variable_scope("RNN"):
                output_g, state_g = vars.lstm_g(rcn_upsampled_outputs[i],
                                                state_g)

                output_g.get_shape().assert_is_compatible_with([B, GH, GW, 3])
                rcn_outputs.append(rcn_outputs)
                output_g = tf.reshape(output_g, [B, -1])

                # apply another convolutional layer (== fc in fact) to gaze map
                # [B x 49 x 49 x 3] -> # [B x 49 x 49 x 1]

                with tf.variable_scope('LastProjection') as scope_proj:
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()

                    fc1 = fully_connected(
                        output_g,
                        4802,
                        activation_fn=None,  #tf.nn.relu,
                        weight_init=initializers.xavier_initializer(
                            uniform=True),
                        bias_init=tf.constant_initializer(0.0),
                        weight_collections=['MODEL_VARS'],
                        bias_collections=['MODEL_VARS'],
                        name='fc1')
                    #net['fc1'] = tflearn.layers.batch_normalization(net['fc1'])
                    fc1 = tf.nn.relu(fc1)

                    if dropout_keep_prob is not None:
                        fc1 = tf.nn.dropout(fc1, dropout_keep_prob)

                    fc1_slice1, fc1_slice2 = tf.split(1,
                                                      2,
                                                      fc1,
                                                      name='fc1_slice')
                    max_out = tf.maximum(fc1_slice1,
                                         fc1_slice2,
                                         name='fc1_maxout')

                    fc2 = fully_connected(
                        max_out,
                        4802,
                        activation_fn=None,  # no relu here
                        weight_init=initializers.xavier_initializer(
                            uniform=True),
                        bias_init=tf.constant_initializer(0.0),
                        weight_collections=['MODEL_VARS'],
                        bias_collections=['MODEL_VARS'],
                        name='fc2')
                    #net['fc2'] = tflearn.layers.batch_normalization(net['fc2'])
                    fc2 = tf.nn.relu(fc2)

                    #if dropout:
                    #    net['fc2'] = tf.nn.dropout( net['fc2'], net['dropout_keep_prob'] )

                    fc2_slice1, fc2_slice2 = tf.split(1,
                                                      2,
                                                      fc2,
                                                      name='fc2_slice')
                    max_out2 = tf.maximum(fc2_slice1,
                                          fc2_slice2,
                                          name='fc2_maxout')

                predicted_gazemap = tf.reshape(
                    max_out2,
                    [B, GH, GW])  # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze
                predicted_gazemaps.append(predicted_gazemap)
                # TODO should we normalize predicted_gazemap ????????????????????????????

        # (4) Finally, calculate the loss
        loss = 0.0

        for i in range(T):
            predicted_gazemap = predicted_gazemaps[i]

            # Cross entropy and softmax??
            l2loss = tf.nn.l2_loss(predicted_gazemap -
                                   gt_gazemap[:, i, :, :])  # on Bx49x49
            current_gaze_loss = tf.reduce_sum(l2loss)

            current_loss = current_gaze_loss
            loss += current_loss

        # loss: take average
        loss = tf.div(loss, float(B * T), name='loss_avg')

        # FIXME may be duplicates?
        tf.scalar_summary('loss/train', loss)
        tf.scalar_summary('loss/val', loss, collections=['TEST_SUMMARIES'])

        # pack as a tensor
        # T-list of [B x 49 x 49] --> [B x 49 x 49]
        net['predicted_gazemaps'] = tf.transpose(tf.pack(predicted_gazemaps),
                                                 [1, 0, 2, 3],
                                                 name='predicted_gazemaps')
        net['predicted_gazemaps'].get_shape().assert_is_compatible_with(
            [B, T, GH, GW])

        # Debugging Informations
        # ----------------------

        # OPTIONAL: for debugging and visualization
        # XXX only last predicted_gazemap is shown as of now :( T^T
        # XXX rename saliency -> gaze (to avoid confusion)
        def _add_image_summary(tag, tensor):
            return tf.image_summary(tag,
                                    tensor,
                                    max_images=2,
                                    collections=['IMAGE_SUMMARIES'])

        _input_image = frame_images[:, i, :, :, :]  # last rnn step
        _saliency_output = tf.reshape(predicted_gazemap, [-1, GH, GW, 1])
        _saliency_gt = tf.reshape(gt_gazemap[:, i, :, :], [-1, GH, GW, 1])
        _saliency_shallow = tf.reshape(net['frm_sal'][:, i, :, :],
                                       [-1, GH, GW, 1])

        _add_image_summary('inputimage', _input_image)
        _add_image_summary('saliency_maps_gt', _saliency_gt)
        _add_image_summary('saliency_maps_pred_original', _saliency_output)
        _add_image_summary('saliency_maps_pred_norm',
                           tf_normalize_map(_saliency_output))
        #_add_image_summary('saliency_zimgframe_shallow77', _saliency_shallow77)
        _add_image_summary('saliency_zshallownet', _saliency_shallow)

        image_summaries = tf.merge_summary(
            inputs=tf.get_collection('IMAGE_SUMMARIES'),
            collections=[],
            name='merged_image_summary',
        )

        return net['predicted_gazemaps'], loss, image_summaries
예제 #7
0
    def _build_net(self):
        print('Constructing generator with resolution of %dx%d' % (self.nin_sp,self.nin_sp))
        self.layers = []

        with tf.variable_scope('encoder_in'):
            net = slim.conv2d(self.nin, self.first_layer_ch, [1,1], stride=1,
                              padding='SAME',
                              weights_initializer=initializers.xavier_initializer_conv2d(),
                              weights_regularizer=None,
                              rate=1, normalizer_fn=slim.batch_norm, activation_fn=tf.nn.leaky_relu,
                              scope='conv0')
            self.layers.append(net)
            print('-- Layer %d: ' % len(self.layers), 'encoder_in ', self.layers[-1].get_shape().as_list())

        for i in range(1, self.encoder_layer_num, 1):
            sp = self.layers[-1].get_shape().as_list()[-2]
            with tf.variable_scope('encoder_%dx%d' % (sp, sp)):
                net = slim.conv2d(self.layers[-1], min(self.first_layer_ch*(2**i), self.bottleneck_ch), [4,4],
                                  stride=2, padding='SAME',
                                  weights_initializer=initializers.xavier_initializer_conv2d(),
                                  weights_regularizer=None,
                                  rate=1, normalizer_fn=slim.batch_norm, activation_fn=tf.nn.leaky_relu,
                                  scope='conv0')
                self.layers.append(net)
                print('-- Layer %d: ' % len(self.layers), 'encoder_%dx%d ' % (sp, sp), self.layers[-1].get_shape().as_list())

        for i in range(self.res_block_num):
            with tf.variable_scope('residual_block_%d' % i):
                net = slim.conv2d(self.layers[-1], self.bottleneck_ch, [3,3], stride=1, padding='SAME',
                                  weights_initializer=initializers.xavier_initializer_conv2d(),
                                  weights_regularizer=None,
                                  rate=1, normalizer_fn=None, activation_fn=tf.nn.leaky_relu,
                                  scope='conv0')
                net = tf.add(net, self.layers[-1])
                self.layers.append(net)
                print('-- Layer %d: ' % len(self.layers), 'residual_block_%d ' % i, self.layers[-1].get_shape().as_list())

        for i in range(self.decoder_layer_num-1, 0, -1):
            sp = self.layers[-1].get_shape().as_list()[-2]
            with tf.variable_scope('decoder_%dx%d' % (sp*2, sp*2)):
                net = tf.image.resize_bilinear(self.layers[-1], (sp*2, sp*2), align_corners=True)
                net = slim.conv2d(net, min(self.first_layer_ch*(2**i), self.bottleneck_ch), [3,3],
                                  stride=1, padding='SAME',
                                  weights_initializer=initializers.xavier_initializer_conv2d(),
                                  weights_regularizer=None,
                                  rate=1, normalizer_fn=slim.batch_norm, activation_fn=tf.nn.relu,
                                  scope='conv0')
                net = tf.concat([net, self.layers[i-1], tf.image.resize_area(self.nin, (sp*2,sp*2), align_corners=False)], axis=3)
                net = slim.conv2d(net, min(self.first_layer_ch*(2**i), self.bottleneck_ch), [3,3],
                                  stride=1, padding='SAME',
                                  weights_initializer=initializers.xavier_initializer_conv2d(),
                                  weights_regularizer=None,
                                  rate=1, normalizer_fn=slim.batch_norm, activation_fn=tf.nn.relu,
                                  scope='conv1')
                self.layers.append(net)
                print('-- Layer %d: ' % len(self.layers), 'decoder_%dx%d ' % (sp*2, sp*2), self.layers[-1].get_shape().as_list())

        with tf.variable_scope('decoder_out'):
            net = slim.conv2d(self.layers[-1], self.nout_ch, [1,1], stride=1, padding='SAME',
                              weights_initializer=initializers.xavier_initializer_conv2d(),
                              rate=1, activation_fn=tf.nn.sigmoid, scope='conv0')
            self.layers.append(net)
            print('-- Layer %d: ' % len(self.layers), 'decoder_out ', self.layers[-1].get_shape().as_list())
예제 #8
0
def conv2d_tiny_complex(
    inputs,
    num_outputs,
    rate=1,
    padding='SAME',
    data_format=None,
    activation_fn=nn.relu,
    normalizer_fn=None,
    normalizer_params=None,
    weights_initializer=initializers.xavier_initializer_conv2d(),
    weights_regularizer=None,
    biases_initializer=init_ops.zeros_initializer,
    biases_regularizer=None,
    reuse=None,
    variables_collections=None,
    outputs_collections=None,
    trainable=True,
    scope=None,
):
    """Tiny Convolution 2d.
    """
    with variable_scope.variable_scope(scope, 'Conv', [inputs],
                                       reuse=reuse) as sc:
        inputs = ops.convert_to_tensor(inputs)
        dtype = inputs.dtype.base_dtype
        input_rank = inputs.get_shape().ndims
        if input_rank is None:
            raise ValueError('Rank of inputs must be known')
        if input_rank < 3 or input_rank > 5:
            raise ValueError(
                'Rank of inputs is %d, which is not >= 3 and <= 5' %
                input_rank)
        conv_dims = input_rank - 2

        # First 2x2 convolution.
        num_outputs_inter = num_outputs // 4
        out_list = []
        paddings = [[[0, 0], [0, rate], [0, rate], [0, 0]],
                    [[0, 0], [0, rate], [rate, 0], [0, 0]],
                    [[0, 0], [rate, 0], [0, rate], [0, 0]],
                    [[0, 0], [rate, 0], [rate, 0], [0, 0]]]
        for i in range(4):
            output = slim.conv2d(inputs,
                                 num_outputs_inter, [2, 2],
                                 rate=rate,
                                 padding='VALID',
                                 activation_fn=activation_fn,
                                 normalizer_fn=normalizer_fn,
                                 normalizer_params=normalizer_params,
                                 weights_initializer=weights_initializer,
                                 weights_regularizer=weights_regularizer,
                                 biases_initializer=biases_regularizer,
                                 biases_regularizer=biases_regularizer,
                                 scope='conv_2x2_%i' % i)
            out_list.append(tf.pad(output, paddings[i], mode='CONSTANT'))
            print(out_list[-1].get_shape())
            # out_list.append(output)

        # Concatening outputs.
        output = tf.concat(input_rank - 1, out_list)
        return output
예제 #9
0
def conv2d_tiny(
    inputs,
    num_outputs,
    rate=1,
    padding='SAME',
    data_format=None,
    activation_fn=nn.relu,
    normalizer_fn=None,
    normalizer_params=None,
    weights_initializer=initializers.xavier_initializer_conv2d(),
    weights_regularizer=None,
    biases_initializer=init_ops.zeros_initializer,
    biases_regularizer=None,
    reuse=None,
    variables_collections=None,
    outputs_collections=None,
    trainable=True,
    scope=None,
):
    """Tiny Convolution 2d.
    """
    with variable_scope.variable_scope(scope, 'Conv', [inputs],
                                       reuse=reuse) as sc:
        inputs = ops.convert_to_tensor(inputs)
        dtype = inputs.dtype.base_dtype
        input_rank = inputs.get_shape().ndims
        if input_rank is None:
            raise ValueError('Rank of inputs must be known')
        if input_rank < 3 or input_rank > 5:
            raise ValueError(
                'Rank of inputs is %d, which is not >= 3 and <= 5' %
                input_rank)
        conv_dims = input_rank - 2

        # First 2x2 convolution.
        # num_outputs_inter = num_outputs
        output = slim.conv2d(
            inputs,
            num_outputs,
            [2, 2],
            rate=rate,
            padding='VALID',
            activation_fn=None,
            normalizer_fn=normalizer_fn,
            normalizer_params=normalizer_params,
            # normalizer_fn=None,
            # normalizer_params=None,
            weights_initializer=initializers.xavier_initializer_conv2d(),
            weights_regularizer=weights_regularizer,
            biases_initializer=None,
            # biases_initializer=init_ops.zeros_initializer,
            biases_regularizer=biases_regularizer,
            scope='conv_2x2')

        # Paddings + second convolution.
        paddings = [[0, 0], [rate, rate], [rate, rate], [0, 0]]
        output = tf.pad(output, paddings, mode='CONSTANT')
        output = slim.conv2d(
            output,
            num_outputs,
            [2, 2],
            rate=rate,
            padding='VALID',
            activation_fn=activation_fn,
            # normalizer_fn=normalizer_fn,
            # normalizer_params=normalizer_params,
            normalizer_fn=None,
            normalizer_params=None,
            weights_initializer=initializers.xavier_initializer_conv2d(),
            weights_regularizer=weights_regularizer,
            # biases_initializer=None,
            biases_initializer=init_ops.zeros_initializer,
            biases_regularizer=biases_regularizer,
            scope='conv_concat')
        return output
예제 #10
0
    def create_gazeprediction_network(frame_images,
                                      c3d_input,
                                      dropout_keep_prob=1.0,
                                      net=None):
        '''
        Args:
            frame_images: a [B x T x IH x IW x 3] tensor (frame images)
            c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features
            gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps
            dropout_keep_prob : float tensor
            (optional) net : a dictionary to get intra-layer activations or tensors.

        Outputs:
            predicted_gazemaps : a [B x T x GH x GW] tensor,
                predicted gaze maps per frame
        '''

        if net is None: net = {}
        else: assert isinstance(net, dict)

        vars = E()

        # (0) input sanity check
        GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width
        IH, IW = CONSTANTS.image_height, CONSTANTS.image_width
        B, T = frame_images.get_shape().as_list()[:2]

        assert B > 0 and T > 0
        frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3])
        c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7])

        dim_cnn_proj = 512  # XXX FIXME (see __init__ in GazePredictionGRU)

        # some variables
        # --------------
        # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME
        rnn_state_size = 128  #dim_cnn_proj # filter size is more correct name
        ''' The RGP (Recurrent Gaze Prediction) model. '''

        with tf.variable_scope("RGP"):

            # (2) C3D
            # -------
            # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV
            # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN

            # c3d input.
            net['c3d_input'] = c3d_input  # [B x T x 1024 x 7 x 7]
            # change axis and reshape to [B x T x 7 x 7 x 1024]
            net['c3d_input_reshape'] = tf.transpose(net['c3d_input'],
                                                    perm=[0, 1, 3, 4, 2],
                                                    name='c3d_input_reshape')
            log.info('c3d_input_reshape shape : %s',
                     net['c3d_input_reshape'].get_shape().as_list())
            net['c3d_input_reshape'].get_shape().assert_is_compatible_with(
                [B, T, 7, 7, 1024])

            # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12
            vars.proj_c3d_W = tf.Variable(tf.random_uniform(
                [1024, dim_cnn_proj], -0.1, 0.1),
                                          name="proj_c3d_W")
            vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj],
                                                            -0.1, 0.1),
                                          name="proj_c3d_b")

            net['c3d_embedded'] = tf.nn.xw_plus_b(
                tf.reshape(net['c3d_input_reshape'],
                           [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b
            )  # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12

            if dropout_keep_prob != 1.0:
                net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'],
                                                    dropout_keep_prob)

            # --> [B x T x 7 x 7 x 12]
            net['c3d_embedded'] = tf.reshape(net['c3d_embedded'],
                                             [B, T, 7, 7, dim_cnn_proj])
            log.info('c3d_embedded shape : %s',
                     net['c3d_embedded'].get_shape().as_list())
            net['c3d_embedded'].get_shape().assert_is_compatible_with(
                [B, T, 7, 7, dim_cnn_proj])

            # Instead of RNN part, we have deconvolution
            # -------------

            rcn_outputs = [None] * T
            for i in range(T):
                rcn_outputs[i] = net['c3d_embedded'][:, i, :, :, :]
                # B x 7 x 7 x 512(dim_cnn_proj)

            # (3) RCN output unpooling to 49x49 size
            # each of (7x7x32) maps are up-sampled to (49x49x8)
            vars.upsampling_filter1 = tf.get_variable(
                'Upsampling/weight1',
                [
                    5,
                    5,
                    64,
                    dim_cnn_proj,  # directly project 512->64
                    #rnn_state_size
                ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))
            vars.upsampling_filter2 = tf.get_variable(
                'Upsampling/weight2',
                [5, 5, 32, 64
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))

            vars.upsampling_filter3 = tf.get_variable(
                'Upsampling/weight3',
                [7, 7, 12, 32
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))
            vars.out_W = tf.Variable(tf.random_uniform([12, 1], -0.1, 0.1),
                                     name="out_W")
            vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1),
                                     name="out_b")

            predicted_gazemaps = []
            for i in range(T):
                rcn_output_map = rcn_outputs[i]  # [B x 7 x 7 x 128]

                rcn_upsampled_output = tf.nn.conv2d_transpose(
                    rcn_output_map,
                    vars.upsampling_filter1,
                    output_shape=[B, 23, 23, 64],
                    strides=[1, 3, 3, 1],
                    padding='VALID',
                    name='upsampled_rcn_output_' + str(i))
                #rcn_upsampled_output.get_shape().assert_is_compatible_with([B, GH, GW, upsampling_output_channel])
                rcn_upsampled_output = tf.nn.conv2d_transpose(
                    rcn_upsampled_output,
                    vars.upsampling_filter2,
                    output_shape=[B, 49, 49, 32],
                    strides=[1, 2, 2, 1],
                    padding='VALID',
                    name='upsampled_rcn_output_' + str(i))
                input_concat = tf.concat(
                    concat_dim=3,  # the last dimension
                    values=[
                        rcn_upsampled_output,  # [B x 49 x 49 x 8]
                        #                                            net['frm_sal_cubic'][:, i, :, :, :],  # [B x 49 x 49 x 1]
                        # last_output_gazemap                   # [B x 49 x 49 x 1]
                    ])

                output = tf.nn.conv2d_transpose(input_concat,
                                                vars.upsampling_filter3,
                                                output_shape=[B, 49, 49, 12],
                                                strides=[1, 1, 1, 1],
                                                padding='SAME',
                                                name='upsampled_rcn_output_' +
                                                str(i))

                output = tf.nn.xw_plus_b(tf.reshape(output, [-1, 12]),
                                         vars.out_W, vars.out_b)
                output = tf.nn.dropout(output, dropout_keep_prob)

                predicted_gazemap = tf.reshape(
                    output,
                    [B, GH, GW])  # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze
                predicted_gazemaps.append(predicted_gazemap)
                # TODO should we normalize predicted_gazemap ????????????????????????????

            # pack as a tensor
            # T-list of [B x 49 x 49] --> [B x 49 x 49]
            net['predicted_gazemaps'] = tf.transpose(
                tf.pack(predicted_gazemaps), [1, 0, 2, 3],
                name='predicted_gazemaps')
            net['predicted_gazemaps'].get_shape().assert_is_compatible_with(
                [B, T, GH, GW])

        return net['predicted_gazemaps']
예제 #11
0
    def create_gazeprediction_network(frame_images,
                                      c3d_input,
                                      dropout_keep_prob=1.0,
                                      net=None):
        '''
        Args:d
            frame_images: a [B x T x IH x IW x 3] tensor (frame images)
            c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features
            gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps
            dropout_keep_prob : float tensor
            (optional) net : a dictionary to get intra-layer activations or tensors.

        Outputs:
            predicted_gazemaps : a [B x T x GH x GW] tensor,
                predicted gaze maps per frame
        '''

        if net is None:
            net = {}
        else:
            assert isinstance(net, dict)

        vars = E()

        # (0) input sanity check
        GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width
        IH, IW = CONSTANTS.image_height, CONSTANTS.image_width
        B, T = frame_images.get_shape().as_list()[:2]

        assert B > 0 and T > 0
        frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3])
        c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7])

        dim_cnn_proj = 512  # XXX FIXME (see __init__ in GazePredictionGRU)

        # some variables
        # --------------
        # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME
        rnn_state_size = 128  # dim_cnn_proj # filter size is more correct name
        ''' The RGP (Recurrent Gaze Prediction) model. '''

        with tf.variable_scope("RGP"):

            # (2) C3D
            # -------
            # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV
            # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN

            # c3d input.
            net['c3d_input'] = c3d_input  # [B x T x 1024 x 7 x 7]
            # change axis and reshape to [B x T x 7 x 7 x 1024]
            net['c3d_input_reshape'] = tf.transpose(net['c3d_input'],
                                                    perm=[0, 1, 3, 4, 2],
                                                    name='c3d_input_reshape')
            log.info('c3d_input_reshape shape : %s',
                     net['c3d_input_reshape'].get_shape().as_list())
            net['c3d_input_reshape'].get_shape().assert_is_compatible_with(
                [B, T, 7, 7, 1024])

            # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12
            vars.proj_c3d_W = tf.Variable(tf.random_uniform(
                [1024, dim_cnn_proj], -0.1, 0.1),
                                          name="proj_c3d_W")
            vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj],
                                                            -0.1, 0.1),
                                          name="proj_c3d_b")

            net['c3d_embedded'] = tf.nn.xw_plus_b(
                tf.reshape(net['c3d_input_reshape'],
                           [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b
            )  # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12

            if dropout_keep_prob != 1.0:
                net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'],
                                                    dropout_keep_prob)

            # --> [B x T x 7 x 7 x 12]
            net['c3d_embedded'] = tf.reshape(net['c3d_embedded'],
                                             [B, T, 7, 7, dim_cnn_proj])
            log.info('c3d_embedded shape : %s',
                     net['c3d_embedded'].get_shape().as_list())
            net['c3d_embedded'].get_shape().assert_is_compatible_with(
                [B, T, 7, 7, dim_cnn_proj])

            # The RNN Part.
            # -------------

            with tf.variable_scope('RCNBottom') as scope:
                vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj)

                state_u = vars.lstm_u.zero_state(B, tf.float32)
                log.info('RNN state shape : %s', state_u.get_shape().as_list())

                predicted_gazemaps = []
                net['rcn_outputs'] = rcn_outputs = []

                # n_lstm_step for example, 35. -> 42 has highest performance
                for i in range(T):  # T = number of timesteps
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()

                    # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector)
                    rnn_input = tf.concat(
                        values=[  # 0     1  2  3
                            # (i) C3D map (embedded into 7x7x12)
                            net['c3d_embedded'][:, i, :, :, :],
                        ],
                        axis=3,  # [:, i, 7, 7, HERE]
                        name='rnn_input' + str(i))

                    # with tf.variable_scope("RNN"):
                    output_u, state_u = vars.lstm_u(rnn_input, state_u)

                    # at time t
                    output_u.get_shape().assert_is_compatible_with(
                        [B, 7, 7, rnn_state_size])  # Bx{time}x7x7x32
                    rcn_outputs.append(output_u)

            # (3) RCN output unpooling to 49x49 size
            # each of (7x7x32) maps are up-sampled to (49x49x8)
            vars.upsampling_filter1 = tf.get_variable(
                'Upsampling/weight1',
                [5, 5, 64, rnn_state_size
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))
            vars.upsampling_filter2 = tf.get_variable(
                'Upsampling/weight2',
                [5, 5, 32, 64
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))

            vars.upsampling_filter3 = tf.get_variable(
                'Upsampling/weight3',
                [7, 7, 12, 32
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))
            vars.out_W = tf.Variable(tf.random_uniform([12, 1], -0.1, 0.1),
                                     name="out_W")
            vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1),
                                     name="out_b")

            predicted_gazemaps = []
            # Batch normalization assumption (if wrong fix): apply before eac convolutional layer
            for i in range(T):
                rcn_output_map = rcn_outputs[i]  # [B x 7 x 7 x 128]

                # for now in here - later will add to base:

                # batch_mean, batch_var = tf.nn.moments(rcn_output_map, axes = [0,1,2]) #global normalization for conv_filters
                # what to do with offset and scale?
                rcn_output_map = tf.layers.batch_normalization(rcn_output_map)
                rcn_upsampled_output = tf.nn.conv2d_transpose(
                    rcn_output_map,
                    vars.upsampling_filter1,
                    output_shape=[B, 23, 23, 64],
                    strides=[1, 3, 3, 1],
                    padding='VALID',
                    name='upsampled_rcn_output_' + str(i))

                #rcn_upsampled_output.get_shape().assert_is_compatible_with([B, GH, GW, upsampling_output_channel])
                rcn_upsampled_output = tf.nn.conv2d_transpose(
                    rcn_upsampled_output,
                    vars.upsampling_filter2,
                    output_shape=[B, 49, 49, 32],
                    strides=[1, 2, 2, 1],
                    padding='VALID',
                    name='upsampled_rcn_output_' + str(i))

                input_concat = tf.concat(
                    axis=3,  # the last dimension
                    values=[
                        # [B x 49 x 49 x 8]
                        rcn_upsampled_output,
                        #                                            net['frm_sal_cubic'][:, i, :, :, :],  # [B x 49 x 49 x 1]
                        # last_output_gazemap                   # [B x 49 x 49 x 1]
                    ])

                output = tf.nn.conv2d_transpose(input_concat,
                                                vars.upsampling_filter3,
                                                output_shape=[B, 49, 49, 12],
                                                strides=[1, 1, 1, 1],
                                                padding='SAME',
                                                name='upsampled_rcn_output_' +
                                                str(i))

                output = tf.nn.xw_plus_b(tf.reshape(output, [-1, 12]),
                                         vars.out_W, vars.out_b)
                output = tf.nn.dropout(output, dropout_keep_prob)

                # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze
                predicted_gazemap = tf.reshape(output, [B, GH, GW])
                predicted_gazemaps.append(predicted_gazemap)
                # TODO should we normalize predicted_gazemap ????????????????????????????

            # pack as a tensor
            # T-list of [B x 49 x 49] --> [B x 49 x 49]
            net['predicted_gazemaps'] = tf.transpose(
                tf.stack(predicted_gazemaps), [1, 0, 2, 3],
                name='predicted_gazemaps')
            net['predicted_gazemaps'].get_shape().assert_is_compatible_with(
                [B, T, GH, GW])

        return net['predicted_gazemaps']