Exemplo n.º 1
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     labels,
                     scope='default',
                     is_training=True,
                     **unused_params):
        lstm_size = FLAGS.lstm_cells
        number_of_layers = FLAGS.lstm_layers
        with tf.variable_scope(scope, tf.AUTO_REUSE):
            stacked_lstm_fw = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)
                for _ in range(number_of_layers)
            ])

            stacked_lstm_bw = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)
                for _ in range(number_of_layers)
            ])

            outputs, state = tf.nn.bidirectional_dynamic_rnn(
                stacked_lstm_fw,
                stacked_lstm_bw,
                model_input,
                sequence_length=num_frames,
                dtype=tf.float32,
                swap_memory=True)

            if FLAGS.lstm_pooling_method == 'last':
                l = [state[i][-1].h for i in range(2)]
            else:
                l = [
                    utils.FramePooling(outputs[0], FLAGS.lstm_pooling_method),
                    utils.FramePooling(outputs[1], FLAGS.lstm_pooling_method)
                ]

            output = tf.concat(l, 1)

            aggregated_model = getattr(video_level_models,
                                       FLAGS.video_level_classifier_model)

            results = aggregated_model().create_model(model_input=output,
                                                      vocab_size=vocab_size,
                                                      is_training=is_training,
                                                      **unused_params)
            results['features'] = output
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
        return results
Exemplo n.º 2
0
    def forward(self, reshaped_input):

        feature_size = self.feature_size
        cluster_size = self.cluster_size
        add_batch_norm = self.add_batch_norm
        max_frames = self.max_frames
        is_training = self.is_training

        cluster_weights = tf.get_variable("cluster_weights",
          [feature_size, cluster_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
        
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        
        if add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases

        if activation == 'glu':
            space_ind = range(cluster_size/2)
            gate_ind = range(cluster_size/2,cluster_size)

            gates = tf.sigmoid(activation[:,gate_ind])
            activation = tf.multiply(activation[:,space_ind],gates)

        elif activation == 'relu':
            activation = tf.nn.relu6(activation)
        
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])

        avg_activation = utils.FramePooling(activation, 'average')
        avg_activation = tf.nn.l2_normalize(avg_activation,1)

        max_activation = utils.FramePooling(activation, 'max')
        max_activation = tf.nn.l2_normalize(max_activation,1)
        
        return tf.concat([avg_activation,max_activation],1)
Exemplo n.º 3
0
  def forward(self, reshaped_input):

    cluster_weights = tf.get_variable("cluster_weights",
      [self.feature_size, self.cluster_size],
      initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
    tf.summary.histogram("cluster_weights", cluster_weights)
    activation = tf.matmul(reshaped_input, cluster_weights)
    
    if self.add_batch_norm:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=self.is_training,
          scope="cluster_bn")
    else:
      cluster_biases = tf.get_variable("cluster_biases",
        [self.cluster_size],
        initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size)))
      tf.summary.histogram("cluster_biases", cluster_biases)
      activation += cluster_biases
    
    activation = tf.nn.relu6(activation)
    tf.summary.histogram("cluster_output", activation)

    activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
    activation = utils.FramePooling(activation, self.dbof_pooling_method)

    return activation
Exemplo n.º 4
0
    def create_model(self, model_input, vocab_size, num_frames,
                     **unused_params):
        lstm_size = FLAGS.lstm_cells
        lstm_layers = FLAGS.lstm_layers
        dropout_keep_prob = FLAGS.dropout_keep_prob

        ## Batch normalize the input
        stacked_lnlstm = tf.contrib.rnn.MultiRNNCell([
            tf.contrib.rnn.LayerNormBasicLSTMCell(
                lstm_size, dropout_keep_prob=dropout_keep_prob)
            for _ in range(lstm_layers)
        ])

        loss = 0.0
        with tf.variable_scope("RNN"):
            outputs, state = tf.nn.dynamic_rnn(stacked_lnlstm,
                                               model_input,
                                               sequence_length=num_frames,
                                               dtype=tf.float32)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        if FLAGS.use_lstm_output:
            agg_model_inputs = utils.FramePooling(outputs,
                                                  FLAGS.pooling_method)
        else:
            agg_model_inputs = state[-1]

        return aggregated_model().create_model(model_input=agg_model_inputs,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemplo n.º 5
0
    def create_model(self, model_input, vocab_size, num_frames,
                     **unused_params):
        """Creates a model which uses a stack of LSTMs to represent the video.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
        lstm_size = FLAGS.lstm_cells
        #number_of_layers = FLAGS.lstm_layers

        with tf.variable_scope("lstm_1"):
            lstm_1 = tf.contrib.rnn.GRUCell(lstm_size, forget_bias=1.0)
            outputs, state = tf.nn.dynamic_rnn(lstm_1,
                                               model_input,
                                               sequence_length=num_frames,
                                               swap_memory=True,
                                               dtype=tf.float32)

        #Adding the time skip
        skip_outputs = outputs[:, ::FLAGS.time_skip, :]

        with tf.variable_scope("lstm_2"):
            lstm_2 = tf.contrib.rnn.GRUCell(lstm_size, forget_bias=1.0)
            outputs2, state2 = tf.nn.dynamic_rnn(lstm_2,
                                                 skip_outputs,
                                                 sequence_length=num_frames /
                                                 FLAGS.time_skip,
                                                 swap_memory=True,
                                                 dtype=tf.float32)

        loss = 0.0

        #Aggregating LSTM state and outputs
        model_state = tf.concat([state, state2], axis=1)
        model_outputs = tf.concat([outputs, outputs2], axis=1)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        if FLAGS.use_lstm_output:
            return aggregated_model().create_model(
                model_input=utils.FramePooling(model_outputs,
                                               FLAGS.pooling_method),
                vocab_size=vocab_size,
                **unused_params)
        else:
            return aggregated_model().create_model(model_input=model_state,
                                                   vocab_size=vocab_size,
                                                   **unused_params)
Exemplo n.º 6
0
    def create_model(self, model_input, vocab_size, num_frames,
                     **unused_params):
        """Creates a model which uses a stack of LSTMs to represent the video.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
        lstm_size = FLAGS.lstm_cells
        number_of_layers = FLAGS.lstm_layers

        if FLAGS.use_attention:
            stacked_lstm = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.AttentionCellWrapper(
                    tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0),
                    FLAGS.attention_len) for _ in range(number_of_layers)
            ])
        elif FLAGS.use_residuals:
            stacked_lstm = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.ResidualWrapper(
                    tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0))
                for _ in range(number_of_layers)
            ])
        else:
            stacked_lstm = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)
                for _ in range(number_of_layers)
            ])

        loss = 0.0

        outputs, state = tf.nn.dynamic_rnn(stacked_lstm,
                                           model_input,
                                           sequence_length=num_frames,
                                           dtype=tf.float32)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        if FLAGS.use_lstm_output:
            agg_model_inputs = utils.FramePooling(outputs,
                                                  FLAGS.pooling_method)
        else:
            agg_model_inputs = state[-1].h

        return aggregated_model().create_model(model_input=agg_model_inputs,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemplo n.º 7
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     labels,
                     scope='default',
                     is_training=True,
                     **unused_params):

        with tf.variable_scope(scope, tf.AUTO_REUSE):

            with tf.variable_scope('lstm1', tf.AUTO_REUSE):
                lstm1 = tf.contrib.rnn.MultiRNNCell(
                    [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)])

                outputs1, _ = tf.nn.dynamic_rnn(lstm1,
                                                model_input,
                                                sequence_length=num_frames,
                                                dtype=tf.float32,
                                                swap_memory=True)
            with tf.variable_scope('lstm2', tf.AUTO_REUSE):
                lstm2 = tf.contrib.rnn.MultiRNNCell(
                    [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)])

                outputs2, _ = tf.nn.dynamic_rnn(lstm2,
                                                outputs1,
                                                sequence_length=num_frames,
                                                dtype=tf.float32,
                                                swap_memory=True)
            with tf.variable_scope('lstm3', tf.AUTO_REUSE):
                lstm3 = tf.contrib.rnn.MultiRNNCell(
                    [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)])

                outputs, state = tf.nn.dynamic_rnn(lstm3,
                                                   outputs2 + outputs1,
                                                   sequence_length=num_frames,
                                                   dtype=tf.float32,
                                                   swap_memory=True)

            if FLAGS.lstm_pooling_method == 'last':
                inp = state[-1].h
            else:
                inp = utils.FramePooling(outputs, FLAGS.lstm_pooling_method)

            aggregated_model = getattr(video_level_models,
                                       FLAGS.video_level_classifier_model)

            results = aggregated_model().create_model(model_input=inp,
                                                      vocab_size=vocab_size,
                                                      is_training=is_training,
                                                      **unused_params)
            results['features'] = inp
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
        return results
Exemplo n.º 8
0
    def create_model(self, model_input, vocab_size, num_frames,
                     **unused_params):
        """Creates a model which uses a stack of LSTMs to represent the video.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
        max_frame = 100
        model_input = SampleRandomFrames(model_input, num_frames, max_frame)
        # max_frame = model_input.get_shape().as_list()[1]
        image = tf.reshape(model_input, [-1, 32, 32])
        image = tf.expand_dims(image, 3)
        with slim.arg_scope(
            [slim.conv2d],
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=0.01),
                weights_regularizer=slim.l2_regularizer(0.0005),
                normalizer_fn=slim.batch_norm):
            net = slim.conv2d(image,
                              32, [5, 5],
                              padding='VALID',
                              scope='conv1')
            net = slim.relu(net, 32, scope='relu1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.conv2d(net, 64, [5, 5], padding='VALID', scope='conv2')
            net = slim.relu(net, 64, scope='relu2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.conv2d(net, 128, [5, 5], padding='VALID', scope='conv3')
            net = slim.relu(net, 128, scope='relu3')
            net = tf.squeeze(net, [1, 2], name='squeezed')
            print(net)

        net = tf.reshape(net, [-1, max_frame, 128])
        net = utils.FramePooling(net, 'max')
        net = slim.fully_connected(net, 512, scope='fc4')
        print(net)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=net,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemplo n.º 9
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     labels,
                     scope='default',
                     is_training=True,
                     **unused_params):
        """Creates a model which uses a stack of LSTMs to represent the video.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
        lstm_size = FLAGS.lstm_cells
        number_of_layers = FLAGS.lstm_layers

        with tf.variable_scope(scope, tf.AUTO_REUSE):
            stacked_lstm = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)
                for _ in range(number_of_layers)
            ])

            outputs, state = tf.nn.dynamic_rnn(stacked_lstm,
                                               model_input,
                                               sequence_length=num_frames,
                                               dtype=tf.float32,
                                               swap_memory=True)

            aggregated_model = getattr(video_level_models,
                                       FLAGS.video_level_classifier_model)
            if FLAGS.lstm_pooling_method == 'last':
                inp = state[-1].h
            else:
                inp = utils.FramePooling(outputs, FLAGS.lstm_pooling_method)
            results = aggregated_model().create_model(model_input=inp,
                                                      vocab_size=vocab_size,
                                                      is_training=is_training,
                                                      **unused_params)
            results['features'] = inp
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
        return results
Exemplo n.º 10
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     labels,
                     scope='default',
                     is_training=True,
                     **unused_params):
        X = FLAGS.residualcnn_x
        with tf.variable_scope(scope, tf.AUTO_REUSE):
            fc = slim.fully_connected(
                model_input,
                X,
                weights_regularizer=tf.contrib.layers.l2_regularizer(0.01))
            reshaped_input = tf.expand_dims(fc, -1)
            reshaped_input = tf.expand_dims(reshaped_input, -1)

            conv1 = slim.convolution(reshaped_input, 64, [49, 1])
            conv1_norm = slim.batch_norm(conv1, is_training=is_training)

            module1 = self.residual_module([128, 192, 64], conv1_norm,
                                           'module1')
            module1_norm = slim.batch_norm(module1, is_training=is_training)

            conv2 = slim.convolution(module1_norm, 128, 1)
            conv2_norm = slim.batch_norm(conv2, is_training=is_training)

            module2 = self.residual_module([256, 512, 128], conv2_norm,
                                           'module2')
            module2_norm = slim.batch_norm(module2, is_training=is_training)

            conv3 = slim.convolution(module2_norm, 256, 1)
            conv3_norm = slim.batch_norm(conv3, is_training=is_training)

            module3 = self.residual_module([512, 256], conv3_norm, 'module3')
            module3_norm = slim.batch_norm(module3, is_training=is_training)

            conv4 = slim.convolution(module3_norm, X, 1)
            conv4_norm = slim.batch_norm(conv4, is_training=is_training)

            module4 = self.residual_module([512, X], conv4_norm, 'module4')

            features = tf.squeeze(module4, [2])
            features = model_utils.FramePooling(features,
                                                FLAGS.residualcnn_pooling) + fc
            results = MoeModel().create_model(features, vocab_size)
            results['features'] = features
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
            return results
Exemplo n.º 11
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     labels,
                     scope='default',
                     **unused_params):

        lstm_size = FLAGS.lstm_cells
        with tf.variable_scope(scope, tf.AUTO_REUSE):
            cells = tf.contrib.rnn.MultiRNNCell(
                [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)])

            outputs1, _ = tf.nn.dynamic_rnn(cells,
                                            model_input,
                                            sequence_length=num_frames,
                                            dtype=tf.float32,
                                            swap_memory=True,
                                            scope='first')
            cells1 = tf.contrib.rnn.MultiRNNCell(
                [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)])

            outputs2, state2 = tf.nn.dynamic_rnn(cells1,
                                                 outputs1[:, 0:300:2, :],
                                                 sequence_length=num_frames /
                                                 2,
                                                 dtype=tf.float32,
                                                 swap_memory=True,
                                                 scope='second')

            aggregated_model = getattr(video_level_models,
                                       FLAGS.video_level_classifier_model)

            if FLAGS.lstm_pooling_method == 'last':
                output = state2[-1].h
            else:
                output = utils.FramePooling(outputs2,
                                            FLAGS.lstm_pooling_method)
            results = aggregated_model().create_model(model_input=output,
                                                      vocab_size=vocab_size,
                                                      **unused_params)
            results['features'] = output
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
        return results
Exemplo n.º 12
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     l2_penalty=1e-4,
                     **unused_params):
        """Creates a model which uses a logistic classifier over the average of the
    frame-level features.

    This class is intended to be an example for implementors of frame level
    models. If you want to train a model over averaged features it is more
    efficient to average them beforehand rather than on the fly.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
        #    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        #    feature_size = model_input.get_shape().as_list()[2]
        #
        #    logging.info('model_input shape: {}'.format(
        #            model_input.get_shape().as_list()))
        #
        #    denominators = tf.reshape(
        #        tf.tile(num_frames, [1, feature_size]), [-1, feature_size])
        #    avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators

        avg_pooled = utils.FramePooling(model_input, 'average')

        logging.info('avg_pooled shape: {}'.format(
            avg_pooled.get_shape().as_list()))

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        return aggregated_model().create_model(model_input=avg_pooled,
                                               vocab_size=vocab_size,
                                               num_mixtures=2,
                                               **unused_params)
Exemplo n.º 13
0
    def create_model(self, model_input, vocab_size, num_frames,
                     **unused_params):
        lstm_size = FLAGS.lstm_cells
        number_of_layers = FLAGS.lstm_layers

        lstm_fw = tf.contrib.rnn.MultiRNNCell([
            tf.contrib.rnn.BasicLSTMCell(lstm_size)
            for _ in range(number_of_layers)
        ],
                                              state_is_tuple=False)

        lstm_bw = tf.contrib.rnn.MultiRNNCell([
            tf.contrib.rnn.BasicLSTMCell(lstm_size)
            for _ in range(number_of_layers)
        ],
                                              state_is_tuple=False)

        loss = 0.0
        with tf.variable_scope("RNN"):
            outputs1, states1 = tf.nn.bidirectional_dynamic_rnn(
                lstm_fw,
                lstm_bw,
                model_input,
                dtype=tf.float32,
                sequence_length=num_frames)
        outputs = tf.concat(outputs1, 2)
        state = tf.concat(states1, 1)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        if FLAGS.use_lstm_output:
            agg_model_inputs = utils.FramePooling(outputs,
                                                  FLAGS.pooling_method)
        else:
            agg_model_inputs = state[-1].h

        return aggregated_model().create_model(model_input=agg_model_inputs,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemplo n.º 14
0
  def forward(self, reshaped_input):

    initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size))
    input_dim = reshaped_input.get_shape().as_list()
    circ_layer_hidden = CirculantLayerWithFactor(
                          (None, self.feature_size), 
                          self.cluster_size, 
                          k_factor=self.k_factor, 
                          initializer=initializer)
    activation = circ_layer_hidden.matmul(reshaped_input)

    # cluster_weights = tf.get_variable("cluster_weights",
    #   [self.feature_size, self.cluster_size],
    #   initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)))
    # tf.summary.histogram("cluster_weights", cluster_weights)
    # activation = tf.matmul(reshaped_input, cluster_weights)
    
    if self.add_batch_norm:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=self.is_training,
          scope="cluster_bn")
    else:
      cluster_biases = tf.get_variable("cluster_biases",
        [self.cluster_size],
        initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size)))
      tf.summary.histogram("cluster_biases", cluster_biases)
      activation += cluster_biases
    
    activation = tf.nn.relu6(activation)
    tf.summary.histogram("cluster_output", activation)

    activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
    activation = utils.FramePooling(activation, self.dbof_pooling_method)

    return activation
Exemplo n.º 15
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     labels,
                     scope='default',
                     is_training=True,
                     **unused_params):

        with tf.variable_scope(scope, tf.AUTO_REUSE):
            reshaped_input = tf.expand_dims(model_input, -1)
            reshaped_input = tf.expand_dims(reshaped_input, -1)

            conv1 = slim.convolution(reshaped_input,
                                     64, [49, 1],
                                     stride=(4, 1))
            max_pool1 = slim.max_pool2d(conv1, (9, 1), (2, 1), padding='SAME')
            norm1 = tf.nn.local_response_normalization(max_pool1)

            conv2 = slim.convolution(norm1, 64, 1, 1)
            conv3 = slim.convolution(conv2, 192, (9, 1), 1)
            norm2 = tf.nn.local_response_normalization(conv3)
            max_pool2 = slim.max_pool2d(norm2, (9, 1), (2, 1), padding='SAME')

            inception3a = self.inception_module(max_pool2,
                                                [64, 96, 128, 16, 32, 32],
                                                '3a')
            inception3b = self.inception_module(inception3a,
                                                [128, 128, 192, 32, 96, 64],
                                                '3b')

            max_pool3 = slim.max_pool2d(inception3b, (9, 1), (2, 1),
                                        padding='SAME')

            inception4a = self.inception_module(max_pool3,
                                                [192, 96, 208, 16, 48, 64],
                                                '4a')
            inception4b = self.inception_module(inception4a,
                                                [160, 112, 224, 24, 64, 64],
                                                '4b')
            inception4c = self.inception_module(inception4b,
                                                [128, 128, 256, 24, 64, 64],
                                                '4c')
            inception4d = self.inception_module(inception4c,
                                                [112, 144, 288, 32, 64, 64],
                                                '4d')
            inception4e = self.inception_module(inception4d,
                                                [256, 160, 320, 32, 128, 128],
                                                '4e')

            max_pool4 = slim.max_pool2d(inception4e, (9, 1), (2, 1),
                                        padding='SAME')

            inception5a = self.inception_module(max_pool4,
                                                [256, 160, 320, 32, 128, 128],
                                                '5a')
            inception5b = self.inception_module(inception5a,
                                                [384, 192, 384, 48, 128, 128],
                                                '5b')

            inter1 = tf.squeeze(inception4a, axis=[2])
            inter2 = tf.squeeze(inception4d, axis=[2])
            output = tf.squeeze(inception5b, axis=[2])
            inter1 = model_utils.FramePooling(inter1, FLAGS.googlenet_pooling)
            inter2 = model_utils.FramePooling(inter2, FLAGS.googlenet_pooling)
            output = model_utils.FramePooling(output, FLAGS.googlenet_pooling)

            inter_results1 = MoeModel().create_model(inter1, vocab_size,
                                                     'inter1')
            inter_results2 = MoeModel().create_model(inter2, vocab_size,
                                                     'inter2')
            results = MoeModel().create_model(output, vocab_size, 'final')
            results['features'] = output
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
                results['loss'] += losses.CrossEntropyLoss().calculate_loss(
                    inter_results1['predictions'], labels)
                results['loss'] += losses.CrossEntropyLoss().calculate_loss(
                    inter_results2['predictions'], labels)

            return results
Exemplo n.º 16
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     labels,
                     scope='default',
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        with tf.variable_scope(scope, tf.AUTO_REUSE):
            num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
            if random_frames:
                model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                       iterations)
            else:
                model_input = utils.SampleRandomSequence(
                    model_input, num_frames, iterations)
            max_frames = model_input.get_shape().as_list()[1]
            feature_size = model_input.get_shape().as_list()[2]
            reshaped_input = tf.reshape(model_input, [-1, feature_size])
            tf.summary.histogram("input_hist", reshaped_input)

            if add_batch_norm:
                reshaped_input = slim.batch_norm(reshaped_input,
                                                 center=True,
                                                 scale=True,
                                                 is_training=is_training,
                                                 scope="input_bn")

            cluster_weights = tf.get_variable(
                "cluster_weights", [feature_size, cluster_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(feature_size)))
            tf.summary.histogram("cluster_weights", cluster_weights)
            activation = tf.matmul(reshaped_input, cluster_weights)
            if add_batch_norm:
                activation = slim.batch_norm(activation,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="cluster_bn")
            else:
                cluster_biases = tf.get_variable(
                    "cluster_biases", [cluster_size],
                    initializer=tf.random_normal(stddev=1 /
                                                 math.sqrt(feature_size)))
                tf.summary.histogram("cluster_biases", cluster_biases)
                activation += cluster_biases
            activation = tf.nn.relu6(activation)
            tf.summary.histogram("cluster_output", activation)

            activation = tf.reshape(activation, [-1, max_frames, cluster_size])
            activation = utils.FramePooling(activation,
                                            FLAGS.dbof_pooling_method)

            hidden1_weights = tf.get_variable(
                "hidden1_weights", [cluster_size, hidden1_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(cluster_size)))
            tf.summary.histogram("hidden1_weights", hidden1_weights)
            activation = tf.matmul(activation, hidden1_weights)
            if add_batch_norm:
                activation = slim.batch_norm(activation,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="hidden1_bn")
            else:
                hidden1_biases = tf.get_variable(
                    "hidden1_biases", [hidden1_size],
                    initializer=tf.random_normal_initializer(stddev=0.01))
                tf.summary.histogram("hidden1_biases", hidden1_biases)
                activation += hidden1_biases
            activation = tf.nn.relu6(activation)
            tf.summary.histogram("hidden1_output", activation)

            aggregated_model = getattr(video_level_models,
                                       FLAGS.video_level_classifier_model)
            results = aggregated_model().create_model(model_input=activation,
                                                      vocab_size=vocab_size,
                                                      **unused_params)
            results['features'] = activation
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
        return results
Exemplo n.º 17
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        """See base class.

    Args:
       model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
         input features.
       vocab_size: The number of classes in the dataset.
       num_frames: A vector of length 'batch' which indicates the number of
         frames for each video (before padding).
       iterations: the number of frames to be sampled.
       add_batch_norm: whether to add batch norm during training.
       sample_random_frames: whether to sample random frames or random sequences.
       cluster_size: the output neuron number of the cluster layer.
       hidden_size: the output neuron number of the hidden layer.
       is_training: whether to build the graph in training mode.

    Returns:
       A dictionary with a tensor containing the probability predictions of the
       model in the 'predictions' key. The dimensions of the tensor are
       'batch_size' x 'num_classes'.
    """
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size
        act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation)
        assert act_fn is not None, ("dbof_activation is not valid: %s." %
                                    FLAGS.dbof_activation)

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        tf.compat.v1.summary.histogram("input_hist", reshaped_input)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(
                reshaped_input,
                center=True,
                scale=True,
                is_training=is_training,
                scope="input_bn",
            )

        cluster_weights = tf.compat.v1.get_variable(
            "cluster_weights",
            [feature_size, cluster_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(feature_size)),
        )
        tf.compat.v1.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(
                activation,
                center=True,
                scale=True,
                is_training=is_training,
                scope="cluster_bn",
            )
        else:
            cluster_biases = tf.compat.v1.get_variable(
                "cluster_biases",
                [cluster_size],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(feature_size)),
            )
            tf.compat.v1.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases
        # activation = act_fn(activation)
        # xxx 2018
        activation = tf.nn.relu6(activation)
        tf.compat.v1.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)

        hidden1_weights = tf.compat.v1.get_variable(
            "hidden1_weights",
            [cluster_size, hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(cluster_size)),
        )
        tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(
                activation,
                center=True,
                scale=True,
                is_training=is_training,
                scope="hidden1_bn",
            )
        else:
            hidden1_biases = tf.compat.v1.get_variable(
                "hidden1_biases",
                [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01),
            )
            tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        # xxx 2018
        # activation = tf.nn.relu6(activation)
        activation = act_fn(activation)
        tf.compat.v1.summary.histogram("hidden1_output", activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemplo n.º 18
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = 2048
        cluster_size_2 = 512
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.EqualSpaceMeans(model_input, num_frames,
                                                iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        cluster_weights = tf.get_variable(
            "cluster_weights", [feature_size, cluster_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(feature_size)))
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.get_variable(
                "cluster_biases", [cluster_size],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(feature_size)))
            activation += cluster_biases
        activation = tf.nn.relu6(activation)

        activation = tf.reshape(activation, [-1, 3, 10, cluster_size])
        activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)
        activation = tf.reshape(activation, [-1, 2, 5, cluster_size])
        activation = tf.transpose(activation, [0, 2, 3, 1])
        activation = tf.reshape(activation, [-1, cluster_size * 2])
        cluster_weights_2 = tf.get_variable(
            "cluster_weights2", [cluster_size * 2, cluster_size_2],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(cluster_size * 2)))
        activation = tf.matmul(activation, cluster_weights_2)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn2")
        else:
            cluster_biases_2 = tf.get_variable(
                "cluster_biases2", [cluster_size_2],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(cluster_size * 2)))
            activation += cluster_biases_2
        activation = tf.nn.relu6(activation)
        activation = tf.reshape(activation, [-1, cluster_size_2 * 5])

        hidden1_weights = tf.get_variable(
            "hidden1_weights", [cluster_size_2 * 5, hidden1_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(cluster_size_2 * 5)))
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.get_variable(
                "hidden1_biases", [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        activation = tf.nn.relu6(activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemplo n.º 19
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     labels,
                     scope='default',
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        with tf.variable_scope(scope, tf.AUTO_REUSE):
            num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
            if random_frames:
                model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                       iterations)
            else:
                model_input = utils.SampleRandomSequence(
                    model_input, num_frames, iterations)
            max_frames = model_input.get_shape().as_list()[1]
            feature_size = model_input.get_shape().as_list()[2]
            reshaped_input = tf.reshape(model_input, [-1, feature_size])

            tf.summary.histogram("input_hist", reshaped_input)
            reshaped_input = tf.expand_dims(reshaped_input, -1)
            reshaped_input = tf.expand_dims(reshaped_input, -1)

            out1 = tf.layers.conv2d(
                reshaped_input,
                128, (32, 1),
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                padding='same')
            out1_norm = tf.layers.batch_normalization(out1,
                                                      training=is_training)
            out1_pool = tf.layers.max_pooling2d(out1_norm, (8, 1),
                                                2,
                                                padding='same')

            out2 = tf.layers.conv2d(
                out1_pool,
                256, (32, 1),
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                padding='same')
            out2_norm = tf.layers.batch_normalization(out2,
                                                      training=is_training)
            out2_pool = tf.layers.max_pooling2d(out2_norm, (8, 1),
                                                2,
                                                padding='same')

            out3 = tf.layers.conv2d(
                out2_pool,
                256, (32, 1),
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                padding='same')
            out3_norm = tf.layers.batch_normalization(out3,
                                                      training=is_training)
            out3_pool = tf.layers.max_pooling2d(out3_norm, (8, 1),
                                                2,
                                                padding='same')

            out = tf.reduce_max(out3_pool, axis=[2, 3])
            activation = tf.reshape(out, [-1, max_frames, out.shape[1]])
            cluster_size = out.shape[1]

            activation = utils.FramePooling(activation,
                                            FLAGS.dbof_pooling_method)

            activation = tf.layers.dense(
                activation,
                hidden1_size,
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer())

            tf.summary.histogram("activation", activation)

            aggregated_model = getattr(video_level_models,
                                       FLAGS.video_level_classifier_model)
            results = aggregated_model().create_model(model_input=activation,
                                                      vocab_size=vocab_size,
                                                      is_training=is_training,
                                                      **unused_params)

            results['features'] = activation
            if labels != None:
                results['loss'] = losses.CrossEntropyLoss().calculate_loss(
                    results['predictions'], labels)
        return results
Exemplo n.º 20
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleFramesOrdered(model_input,
                                                    num_frames,
                                                    iterations,
                                                    is_training=is_training)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        tf.summary.histogram("input_hist", reshaped_input)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        cluster_weights = tf.get_variable(
            "cluster_weights", [feature_size, cluster_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(feature_size)))
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.get_variable(
                "cluster_biases", [cluster_size],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(feature_size)))
            tf.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)

        #########add hidden layer 4096 2048
        hidden1_weights = tf.get_variable(
            "hidden1_weights", [cluster_size, hidden1_size * 4],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(cluster_size)))
        tf.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.get_variable(
                "hidden1_biases", [hidden1_size * 4],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        activation = tf.nn.relu6(activation)

        hidden2_weights = tf.get_variable(
            "hidden2_weights", [hidden1_size * 4, hidden1_size * 2],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(hidden1_size * 4)))
        tf.summary.histogram("hidden2_weights", hidden2_weights)
        activation = tf.matmul(activation, hidden2_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden2_bn")
        else:
            hidden2_biases = tf.get_variable(
                "hidden2_biases", [hidden1_size * 2],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden2_biases", hidden2_biases)
            activation += hidden2_biases
        activation = tf.nn.relu6(activation)
        ##############################
        hidden3_weights = tf.get_variable(
            "hidden3_weights", [hidden1_size * 2, hidden1_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(hidden1_size * 2)))
        tf.summary.histogram("hidden3_weights", hidden3_weights)
        activation = tf.matmul(activation, hidden3_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden3_bn")
        else:
            hidden3_biases = tf.get_variable(
                "hidden3_biases", [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden3_biases", hidden3_biases)
            activation += hidden3_biases
        activation = tf.nn.relu6(activation)

        return activation
Exemplo n.º 21
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = model_utils.SampleRandomFrames(
                model_input, num_frames, iterations)
        else:
            model_input = model_utils.SampleRandomSequence(
                model_input, num_frames, iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        tf.summary.histogram("input_hist", reshaped_input)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        cluster_weights = tf.Variable(
            tf.random_normal([feature_size, cluster_size],
                             stddev=1 / math.sqrt(feature_size)))
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.Variable(
                tf.random_normal([cluster_size],
                                 stddev=1 / math.sqrt(feature_size)))
            tf.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        activation = model_utils.FramePooling(activation,
                                              FLAGS.dbof_pooling_method)

        hidden1_weights = tf.Variable(
            tf.random_normal([cluster_size, hidden1_size],
                             stddev=1 / math.sqrt(cluster_size)))
        tf.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.Variable(
                tf.random_normal([hidden1_size], stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("hidden1_output", activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        return aggregated_model().create_model(model_input=activation,
                                               original_input=model_input,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemplo n.º 22
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        print("num_frames=", num_frames)

        num_frames_ = num_frames
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                 iterations)

        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])

        print("model_input=", model_input)
        print("max_frames({}), feature_size({})".format(
            max_frames, feature_size))
        print("reshaped_input=", reshaped_input)

        tf.summary.histogram("input_hist", reshaped_input)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        cluster_weights = tf.get_variable(
            "cluster_weights", [feature_size, cluster_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(feature_size)))
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.get_variable(
                "cluster_biases", [cluster_size],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(feature_size)))
            tf.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("cluster_output", activation)

        print("activation=", activation)
        print("model_input=", model_input)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        print("activation before pooling=", activation)

        lstm_size = FLAGS.lstm_cells
        number_of_layers = FLAGS.lstm_layers
        print("max_frames=", max_frames)
        iterations_ = tf.tile(tf.expand_dims(iterations, 0),
                              tf.shape(model_input)[0:1])
        stacked_lstm = tf.contrib.rnn.MultiRNNCell(
            [
                #tf.contrib.rnn.BasicLSTMCell(
                #    lstm_size, forget_bias=1.0, state_is_tuple=True)
                tf.contrib.rnn.GRUCell(lstm_size)
                for _ in range(number_of_layers)
            ],
            state_is_tuple=True)

        _, state = tf.nn.dynamic_rnn(stacked_lstm,
                                     activation,
                                     sequence_length=iterations_,
                                     dtype=tf.float32)

        activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)

        stacked = tf.concat(list(state) + [activation], axis=1)

        stacked_size = stacked.get_shape().as_list()[-1]

        hidden1_weights = tf.get_variable(
            "hidden1_weights", [stacked_size, hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(stacked_size)))
        tf.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(stacked, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.get_variable(
                "hidden1_biases", [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("hidden1_output", activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        return aggregated_model().create_model(model_input=stacked,
                                               vocab_size=vocab_size,
                                               **unused_params)