def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', is_training=True, **unused_params): lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers with tf.variable_scope(scope, tf.AUTO_REUSE): stacked_lstm_fw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) stacked_lstm_bw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) outputs, state = tf.nn.bidirectional_dynamic_rnn( stacked_lstm_fw, stacked_lstm_bw, model_input, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) if FLAGS.lstm_pooling_method == 'last': l = [state[i][-1].h for i in range(2)] else: l = [ utils.FramePooling(outputs[0], FLAGS.lstm_pooling_method), utils.FramePooling(outputs[1], FLAGS.lstm_pooling_method) ] output = tf.concat(l, 1) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) results = aggregated_model().create_model(model_input=output, vocab_size=vocab_size, is_training=is_training, **unused_params) results['features'] = output if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def forward(self, reshaped_input): feature_size = self.feature_size cluster_size = self.cluster_size add_batch_norm = self.add_batch_norm max_frames = self.max_frames is_training = self.is_training cluster_weights = tf.get_variable("cluster_weights", [feature_size, cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases if activation == 'glu': space_ind = range(cluster_size/2) gate_ind = range(cluster_size/2,cluster_size) gates = tf.sigmoid(activation[:,gate_ind]) activation = tf.multiply(activation[:,space_ind],gates) elif activation == 'relu': activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) avg_activation = utils.FramePooling(activation, 'average') avg_activation = tf.nn.l2_normalize(avg_activation,1) max_activation = utils.FramePooling(activation, 'max') max_activation = tf.nn.l2_normalize(max_activation,1) return tf.concat([avg_activation,max_activation],1)
def forward(self, reshaped_input): cluster_weights = tf.get_variable("cluster_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [self.cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) activation = utils.FramePooling(activation, self.dbof_pooling_method) return activation
def create_model(self, model_input, vocab_size, num_frames, **unused_params): lstm_size = FLAGS.lstm_cells lstm_layers = FLAGS.lstm_layers dropout_keep_prob = FLAGS.dropout_keep_prob ## Batch normalize the input stacked_lnlstm = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.LayerNormBasicLSTMCell( lstm_size, dropout_keep_prob=dropout_keep_prob) for _ in range(lstm_layers) ]) loss = 0.0 with tf.variable_scope("RNN"): outputs, state = tf.nn.dynamic_rnn(stacked_lnlstm, model_input, sequence_length=num_frames, dtype=tf.float32) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.use_lstm_output: agg_model_inputs = utils.FramePooling(outputs, FLAGS.pooling_method) else: agg_model_inputs = state[-1] return aggregated_model().create_model(model_input=agg_model_inputs, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ lstm_size = FLAGS.lstm_cells #number_of_layers = FLAGS.lstm_layers with tf.variable_scope("lstm_1"): lstm_1 = tf.contrib.rnn.GRUCell(lstm_size, forget_bias=1.0) outputs, state = tf.nn.dynamic_rnn(lstm_1, model_input, sequence_length=num_frames, swap_memory=True, dtype=tf.float32) #Adding the time skip skip_outputs = outputs[:, ::FLAGS.time_skip, :] with tf.variable_scope("lstm_2"): lstm_2 = tf.contrib.rnn.GRUCell(lstm_size, forget_bias=1.0) outputs2, state2 = tf.nn.dynamic_rnn(lstm_2, skip_outputs, sequence_length=num_frames / FLAGS.time_skip, swap_memory=True, dtype=tf.float32) loss = 0.0 #Aggregating LSTM state and outputs model_state = tf.concat([state, state2], axis=1) model_outputs = tf.concat([outputs, outputs2], axis=1) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.use_lstm_output: return aggregated_model().create_model( model_input=utils.FramePooling(model_outputs, FLAGS.pooling_method), vocab_size=vocab_size, **unused_params) else: return aggregated_model().create_model(model_input=model_state, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers if FLAGS.use_attention: stacked_lstm = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.AttentionCellWrapper( tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0), FLAGS.attention_len) for _ in range(number_of_layers) ]) elif FLAGS.use_residuals: stacked_lstm = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.ResidualWrapper( tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)) for _ in range(number_of_layers) ]) else: stacked_lstm = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) loss = 0.0 outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, sequence_length=num_frames, dtype=tf.float32) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.use_lstm_output: agg_model_inputs = utils.FramePooling(outputs, FLAGS.pooling_method) else: agg_model_inputs = state[-1].h return aggregated_model().create_model(model_input=agg_model_inputs, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', is_training=True, **unused_params): with tf.variable_scope(scope, tf.AUTO_REUSE): with tf.variable_scope('lstm1', tf.AUTO_REUSE): lstm1 = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs1, _ = tf.nn.dynamic_rnn(lstm1, model_input, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) with tf.variable_scope('lstm2', tf.AUTO_REUSE): lstm2 = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs2, _ = tf.nn.dynamic_rnn(lstm2, outputs1, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) with tf.variable_scope('lstm3', tf.AUTO_REUSE): lstm3 = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs, state = tf.nn.dynamic_rnn(lstm3, outputs2 + outputs1, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) if FLAGS.lstm_pooling_method == 'last': inp = state[-1].h else: inp = utils.FramePooling(outputs, FLAGS.lstm_pooling_method) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) results = aggregated_model().create_model(model_input=inp, vocab_size=vocab_size, is_training=is_training, **unused_params) results['features'] = inp if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ max_frame = 100 model_input = SampleRandomFrames(model_input, num_frames, max_frame) # max_frame = model_input.get_shape().as_list()[1] image = tf.reshape(model_input, [-1, 32, 32]) image = tf.expand_dims(image, 3) with slim.arg_scope( [slim.conv2d], weights_initializer=tf.truncated_normal_initializer( stddev=0.01), weights_regularizer=slim.l2_regularizer(0.0005), normalizer_fn=slim.batch_norm): net = slim.conv2d(image, 32, [5, 5], padding='VALID', scope='conv1') net = slim.relu(net, 32, scope='relu1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.conv2d(net, 64, [5, 5], padding='VALID', scope='conv2') net = slim.relu(net, 64, scope='relu2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.conv2d(net, 128, [5, 5], padding='VALID', scope='conv3') net = slim.relu(net, 128, scope='relu3') net = tf.squeeze(net, [1, 2], name='squeezed') print(net) net = tf.reshape(net, [-1, max_frame, 128]) net = utils.FramePooling(net, 'max') net = slim.fully_connected(net, 512, scope='fc4') print(net) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=net, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', is_training=True, **unused_params): """Creates a model which uses a stack of LSTMs to represent the video. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers with tf.variable_scope(scope, tf.AUTO_REUSE): stacked_lstm = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0) for _ in range(number_of_layers) ]) outputs, state = tf.nn.dynamic_rnn(stacked_lstm, model_input, sequence_length=num_frames, dtype=tf.float32, swap_memory=True) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.lstm_pooling_method == 'last': inp = state[-1].h else: inp = utils.FramePooling(outputs, FLAGS.lstm_pooling_method) results = aggregated_model().create_model(model_input=inp, vocab_size=vocab_size, is_training=is_training, **unused_params) results['features'] = inp if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, labels, scope='default', is_training=True, **unused_params): X = FLAGS.residualcnn_x with tf.variable_scope(scope, tf.AUTO_REUSE): fc = slim.fully_connected( model_input, X, weights_regularizer=tf.contrib.layers.l2_regularizer(0.01)) reshaped_input = tf.expand_dims(fc, -1) reshaped_input = tf.expand_dims(reshaped_input, -1) conv1 = slim.convolution(reshaped_input, 64, [49, 1]) conv1_norm = slim.batch_norm(conv1, is_training=is_training) module1 = self.residual_module([128, 192, 64], conv1_norm, 'module1') module1_norm = slim.batch_norm(module1, is_training=is_training) conv2 = slim.convolution(module1_norm, 128, 1) conv2_norm = slim.batch_norm(conv2, is_training=is_training) module2 = self.residual_module([256, 512, 128], conv2_norm, 'module2') module2_norm = slim.batch_norm(module2, is_training=is_training) conv3 = slim.convolution(module2_norm, 256, 1) conv3_norm = slim.batch_norm(conv3, is_training=is_training) module3 = self.residual_module([512, 256], conv3_norm, 'module3') module3_norm = slim.batch_norm(module3, is_training=is_training) conv4 = slim.convolution(module3_norm, X, 1) conv4_norm = slim.batch_norm(conv4, is_training=is_training) module4 = self.residual_module([512, X], conv4_norm, 'module4') features = tf.squeeze(module4, [2]) features = model_utils.FramePooling(features, FLAGS.residualcnn_pooling) + fc results = MoeModel().create_model(features, vocab_size) results['features'] = features if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', **unused_params): lstm_size = FLAGS.lstm_cells with tf.variable_scope(scope, tf.AUTO_REUSE): cells = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs1, _ = tf.nn.dynamic_rnn(cells, model_input, sequence_length=num_frames, dtype=tf.float32, swap_memory=True, scope='first') cells1 = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=1.0)]) outputs2, state2 = tf.nn.dynamic_rnn(cells1, outputs1[:, 0:300:2, :], sequence_length=num_frames / 2, dtype=tf.float32, swap_memory=True, scope='second') aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.lstm_pooling_method == 'last': output = state2[-1].h else: output = utils.FramePooling(outputs2, FLAGS.lstm_pooling_method) results = aggregated_model().create_model(model_input=output, vocab_size=vocab_size, **unused_params) results['features'] = output if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, l2_penalty=1e-4, **unused_params): """Creates a model which uses a logistic classifier over the average of the frame-level features. This class is intended to be an example for implementors of frame level models. If you want to train a model over averaged features it is more efficient to average them beforehand rather than on the fly. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ # num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) # feature_size = model_input.get_shape().as_list()[2] # # logging.info('model_input shape: {}'.format( # model_input.get_shape().as_list())) # # denominators = tf.reshape( # tf.tile(num_frames, [1, feature_size]), [-1, feature_size]) # avg_pooled = tf.reduce_sum(model_input, axis=[1]) / denominators avg_pooled = utils.FramePooling(model_input, 'average') logging.info('avg_pooled shape: {}'.format( avg_pooled.get_shape().as_list())) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=avg_pooled, vocab_size=vocab_size, num_mixtures=2, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, **unused_params): lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers lstm_fw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size) for _ in range(number_of_layers) ], state_is_tuple=False) lstm_bw = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.BasicLSTMCell(lstm_size) for _ in range(number_of_layers) ], state_is_tuple=False) loss = 0.0 with tf.variable_scope("RNN"): outputs1, states1 = tf.nn.bidirectional_dynamic_rnn( lstm_fw, lstm_bw, model_input, dtype=tf.float32, sequence_length=num_frames) outputs = tf.concat(outputs1, 2) state = tf.concat(states1, 1) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) if FLAGS.use_lstm_output: agg_model_inputs = utils.FramePooling(outputs, FLAGS.pooling_method) else: agg_model_inputs = state[-1].h return aggregated_model().create_model(model_input=agg_model_inputs, vocab_size=vocab_size, **unused_params)
def forward(self, reshaped_input): initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size)) input_dim = reshaped_input.get_shape().as_list() circ_layer_hidden = CirculantLayerWithFactor( (None, self.feature_size), self.cluster_size, k_factor=self.k_factor, initializer=initializer) activation = circ_layer_hidden.matmul(reshaped_input) # cluster_weights = tf.get_variable("cluster_weights", # [self.feature_size, self.cluster_size], # initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.cluster_size))) # tf.summary.histogram("cluster_weights", cluster_weights) # activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [self.cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) activation = utils.FramePooling(activation, self.dbof_pooling_method) return activation
def create_model(self, model_input, vocab_size, labels, scope='default', is_training=True, **unused_params): with tf.variable_scope(scope, tf.AUTO_REUSE): reshaped_input = tf.expand_dims(model_input, -1) reshaped_input = tf.expand_dims(reshaped_input, -1) conv1 = slim.convolution(reshaped_input, 64, [49, 1], stride=(4, 1)) max_pool1 = slim.max_pool2d(conv1, (9, 1), (2, 1), padding='SAME') norm1 = tf.nn.local_response_normalization(max_pool1) conv2 = slim.convolution(norm1, 64, 1, 1) conv3 = slim.convolution(conv2, 192, (9, 1), 1) norm2 = tf.nn.local_response_normalization(conv3) max_pool2 = slim.max_pool2d(norm2, (9, 1), (2, 1), padding='SAME') inception3a = self.inception_module(max_pool2, [64, 96, 128, 16, 32, 32], '3a') inception3b = self.inception_module(inception3a, [128, 128, 192, 32, 96, 64], '3b') max_pool3 = slim.max_pool2d(inception3b, (9, 1), (2, 1), padding='SAME') inception4a = self.inception_module(max_pool3, [192, 96, 208, 16, 48, 64], '4a') inception4b = self.inception_module(inception4a, [160, 112, 224, 24, 64, 64], '4b') inception4c = self.inception_module(inception4b, [128, 128, 256, 24, 64, 64], '4c') inception4d = self.inception_module(inception4c, [112, 144, 288, 32, 64, 64], '4d') inception4e = self.inception_module(inception4d, [256, 160, 320, 32, 128, 128], '4e') max_pool4 = slim.max_pool2d(inception4e, (9, 1), (2, 1), padding='SAME') inception5a = self.inception_module(max_pool4, [256, 160, 320, 32, 128, 128], '5a') inception5b = self.inception_module(inception5a, [384, 192, 384, 48, 128, 128], '5b') inter1 = tf.squeeze(inception4a, axis=[2]) inter2 = tf.squeeze(inception4d, axis=[2]) output = tf.squeeze(inception5b, axis=[2]) inter1 = model_utils.FramePooling(inter1, FLAGS.googlenet_pooling) inter2 = model_utils.FramePooling(inter2, FLAGS.googlenet_pooling) output = model_utils.FramePooling(output, FLAGS.googlenet_pooling) inter_results1 = MoeModel().create_model(inter1, vocab_size, 'inter1') inter_results2 = MoeModel().create_model(inter2, vocab_size, 'inter2') results = MoeModel().create_model(output, vocab_size, 'final') results['features'] = output if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) results['loss'] += losses.CrossEntropyLoss().calculate_loss( inter_results1['predictions'], labels) results['loss'] += losses.CrossEntropyLoss().calculate_loss( inter_results2['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size with tf.variable_scope(scope, tf.AUTO_REUSE): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence( model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.get_variable( "hidden1_weights", [cluster_size, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) results = aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params) results['features'] = activation if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): """See base class. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. num_frames: A vector of length 'batch' which indicates the number of frames for each video (before padding). iterations: the number of frames to be sampled. add_batch_norm: whether to add batch norm during training. sample_random_frames: whether to sample random frames or random sequences. cluster_size: the output neuron number of the cluster layer. hidden_size: the output neuron number of the hidden layer. is_training: whether to build the graph in training mode. Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are 'batch_size' x 'num_classes'. """ iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size act_fn = self.ACT_FN_MAP.get(FLAGS.dbof_activation) assert act_fn is not None, ("dbof_activation is not valid: %s." % FLAGS.dbof_activation) num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.compat.v1.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn", ) cluster_weights = tf.compat.v1.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)), ) tf.compat.v1.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="cluster_bn", ) else: cluster_biases = tf.compat.v1.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size)), ) tf.compat.v1.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases # activation = act_fn(activation) # xxx 2018 activation = tf.nn.relu6(activation) tf.compat.v1.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.compat.v1.get_variable( "hidden1_weights", [cluster_size, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)), ) tf.compat.v1.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn", ) else: hidden1_biases = tf.compat.v1.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01), ) tf.compat.v1.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases # xxx 2018 # activation = tf.nn.relu6(activation) activation = act_fn(activation) tf.compat.v1.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = 2048 cluster_size_2 = 512 hidden1_size = hidden_size or FLAGS.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.EqualSpaceMeans(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) activation += cluster_biases activation = tf.nn.relu6(activation) activation = tf.reshape(activation, [-1, 3, 10, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) activation = tf.reshape(activation, [-1, 2, 5, cluster_size]) activation = tf.transpose(activation, [0, 2, 3, 1]) activation = tf.reshape(activation, [-1, cluster_size * 2]) cluster_weights_2 = tf.get_variable( "cluster_weights2", [cluster_size * 2, cluster_size_2], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(cluster_size * 2))) activation = tf.matmul(activation, cluster_weights_2) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn2") else: cluster_biases_2 = tf.get_variable( "cluster_biases2", [cluster_size_2], initializer=tf.random_normal(stddev=1 / math.sqrt(cluster_size * 2))) activation += cluster_biases_2 activation = tf.nn.relu6(activation) activation = tf.reshape(activation, [-1, cluster_size_2 * 5]) hidden1_weights = tf.get_variable( "hidden1_weights", [cluster_size_2 * 5, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(cluster_size_2 * 5))) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, labels, scope='default', iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size with tf.variable_scope(scope, tf.AUTO_REUSE): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence( model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) reshaped_input = tf.expand_dims(reshaped_input, -1) reshaped_input = tf.expand_dims(reshaped_input, -1) out1 = tf.layers.conv2d( reshaped_input, 128, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out1_norm = tf.layers.batch_normalization(out1, training=is_training) out1_pool = tf.layers.max_pooling2d(out1_norm, (8, 1), 2, padding='same') out2 = tf.layers.conv2d( out1_pool, 256, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out2_norm = tf.layers.batch_normalization(out2, training=is_training) out2_pool = tf.layers.max_pooling2d(out2_norm, (8, 1), 2, padding='same') out3 = tf.layers.conv2d( out2_pool, 256, (32, 1), activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), padding='same') out3_norm = tf.layers.batch_normalization(out3, training=is_training) out3_pool = tf.layers.max_pooling2d(out3_norm, (8, 1), 2, padding='same') out = tf.reduce_max(out3_pool, axis=[2, 3]) activation = tf.reshape(out, [-1, max_frames, out.shape[1]]) cluster_size = out.shape[1] activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) activation = tf.layers.dense( activation, hidden1_size, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer()) tf.summary.histogram("activation", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) results = aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params) results['features'] = activation if labels != None: results['loss'] = losses.CrossEntropyLoss().calculate_loss( results['predictions'], labels) return results
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleFramesOrdered(model_input, num_frames, iterations, is_training=is_training) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) #########add hidden layer 4096 2048 hidden1_weights = tf.get_variable( "hidden1_weights", [cluster_size, hidden1_size * 4], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size * 4], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) hidden2_weights = tf.get_variable( "hidden2_weights", [hidden1_size * 4, hidden1_size * 2], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(hidden1_size * 4))) tf.summary.histogram("hidden2_weights", hidden2_weights) activation = tf.matmul(activation, hidden2_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden2_bn") else: hidden2_biases = tf.get_variable( "hidden2_biases", [hidden1_size * 2], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden2_biases", hidden2_biases) activation += hidden2_biases activation = tf.nn.relu6(activation) ############################## hidden3_weights = tf.get_variable( "hidden3_weights", [hidden1_size * 2, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(hidden1_size * 2))) tf.summary.histogram("hidden3_weights", hidden3_weights) activation = tf.matmul(activation, hidden3_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden3_bn") else: hidden3_biases = tf.get_variable( "hidden3_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden3_biases", hidden3_biases) activation += hidden3_biases activation = tf.nn.relu6(activation) return activation
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = model_utils.SampleRandomFrames( model_input, num_frames, iterations) else: model_input = model_utils.SampleRandomSequence( model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.Variable( tf.random_normal([feature_size, cluster_size], stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.Variable( tf.random_normal([cluster_size], stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = model_utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.Variable( tf.random_normal([cluster_size, hidden1_size], stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.Variable( tf.random_normal([hidden1_size], stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, original_input=model_input, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size print("num_frames=", num_frames) num_frames_ = num_frames num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) print("model_input=", model_input) print("max_frames({}), feature_size({})".format( max_frames, feature_size)) print("reshaped_input=", reshaped_input) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable( "cluster_weights", [feature_size, cluster_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) print("activation=", activation) print("model_input=", model_input) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) print("activation before pooling=", activation) lstm_size = FLAGS.lstm_cells number_of_layers = FLAGS.lstm_layers print("max_frames=", max_frames) iterations_ = tf.tile(tf.expand_dims(iterations, 0), tf.shape(model_input)[0:1]) stacked_lstm = tf.contrib.rnn.MultiRNNCell( [ #tf.contrib.rnn.BasicLSTMCell( # lstm_size, forget_bias=1.0, state_is_tuple=True) tf.contrib.rnn.GRUCell(lstm_size) for _ in range(number_of_layers) ], state_is_tuple=True) _, state = tf.nn.dynamic_rnn(stacked_lstm, activation, sequence_length=iterations_, dtype=tf.float32) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) stacked = tf.concat(list(state) + [activation], axis=1) stacked_size = stacked.get_shape().as_list()[-1] hidden1_weights = tf.get_variable( "hidden1_weights", [stacked_size, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(stacked_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(stacked, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=stacked, vocab_size=vocab_size, **unused_params)