def create_model(self, model_input, vocab_size, num_frames, iterations=None, batch_norm=None, random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iter = iterations or FLAGS.iterations batch_norm = batch_norm or FLAGS.batch_norm random_frames = random_frames or FLAGS.random_frames cluster_size = cluster_size or FLAGS.fv_cluster_size hidden1_size = hidden_size or FLAGS.fv_hidden_size relu = FLAGS.relu gating = FLAGS.gating num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = sample_random_frames(model_input, num_frames, iter) else: model_input = sample_random_seq(model_input, num_frames, iter) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) video_NetFV = NetFV(1024, max_frames, cluster_size, batch_norm, is_training) audio_NetFV = NetFV(128, max_frames, cluster_size // 2, batch_norm, is_training) if batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_FV"): fv_video = video_NetFV.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_FV"): fv_audio = audio_NetFV.forward(reshaped_input[:, 1024:]) fv = tf.concat([fv_video, fv_audio], 1) fv_dim = fv.get_shape().as_list()[1] hidden1_weights = tf.get_variable( "hidden1_weights", [fv_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) activation = tf.matmul(fv, hidden1_weights) if batch_norm and relu: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) if gating: gating_weights = tf.get_variable( "gating_weights_2", [hidden1_size, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) if batch_norm: gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, scope="gating_bn") else: gating_biases = tf.get_variable( "gating_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(activation, gates) aggregated_model = getattr(video_level_models, FLAGS.video_level_clf_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, batch_norm=None, random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iter = iterations or FLAGS.iterations batch_norm = batch_norm or FLAGS.batch_norm random_frames = random_frames or FLAGS.random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size relu = FLAGS.relu fc_dimred = FLAGS.fc_dimred max_pool = FLAGS.softdbof_maxpool num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = sample_random_frames(model_input, num_frames, iter) else: model_input = sample_random_seq(model_input, num_frames, iter) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) video_Dbof = SoftDBoF(1024, max_frames, cluster_size, max_pool, batch_norm, is_training) audio_Dbof = SoftDBoF(128, max_frames, cluster_size // 8, max_pool, batch_norm, is_training) if batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_DBOF"): dbof_video = video_Dbof.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_DBOF"): dbof_audio = audio_Dbof.forward(reshaped_input[:, 1024:]) dbof = tf.concat([dbof_video, dbof_audio], 1) dbof_dim = dbof.get_shape().as_list()[1] if fc_dimred: hidden1_weights = tf.get_variable( "hidden1_weights", [dbof_dim, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(dbof, hidden1_weights) if batch_norm and relu: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) else: activation = dbof aggregated_model = getattr(video_level_models, FLAGS.video_level_clf_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)