def build_model(self): # input layer (batch_size, n_steps, input_dim) self.ques_vecs = tf.placeholder( tf.float32, [None, self.max_words, self.input_ques_dim]) self.ques_len = tf.placeholder(tf.int32, [None]) self.frame_vecs = tf.placeholder( tf.float32, [None, self.max_frames, self.input_video_dim]) self.frame_len = tf.placeholder(tf.int32, [None]) self.batch_size = tf.placeholder(tf.int32, []) self.is_training = tf.placeholder(tf.bool) self.gt_predict = tf.placeholder(tf.float32, [None, self.max_frames]) self.gt_windows = tf.placeholder(tf.float32, [None, 2]) self.frame_mask = tf.sequence_mask(self.frame_len, maxlen=self.max_frames) self.ques_mask = tf.sequence_mask(self.ques_len, maxlen=self.max_words) with tf.variable_scope("Frame_Embedding_Encoder_Layer"): frame_next_layer = tf.contrib.layers.dropout( self.frame_vecs, self.dropout, is_training=self.is_training) frame_next_layer = conv_utils.linear_mapping( frame_next_layer, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_cnn") frame_next_layer = transformer.normalize(frame_next_layer) frame_next_layer += transformer.positional_encoding_v2( frame_next_layer, num_units=self.hidden_size, zero_pad=False, scale=False, scope="enc_pe") for i in range(3): with tf.variable_scope("stack_%s" % i): frame_next_layer = conv_utils.conv_encoder_stack( frame_next_layer, [self.hidden_size, self.hidden_size, self.hidden_size], [3, 3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) frame_next_layer = transformer.multihead_attention( queries=frame_next_layer, keys=frame_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) frame_next_layer = transformer.feedforward( frame_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) frame_embedding = tf.contrib.layers.dropout( frame_next_layer, self.dropout, is_training=self.is_training) with tf.variable_scope("Ques_Embedding_Encoder_Layer"): ques_next_layer = tf.contrib.layers.dropout( self.ques_vecs, self.dropout, is_training=self.is_training) ques_next_layer = conv_utils.linear_mapping( ques_next_layer, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_cnn") ques_next_layer = transformer.normalize(ques_next_layer) ques_next_layer += transformer.positional_encoding_v2( ques_next_layer, num_units=self.hidden_size, zero_pad=False, scale=False, scope="enc_pe") for i in range(1): with tf.variable_scope("stack_%s" % i): ques_next_layer = conv_utils.conv_encoder_stack( ques_next_layer, [self.hidden_size, self.hidden_size], [3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) ques_next_layer = transformer.multihead_attention( queries=ques_next_layer, keys=ques_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) ques_next_layer = transformer.feedforward( ques_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) ques_embedding = tf.contrib.layers.dropout( ques_next_layer, self.dropout, is_training=self.is_training) # q_feature, _ = layers.weight_attention_layer(ques_embedding,self.hidden_size,scope_name='q_feature') ques_mask_embedding = layers.mask_zero( ques_next_layer, tf.expand_dims(self.ques_mask, 2)) q_feature = tf.reduce_sum( ques_mask_embedding, axis=1) / tf.expand_dims( tf.cast(self.ques_len, tf.float32), 1) # q_feature = tf.reduce_mean(ques_next_layer,axis=1) print(q_feature.shape) self.q_feature = tf.contrib.layers.dropout( q_feature, self.dropout, is_training=self.is_training) with tf.variable_scope("Context_to_Query_Attention_Layer"): att_score = tf.matmul( frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 mask_q = tf.expand_dims(self.ques_mask, 1) S_ = tf.nn.softmax(layers.mask_logits(att_score, mask=mask_q)) mask_v = tf.expand_dims(self.frame_mask, 2) S_T = tf.transpose( tf.nn.softmax(layers.mask_logits(att_score, mask=mask_v), axis=1), (0, 2, 1)) self.v2q = tf.matmul(S_, ques_embedding) self.q2v = tf.matmul(tf.matmul(S_, S_T), frame_embedding) attention_outputs = tf.concat([ frame_embedding, self.v2q, frame_embedding * self.v2q, frame_embedding * self.q2v ], 2) with tf.variable_scope("Model_Encoder_Layer"): model_next_layer = conv_utils.linear_mapping( attention_outputs, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_model_layer") model_next_layer = transformer.normalize(model_next_layer) for i in range(2): with tf.variable_scope("stack_%s" % i): model_next_layer = conv_utils.conv_encoder_stack( model_next_layer, [self.hidden_size, self.hidden_size], [3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) model_next_layer = transformer.multihead_attention( queries=model_next_layer, keys=model_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) model_next_layer = transformer.feedforward( model_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) model_outputs = model_next_layer with tf.variable_scope("Output_Layer"): # logit_score = layers.correlation_layer(model_outputs,self.q_feature,self.hidden_size,scope_name='output_layer') logit_score = layers.linear_layer_3d(model_outputs, 1, scope_name='output_layer') logit_score = tf.squeeze(logit_score, 2) logit_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logit_score, labels=self.gt_predict) avg_logit_loss = tf.reduce_mean(tf.reduce_sum(logit_loss, 1)) self.G_variables = tf.trainable_variables() G_regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in self.G_variables]) self.test_loss = avg_logit_loss self.loss = avg_logit_loss + self.regularization_beta * G_regularization_cost self.frame_score = tf.nn.sigmoid(logit_score) with tf.variable_scope('Pointer_Layer'): score_dist = tf.nn.sigmoid(logit_score) score_dist = conv_utils.normalize(score_dist, scope='layer_normal') output = tf.nn.relu( conv_utils.conv1d_with_bias(tf.expand_dims(score_dist, 2), 1, 16, 5)) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 2, 32, 10)) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 3, 64, 20)) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 4, 1, 10)) output = layers.linear_layer(tf.squeeze(output, 2), 2, scope_name='pointrt_output') self.predict_start_end = output gt_start_end = self.gt_windows pointer_loss = tf.reduce_mean( tf.square(tf.subtract(self.predict_start_end, gt_start_end))) all_variable = tf.trainable_variables() self.pn_variables = [ vv for vv in all_variable if vv not in self.G_variables ] pn_regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in self.pn_variables]) self.pn_loss = pointer_loss + self.regularization_beta * pn_regularization_cost
def build_model(self): # input layer (batch_size, n_steps, input_dim) self.ques_vecs = tf.placeholder( tf.float32, [None, self.max_words, self.input_ques_dim]) self.ques_len = tf.placeholder(tf.int32, [None]) self.frame_vecs = tf.placeholder( tf.float32, [None, self.max_frames, self.input_video_dim]) self.frame_len = tf.placeholder(tf.int32, [None]) self.batch_size = tf.placeholder(tf.int32, []) self.is_training = tf.placeholder(tf.bool) self.gt_predict = tf.placeholder(tf.float32, [None, self.max_frames]) self.frame_mask = tf.sequence_mask(self.frame_len, maxlen=self.max_frames) self.ques_mask = tf.sequence_mask(self.ques_len, maxlen=self.max_words) with tf.variable_scope("Frame_Embedding_Encoder_Layer"): input_frame_vecs = tf.contrib.layers.dropout( self.frame_vecs, self.dropout, is_training=self.is_training) frame_embedding, _ = layers.dynamic_origin_bilstm_layer( input_frame_vecs, self.hidden_size, 'frame_embedding', input_len=self.frame_len) frame_embedding = tf.contrib.layers.dropout( frame_embedding, self.dropout, is_training=self.is_training) with tf.variable_scope("Ques_Embedding_Encoder_Layer"): input_ques_vecs = tf.contrib.layers.dropout( self.ques_vecs, self.dropout, is_training=self.is_training) ques_embedding, ques_states = layers.dynamic_origin_bilstm_layer( input_ques_vecs, self.hidden_size, 'ques_embedding', input_len=self.ques_len) ques_embedding = tf.contrib.layers.dropout( ques_embedding, self.dropout, is_training=self.is_training) with tf.variable_scope("Context_to_Query_Attention_Layer"): # att_score = tf.matmul(frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 # att_score = tf.nn.softmax(mask_logits(att_score, mask=tf.expand_dims(self.ques_mask, 1))) # # length = tf.cast(tf.shape(ques_embedding), tf.float32) # att_out = tf.matmul(att_score, ques_embedding) * length[1] * tf.sqrt( # 1.0 / length[1]) # M*N1*N2 ** M*N2*K --> M*N1*k # # attention_outputs = tf.concat([frame_embedding, att_out, tf.multiply(frame_embedding,att_out)]) att_score = tf.matmul( frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 mask_q = tf.expand_dims(self.ques_mask, 1) S_ = tf.nn.softmax(layers.mask_logits(att_score, mask=mask_q)) mask_v = tf.expand_dims(self.frame_mask, 2) S_T = tf.transpose( tf.nn.softmax(layers.mask_logits(att_score, mask=mask_v), axis=1), (0, 2, 1)) self.v2q = tf.matmul(S_, ques_embedding) self.q2v = tf.matmul(tf.matmul(S_, S_T), frame_embedding) attention_outputs = tf.concat([ frame_embedding, self.v2q, frame_embedding * self.v2q, frame_embedding * self.q2v ], 2) with tf.variable_scope("Model_Encoder_Layer"): attention_outputs = tf.contrib.layers.dropout( attention_outputs, self.dropout, is_training=self.is_training) model_outputs, _ = layers.dynamic_origin_bilstm_layer( attention_outputs, self.hidden_size, 'model_layer', input_len=self.frame_len) model_outputs = tf.contrib.layers.dropout( model_outputs, self.dropout, is_training=self.is_training) with tf.variable_scope("Output_Layer"): logit_score = layers.linear_layer_3d(model_outputs, 1, scope_name='output_layer') logit_score = tf.squeeze(logit_score, 2) logit_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logit_score, labels=self.gt_predict) avg_logit_loss = tf.reduce_mean(tf.reduce_sum(logit_loss, 1)) variables = tf.trainable_variables() regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in variables]) self.test_loss = avg_logit_loss self.loss = avg_logit_loss + self.regularization_beta * regularization_cost self.frame_score = tf.nn.sigmoid(logit_score)
def build_model(self): # input layer (batch_size, n_steps, input_dim) self.ques_vecs = tf.placeholder(tf.float32, [None, self.max_words, self.input_ques_dim]) self.ques_len = tf.placeholder(tf.int32, [None]) self.frame_vecs = tf.placeholder(tf.float32, [None, self.max_frames, self.input_video_dim]) self.frame_len = tf.placeholder(tf.int32, [None]) self.batch_size = tf.placeholder(tf.int32, []) self.is_training = tf.placeholder(tf.bool) self.gt_predict = tf.placeholder(tf.float32, [None, self.max_frames]) self.gt_windows = tf.placeholder(tf.float32, [None, 2]) self.frame_mask = tf.sequence_mask(self.frame_len, maxlen=self.max_frames) self.ques_mask = tf.sequence_mask(self.ques_len, maxlen=self.max_words) with tf.variable_scope("Frame_Embedding_Encoder_Layer"): input_frame_vecs = tf.contrib.layers.dropout(self.frame_vecs, self.dropout, is_training=self.is_training) frame_embedding, _ = layers.dynamic_origin_bilstm_layer(input_frame_vecs, self.hidden_size, 'frame_embedding', input_len=self.frame_len) frame_embedding = tf.contrib.layers.dropout(frame_embedding, self.dropout, is_training=self.is_training) with tf.variable_scope("Ques_Embedding_Encoder_Layer"): input_ques_vecs = tf.contrib.layers.dropout(self.ques_vecs, self.dropout, is_training=self.is_training) ques_embedding, ques_states = layers.dynamic_origin_bilstm_layer(input_ques_vecs, self.hidden_size, 'ques_embedding', input_len=self.ques_len) ques_embedding = tf.contrib.layers.dropout(ques_embedding, self.dropout, is_training=self.is_training) q_feature = tf.concat([ques_states[0][1],ques_states[1][1]], 1) self.q_feature = tf.contrib.layers.dropout(q_feature, self.dropout, is_training=self.is_training) with tf.variable_scope("Context_to_Query_Attention_Layer"): # att_score = tf.matmul(frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 # att_score = tf.nn.softmax(mask_logits(att_score, mask=tf.expand_dims(self.ques_mask, 1))) # # length = tf.cast(tf.shape(ques_embedding), tf.float32) # att_out = tf.matmul(att_score, ques_embedding) * length[1] * tf.sqrt( # 1.0 / length[1]) # M*N1*N2 ** M*N2*K --> M*N1*k # # attention_outputs = tf.concat([frame_embedding, att_out, tf.multiply(frame_embedding,att_out)]) att_score = tf.matmul(frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 mask_q = tf.expand_dims(self.ques_mask, 1) S_ = tf.nn.softmax(layers.mask_logits(att_score, mask = mask_q)) mask_v = tf.expand_dims(self.frame_mask, 2) S_T = tf.transpose(tf.nn.softmax(layers.mask_logits(att_score, mask = mask_v), axis = 1),(0,2,1)) self.v2q = tf.matmul(S_, ques_embedding) self.q2v = tf.matmul(tf.matmul(S_, S_T), frame_embedding) attention_outputs = tf.concat([frame_embedding, self.v2q, frame_embedding * self.v2q, frame_embedding * self.q2v], 2) with tf.variable_scope("Model_Encoder_Layer"): attention_outputs = tf.contrib.layers.dropout(attention_outputs, self.dropout, is_training=self.is_training) model_outputs, _ = layers.dynamic_origin_bilstm_layer(attention_outputs, self.hidden_size, 'model_layer', input_len=self.frame_len) model_outputs = tf.contrib.layers.dropout(model_outputs, self.dropout, is_training=self.is_training) with tf.variable_scope("Output_Layer"): # logit_score = layers.correlation_layer(model_outputs,self.q_feature,self.hidden_size,scope_name='output_layer') logit_score = layers.linear_layer_3d(model_outputs, 1, scope_name='output_layer') logit_score = tf.squeeze(logit_score, 2) logit_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits= logit_score, labels=self.gt_predict) avg_logit_loss = tf.reduce_mean(tf.reduce_sum(logit_loss,1)) self.G_variables = tf.trainable_variables() G_regularization_cost = tf.reduce_sum([tf.nn.l2_loss(v) for v in self.G_variables]) self.test_loss = avg_logit_loss self.loss = avg_logit_loss + self.regularization_beta * G_regularization_cost self.frame_score = tf.nn.sigmoid(logit_score) with tf.variable_scope('Pointer_Layer'): score_dist = tf.nn.sigmoid(logit_score) score_dist = conv_utils.normalize(score_dist,scope='layer_normal') output = tf.nn.relu(conv_utils.conv1d_with_bias(tf.expand_dims(score_dist,2),1,16,5)) # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training) output = tf.nn.relu(conv_utils.conv1d_with_bias(output,2,32,10)) # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 3, 64, 20)) # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training) output = tf.nn.relu(conv_utils.conv1d_with_bias(output,4,1,10)) # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training) # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size,scope_name='pointer_1')) # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size/2,scope_name='pointer_2')) # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size/4,scope_name='pointer_3')) output = layers.linear_layer(tf.squeeze(output,2), 2, scope_name='pointrt_output') self.predict_start_end = output gt_start_end = self.gt_windows pointer_loss = tf.reduce_mean(tf.square(tf.subtract(self.predict_start_end, gt_start_end))) all_variable = tf.trainable_variables() # self.pn_variables = all_variable self.pn_variables = [vv for vv in all_variable if vv not in self.G_variables] pn_regularization_cost = tf.reduce_sum([tf.nn.l2_loss(v) for v in self.pn_variables]) self.pn_loss = pointer_loss + self.regularization_beta * pn_regularization_cost
def build_model(self): # input layer (batch_size, n_steps, input_dim) self.ques_vecs = tf.placeholder( tf.float32, [None, self.max_words, self.input_ques_dim]) self.ques_len = tf.placeholder(tf.int32, [None]) self.frame_vecs = tf.placeholder( tf.float32, [None, self.max_frames, self.input_video_dim]) self.frame_len = tf.placeholder(tf.int32, [None]) self.batch_size = tf.placeholder(tf.int32, []) self.is_training = tf.placeholder(tf.bool) self.gt_predict = tf.placeholder(tf.float32, [None, self.max_frames]) self.frame_mask = tf.sequence_mask(self.frame_len, maxlen=self.max_frames) self.ques_mask = tf.sequence_mask(self.ques_len, maxlen=self.max_words) with tf.variable_scope("Frame_Embedding_Encoder_Layer"): frame_next_layer = tf.contrib.layers.dropout( self.frame_vecs, self.dropout, is_training=self.is_training) frame_next_layer = conv_utils.linear_mapping( frame_next_layer, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_cnn") frame_next_layer = transformer.normalize(frame_next_layer) frame_next_layer += transformer.positional_encoding_v2( frame_next_layer, num_units=self.hidden_size, zero_pad=False, scale=False, scope="enc_pe") for i in range(3): with tf.variable_scope("stack_%s" % i): frame_next_layer = conv_utils.conv_encoder_stack( frame_next_layer, [self.hidden_size, self.hidden_size, self.hidden_size], [3, 3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) frame_next_layer = transformer.multihead_attention( queries=frame_next_layer, keys=frame_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) frame_next_layer = transformer.feedforward( frame_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) frame_embedding = tf.contrib.layers.dropout( frame_next_layer, self.dropout, is_training=self.is_training) with tf.variable_scope("Ques_Embedding_Encoder_Layer"): ques_next_layer = tf.contrib.layers.dropout( self.ques_vecs, self.dropout, is_training=self.is_training) ques_next_layer = conv_utils.linear_mapping( ques_next_layer, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_cnn") ques_next_layer = transformer.normalize(ques_next_layer) ques_next_layer += transformer.positional_encoding_v2( ques_next_layer, num_units=self.hidden_size, zero_pad=False, scale=False, scope="enc_pe") for i in range(1): with tf.variable_scope("stack_%s" % i): ques_next_layer = conv_utils.conv_encoder_stack( ques_next_layer, [self.hidden_size, self.hidden_size], [3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) ques_next_layer = transformer.multihead_attention( queries=ques_next_layer, keys=ques_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) ques_next_layer = transformer.feedforward( ques_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) ques_embedding = tf.contrib.layers.dropout( ques_next_layer, self.dropout, is_training=self.is_training) with tf.variable_scope("Context_to_Query_Attention_Layer"): att_score = tf.matmul( frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 mask_q = tf.expand_dims(self.ques_mask, 1) S_ = tf.nn.softmax(layers.mask_logits(att_score, mask=mask_q)) mask_v = tf.expand_dims(self.frame_mask, 2) S_T = tf.transpose( tf.nn.softmax(layers.mask_logits(att_score, mask=mask_v), axis=1), (0, 2, 1)) self.v2q = tf.matmul(S_, ques_embedding) self.q2v = tf.matmul(tf.matmul(S_, S_T), frame_embedding) attention_outputs = tf.concat([ frame_embedding, self.v2q, frame_embedding * self.v2q, frame_embedding * self.q2v ], 2) with tf.variable_scope("Model_Encoder_Layer"): model_next_layer = conv_utils.linear_mapping( attention_outputs, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_model_layer") model_next_layer = transformer.normalize(model_next_layer) for i in range(2): with tf.variable_scope("stack_%s" % i): model_next_layer = conv_utils.conv_encoder_stack( model_next_layer, [self.hidden_size, self.hidden_size], [3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) model_next_layer = transformer.multihead_attention( queries=model_next_layer, keys=model_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) model_next_layer = transformer.feedforward( model_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) model_outputs = model_next_layer with tf.variable_scope("Output_Layer"): logit_score = layers.linear_layer_3d(model_outputs, 1, scope_name='output_layer') logit_score = tf.squeeze(logit_score, 2) logit_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logit_score, labels=self.gt_predict) avg_logit_loss = tf.reduce_mean(tf.reduce_sum(logit_loss, 1)) variables = tf.trainable_variables() print(len(variables)) regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in variables]) self.test_loss = avg_logit_loss self.loss = avg_logit_loss + self.regularization_beta * regularization_cost self.frame_score = tf.nn.sigmoid(logit_score)