def discriminator(self, v_feature, q_feature, reuse_flag=False): with tf.variable_scope("discriminator", reuse=reuse_flag): v_feature = layers.linear_layer(v_feature, self.hidden_size, scope_name='v_transfer') # v_feature = tf.contrib.layers.dropout(v_feature, self.dropout, is_training=self.is_training) q_feature = layers.linear_layer(q_feature, self.hidden_size, scope_name='q_transfer') # q_feature = tf.contrib.layers.dropout(q_feature, self.dropout, is_training=self.is_training) fused_add = v_feature + q_feature fused_mul = v_feature * q_feature fused_cat = tf.concat([fused_add, fused_mul], axis=1) fused_fc = layers.linear_layer(fused_cat, self.hidden_size, scope_name='fc_transfer') fused_all = tf.concat([fused_add, fused_mul, fused_fc], axis=1) # fused_all = tf.contrib.layers.dropout(fused_all, self.dropout, is_training=self.is_training) with tf.variable_scope("output"): scores = layers.linear_layer(fused_all, 1, scope_name='output') scores = tf.squeeze(scores, 1) return scores
def build_model(self): # input layer (batch_size, n_steps, input_dim) self.ques_vecs = tf.placeholder( tf.float32, [None, self.max_words, self.input_ques_dim]) self.ques_len = tf.placeholder(tf.int32, [None]) self.frame_vecs = tf.placeholder( tf.float32, [None, self.max_frames, self.input_video_dim]) self.frame_len = tf.placeholder(tf.int32, [None]) self.batch_size = tf.placeholder(tf.int32, []) self.is_training = tf.placeholder(tf.bool) self.gt_predict = tf.placeholder(tf.float32, [None, self.max_frames]) self.gt_windows = tf.placeholder(tf.float32, [None, 2]) self.frame_mask = tf.sequence_mask(self.frame_len, maxlen=self.max_frames) self.ques_mask = tf.sequence_mask(self.ques_len, maxlen=self.max_words) with tf.variable_scope("Frame_Embedding_Encoder_Layer"): input_frame_vecs = tf.contrib.layers.dropout( self.frame_vecs, self.dropout, is_training=self.is_training) frame_embedding, _ = layers.dynamic_origin_bilstm_layer( input_frame_vecs, self.hidden_size, 'frame_embedding', input_len=self.frame_len) frame_embedding = tf.contrib.layers.dropout( frame_embedding, self.dropout, is_training=self.is_training) with tf.variable_scope("Ques_Embedding_Encoder_Layer"): input_ques_vecs = tf.contrib.layers.dropout( self.ques_vecs, self.dropout, is_training=self.is_training) ques_embedding, ques_states = layers.dynamic_origin_bilstm_layer( input_ques_vecs, self.hidden_size, 'ques_embedding', input_len=self.ques_len) ques_embedding = tf.contrib.layers.dropout( ques_embedding, self.dropout, is_training=self.is_training) q_feature = tf.concat([ques_states[0][1], ques_states[1][1]], 1) self.q_feature = tf.contrib.layers.dropout( q_feature, self.dropout, is_training=self.is_training) with tf.variable_scope("Context_to_Query_Attention_Layer"): # att_score = tf.matmul(frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 # att_score = tf.nn.softmax(mask_logits(att_score, mask=tf.expand_dims(self.ques_mask, 1))) # # length = tf.cast(tf.shape(ques_embedding), tf.float32) # att_out = tf.matmul(att_score, ques_embedding) * length[1] * tf.sqrt( # 1.0 / length[1]) # M*N1*N2 ** M*N2*K --> M*N1*k # # attention_outputs = tf.concat([frame_embedding, att_out, tf.multiply(frame_embedding,att_out)]) att_score = tf.matmul( frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 mask_q = tf.expand_dims(self.ques_mask, 1) S_ = tf.nn.softmax(layers.mask_logits(att_score, mask=mask_q)) mask_v = tf.expand_dims(self.frame_mask, 2) S_T = tf.transpose( tf.nn.softmax(layers.mask_logits(att_score, mask=mask_v), axis=1), (0, 2, 1)) self.v2q = tf.matmul(S_, ques_embedding) self.q2v = tf.matmul(tf.matmul(S_, S_T), frame_embedding) attention_outputs = tf.concat([ frame_embedding, self.v2q, frame_embedding * self.v2q, frame_embedding * self.q2v ], 2) with tf.variable_scope("Model_Encoder_Layer"): attention_outputs = tf.contrib.layers.dropout( attention_outputs, self.dropout, is_training=self.is_training) model_outputs, _ = layers.dynamic_origin_bilstm_layer( attention_outputs, self.hidden_size, 'model_layer', input_len=self.frame_len) model_outputs = tf.contrib.layers.dropout( model_outputs, self.dropout, is_training=self.is_training) with tf.variable_scope("Output_Layer"): logit_score = layers.correlation_layer(model_outputs, self.q_feature, self.hidden_size, scope_name='output_layer') # logit_score = layers.linear_layer_3d(model_outputs, 1, scope_name='output_layer') # logit_score = tf.squeeze(logit_score, 2) logit_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logit_score, labels=self.gt_predict) avg_logit_loss = tf.reduce_mean(tf.reduce_sum(logit_loss, 1)) self.G_variables = tf.trainable_variables() G_regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in self.G_variables]) G_reg_loss = self.regularization_beta * G_regularization_cost ground_prod = tf.nn.softmax( layers.mask_logits(self.gt_predict, self.gt_predict)) ground_v_feature = tf.reduce_sum( tf.multiply(model_outputs, tf.expand_dims(ground_prod, 2)), 1) ground_out = self.discriminator(ground_v_feature, self.q_feature) generated_prod = tf.nn.sigmoid(logit_score) / tf.reduce_sum( tf.nn.sigmoid(logit_score), keepdims=True, axis=1) generated_v_feature = tf.reduce_sum( tf.multiply(model_outputs, tf.expand_dims(generated_prod, 2)), 1) generated_out = self.discriminator(generated_v_feature, self.q_feature, reuse_flag=True) all_variable = tf.trainable_variables() self.D_variables = [ vv for vv in all_variable if vv not in self.G_variables ] D_regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in self.D_variables]) D_reg_loss = self.regularization_beta * D_regularization_cost ground_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(ground_out), logits=ground_out)) generated_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.zeros_like(generated_out), logits=generated_out)) regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in all_variable]) self.reg_loss = self.regularization_beta * regularization_cost self.dist_loss = avg_logit_loss self.G_pre_loss = avg_logit_loss + G_reg_loss self.D_loss = ground_loss + generated_loss + D_reg_loss scale = self.max_frames / 5 self.G_loss = avg_logit_loss + G_reg_loss + scale * tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(generated_out), logits=generated_out)) self.frame_score = tf.nn.sigmoid(logit_score) with tf.variable_scope('Pointer_Layer'): score_dist = tf.nn.sigmoid(logit_score) score_dist = conv_utils.normalize(score_dist, scope='layer_normal') output = tf.nn.relu( conv_utils.conv1d_with_bias(tf.expand_dims(score_dist, 2), 1, 16, 5)) # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 2, 32, 10)) # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 3, 64, 20)) # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 4, 1, 10)) # output = tf.contrib.layers.dropout(output, self.dropout, is_training=self.is_training) # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size,scope_name='pointer_1')) # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size/2,scope_name='pointer_2')) # output = tf.nn.relu(layers.linear_layer(output, self.hidden_size/4,scope_name='pointer_3')) output = layers.linear_layer(tf.squeeze(output, 2), 2, scope_name='pointrt_output') self.predict_start_end = output gt_start_end = self.gt_windows pointer_loss = tf.reduce_mean( tf.square(tf.subtract(self.predict_start_end, gt_start_end))) all_variable = tf.trainable_variables() self.pn_variables = [ vv for vv in all_variable if vv not in self.G_variables and vv not in self.D_variables ] pn_regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in self.pn_variables]) self.pn_loss = pointer_loss + self.regularization_beta * pn_regularization_cost print(self.G_variables) print(self.D_variables) print(self.pn_variables)
def build_model(self): # input layer (batch_size, n_steps, input_dim) self.ques_vecs = tf.placeholder( tf.float32, [None, self.max_words, self.input_ques_dim]) self.ques_len = tf.placeholder(tf.int32, [None]) self.frame_vecs = tf.placeholder( tf.float32, [None, self.max_frames, self.input_video_dim]) self.frame_len = tf.placeholder(tf.int32, [None]) self.batch_size = tf.placeholder(tf.int32, []) self.is_training = tf.placeholder(tf.bool) self.gt_predict = tf.placeholder(tf.float32, [None, self.max_frames]) self.gt_windows = tf.placeholder(tf.float32, [None, 2]) self.frame_mask = tf.sequence_mask(self.frame_len, maxlen=self.max_frames) self.ques_mask = tf.sequence_mask(self.ques_len, maxlen=self.max_words) with tf.variable_scope("Frame_Embedding_Encoder_Layer"): frame_next_layer = tf.contrib.layers.dropout( self.frame_vecs, self.dropout, is_training=self.is_training) frame_next_layer = conv_utils.linear_mapping( frame_next_layer, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_cnn") frame_next_layer = transformer.normalize(frame_next_layer) frame_next_layer += transformer.positional_encoding_v2( frame_next_layer, num_units=self.hidden_size, zero_pad=False, scale=False, scope="enc_pe") for i in range(3): with tf.variable_scope("stack_%s" % i): frame_next_layer = conv_utils.conv_encoder_stack( frame_next_layer, [self.hidden_size, self.hidden_size, self.hidden_size], [3, 3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) frame_next_layer = transformer.multihead_attention( queries=frame_next_layer, keys=frame_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) frame_next_layer = transformer.feedforward( frame_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) frame_embedding = tf.contrib.layers.dropout( frame_next_layer, self.dropout, is_training=self.is_training) with tf.variable_scope("Ques_Embedding_Encoder_Layer"): ques_next_layer = tf.contrib.layers.dropout( self.ques_vecs, self.dropout, is_training=self.is_training) ques_next_layer = conv_utils.linear_mapping( ques_next_layer, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_cnn") ques_next_layer = transformer.normalize(ques_next_layer) ques_next_layer += transformer.positional_encoding_v2( ques_next_layer, num_units=self.hidden_size, zero_pad=False, scale=False, scope="enc_pe") for i in range(1): with tf.variable_scope("stack_%s" % i): ques_next_layer = conv_utils.conv_encoder_stack( ques_next_layer, [self.hidden_size, self.hidden_size], [3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) ques_next_layer = transformer.multihead_attention( queries=ques_next_layer, keys=ques_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) ques_next_layer = transformer.feedforward( ques_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) ques_embedding = tf.contrib.layers.dropout( ques_next_layer, self.dropout, is_training=self.is_training) # q_feature, _ = layers.weight_attention_layer(ques_embedding,self.hidden_size,scope_name='q_feature') ques_mask_embedding = layers.mask_zero( ques_next_layer, tf.expand_dims(self.ques_mask, 2)) q_feature = tf.reduce_sum( ques_mask_embedding, axis=1) / tf.expand_dims( tf.cast(self.ques_len, tf.float32), 1) # q_feature = tf.reduce_mean(ques_next_layer,axis=1) print(q_feature.shape) self.q_feature = tf.contrib.layers.dropout( q_feature, self.dropout, is_training=self.is_training) with tf.variable_scope("Context_to_Query_Attention_Layer"): att_score = tf.matmul( frame_embedding, ques_embedding, transpose_b=True) # M*N1*K ** M*N2*K --> M*N1*N2 mask_q = tf.expand_dims(self.ques_mask, 1) S_ = tf.nn.softmax(layers.mask_logits(att_score, mask=mask_q)) mask_v = tf.expand_dims(self.frame_mask, 2) S_T = tf.transpose( tf.nn.softmax(layers.mask_logits(att_score, mask=mask_v), axis=1), (0, 2, 1)) self.v2q = tf.matmul(S_, ques_embedding) self.q2v = tf.matmul(tf.matmul(S_, S_T), frame_embedding) attention_outputs = tf.concat([ frame_embedding, self.v2q, frame_embedding * self.v2q, frame_embedding * self.q2v ], 2) with tf.variable_scope("Model_Encoder_Layer"): model_next_layer = conv_utils.linear_mapping( attention_outputs, self.hidden_size, dropout=self.dropout, var_scope_name="linear_mapping_before_model_layer") model_next_layer = transformer.normalize(model_next_layer) for i in range(2): with tf.variable_scope("stack_%s" % i): model_next_layer = conv_utils.conv_encoder_stack( model_next_layer, [self.hidden_size, self.hidden_size], [3, 3], { 'src': self.dropout, 'hid': self.dropout }, mode=self.is_training) model_next_layer = transformer.multihead_attention( queries=model_next_layer, keys=model_next_layer, num_units=self.hidden_size, num_heads=4, dropout_rate=1 - self.dropout, is_training=self.is_training, causality=False) model_next_layer = transformer.feedforward( model_next_layer, num_units=[2 * self.hidden_size, self.hidden_size], is_training=self.is_training) model_outputs = model_next_layer with tf.variable_scope("Output_Layer"): # logit_score = layers.correlation_layer(model_outputs,self.q_feature,self.hidden_size,scope_name='output_layer') logit_score = layers.linear_layer_3d(model_outputs, 1, scope_name='output_layer') logit_score = tf.squeeze(logit_score, 2) logit_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logit_score, labels=self.gt_predict) avg_logit_loss = tf.reduce_mean(tf.reduce_sum(logit_loss, 1)) self.G_variables = tf.trainable_variables() G_regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in self.G_variables]) self.test_loss = avg_logit_loss self.loss = avg_logit_loss + self.regularization_beta * G_regularization_cost self.frame_score = tf.nn.sigmoid(logit_score) with tf.variable_scope('Pointer_Layer'): score_dist = tf.nn.sigmoid(logit_score) score_dist = conv_utils.normalize(score_dist, scope='layer_normal') output = tf.nn.relu( conv_utils.conv1d_with_bias(tf.expand_dims(score_dist, 2), 1, 16, 5)) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 2, 32, 10)) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 3, 64, 20)) output = tf.nn.relu(conv_utils.conv1d_with_bias(output, 4, 1, 10)) output = layers.linear_layer(tf.squeeze(output, 2), 2, scope_name='pointrt_output') self.predict_start_end = output gt_start_end = self.gt_windows pointer_loss = tf.reduce_mean( tf.square(tf.subtract(self.predict_start_end, gt_start_end))) all_variable = tf.trainable_variables() self.pn_variables = [ vv for vv in all_variable if vv not in self.G_variables ] pn_regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in self.pn_variables]) self.pn_loss = pointer_loss + self.regularization_beta * pn_regularization_cost
def build_train_proc(self): # input layer (batch_size, n_steps, input_dim) self.input_q = tf.placeholder( tf.float32, [None, self.max_n_q_words, self.input_ques_dim]) self.input_q_len = tf.placeholder(tf.int32, [None]) self.input_x = tf.placeholder( tf.float32, [None, self.input_n_frames, self.input_frame_dim]) self.input_x_len = tf.placeholder(tf.int32, [None]) self.y = tf.placeholder(tf.int32, [None, self.max_n_a_words]) self.y_mask = tf.placeholder(tf.float32, [None, self.max_n_a_words]) self.ans_vec = tf.placeholder( tf.float32, [None, self.max_n_a_words, self.input_ques_dim]) self.batch_size = tf.placeholder(tf.int32, []) self.is_training = tf.placeholder(tf.bool) self.reward = tf.placeholder(tf.float32, [None]) lstm_dim = self.lstm_dim # video LSTM layer, [n_steps * (batch_size, input_dim)] -> [n_steps * (batch_size, 2*lstm_dim)] input_x = tf.contrib.layers.dropout(self.input_x, self.dropout_prob, is_training=self.is_training) v_lstm_output, v_lstm_state = layers.dynamic_origin_lstm_layer( input_x, lstm_dim, 'v_lstm', input_len=self.input_x_len) # question LSTM layer q_lstm_output, q_lstm_state1 = layers.dynamic_origin_lstm_layer( self.input_q, lstm_dim, 'q_lstm', input_len=self.input_q_len) _, q_lstm_state2 = layers.dynamic_origin_lstm_layer( q_lstm_output, lstm_dim, 'q_lstm1', input_len=self.input_q_len) q_lstm_state_temp = tf.concat([q_lstm_state1[1], q_lstm_state2[1]], 1) q_lstm_state = layers.linear_layer(q_lstm_state_temp, self.lstm_dim, 'linear0') qv_dot = tf.multiply(q_lstm_state, v_lstm_state[1]) # [None, 1024] # softmax projection [batch_size, 2*lstm_dim] -> [batch_size, n_classes] concat_output = tf.concat([q_lstm_state, qv_dot], axis=1) self.v_first_lstm_output = v_lstm_output self.q_last_state = q_lstm_state # decoder # output -> first_atten # self.decoder_cell = tf.contrib.rnn.BasicLSTMCell(self.decode_dim) self.decoder_cell = tf.contrib.rnn.GRUCell(self.decode_dim) with tf.variable_scope('linear'): decoder_input_W = tf.get_variable( 'w', shape=[concat_output.shape[1], self.decode_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer( )) # initializer=tf.random_normal_initializer(stddev=0.03)) decoder_input_b = tf.get_variable( 'b', shape=[self.decode_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer( )) # initializer=tf.random_normal_initializer(stddev=0.03)) self.decoder_input = tf.matmul( concat_output, decoder_input_W) + decoder_input_b # [None, decode_dim] # answer->word predict self.embed_word_W = tf.Variable(tf.random_uniform( [self.decode_dim, self.n_words], -0.1, 0.1), name='embed_word_W') self.embed_word_b = tf.Variable(tf.random_uniform([self.n_words], -0.1, 0.1), name='embed_word_b') # word dim -> decode_dim self.word_to_lstm_w = tf.Variable(tf.random_uniform( [self.input_ques_dim, self.decode_dim], -0.1, 0.1), name='word_to_lstm_W') self.word_to_lstm_b = tf.Variable(tf.random_uniform([self.decode_dim], -0.1, 0.1), name='word_to_lstm_b') # decoder attention layer with tf.variable_scope('decoder_attention'): self.attention_w_q = tf.get_variable( 'attention_w_q', shape=[self.lstm_dim, self.attention_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) self.attention_w_x = tf.get_variable( 'attention_w_x', shape=[self.lstm_dim, self.attention_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) self.attention_w_h = tf.get_variable( 'attention_w_h', shape=[self.decode_dim, self.attention_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) self.attention_b = tf.get_variable( 'attention_b', shape=[self.attention_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) self.attention_a = tf.get_variable( 'attention_a', shape=[self.attention_dim, 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) self.attention_to_decoder = tf.get_variable( 'attention_to_decoder', shape=[self.lstm_dim, self.decode_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # decoder with tf.variable_scope('decoder'): self.decoder_r = tf.get_variable( 'decoder_r', shape=[self.decode_dim * 2, self.decode_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) self.decoder_z = tf.get_variable( 'decoder_z', shape=[self.decode_dim * 2, self.decode_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) self.decoder_w = tf.get_variable( 'decoder_w', shape=[self.decode_dim * 2, self.decode_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embedding layer embeddings = load_file(self.params['word_embedding']) self.Wemb = tf.constant(embeddings, dtype=tf.float32) # generate training answer_train, train_loss = self.generate_answer_on_training() answer_test, test_loss = self.generate_answer_on_testing() # final variables = tf.trainable_variables() regularization_cost = tf.reduce_sum( [tf.nn.l2_loss(v) for v in variables]) self.answer_word_train = answer_train self.train_loss = train_loss + self.regularization_beta * regularization_cost self.answer_word_test = answer_test self.test_loss = test_loss + self.regularization_beta * regularization_cost tf.summary.scalar('training cross entropy', self.train_loss)