def getVideoDualSemanticEmbedding(x, w2v, embedded_stories_words, T_B, pca_mat=None): ''' x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) w2v: word 2 vec (|v|,dim) ''' input_shape = x.get_shape().as_list() w2v_shape = w2v.get_shape().as_list() assert (len(input_shape) == 5) axis = [0, 1, 3, 4, 2] x = tf.transpose(x, perm=axis) x = tf.reshape(x, (-1, input_shape[2])) # x = tf.nn.l2_normalize(x,-1) if pca_mat is not None: linear_proj = tf.Variable(0.1 * pca_mat, dtype='float32', name='visual_linear_proj') else: linear_proj = InitUtil.init_weight_variable( (input_shape[2], w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') x = tf.matmul(x, linear_proj) x = tf.nn.l2_normalize(x, -1) w2v_cov = tf.matmul(tf.transpose(w2v, perm=[1, 0]), w2v) x = tf.matmul(x, w2v_cov) # (batch_size*timesteps*height*width, |V|) x = tf.reshape( x, (-1, input_shape[1], input_shape[3], input_shape[4], w2v_shape[-1])) axis = [0, 1, 4, 2, 3] x = tf.transpose(x, perm=axis) # can be extended to different architecture x = tf.reduce_sum(x, reduction_indices=[3, 4]) x = tf.nn.l2_normalize(x, -1) stories_cov = batch_dot( tf.transpose(embedded_stories_words, perm=[0, 2, 1]), embedded_stories_words) print('stories_cov.get_shape():', stories_cov.get_shape().as_list()) x = batch_dot(x, stories_cov) print('x.get_shape():', x.get_shape().as_list()) x = tf.reshape(x, (-1, w2v_shape[-1])) x = tf.matmul(x, T_B) x = tf.reshape(x, (-1, input_shape[1], w2v_shape[-1])) x = tf.reduce_sum(x, reduction_indices=[1]) x = tf.nn.l2_normalize(x, -1) return x
def getVideoDualSemanticEmbeddingWithQuestionAttentionForDemo(x,w2v,embedded_stories_words,embedded_question,T_B,top_k,pca_mat=None): ''' x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) w2v: word 2 vec (|v|,dim) ''' input_shape = x.get_shape().as_list() w2v_shape = w2v.get_shape().as_list() assert(len(input_shape)==5) axis = [0,1,3,4,2] x = tf.transpose(x,perm=axis) x = tf.reshape(x,(-1,input_shape[2])) # x = tf.nn.l2_normalize(x,-1) if pca_mat is not None: linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj') else: linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') x = tf.matmul(x,linear_proj) x = tf.nn.l2_normalize(x,-1) #----------------------- # w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v) # x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|) regional_weight = tf.matmul(x,tf.transpose(w2v,perm=[1,0])) x = tf.matmul(regional_weight,w2v) #----------------------- x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1])) axis = [0,1,4,2,3] x = tf.transpose(x,perm=axis) # can be extended to different architecture x = tf.reduce_sum(x,reduction_indices=[3,4]) x = tf.nn.l2_normalize(x,-1) #----------------------- # stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words) # x = batch_dot(x,stories_cov) frame_wise_weight = batch_dot(x,tf.transpose(embedded_stories_words,perm=[0,2,1])) x = batch_dot(frame_wise_weight,embedded_stories_words) #----------------------- x = tf.nn.l2_normalize(x,-1) embedded_question = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1]) frame_weight = tf.reduce_sum(x*embedded_question,reduction_indices=-1,keep_dims=True) frame_weight = tf.nn.softmax(frame_weight,dim=1) frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]]) x = tf.reduce_sum(x*frame_weight,reduction_indices=1) # x = tf.nn.l2_normalize(x,-1) x = tf.matmul(x,T_B) x = tf.nn.l2_normalize(x,-1) # return top K words words_top_K_weight, words_top_K_indice = tf.nn.top_k(regional_weight,k=top_k) subtitles_top_K_weight, subtitles_top_K_indice = tf.nn.top_k(frame_wise_weight,k=3660) words_top_K_weight = tf.reshape(words_top_K_weight,(-1,input_shape[1],input_shape[3],input_shape[4],top_k)) words_top_K_indice = tf.reshape(words_top_K_indice,(-1,input_shape[1],input_shape[3],input_shape[4],top_k)) subtitles_top_K_weight = tf.reshape(subtitles_top_K_weight,(-1,input_shape[1],3660)) subtitles_top_K_indice = tf.reshape(subtitles_top_K_indice,(-1,input_shape[1],3660)) print('words_top_K_weight:',words_top_K_weight.get_shape().as_list()) print('words_top_K_indice:',words_top_K_indice.get_shape().as_list()) print('subtitles_top_K_weight:',subtitles_top_K_weight.get_shape().as_list()) print('subtitles_top_K_indice:',subtitles_top_K_indice.get_shape().as_list()) return x,words_top_K_weight, words_top_K_indice,subtitles_top_K_weight, subtitles_top_K_indice
def init_parameters(self): print('init_parameters ...') # encoder parameters # print(self.encoder_input_shape) encoder_i2h_shape = (self.encoder_input_shape[-1],3*self.output_dim) encoder_h2h_shape = (self.output_dim,self.output_dim) self.W_e_1 = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_1") self.U_e_r_1 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_r_1") self.U_e_z_1 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_z_1") self.U_e_h_1 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_h_1") self.b_e_1 = InitUtil.init_bias_variable((3*self.output_dim,),name="b_e_1") encoder_i2h_shape = (self.output_dim,3*self.output_dim) encoder_h2h_shape = (self.output_dim,self.output_dim) self.W_e_2 = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_2") self.U_e_r_2 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_r_2") self.U_e_z_2 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_z_2") self.U_e_h_2 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_h_2") self.b_e_2 = InitUtil.init_bias_variable((3*self.output_dim,),name="b_e_2") # decoder parameters self.T_w2v, self.T_mask = self.init_embedding_matrix() decoder_i2h_shape = (self.d_w2v,self.output_dim) decoder_h2h_shape = (self.output_dim,self.output_dim) self.W_d_r = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_r") self.W_d_z = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_z") self.W_d_h = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_h") self.U_d_r = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_r") self.U_d_z = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_z") self.U_d_h = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_h") self.b_d_r = InitUtil.init_bias_variable((self.output_dim,),name="b_d_r") self.b_d_z = InitUtil.init_bias_variable((self.output_dim,),name="b_d_z") self.b_d_h = InitUtil.init_bias_variable((self.output_dim,),name="b_d_h") self.W_a = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.attention_dim),init_method='glorot_uniform',name="W_a") self.U_a = InitUtil.init_weight_variable((self.output_dim,self.attention_dim),init_method='orthogonal',name="U_a") self.b_a = InitUtil.init_bias_variable((self.attention_dim,),name="b_a") self.W = InitUtil.init_weight_variable((self.attention_dim,1),init_method='glorot_uniform',name="W") self.A_z = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_z") self.A_r = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_r") self.A_h = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_h") # classification parameters self.W_c = InitUtil.init_weight_variable((self.output_dim,self.voc_size),init_method='uniform',name='W_c') self.b_c = InitUtil.init_bias_variable((self.voc_size,),name="b_c")
def getVideoHierarchicalSemanticWithAttendQuestionExe(x, w2v, embedded_stories_words, embedded_question, T_B, mask_q, pca_mat=None, return_sequences=False): ''' x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) w2v: word 2 vec (|v|,dim) ''' input_shape = x.get_shape().as_list() w2v_shape = w2v.get_shape().as_list() assert (len(input_shape) == 5) axis = [0, 1, 3, 4, 2] x = tf.transpose(x, perm=axis) x = tf.reshape(x, (-1, input_shape[2])) # x = tf.nn.l2_normalize(x,-1) if pca_mat is not None: linear_proj = tf.Variable(0.1 * pca_mat, dtype='float32', name='visual_linear_proj') else: linear_proj = InitUtil.init_weight_variable( (input_shape[2], w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') x = tf.matmul(x, linear_proj) x = tf.nn.l2_normalize(x, -1) #----------------------- w2v_cov = tf.matmul(tf.transpose(w2v, perm=[1, 0]), w2v) x = tf.matmul(x, w2v_cov) # (batch_size*timesteps*height*width, |V|) #----------------------- x = tf.reshape( x, (-1, input_shape[1], input_shape[3], input_shape[4], w2v_shape[-1])) axis = [0, 1, 4, 2, 3] x = tf.transpose(x, perm=axis) # can be extended to different architecture x = tf.reduce_sum(x, reduction_indices=[3, 4]) x = tf.nn.l2_normalize(x, -1) #----------------------- # embedded_stories_words = tf.nn.l2_normalize(embedded_stories_words,-1)### test stories_cov = batch_dot( tf.transpose(embedded_stories_words, perm=[0, 2, 1]), embedded_stories_words) x = batch_dot(x, stories_cov) #----------------------- # x = tf.nn.l2_normalize(x,-1) input_shape = x.get_shape().as_list() assert len(input_shape) == 3 timesteps = input_shape[1] input_dims = input_shape[2] output_dims = input_dims # # get initial state initial_state = get_init_state(x, output_dims) # initialize the parameters # W_r,U_r,b_r; W_z, U_z, b_z; W_h, U_h, b_h # W_r = InitUtil.init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_v_r") # W_z = InitUtil.init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_v_z") # W_h = InitUtil.init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_v_h") U_r = InitUtil.init_weight_variable((output_dims, output_dims), init_method='orthogonal', name="U_v_r") U_z = InitUtil.init_weight_variable((output_dims, output_dims), init_method='orthogonal', name="U_v_z") U_h = InitUtil.init_weight_variable((output_dims, output_dims), init_method='orthogonal', name="U_v_h") # b_r = InitUtil.init_bias_variable((output_dims,),name="b_v_r") # b_z = InitUtil.init_bias_variable((output_dims,),name="b_v_z") # b_h = InitUtil.init_bias_variable((output_dims,),name="b_v_h") # attention attention_dim = 30 W_a = InitUtil.init_weight_variable((input_dims, attention_dim), init_method='glorot_uniform', name="W_a") U_a = InitUtil.init_weight_variable((output_dims, attention_dim), init_method='orthogonal', name="U_a") b_a = InitUtil.init_bias_variable((attention_dim, ), name="b_a") W = InitUtil.init_weight_variable((attention_dim, 1), init_method='glorot_uniform', name="W") A_z = InitUtil.init_weight_variable((input_dims, output_dims), init_method='orthogonal', name="A_z") A_r = InitUtil.init_weight_variable((input_dims, output_dims), init_method='orthogonal', name="A_r") A_h = InitUtil.init_weight_variable((input_dims, output_dims), init_method='orthogonal', name="A_h") # # batch_size x timesteps x dim -> timesteps x batch_size x dim axis = [1, 0] + list(range(2, 3)) # axis = [1,0,2] x = tf.transpose( x, perm=axis ) # permutate the input_x --> timestemp, batch_size, input_dims input_x = tf.TensorArray(dtype=x.dtype, size=timesteps, tensor_array_name='input_x') if hasattr(input_x, 'unstack'): input_x = input_x.unstack(x) else: input_x = input_x.unpack(x) hidden_state_x = tf.TensorArray(dtype=tf.float32, size=timesteps, tensor_array_name='hidden_state_x') timesteps_q = embedded_question.get_shape().as_list()[1] def step(time, hidden_state_x, h_tm1): x_t = input_x.read(time) # batch_size * dim ori_feature = tf.reshape(embedded_question, (-1, input_dims)) attend_wx = tf.reshape(tf.nn.xw_plus_b(ori_feature, W_a, b_a), (-1, timesteps_q, attention_dim)) attend_uh_tm1 = tf.tile(tf.expand_dims(tf.matmul(h_tm1, U_a), dim=1), [1, timesteps_q, 1]) attend_e = tf.nn.tanh(attend_wx + attend_uh_tm1) attend_e = tf.matmul(tf.reshape(attend_e, (-1, attention_dim)), W) # batch_size * timestep # attend_e = tf.reshape(attend_e,(-1,attention_dim)) attend_e = tf.where(tf.reshape(mask_q, (-1, )), attend_e, tf.zeros_like(attend_e)) attend_e = tf.nn.softmax(tf.reshape(attend_e, (-1, timesteps_q, 1)), dim=1) attend_fea = embedded_question * tf.tile(attend_e, [1, 1, input_dims]) attend_fea = tf.reduce_sum(attend_fea, reduction_indices=1) # preprocess_x_r = tf.nn.xw_plus_b(x_t, W_r, b_r) # preprocess_x_z = tf.nn.xw_plus_b(x_t, W_z, b_z) # preprocess_x_h = tf.nn.xw_plus_b(x_t, W_h, b_h) r = hard_sigmoid(x_t + tf.matmul(h_tm1, U_r) + tf.matmul(attend_fea, A_r)) z = hard_sigmoid(x_t + tf.matmul(h_tm1, U_z) + tf.matmul(attend_fea, A_z)) hh = tf.nn.tanh(x_t + tf.matmul(r * h_tm1, U_h) + tf.matmul(attend_fea, A_h)) h = (1 - z) * hh + z * h_tm1 hidden_state_x = hidden_state_x.write(time, h) return (time + 1, hidden_state_x, h) time = tf.constant(0, dtype='int32', name='time') ret_out = tf.while_loop(cond=lambda time, *_: time < timesteps, body=step, loop_vars=(time, hidden_state_x, initial_state), parallel_iterations=32, swap_memory=True) hidden_state_x = ret_out[1] last_output = ret_out[-1] if hasattr(hidden_state_x, 'stack'): outputs = hidden_state_x.stack() print('stack') else: outputs = hidden_state_x.pack() axis = [1, 0] + list(range(2, 3)) outputs = tf.transpose(outputs, perm=axis) if return_sequences: return outputs else: return last_output
def init_parameters(self): print('init_parameters ...') # encoder parameters # print(self.encoder_input_shape) encoder_i2h_shape = (self.encoder_input_shape[-1],4*self.output_dim) encoder_h2h_shape = (self.output_dim,4*self.output_dim) self.W_e = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e") self.U_e = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e") self.b_e = InitUtil.init_bias_variable((4*self.output_dim,),name="b_e") # decoder parameters self.T_w2v, self.T_mask = self.init_embedding_matrix() decoder_i2h_shape = (self.d_w2v,4*self.output_dim) decoder_h2h_shape = (self.output_dim,4*self.output_dim) self.W_d = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d") self.U_d = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d") self.b_d = InitUtil.init_bias_variable((4*self.output_dim,),name="b_d") self.W_a = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.attention_dim),init_method='glorot_uniform',name="W_a") self.U_a = InitUtil.init_weight_variable((self.output_dim,self.attention_dim),init_method='orthogonal',name="U_a") self.b_a = InitUtil.init_bias_variable((self.attention_dim,),name="b_a") self.W = InitUtil.init_weight_variable((self.attention_dim,1),init_method='glorot_uniform',name="W") self.A = InitUtil.init_weight_variable((self.encoder_input_shape[-1],4*self.output_dim),init_method='orthogonal',name="A") # multirate self.block_length = int(math.ceil(self.output_dim/len(self.T_k))) print('block_length:%d'%self.block_length) # classification parameters self.W_c = InitUtil.init_weight_variable((self.output_dim,self.voc_size),init_method='uniform',name='W_c') self.b_c = InitUtil.init_bias_variable((self.voc_size,),name="b_c")
def getVideoDualSemanticEmbeddingWithQuestionAttention_question_guid(embeded_stories, d_lproj, x,w2v,embedded_stories_words,embedded_question,T_B,pca_mat=None,return_sequences=True): ''' x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) w2v: word 2 vec (|v|,dim) ''' input_shape = x.get_shape().as_list() w2v_shape = w2v.get_shape().as_list() assert(len(input_shape)==5) axis = [0,1,3,4,2] x = tf.transpose(x,perm=axis) x = tf.reshape(x,(-1,input_shape[2])) if pca_mat is not None: linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj') else: linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') x = tf.matmul(x,linear_proj) x = tf.nn.l2_normalize(x,-1) #----------------------- w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v) x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|) #----------------------- x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1])) axis = [0,1,4,2,3] x = tf.transpose(x,perm=axis) # can be extended to different architecture x = tf.reduce_sum(x,reduction_indices=[3,4]) x = tf.nn.l2_normalize(x,-1) #----------------------- stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words) x_out = batch_dot(x,stories_cov) #----------------------- x = tf.nn.l2_normalize(x_out,-1) embedded_question_use = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1]) frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True) frame_weight = tf.nn.softmax(frame_weight,dim=1) frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]]) x_weight_new = tf.reduce_sum(x*frame_weight,reduction_indices=1) x_weight_use = tf.expand_dims(x_weight_new, dim = 1) story_weight = tf.matmul(x_weight_use,tf.transpose(embedded_stories_words,perm=[0,2,1])) story_weight = tf.nn.relu(story_weight) embedded_stories_words = tf.multiply(tf.transpose(story_weight,perm=[0,2,1]), embedded_stories_words) stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words) x = batch_dot(x,stories_cov) x = tf.nn.l2_normalize(x_out,-1) #------------------------------------------------------------------------------------------------------------- stories_shape = embeded_stories.get_shape().as_list() embeded_question_shape = embedded_question.get_shape().as_list() num_of_sentence = stories_shape[-3] input_dims = stories_shape[-1] output_dims = embeded_question_shape[-1] print('embeded_question_shape', embeded_question_shape) print('num_of_sentence', num_of_sentence) print('output_dims', output_dims) print('stories_shape', stories_shape) embeded_question = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,num_of_sentence,1]) sen_weight = tf.reduce_sum(embeded_question*embedded_stories_words,reduction_indices=-1,keep_dims=True) sen_weight = tf.nn.relu(sen_weight) sen_weight = tf.tile(sen_weight,[1,1,output_dims]) if return_sequences: embeded_stories_used = embedded_stories_words*sen_weight else: embeded_stories_used = tf.reduce_sum(embedded_stories_words*sen_weight,reduction_indices=1) #------------------------------------------------------------------------------------------------------------- stories_cov = batch_dot(tf.transpose(embeded_stories_used,perm=[0,2,1]),embeded_stories_used) x = batch_dot(x,stories_cov) #----------------------- x = tf.nn.l2_normalize(x,-1) frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True) frame_weight = tf.nn.softmax(frame_weight,dim=1) frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]]) x = tf.reduce_sum(x*frame_weight,reduction_indices=1) #----------------------------------------------- x = tf.matmul(x,T_B) x = tf.nn.l2_normalize(x,-1) return x
def init_parameters(self): print('init_parameters ...') # encoder parameters # print(self.encoder_input_shape) encoder_i2h_shape = (self.encoder_input_shape[-1],self.output_dim) encoder_h2h_shape = (self.output_dim,self.output_dim) self.W_e_r = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_r") self.W_e_z = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_z") self.W_e_h = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_h") self.U_e_r = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_r") self.U_e_z = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_z") self.U_e_h = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_h") self.b_e_r = InitUtil.init_bias_variable((self.output_dim,),name="b_e_r") self.b_e_z = InitUtil.init_bias_variable((self.output_dim,),name="b_e_z") self.b_e_h = InitUtil.init_bias_variable((self.output_dim,),name="b_e_h") # decoder parameters self.T_w2v, self.T_mask = self.init_embedding_matrix() decoder_i2h_shape = (self.d_w2v,self.output_dim) decoder_h2h_shape = (self.output_dim,self.output_dim) self.W_d_r = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_r") self.W_d_z = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_z") self.W_d_h = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_h") self.U_d_r = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_r") self.U_d_z = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_z") self.U_d_h = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_h") self.b_d_r = InitUtil.init_bias_variable((self.output_dim,),name="b_d_r") self.b_d_z = InitUtil.init_bias_variable((self.output_dim,),name="b_d_z") self.b_d_h = InitUtil.init_bias_variable((self.output_dim,),name="b_d_h") self.W_a = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.attention_dim),init_method='glorot_uniform',name="W_a") self.U_a = InitUtil.init_weight_variable((self.output_dim,self.attention_dim),init_method='orthogonal',name="U_a") self.b_a = InitUtil.init_bias_variable((self.attention_dim,),name="b_a") self.W = InitUtil.init_weight_variable((self.attention_dim,1),init_method='glorot_uniform',name="W") self.A_z = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_z") self.A_r = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_r") self.A_h = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_h") # predict feature decoder predict_decoder_i2h_shape = (self.unsup_decoder_input_shape[-1],self.output_dim) predict_decoder_h2h_shape = (self.output_dim,self.output_dim) predict_decoder_h2o_shape = (self.output_dim,self.unsup_decoder_input_shape[-1]) self.W_p_r = InitUtil.init_weight_variable(predict_decoder_i2h_shape,init_method='glorot_uniform',name="W_p_r") self.W_p_z = InitUtil.init_weight_variable(predict_decoder_i2h_shape,init_method='glorot_uniform',name="W_p_z") self.W_p_h = InitUtil.init_weight_variable(predict_decoder_i2h_shape,init_method='glorot_uniform',name="W_p_h") self.W_p_o = InitUtil.init_weight_variable(predict_decoder_h2o_shape,init_method='glorot_uniform',name='W_p_o') self.U_p_r = InitUtil.init_weight_variable(predict_decoder_h2h_shape,init_method='orthogonal',name="U_p_r") self.U_p_z = InitUtil.init_weight_variable(predict_decoder_h2h_shape,init_method='orthogonal',name="U_p_z") self.U_p_h = InitUtil.init_weight_variable(predict_decoder_h2h_shape,init_method='orthogonal',name="U_p_h") self.b_p_r = InitUtil.init_bias_variable((self.output_dim,),name="b_p_r") self.b_p_z = InitUtil.init_bias_variable((self.output_dim,),name="b_p_z") self.b_p_h = InitUtil.init_bias_variable((self.output_dim,),name="b_p_h") self.W_p_a = InitUtil.init_weight_variable((self.unsup_decoder_input_shape[-1],self.attention_dim),init_method='glorot_uniform',name="W_p_a") self.U_p_a = InitUtil.init_weight_variable((self.output_dim,self.attention_dim),init_method='orthogonal',name="U_p_a") self.b_p_a = InitUtil.init_bias_variable((self.attention_dim,),name="b_p_a") self.W_p = InitUtil.init_weight_variable((self.attention_dim,1),init_method='glorot_uniform',name="W_p") self.A_p_z = InitUtil.init_weight_variable((self.unsup_decoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_p_z") self.A_p_r = InitUtil.init_weight_variable((self.unsup_decoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_p_r") self.A_p_h = InitUtil.init_weight_variable((self.unsup_decoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_p_h") # multirate self.block_length = int(math.ceil(self.output_dim/len(self.T_k))) print('block_length:%d'%self.block_length) # classification parameters self.W_c = InitUtil.init_weight_variable((self.output_dim,self.voc_size),init_method='uniform',name='W_c') self.b_c = InitUtil.init_bias_variable((self.voc_size,),name="b_c")
def getVideoDualSemanticEmbeddingWithQuestionAttention_question_guid( embeded_stories, d_lproj, x, w2v, embedded_stories_words, embedded_question, T_B, pca_mat=None, return_sequences=True): ''' x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) w2v: word 2 vec (|v|,dim) ''' # embeded_stories shape=(?, 3660, 40, 300) # d_lproj 300 # x shape=(?, 32, 512, 7, 7) # w2v shape=(26033, 300) # embedded_stories_words shape=(?, 3660, 300) # embedded_question shape=(?, 300) # T_B shape=(300, 300) # pca_mat (512, 300) # pdb.set_trace() input_shape = x.get_shape().as_list() w2v_shape = w2v.get_shape().as_list() assert (len(input_shape) == 5) axis = [0, 1, 3, 4, 2] x = tf.transpose(x, perm=axis) x = tf.reshape(x, (-1, input_shape[2])) if pca_mat is not None: linear_proj = tf.Variable(0.1 * pca_mat, dtype='float32', name='visual_linear_proj') else: linear_proj = InitUtil.init_weight_variable( (input_shape[2], w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') x = tf.matmul(x, linear_proj) x = tf.nn.l2_normalize(x, -1) #----------------------- w2v_cov = tf.matmul(tf.transpose(w2v, perm=[1, 0]), w2v) x = tf.matmul(x, w2v_cov) # (batch_size*timesteps*height*width, |V|) #----------------------- x = tf.reshape( x, (-1, input_shape[1], input_shape[3], input_shape[4], w2v_shape[-1])) axis = [0, 1, 4, 2, 3] x = tf.transpose(x, perm=axis) # can be extended to different architecture x = tf.reduce_sum(x, reduction_indices=[3, 4]) x = tf.nn.l2_normalize(x, -1) # pdb.set_trace() ##### SO TILL NOW PROBABLY IS EXACTLY THE SAME AS THE NORMAL CASE, expect that you sum only 7x7 into 1 region and keep 32 as it is # x --> shape=(?, 32, 300) #----------------------- # embedded_stories_words --> shape=(?, 3660, 300) stories_cov = batch_dot( tf.transpose(embedded_stories_words, perm=[0, 2, 1]), embedded_stories_words) # shape=(?, 300, 300) x_out = batch_dot( x, stories_cov ) # shape=(?, 32, 300) dot with shape=(?, 300, 300) to get again shape=(?, 32, 300) #----------------------- x = tf.nn.l2_normalize(x_out, -1) # 32x300 # embedded_question ..... ? x 300 embedded_question_use = tf.tile( tf.expand_dims(embedded_question, dim=1), [1, input_shape[1], 1]) # shape=(?, 32, 300) ### THE ABOVE THING JUST REPLICATES x300 into x32x300 frame_weight = tf.reduce_sum(x * embedded_question_use, reduction_indices=-1, keep_dims=True) #?x32x1....NS frame_weight = tf.nn.softmax(frame_weight, dim=1) #####THIS THING TELLS THE WEIGHT OF EACH FRAME frame_weight = tf.tile(frame_weight, [1, 1, w2v_shape[-1]]) # shape=(?, 32, 300) x_weight_new = tf.reduce_sum(x * frame_weight, reduction_indices=1) x_weight_use = tf.expand_dims(x_weight_new, dim=1) # ? x 300 story_weight = tf.matmul( x_weight_use, tf.transpose(embedded_stories_words, perm=[0, 2, 1])) story_weight = tf.nn.relu(story_weight) # shape=(?, 1, 3660) embedded_stories_words = tf.multiply( tf.transpose(story_weight, perm=[0, 2, 1]), embedded_stories_words) # shape=(?, 3660, 300) stories_cov = batch_dot( tf.transpose(embedded_stories_words, perm=[0, 2, 1]), embedded_stories_words) # ? x 300 x 300 #ALL THE ABOVE DOES SOME WEIGHTED ATTENTION THING USING STORY {i.e. subtitles} to make it 300x300 AND THEN WE AGAIN MUTLIPLE x the video features with it x = batch_dot(x, stories_cov) x = tf.nn.l2_normalize(x_out, -1) # ? x 32 x 300 # SO THE PROJECTED VIDEO FEATURES [these are probably using static words] WITH SUBTITLE BASED ATTENSION [now this makes the dynamic thing because we # now use the corresponding subtitles] are of the shape ? x 32 x 300, so we still have the temporal context #------------------------------------------------------------------------------------------------------------- stories_shape = embeded_stories.get_shape().as_list() embeded_question_shape = embedded_question.get_shape().as_list() num_of_sentence = stories_shape[-3] input_dims = stories_shape[-1] output_dims = embeded_question_shape[-1] print('embeded_question_shape', embeded_question_shape) # [None, 300] print('num_of_sentence', num_of_sentence) # 3660 print('output_dims', output_dims) # 300 print('stories_shape', stories_shape) # [None, 3660, 40, 300] #### SO NOW TILL NOW WE HAD DONE SUBTITLE BASED ATTENTION OF 300d projected visual features, NOW WE ARE DOING THE QUESTION BASED ATTENTION THING ## SO FIRST THING IS TO FIND ATTENTION OF QUESTION USING SUBITLES and THEN USE THAT ON VISUAL FEATURS embeded_question = tf.tile( tf.expand_dims(embedded_question, dim=1), [1, num_of_sentence, 1 ]) # from shape=(?, 300) is expanded to shape=(?, 3660, 300) sen_weight = tf.reduce_sum( embeded_question * embedded_stories_words, reduction_indices=-1, keep_dims=True) # ? x 3660 x 1......this is sentence weights sen_weight = tf.nn.relu(sen_weight) # shape=(?, 3660, 1) sen_weight = tf.tile(sen_weight, [1, 1, output_dims]) # shape=(?, 3660, 300) if return_sequences: #### THIS ONE IS TRUE embeded_stories_used = embedded_stories_words * sen_weight # shape=(?, 3660, 300) else: embeded_stories_used = tf.reduce_sum(embedded_stories_words * sen_weight, reduction_indices=1) #------------------------------------------------------------------------------------------------------------- stories_cov = batch_dot(tf.transpose(embeded_stories_used, perm=[0, 2, 1]), embeded_stories_used) # shape=(?, 300, 300) x = batch_dot(x, stories_cov) # shape=(?, 32, 300) #----------------------- x = tf.nn.l2_normalize(x, -1) frame_weight = tf.reduce_sum(x * embedded_question_use, reduction_indices=-1, keep_dims=True) # shape=(?, 32, 1) frame_weight = tf.nn.softmax(frame_weight, dim=1) frame_weight = tf.tile(frame_weight, [1, 1, w2v_shape[-1]]) # shape=(?, 32, 300) x = tf.reduce_sum(x * frame_weight, reduction_indices=1) # shape=(?, 300) #----------------------------------------------- x = tf.matmul(x, T_B) # shape=(?, 300) x = tf.nn.l2_normalize(x, -1) return x # FINAL OUTUT IS AGAIN of shape=(?, 300)