Exemplo n.º 1
0
def getVideoDualSemanticEmbedding(x,
                                  w2v,
                                  embedded_stories_words,
                                  T_B,
                                  pca_mat=None):
    '''
		x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
		w2v: word 2 vec (|v|,dim)
	'''
    input_shape = x.get_shape().as_list()
    w2v_shape = w2v.get_shape().as_list()
    assert (len(input_shape) == 5)
    axis = [0, 1, 3, 4, 2]
    x = tf.transpose(x, perm=axis)
    x = tf.reshape(x, (-1, input_shape[2]))
    # x = tf.nn.l2_normalize(x,-1)

    if pca_mat is not None:
        linear_proj = tf.Variable(0.1 * pca_mat,
                                  dtype='float32',
                                  name='visual_linear_proj')
    else:
        linear_proj = InitUtil.init_weight_variable(
            (input_shape[2], w2v_shape[-1]),
            init_method='uniform',
            name='visual_linear_proj')

    x = tf.matmul(x, linear_proj)
    x = tf.nn.l2_normalize(x, -1)

    w2v_cov = tf.matmul(tf.transpose(w2v, perm=[1, 0]), w2v)

    x = tf.matmul(x, w2v_cov)  # (batch_size*timesteps*height*width, |V|)

    x = tf.reshape(
        x, (-1, input_shape[1], input_shape[3], input_shape[4], w2v_shape[-1]))
    axis = [0, 1, 4, 2, 3]
    x = tf.transpose(x, perm=axis)

    # can be extended to different architecture
    x = tf.reduce_sum(x, reduction_indices=[3, 4])
    x = tf.nn.l2_normalize(x, -1)

    stories_cov = batch_dot(
        tf.transpose(embedded_stories_words, perm=[0, 2, 1]),
        embedded_stories_words)
    print('stories_cov.get_shape():', stories_cov.get_shape().as_list())
    x = batch_dot(x, stories_cov)
    print('x.get_shape():', x.get_shape().as_list())
    x = tf.reshape(x, (-1, w2v_shape[-1]))

    x = tf.matmul(x, T_B)
    x = tf.reshape(x, (-1, input_shape[1], w2v_shape[-1]))
    x = tf.reduce_sum(x, reduction_indices=[1])
    x = tf.nn.l2_normalize(x, -1)
    return x
def getVideoDualSemanticEmbeddingWithQuestionAttentionForDemo(x,w2v,embedded_stories_words,embedded_question,T_B,top_k,pca_mat=None):
	'''
		x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
		w2v: word 2 vec (|v|,dim)
	'''
	input_shape = x.get_shape().as_list()
	w2v_shape = w2v.get_shape().as_list()
	assert(len(input_shape)==5)
	axis = [0,1,3,4,2]
	x = tf.transpose(x,perm=axis)
	x = tf.reshape(x,(-1,input_shape[2]))
	# x = tf.nn.l2_normalize(x,-1)

	if pca_mat is not None:
		linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj')
	else:
		linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj')

	x = tf.matmul(x,linear_proj) 
	x = tf.nn.l2_normalize(x,-1)


	#-----------------------
	# w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v)
	# x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|)

	regional_weight = tf.matmul(x,tf.transpose(w2v,perm=[1,0]))
	x = tf.matmul(regional_weight,w2v)
	
	#-----------------------

	x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1]))
	axis = [0,1,4,2,3]
	x = tf.transpose(x,perm=axis)
	
	# can be extended to different architecture
	x = tf.reduce_sum(x,reduction_indices=[3,4])
	x = tf.nn.l2_normalize(x,-1)

	#-----------------------

	# stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words)
	# x = batch_dot(x,stories_cov)

	frame_wise_weight =  batch_dot(x,tf.transpose(embedded_stories_words,perm=[0,2,1]))
	x = batch_dot(frame_wise_weight,embedded_stories_words)

	#-----------------------
	x = tf.nn.l2_normalize(x,-1)

	embedded_question = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1])

	
	

	
	frame_weight = tf.reduce_sum(x*embedded_question,reduction_indices=-1,keep_dims=True)
	frame_weight = tf.nn.softmax(frame_weight,dim=1)

	frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]])

	x = tf.reduce_sum(x*frame_weight,reduction_indices=1)
	# x = tf.nn.l2_normalize(x,-1)
	x = tf.matmul(x,T_B)

	x = tf.nn.l2_normalize(x,-1)


	# return top K words
	words_top_K_weight, words_top_K_indice = tf.nn.top_k(regional_weight,k=top_k)
	subtitles_top_K_weight, subtitles_top_K_indice = tf.nn.top_k(frame_wise_weight,k=3660)

	words_top_K_weight = tf.reshape(words_top_K_weight,(-1,input_shape[1],input_shape[3],input_shape[4],top_k))
	words_top_K_indice = tf.reshape(words_top_K_indice,(-1,input_shape[1],input_shape[3],input_shape[4],top_k))

	subtitles_top_K_weight = tf.reshape(subtitles_top_K_weight,(-1,input_shape[1],3660))
	subtitles_top_K_indice = tf.reshape(subtitles_top_K_indice,(-1,input_shape[1],3660))

	print('words_top_K_weight:',words_top_K_weight.get_shape().as_list())
	print('words_top_K_indice:',words_top_K_indice.get_shape().as_list())

	print('subtitles_top_K_weight:',subtitles_top_K_weight.get_shape().as_list())
	print('subtitles_top_K_indice:',subtitles_top_K_indice.get_shape().as_list())

	
	return x,words_top_K_weight, words_top_K_indice,subtitles_top_K_weight, subtitles_top_K_indice
Exemplo n.º 3
0
	def init_parameters(self):
		print('init_parameters ...')

		# encoder parameters
		# print(self.encoder_input_shape)
		encoder_i2h_shape = (self.encoder_input_shape[-1],3*self.output_dim)
		encoder_h2h_shape = (self.output_dim,self.output_dim)
		self.W_e_1 = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_1")
		

		self.U_e_r_1 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_r_1")
		self.U_e_z_1 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_z_1")
		self.U_e_h_1 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_h_1")

		self.b_e_1 = InitUtil.init_bias_variable((3*self.output_dim,),name="b_e_1")
		

		encoder_i2h_shape = (self.output_dim,3*self.output_dim)
		encoder_h2h_shape = (self.output_dim,self.output_dim)
		self.W_e_2 = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_2")
		
		self.U_e_r_2 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_r_2")
		self.U_e_z_2 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_z_2")
		self.U_e_h_2 = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_h_2")

		self.b_e_2 = InitUtil.init_bias_variable((3*self.output_dim,),name="b_e_2")

		# decoder parameters
		self.T_w2v, self.T_mask = self.init_embedding_matrix()

		decoder_i2h_shape = (self.d_w2v,self.output_dim)
		decoder_h2h_shape = (self.output_dim,self.output_dim)
		self.W_d_r = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_r")
		self.W_d_z = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_z")
		self.W_d_h = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_h")

		self.U_d_r = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_r")
		self.U_d_z = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_z")
		self.U_d_h = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_h")

		self.b_d_r = InitUtil.init_bias_variable((self.output_dim,),name="b_d_r")
		self.b_d_z = InitUtil.init_bias_variable((self.output_dim,),name="b_d_z")
		self.b_d_h = InitUtil.init_bias_variable((self.output_dim,),name="b_d_h")


		
		self.W_a = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.attention_dim),init_method='glorot_uniform',name="W_a")
		self.U_a = InitUtil.init_weight_variable((self.output_dim,self.attention_dim),init_method='orthogonal',name="U_a")
		self.b_a = InitUtil.init_bias_variable((self.attention_dim,),name="b_a")

		self.W = InitUtil.init_weight_variable((self.attention_dim,1),init_method='glorot_uniform',name="W")

		self.A_z = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_z")

		self.A_r = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_r")

		self.A_h = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_h")





		# classification parameters
		self.W_c = InitUtil.init_weight_variable((self.output_dim,self.voc_size),init_method='uniform',name='W_c')
		self.b_c = InitUtil.init_bias_variable((self.voc_size,),name="b_c")
Exemplo n.º 4
0
def getVideoHierarchicalSemanticWithAttendQuestionExe(x,
                                                      w2v,
                                                      embedded_stories_words,
                                                      embedded_question,
                                                      T_B,
                                                      mask_q,
                                                      pca_mat=None,
                                                      return_sequences=False):
    '''
		x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
		w2v: word 2 vec (|v|,dim)
	'''
    input_shape = x.get_shape().as_list()
    w2v_shape = w2v.get_shape().as_list()
    assert (len(input_shape) == 5)
    axis = [0, 1, 3, 4, 2]
    x = tf.transpose(x, perm=axis)
    x = tf.reshape(x, (-1, input_shape[2]))
    # x = tf.nn.l2_normalize(x,-1)

    if pca_mat is not None:
        linear_proj = tf.Variable(0.1 * pca_mat,
                                  dtype='float32',
                                  name='visual_linear_proj')
    else:
        linear_proj = InitUtil.init_weight_variable(
            (input_shape[2], w2v_shape[-1]),
            init_method='uniform',
            name='visual_linear_proj')

    x = tf.matmul(x, linear_proj)
    x = tf.nn.l2_normalize(x, -1)

    #-----------------------
    w2v_cov = tf.matmul(tf.transpose(w2v, perm=[1, 0]), w2v)
    x = tf.matmul(x, w2v_cov)  # (batch_size*timesteps*height*width, |V|)

    #-----------------------

    x = tf.reshape(
        x, (-1, input_shape[1], input_shape[3], input_shape[4], w2v_shape[-1]))
    axis = [0, 1, 4, 2, 3]
    x = tf.transpose(x, perm=axis)

    # can be extended to different architecture
    x = tf.reduce_sum(x, reduction_indices=[3, 4])
    x = tf.nn.l2_normalize(x, -1)

    #-----------------------

    # embedded_stories_words = tf.nn.l2_normalize(embedded_stories_words,-1)### test

    stories_cov = batch_dot(
        tf.transpose(embedded_stories_words, perm=[0, 2, 1]),
        embedded_stories_words)
    x = batch_dot(x, stories_cov)
    #-----------------------
    # x = tf.nn.l2_normalize(x,-1)

    input_shape = x.get_shape().as_list()
    assert len(input_shape) == 3

    timesteps = input_shape[1]
    input_dims = input_shape[2]
    output_dims = input_dims
    # 	# get initial state
    initial_state = get_init_state(x, output_dims)

    # initialize the parameters
    # W_r,U_r,b_r; W_z, U_z, b_z; W_h, U_h, b_h
    # W_r = InitUtil.init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_v_r")
    # W_z = InitUtil.init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_v_z")
    # W_h = InitUtil.init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_v_h")

    U_r = InitUtil.init_weight_variable((output_dims, output_dims),
                                        init_method='orthogonal',
                                        name="U_v_r")
    U_z = InitUtil.init_weight_variable((output_dims, output_dims),
                                        init_method='orthogonal',
                                        name="U_v_z")
    U_h = InitUtil.init_weight_variable((output_dims, output_dims),
                                        init_method='orthogonal',
                                        name="U_v_h")

    # b_r = InitUtil.init_bias_variable((output_dims,),name="b_v_r")
    # b_z = InitUtil.init_bias_variable((output_dims,),name="b_v_z")
    # b_h = InitUtil.init_bias_variable((output_dims,),name="b_v_h")

    # attention
    attention_dim = 30
    W_a = InitUtil.init_weight_variable((input_dims, attention_dim),
                                        init_method='glorot_uniform',
                                        name="W_a")
    U_a = InitUtil.init_weight_variable((output_dims, attention_dim),
                                        init_method='orthogonal',
                                        name="U_a")
    b_a = InitUtil.init_bias_variable((attention_dim, ), name="b_a")

    W = InitUtil.init_weight_variable((attention_dim, 1),
                                      init_method='glorot_uniform',
                                      name="W")

    A_z = InitUtil.init_weight_variable((input_dims, output_dims),
                                        init_method='orthogonal',
                                        name="A_z")

    A_r = InitUtil.init_weight_variable((input_dims, output_dims),
                                        init_method='orthogonal',
                                        name="A_r")

    A_h = InitUtil.init_weight_variable((input_dims, output_dims),
                                        init_method='orthogonal',
                                        name="A_h")

    # 	# batch_size x timesteps x dim -> timesteps x batch_size x dim
    axis = [1, 0] + list(range(2, 3))  # axis = [1,0,2]
    x = tf.transpose(
        x, perm=axis
    )  # permutate the input_x --> timestemp, batch_size, input_dims

    input_x = tf.TensorArray(dtype=x.dtype,
                             size=timesteps,
                             tensor_array_name='input_x')

    if hasattr(input_x, 'unstack'):
        input_x = input_x.unstack(x)
    else:
        input_x = input_x.unpack(x)

    hidden_state_x = tf.TensorArray(dtype=tf.float32,
                                    size=timesteps,
                                    tensor_array_name='hidden_state_x')

    timesteps_q = embedded_question.get_shape().as_list()[1]

    def step(time, hidden_state_x, h_tm1):
        x_t = input_x.read(time)  # batch_size * dim

        ori_feature = tf.reshape(embedded_question, (-1, input_dims))
        attend_wx = tf.reshape(tf.nn.xw_plus_b(ori_feature, W_a, b_a),
                               (-1, timesteps_q, attention_dim))
        attend_uh_tm1 = tf.tile(tf.expand_dims(tf.matmul(h_tm1, U_a), dim=1),
                                [1, timesteps_q, 1])

        attend_e = tf.nn.tanh(attend_wx + attend_uh_tm1)
        attend_e = tf.matmul(tf.reshape(attend_e, (-1, attention_dim)),
                             W)  # batch_size * timestep
        # attend_e = tf.reshape(attend_e,(-1,attention_dim))
        attend_e = tf.where(tf.reshape(mask_q, (-1, )), attend_e,
                            tf.zeros_like(attend_e))
        attend_e = tf.nn.softmax(tf.reshape(attend_e, (-1, timesteps_q, 1)),
                                 dim=1)

        attend_fea = embedded_question * tf.tile(attend_e, [1, 1, input_dims])
        attend_fea = tf.reduce_sum(attend_fea, reduction_indices=1)

        # preprocess_x_r = tf.nn.xw_plus_b(x_t, W_r, b_r)
        # preprocess_x_z = tf.nn.xw_plus_b(x_t, W_z, b_z)
        # preprocess_x_h = tf.nn.xw_plus_b(x_t, W_h, b_h)

        r = hard_sigmoid(x_t + tf.matmul(h_tm1, U_r) +
                         tf.matmul(attend_fea, A_r))
        z = hard_sigmoid(x_t + tf.matmul(h_tm1, U_z) +
                         tf.matmul(attend_fea, A_z))
        hh = tf.nn.tanh(x_t + tf.matmul(r * h_tm1, U_h) +
                        tf.matmul(attend_fea, A_h))

        h = (1 - z) * hh + z * h_tm1

        hidden_state_x = hidden_state_x.write(time, h)

        return (time + 1, hidden_state_x, h)

    time = tf.constant(0, dtype='int32', name='time')

    ret_out = tf.while_loop(cond=lambda time, *_: time < timesteps,
                            body=step,
                            loop_vars=(time, hidden_state_x, initial_state),
                            parallel_iterations=32,
                            swap_memory=True)

    hidden_state_x = ret_out[1]
    last_output = ret_out[-1]

    if hasattr(hidden_state_x, 'stack'):
        outputs = hidden_state_x.stack()
        print('stack')
    else:
        outputs = hidden_state_x.pack()

    axis = [1, 0] + list(range(2, 3))
    outputs = tf.transpose(outputs, perm=axis)

    if return_sequences:
        return outputs
    else:
        return last_output
Exemplo n.º 5
0
	def init_parameters(self):
		print('init_parameters ...')

		# encoder parameters
		# print(self.encoder_input_shape)
		encoder_i2h_shape = (self.encoder_input_shape[-1],4*self.output_dim)
		encoder_h2h_shape = (self.output_dim,4*self.output_dim)


		self.W_e = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e")
		self.U_e = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e")
		self.b_e = InitUtil.init_bias_variable((4*self.output_dim,),name="b_e")


		# decoder parameters
		self.T_w2v, self.T_mask = self.init_embedding_matrix()

		decoder_i2h_shape = (self.d_w2v,4*self.output_dim)
		decoder_h2h_shape = (self.output_dim,4*self.output_dim)
		self.W_d = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d")
		self.U_d = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d")
		self.b_d = InitUtil.init_bias_variable((4*self.output_dim,),name="b_d")


		
		self.W_a = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.attention_dim),init_method='glorot_uniform',name="W_a")
		self.U_a = InitUtil.init_weight_variable((self.output_dim,self.attention_dim),init_method='orthogonal',name="U_a")
		self.b_a = InitUtil.init_bias_variable((self.attention_dim,),name="b_a")

		self.W = InitUtil.init_weight_variable((self.attention_dim,1),init_method='glorot_uniform',name="W")

		self.A = InitUtil.init_weight_variable((self.encoder_input_shape[-1],4*self.output_dim),init_method='orthogonal',name="A")

		# multirate
		self.block_length = int(math.ceil(self.output_dim/len(self.T_k)))
		print('block_length:%d'%self.block_length)


		# classification parameters
		self.W_c = InitUtil.init_weight_variable((self.output_dim,self.voc_size),init_method='uniform',name='W_c')
		self.b_c = InitUtil.init_bias_variable((self.voc_size,),name="b_c")
Exemplo n.º 6
0
def getVideoDualSemanticEmbeddingWithQuestionAttention_question_guid(embeded_stories, d_lproj, x,w2v,embedded_stories_words,embedded_question,T_B,pca_mat=None,return_sequences=True):
    '''
        x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
        w2v: word 2 vec (|v|,dim)
    '''
    input_shape = x.get_shape().as_list()
    w2v_shape = w2v.get_shape().as_list()
    assert(len(input_shape)==5)
    axis = [0,1,3,4,2]
    x = tf.transpose(x,perm=axis)
    x = tf.reshape(x,(-1,input_shape[2]))

    if pca_mat is not None:
        linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj')
    else:
        linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj')

    x = tf.matmul(x,linear_proj) 
    x = tf.nn.l2_normalize(x,-1)


    
    #-----------------------
    w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v)
    x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|)

    #-----------------------

    x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1]))
    axis = [0,1,4,2,3]
    x = tf.transpose(x,perm=axis)
    
    # can be extended to different architecture
    x = tf.reduce_sum(x,reduction_indices=[3,4])
    x = tf.nn.l2_normalize(x,-1)
    
    

    #-----------------------

    stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words)
    x_out = batch_dot(x,stories_cov)


    
    #-----------------------
    x = tf.nn.l2_normalize(x_out,-1)

    embedded_question_use = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1])

    
    frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True)
    
    frame_weight = tf.nn.softmax(frame_weight,dim=1) 
    


    frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]])

    x_weight_new = tf.reduce_sum(x*frame_weight,reduction_indices=1)
    

    x_weight_use = tf.expand_dims(x_weight_new, dim = 1)
    
    story_weight = tf.matmul(x_weight_use,tf.transpose(embedded_stories_words,perm=[0,2,1]))
    
    story_weight = tf.nn.relu(story_weight)
    
    embedded_stories_words = tf.multiply(tf.transpose(story_weight,perm=[0,2,1]), embedded_stories_words)
    
    stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words)
    
    x = batch_dot(x,stories_cov)
    
    x = tf.nn.l2_normalize(x_out,-1)
    
    #-------------------------------------------------------------------------------------------------------------
    
    stories_shape = embeded_stories.get_shape().as_list()
    embeded_question_shape = embedded_question.get_shape().as_list()
    num_of_sentence = stories_shape[-3]
    input_dims = stories_shape[-1]
    output_dims = embeded_question_shape[-1]
    
    print('embeded_question_shape', embeded_question_shape)
    print('num_of_sentence', num_of_sentence)
    
    print('output_dims', output_dims)
    print('stories_shape', stories_shape)

    
    embeded_question = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,num_of_sentence,1])

    sen_weight = tf.reduce_sum(embeded_question*embedded_stories_words,reduction_indices=-1,keep_dims=True)


    sen_weight = tf.nn.relu(sen_weight)
    sen_weight = tf.tile(sen_weight,[1,1,output_dims])
    if return_sequences:
        embeded_stories_used = embedded_stories_words*sen_weight
    else:
        embeded_stories_used = tf.reduce_sum(embedded_stories_words*sen_weight,reduction_indices=1)
 
    
    #-------------------------------------------------------------------------------------------------------------
    stories_cov = batch_dot(tf.transpose(embeded_stories_used,perm=[0,2,1]),embeded_stories_used)
    
    x = batch_dot(x,stories_cov)


    

    #-----------------------
    x = tf.nn.l2_normalize(x,-1)
    


    
    frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True)
    
    frame_weight = tf.nn.softmax(frame_weight,dim=1) 

    frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]])

    x = tf.reduce_sum(x*frame_weight,reduction_indices=1)


    #-----------------------------------------------

    x = tf.matmul(x,T_B)

    x = tf.nn.l2_normalize(x,-1)
    
    return x
Exemplo n.º 7
0
	def init_parameters(self):
		print('init_parameters ...')

		# encoder parameters
		# print(self.encoder_input_shape)
		encoder_i2h_shape = (self.encoder_input_shape[-1],self.output_dim)
		encoder_h2h_shape = (self.output_dim,self.output_dim)
		self.W_e_r = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_r")
		self.W_e_z = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_z")
		self.W_e_h = InitUtil.init_weight_variable(encoder_i2h_shape,init_method='glorot_uniform',name="W_e_h")


		self.U_e_r = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_r")
		self.U_e_z = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_z")
		self.U_e_h = InitUtil.init_weight_variable(encoder_h2h_shape,init_method='orthogonal',name="U_e_h")

		self.b_e_r = InitUtil.init_bias_variable((self.output_dim,),name="b_e_r")
		self.b_e_z = InitUtil.init_bias_variable((self.output_dim,),name="b_e_z")
		self.b_e_h = InitUtil.init_bias_variable((self.output_dim,),name="b_e_h")


		# decoder parameters
		self.T_w2v, self.T_mask = self.init_embedding_matrix()

		decoder_i2h_shape = (self.d_w2v,self.output_dim)
		decoder_h2h_shape = (self.output_dim,self.output_dim)
		self.W_d_r = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_r")
		self.W_d_z = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_z")
		self.W_d_h = InitUtil.init_weight_variable(decoder_i2h_shape,init_method='glorot_uniform',name="W_d_h")

		self.U_d_r = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_r")
		self.U_d_z = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_z")
		self.U_d_h = InitUtil.init_weight_variable(decoder_h2h_shape,init_method='orthogonal',name="U_d_h")

		self.b_d_r = InitUtil.init_bias_variable((self.output_dim,),name="b_d_r")
		self.b_d_z = InitUtil.init_bias_variable((self.output_dim,),name="b_d_z")
		self.b_d_h = InitUtil.init_bias_variable((self.output_dim,),name="b_d_h")


		
		self.W_a = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.attention_dim),init_method='glorot_uniform',name="W_a")
		self.U_a = InitUtil.init_weight_variable((self.output_dim,self.attention_dim),init_method='orthogonal',name="U_a")
		self.b_a = InitUtil.init_bias_variable((self.attention_dim,),name="b_a")

		self.W = InitUtil.init_weight_variable((self.attention_dim,1),init_method='glorot_uniform',name="W")

		self.A_z = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_z")

		self.A_r = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_r")

		self.A_h = InitUtil.init_weight_variable((self.encoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_h")

		# predict feature decoder
		predict_decoder_i2h_shape = (self.unsup_decoder_input_shape[-1],self.output_dim)
		predict_decoder_h2h_shape = (self.output_dim,self.output_dim)
		predict_decoder_h2o_shape = (self.output_dim,self.unsup_decoder_input_shape[-1])
		self.W_p_r = InitUtil.init_weight_variable(predict_decoder_i2h_shape,init_method='glorot_uniform',name="W_p_r")
		self.W_p_z = InitUtil.init_weight_variable(predict_decoder_i2h_shape,init_method='glorot_uniform',name="W_p_z")
		self.W_p_h = InitUtil.init_weight_variable(predict_decoder_i2h_shape,init_method='glorot_uniform',name="W_p_h")

		self.W_p_o = InitUtil.init_weight_variable(predict_decoder_h2o_shape,init_method='glorot_uniform',name='W_p_o')

		self.U_p_r = InitUtil.init_weight_variable(predict_decoder_h2h_shape,init_method='orthogonal',name="U_p_r")
		self.U_p_z = InitUtil.init_weight_variable(predict_decoder_h2h_shape,init_method='orthogonal',name="U_p_z")
		self.U_p_h = InitUtil.init_weight_variable(predict_decoder_h2h_shape,init_method='orthogonal',name="U_p_h")

		self.b_p_r = InitUtil.init_bias_variable((self.output_dim,),name="b_p_r")
		self.b_p_z = InitUtil.init_bias_variable((self.output_dim,),name="b_p_z")
		self.b_p_h = InitUtil.init_bias_variable((self.output_dim,),name="b_p_h")


		
		self.W_p_a = InitUtil.init_weight_variable((self.unsup_decoder_input_shape[-1],self.attention_dim),init_method='glorot_uniform',name="W_p_a")
		self.U_p_a = InitUtil.init_weight_variable((self.output_dim,self.attention_dim),init_method='orthogonal',name="U_p_a")
		self.b_p_a = InitUtil.init_bias_variable((self.attention_dim,),name="b_p_a")

		self.W_p = InitUtil.init_weight_variable((self.attention_dim,1),init_method='glorot_uniform',name="W_p")

		self.A_p_z = InitUtil.init_weight_variable((self.unsup_decoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_p_z")

		self.A_p_r = InitUtil.init_weight_variable((self.unsup_decoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_p_r")

		self.A_p_h = InitUtil.init_weight_variable((self.unsup_decoder_input_shape[-1],self.output_dim),init_method='orthogonal',name="A_p_h")


		# multirate
		self.block_length = int(math.ceil(self.output_dim/len(self.T_k)))
		print('block_length:%d'%self.block_length)

		# classification parameters
		self.W_c = InitUtil.init_weight_variable((self.output_dim,self.voc_size),init_method='uniform',name='W_c')
		self.b_c = InitUtil.init_bias_variable((self.voc_size,),name="b_c")
Exemplo n.º 8
0
def getVideoDualSemanticEmbeddingWithQuestionAttention_question_guid(
        embeded_stories,
        d_lproj,
        x,
        w2v,
        embedded_stories_words,
        embedded_question,
        T_B,
        pca_mat=None,
        return_sequences=True):
    '''
        x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
        w2v: word 2 vec (|v|,dim)
    '''

    # embeded_stories             shape=(?, 3660, 40, 300)
    # d_lproj                     300
    # x                           shape=(?, 32, 512, 7, 7)
    # w2v                         shape=(26033, 300)
    # embedded_stories_words      shape=(?, 3660, 300)
    # embedded_question           shape=(?, 300)
    # T_B                         shape=(300, 300)
    # pca_mat                     (512, 300)

    # pdb.set_trace()

    input_shape = x.get_shape().as_list()
    w2v_shape = w2v.get_shape().as_list()
    assert (len(input_shape) == 5)
    axis = [0, 1, 3, 4, 2]
    x = tf.transpose(x, perm=axis)
    x = tf.reshape(x, (-1, input_shape[2]))

    if pca_mat is not None:
        linear_proj = tf.Variable(0.1 * pca_mat,
                                  dtype='float32',
                                  name='visual_linear_proj')
    else:
        linear_proj = InitUtil.init_weight_variable(
            (input_shape[2], w2v_shape[-1]),
            init_method='uniform',
            name='visual_linear_proj')

    x = tf.matmul(x, linear_proj)
    x = tf.nn.l2_normalize(x, -1)

    #-----------------------
    w2v_cov = tf.matmul(tf.transpose(w2v, perm=[1, 0]), w2v)
    x = tf.matmul(x, w2v_cov)  # (batch_size*timesteps*height*width, |V|)

    #-----------------------

    x = tf.reshape(
        x, (-1, input_shape[1], input_shape[3], input_shape[4], w2v_shape[-1]))
    axis = [0, 1, 4, 2, 3]
    x = tf.transpose(x, perm=axis)

    # can be extended to different architecture
    x = tf.reduce_sum(x, reduction_indices=[3, 4])
    x = tf.nn.l2_normalize(x, -1)

    # pdb.set_trace()

    ##### SO TILL NOW PROBABLY IS EXACTLY THE SAME AS THE NORMAL CASE, expect that you sum only 7x7 into 1 region and keep 32 as it is
    # x --> shape=(?, 32, 300)

    #-----------------------

    # embedded_stories_words --> shape=(?, 3660, 300)

    stories_cov = batch_dot(
        tf.transpose(embedded_stories_words, perm=[0, 2, 1]),
        embedded_stories_words)  #  shape=(?, 300, 300)
    x_out = batch_dot(
        x, stories_cov
    )  # shape=(?, 32, 300) dot with  shape=(?, 300, 300) to get again  shape=(?, 32, 300)

    #-----------------------
    x = tf.nn.l2_normalize(x_out, -1)  # 32x300

    # embedded_question ..... ? x 300

    embedded_question_use = tf.tile(
        tf.expand_dims(embedded_question,
                       dim=1), [1, input_shape[1], 1])  # shape=(?, 32, 300)
    ### THE ABOVE THING JUST REPLICATES x300 into x32x300

    frame_weight = tf.reduce_sum(x * embedded_question_use,
                                 reduction_indices=-1,
                                 keep_dims=True)  #?x32x1....NS

    frame_weight = tf.nn.softmax(frame_weight, dim=1)

    #####THIS THING TELLS THE WEIGHT OF EACH FRAME

    frame_weight = tf.tile(frame_weight,
                           [1, 1, w2v_shape[-1]])  #  shape=(?, 32, 300)

    x_weight_new = tf.reduce_sum(x * frame_weight, reduction_indices=1)

    x_weight_use = tf.expand_dims(x_weight_new, dim=1)  # ? x 300

    story_weight = tf.matmul(
        x_weight_use, tf.transpose(embedded_stories_words, perm=[0, 2, 1]))

    story_weight = tf.nn.relu(story_weight)  # shape=(?, 1, 3660)

    embedded_stories_words = tf.multiply(
        tf.transpose(story_weight, perm=[0, 2, 1]),
        embedded_stories_words)  # shape=(?, 3660, 300)

    stories_cov = batch_dot(
        tf.transpose(embedded_stories_words, perm=[0, 2, 1]),
        embedded_stories_words)  # ? x 300 x 300

    #ALL THE ABOVE DOES SOME WEIGHTED ATTENTION THING USING STORY {i.e. subtitles} to make it 300x300 AND THEN WE AGAIN MUTLIPLE x the video features with it

    x = batch_dot(x, stories_cov)

    x = tf.nn.l2_normalize(x_out, -1)  # ? x 32 x 300

    # SO THE PROJECTED VIDEO FEATURES [these are probably using static words] WITH SUBTITLE BASED ATTENSION [now this makes the dynamic thing because we
    # now use the corresponding subtitles] are of the shape ? x 32 x 300, so we still have the temporal context

    #-------------------------------------------------------------------------------------------------------------

    stories_shape = embeded_stories.get_shape().as_list()
    embeded_question_shape = embedded_question.get_shape().as_list()
    num_of_sentence = stories_shape[-3]
    input_dims = stories_shape[-1]
    output_dims = embeded_question_shape[-1]

    print('embeded_question_shape', embeded_question_shape)  # [None, 300]
    print('num_of_sentence', num_of_sentence)  # 3660

    print('output_dims', output_dims)  # 300
    print('stories_shape', stories_shape)  # [None, 3660, 40, 300]

    #### SO NOW TILL NOW WE HAD DONE SUBTITLE BASED ATTENTION OF 300d projected visual features, NOW WE ARE DOING THE QUESTION BASED ATTENTION THING
    ## SO FIRST THING IS TO FIND ATTENTION OF QUESTION USING SUBITLES and THEN USE THAT ON VISUAL FEATURS

    embeded_question = tf.tile(
        tf.expand_dims(embedded_question, dim=1),
        [1, num_of_sentence, 1
         ])  # from shape=(?, 300) is expanded to  shape=(?, 3660, 300)

    sen_weight = tf.reduce_sum(
        embeded_question * embedded_stories_words,
        reduction_indices=-1,
        keep_dims=True)  # ? x 3660 x 1......this is sentence weights

    sen_weight = tf.nn.relu(sen_weight)  #  shape=(?, 3660, 1)
    sen_weight = tf.tile(sen_weight,
                         [1, 1, output_dims])  #  shape=(?, 3660, 300)
    if return_sequences:  #### THIS ONE IS TRUE
        embeded_stories_used = embedded_stories_words * sen_weight  # shape=(?, 3660, 300)
    else:
        embeded_stories_used = tf.reduce_sum(embedded_stories_words *
                                             sen_weight,
                                             reduction_indices=1)

    #-------------------------------------------------------------------------------------------------------------
    stories_cov = batch_dot(tf.transpose(embeded_stories_used, perm=[0, 2, 1]),
                            embeded_stories_used)  # shape=(?, 300, 300)

    x = batch_dot(x, stories_cov)  # shape=(?, 32, 300)

    #-----------------------
    x = tf.nn.l2_normalize(x, -1)

    frame_weight = tf.reduce_sum(x * embedded_question_use,
                                 reduction_indices=-1,
                                 keep_dims=True)  # shape=(?, 32, 1)

    frame_weight = tf.nn.softmax(frame_weight, dim=1)

    frame_weight = tf.tile(frame_weight,
                           [1, 1, w2v_shape[-1]])  # shape=(?, 32, 300)

    x = tf.reduce_sum(x * frame_weight, reduction_indices=1)  # shape=(?, 300)

    #-----------------------------------------------

    x = tf.matmul(x, T_B)  # shape=(?, 300)

    x = tf.nn.l2_normalize(x, -1)

    return x  # FINAL OUTUT IS AGAIN  of shape=(?, 300)