def step(prev_state_c, prev_state_h, tokens, seq_length, stop_indicator): input = tf.gather(tokens, tf.shape(tokens)[0] - 1) # Look for new finish dialogue is_stop_token = tf.equal(input, stop_token) is_stop_dialogue_token = tf.equal(input, stop_dialogue_token) is_stop = tf.logical_or(is_stop_token, is_stop_dialogue_token) stop_indicator = tf.logical_or( stop_indicator, is_stop) # flag to false new finished dialogue # increment seq_length when the dialogue is not over seq_length = tf.where(stop_indicator, seq_length, tf.add(seq_length, 1)) # compute the next words. TODO: factorize with qgen.. but how?! with tf.variable_scope(self.scope_name, reuse=True): word_emb = utils.get_embedding( input, n_words=tokenizer.no_words, n_dim=config['word_embedding_size'], scope="word_embedding", reuse=True) inp_emb = tf.concat([word_emb, self.image_emb], axis=1) with tf.variable_scope("word_decoder"): lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( config['num_lstm_units'], layer_norm=False, dropout_keep_prob=1.0, reuse=True) state = tf.contrib.rnn.LSTMStateTuple(c=prev_state_c, h=prev_state_h) out, state = lstm_cell(inp_emb, state) # store/update the state when the dialogue is not finished (after sampling the <?> token) cond = tf.greater_equal( seq_length, tf.subtract(tf.reduce_max(seq_length), 1)) state_c = tf.where(cond, state.c, prev_state_c) state_h = tf.where(cond, state.h, prev_state_h) with tf.variable_scope('decoder_output'): output = utils.fully_connected(state_h, tokenizer.no_words, reuse=True) sampled_tokens = tf.cond( self.greedy, lambda: tf.argmax(output, 1), lambda: tf.reshape(tf.multinomial(output, 1), [-1])) sampled_tokens = tf.cast(sampled_tokens, tf.int32) tokens = tf.concat( [tokens, tf.expand_dims(sampled_tokens, 0)], axis=0) # check axis! return state_c, state_h, tokens, seq_length, stop_indicator
def compute_attention(feature_maps, context, no_mlp_units, reuse=False): with tf.variable_scope("attention"): if len(feature_maps.get_shape()) == 3: h = tf.shape(feature_maps)[ 1] # when the shape is dynamic (attention over lstm) w = 1 c = int(feature_maps.get_shape()[2]) else: h = int(feature_maps.get_shape()[1]) w = int(feature_maps.get_shape()[2]) c = int(feature_maps.get_shape()[3]) s = int(context.get_shape()[1]) feature_maps = tf.reshape(feature_maps, shape=[-1, h * w, c]) context = tf.expand_dims(context, axis=1) context = tf.tile(context, [1, h * w, 1]) embedding = tf.concat([feature_maps, context], axis=2) embedding = tf.reshape(embedding, shape=[-1, s + c]) # compute the evidence from the embedding with tf.variable_scope("mlp"): e = utils.fully_connected(embedding, no_mlp_units, scope='hidden_layer', activation="relu", reuse=reuse) e = utils.fully_connected(e, 1, scope='out', reuse=reuse) e = tf.reshape(e, shape=[-1, h * w, 1]) # compute the softmax over the evidence alpha = tf.nn.softmax(e, dim=1) # apply soft attention soft_attention = feature_maps * alpha soft_attention = tf.reduce_sum(soft_attention, axis=1) return soft_attention
def create_cbn_input(self, feature_maps): no_features = int(feature_maps.get_shape()[3]) batch_size = tf.shape(feature_maps)[0] if self.use_betas: h_betas = utils.fully_connected(self.lstm_state, self.cbn_embedding_size, weight_initializer=RandomUniform( -1e-4, 1e-4), scope="hidden_betas", activation='relu') delta_betas = utils.fully_connected( h_betas, no_features, scope="delta_beta", weight_initializer=RandomUniform(-1e-4, 1e-4), use_bias=False) else: delta_betas = tf.tile(tf.constant(0.0, shape=[1, no_features]), tf.stack([batch_size, 1])) if self.use_gammas: h_gammas = utils.fully_connected(self.lstm_state, self.cbn_embedding_size, weight_initializer=RandomUniform( -1e-4, 1e-4), scope="hidden_gammas", activation='relu') delta_gammas = utils.fully_connected( h_gammas, no_features, scope="delta_gamma", weight_initializer=RandomUniform(-1e-4, 1e-4)) else: delta_gammas = tf.tile(tf.constant(0.0, shape=[1, no_features]), tf.stack([batch_size, 1])) return delta_betas, delta_gammas
def __init__(self, config, num_words, policy_gradient, device='', reuse=False): AbstractNetwork.__init__(self, "qgen", device=device) # Create the scope for this graph with tf.variable_scope(self.scope_name, reuse=reuse): mini_batch_size = None # Picture self.images = tf.placeholder(tf.float32, [mini_batch_size] + config['image']["dim"], name='images') # Question self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues') self.answer_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='answer_mask') # 1 if keep and (1 q/a 1) for (START q/a STOP) self.padding_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='padding_mask') self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length') # Rewards self.cum_rewards = tf.placeholder(tf.float32, shape=[mini_batch_size, None], name='cum_reward') # DECODER Hidden state (for beam search) zero_state = tf.zeros([1, config['num_lstm_units']]) # default LSTM state is a zero-vector zero_state = tf.tile(zero_state, [tf.shape(self.images)[0], 1]) # trick to do a dynamic size 0 tensors self.decoder_zero_state_c = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_c") self.decoder_zero_state_h = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_h") decoder_initial_state = tf.contrib.rnn.LSTMStateTuple(c=self.decoder_zero_state_c, h=self.decoder_zero_state_h) # Misc self.is_training = tf.placeholder(tf.bool, name='is_training') self.greedy = tf.placeholder_with_default(False, shape=(), name="greedy") # use for graph self.samples = None # remove last token input_dialogues = self.dialogues[:, :-1] input_seq_length = self.seq_length - 1 # remove first token(=start token) rewards = self.cum_rewards[:, 1:] target_words = self.dialogues[:, 1:] # to understand the padding: # input # <start> is it a blue <?> <yes> is it a car <?> <no> <stop_dialogue> # target # is it a blue <?> - is it a car <?> - <stop_dialogue> - # image processing if len(config["image"]["dim"]) == 1: self.image_out = self.images else: self.image_out = get_attention(self.images, None, "none") #TODO: improve by using the previous lstm state? # Reduce the embedding size of the image with tf.variable_scope('picture_embedding'): self.picture_emb = utils.fully_connected(self.image_out, config['picture_embedding_size']) picture_emb = tf.expand_dims(self.picture_emb, 1) picture_emb = tf.tile(picture_emb, [1, tf.shape(input_dialogues)[1], 1]) # Compute the question embedding input_words = utils.get_embedding( input_dialogues, n_words=num_words, n_dim=config['word_embedding_size'], scope="word_embedding") # concat word embedding and picture embedding decoder_input = tf.concat([input_words, picture_emb], axis=2, name="concat_full_embedding") # encode one word+picture decoder_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( config['num_lstm_units'], layer_norm=False, dropout_keep_prob=1.0, reuse=reuse) self.decoder_output, self.decoder_state = tf.nn.dynamic_rnn( cell=decoder_lstm_cell, inputs=decoder_input, dtype=tf.float32, initial_state=decoder_initial_state, sequence_length=input_seq_length, scope="word_decoder") # TODO: use multi-layer RNN max_sequence = tf.reduce_max(self.seq_length) # compute the softmax for evaluation with tf.variable_scope('decoder_output'): flat_decoder_output = tf.reshape(self.decoder_output, [-1, decoder_lstm_cell.output_size]) flat_mlp_output = utils.fully_connected(flat_decoder_output, num_words) # retrieve the batch/dialogue format mlp_output = tf.reshape(flat_mlp_output, [tf.shape(self.seq_length)[0], max_sequence - 1, num_words]) # Ignore th STOP token self.softmax_output = tf.nn.softmax(mlp_output, name="softmax") self.argmax_output = tf.argmax(mlp_output, axis=2) self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mlp_output, labels=target_words) # compute the maximum likelihood loss with tf.variable_scope('ml_loss'): ml_loss = tf.identity(self.cross_entropy_loss) ml_loss *= self.answer_mask[:, 1:] # remove answers (ignore the <stop> token) ml_loss *= self.padding_mask[:, 1:] # remove padding (ignore the <start> token) # Count number of unmask elements count = tf.reduce_sum(self.padding_mask) - tf.reduce_sum(1 - self.answer_mask[:, :-1]) - 1 # no_unpad - no_qa - START token ml_loss = tf.reduce_sum(ml_loss, axis=1) # reduce over dialogue dimension ml_loss = tf.reduce_sum(ml_loss, axis=0) # reduce over minibatch dimension self.ml_loss = ml_loss / count # Normalize self.loss = self.ml_loss # Compute policy gradient if policy_gradient: with tf.variable_scope('rl_baseline'): decoder_out = tf.stop_gradient(self.decoder_output) # take the LSTM output (and stop the gradient!) flat_decoder_output = tf.reshape(decoder_out, [-1, decoder_lstm_cell.output_size]) # flat_h1 = utils.fully_connected(flat_decoder_output, n_out=100, activation='relu', scope='baseline_hidden') flat_baseline = utils.fully_connected(flat_h1, 1, activation='relu', scope='baseline_out') self.baseline = tf.reshape(flat_baseline, [tf.shape(self.seq_length)[0], max_sequence-1]) self.baseline *= self.answer_mask[:, 1:] self.baseline *= self.padding_mask[:, 1:] with tf.variable_scope('policy_gradient_loss'): # Compute log_prob self.log_of_policy = tf.identity(self.cross_entropy_loss) self.log_of_policy *= self.answer_mask[:, 1:] # remove answers (<=> predicted answer has maximum reward) (ignore the START token in the mask) # No need to use padding mask as the discounted_reward is already zero once the episode terminated # Policy gradient loss rewards *= self.answer_mask[:, 1:] self.score_function = tf.multiply(self.log_of_policy, rewards - self.baseline) # score function self.baseline_loss = tf.reduce_sum(tf.square(rewards - self.baseline)) self.policy_gradient_loss = tf.reduce_sum(self.score_function, axis=1) # sum over the dialogue trajectory self.policy_gradient_loss = tf.reduce_mean(self.policy_gradient_loss, axis=0) # reduce over minibatch dimension self.loss = self.policy_gradient_loss
def compute_glimpse(feature_maps, context, no_glimpse, glimpse_embedding_size, keep_dropout, reuse=False): with tf.variable_scope("glimpse"): h = int(feature_maps.get_shape()[1]) w = int(feature_maps.get_shape()[2]) c = int(feature_maps.get_shape()[3]) # reshape state to perform batch operation context = tf.nn.dropout(context, keep_dropout) projected_context = utils.fully_connected(context, glimpse_embedding_size, scope='hidden_layer', activation="tanh", use_bias=False, reuse=reuse) projected_context = tf.expand_dims(projected_context, axis=1) projected_context = tf.tile(projected_context, [1, h * w, 1]) projected_context = tf.reshape(projected_context, [-1, glimpse_embedding_size]) feature_maps = tf.reshape(feature_maps, shape=[-1, h * w, c]) glimpses = [] with tf.variable_scope("glimpse"): g_feature_maps = tf.reshape( feature_maps, shape=[-1, c]) # linearise the feature map as as single batch g_feature_maps = tf.nn.dropout(g_feature_maps, keep_dropout) g_feature_maps = utils.fully_connected(g_feature_maps, glimpse_embedding_size, scope='image_projection', activation="tanh", use_bias=False, reuse=reuse) hadamard = g_feature_maps * projected_context hadamard = tf.nn.dropout(hadamard, keep_dropout) e = utils.fully_connected(hadamard, no_glimpse, scope='hadamard_projection', reuse=reuse) e = tf.reshape(e, shape=[-1, h * w, no_glimpse]) for i in range(no_glimpse): ev = e[:, :, i] alpha = tf.nn.softmax(ev) # apply soft attention soft_glimpses = feature_maps * tf.expand_dims(alpha, -1) soft_glimpses = tf.reduce_sum(soft_glimpses, axis=1) glimpses.append(soft_glimpses) full_glimpse = tf.concat(glimpses, axis=1) return full_glimpse
def compute_all_attention(question_states, caption, history_states, image_feature, no_mlp_units, reuse=False, config=None): print("image_feature = {}", image_feature) print("Question = {}", question_states) print("caption = {}", caption) print("history_sta = {}", history_states) ##### 1 #### # recupere les données #### 2 #### #step_dictionnaire ##### loop step_dictionnaire ######## # feature_input = mlp(x,g1,g2) [1.. feature_shape] # soft_feature = softmax(feature_input) [1.. feature_input_shape] # x = soft_feature * x [1.. feature_input_shape] with tf.variable_scope("coattention"): if image_feature != None: if len(image_feature.get_shape()) == 3: h = tf.shape(image_feature)[ 1] # when the shape is dynamic (attention over lstm) w = 1 c = int(image_feature.get_shape()[2]) else: h = int(image_feature.get_shape()[1]) w = int(image_feature.get_shape()[2]) c = int(image_feature.get_shape()[3]) s = int(question_states.get_shape()[2]) image_feature = tf.reshape(image_feature, shape=[ -1, (h * w), c ]) # input image_feature ?,7,7,2048 => ?,49,2048 print("******************** B Image_feature = {} ".format( image_feature)) image_feature = tf.reduce_sum(image_feature, axis=1) print("******************** A Image_feature = {} ".format( image_feature)) set_img(image_feature) question_shape = question_states.get_shape() question_states = tf.reshape( question_states, shape=[-1, int(question_shape[1]) * int(question_shape[2])]) set_question(question_states) if history_states != None: if caption != None: caption = tf.expand_dims(caption, axis=1) if caption != None: print("caption = {} ,history_states = {} ".format( caption, history_states)) history_states = tf.reshape(history_states, [-1, 6, 1024]) hist = tf.concat([caption, history_states], axis=1) print("hist B= {} ".format(hist)) hist = tf.reshape(history_states, [-1, 7 * 1024]) else: # hist = tf.reshape(history_states,[-1,6*1024]) hist = history_states # print("hist = {} ".format(hist)) # exit() set_history(hist) # return question_states,hist,image_feature dict_step = {0: "img", 1: "question", 3: "hist"} step_attention = { 0: [0, 1, None], 1: [3, 0, 1], 2: [1, 0, 3], 3: [0, 3, 1] } # step_attention = {0:[0,1,None]} for key, value in step_attention.items(): input_data, g1, g2 = get_input_g1_g2(value[0], value[1], value[2]) dimension_two = int(input_data.get_shape()[1]) print("---- input_shape = {} ".format(input_data.get_shape())) print("---- g1_shape = {} ".format(g1.get_shape())) # print("g2_shape = {} ".format(g2.get_shape())) hidden_mlp, weight = utils.fully_connected( input_data, no_mlp_units, scope='hidden_layer_256_{}'.format(key), activation="tanh", reuse=reuse, co_attention=True, g1=g1, g2=g2, key_input=key) hidden_mlp = utils.fully_connected( hidden_mlp, 1, scope='hidden_layer_1_{}'.format(key), reuse=reuse, co_attention=False) alpha = tf.nn.softmax(hidden_mlp, axis=1) input_data = input_data * alpha if value[0] == 0: set_img(input_data) elif value[0] == 1: set_question(input_data) elif value[0] == 2: set_history(input_data) # print("-- {} -- hidden = {} ,ALPHA= {} ,INPUT_DATA = {}".format(key,hidden_mlp,alpha,input_data)) # print("Data_ouput = ",get_img(),get_question(),get_history()) # if key == 2: # exit() # img_shape = get_img().get_shape() # question_shape = get_question().get_shape() # history_shape = get_history().get_shape() # img = tf.reshape(get_img(), shape=[-1,int(img_shape[1]) * int(img_shape[2]) ]) # question = tf.reshape(get_question(), shape=[-1, int(question_shape[1]) * int(question_shape[2]) ]) # history = tf.reshape(get_history(), shape=[-1,int(history_shape[1]) * int(history_shape[2]) ]) question_states = get_question() history = get_history() image_feature = get_img() # print(" history = {} , image_feature = {} , question = {}".format(history,image_feature,question_states)) # exit() return question_states, history, image_feature
def compute_attention(feature_maps, context, no_mlp_units, reuse=False): with tf.variable_scope("attention"): if len(feature_maps.get_shape()) == 3: h = tf.shape(feature_maps)[ 1] # when the shape is dynamic (attention over lstm) w = 1 c = int(feature_maps.get_shape()[2]) else: h = int(feature_maps.get_shape()[1]) w = int(feature_maps.get_shape()[2]) c = int(feature_maps.get_shape()[3]) s = int(context.get_shape()[1]) feature_maps = tf.reshape( feature_maps, shape=[-1, h * w, c] ) #Tensor("oracle/attention/Reshape:0", shape=(?, 49, 2048), dtype=float32) context = tf.expand_dims( context, axis=1 ) # Tensor("oracle/attention/ExpandDims:0", shape=(?, 1, 6144), dtype=float32) context = tf.tile( context, [1, h * w, 1]) # tf.tile([a,b,c],dimension=[2]) => [a,b,c,a,b,c] # Tensor("oracle/attention/Tile:0", shape=(?, 49, 6144), dtype=float32) embedding = tf.concat([feature_maps, context], axis=2) embedding = tf.reshape( embedding, shape=[-1, s + c] ) #Tensor("oracle/attention/Reshape_1:0", shape=(?, 8192), dtype=float32) # compute the evidence from the embedding with tf.variable_scope("mlp"): e = utils.fully_connected( embedding, no_mlp_units, scope='hidden_layer', activation="relu", reuse=reuse ) #Tensor("oracle/attention/mlp/Relu:0", shape=(?, 256), dtype=float32) # print(" Before E = {}".format(e)) e = utils.fully_connected( e, 1, scope='out', reuse=reuse ) # Tensor("oracle/attention/mlp/out/add:0", shape=(?, 1), dtype=float32) # print(" After E = {}".format(e)) e = tf.reshape( e, shape=[-1, h * w, 1] ) #Tensor("oracle/attention/Reshape_2:0", shape=(?, 49, 1), dtype=float32) # compute the softmax over the evidence alpha = tf.nn.softmax( e, dim=1 ) # Tensor("oracle/attention/transpose_1:0", shape=(?, 49, 1), dtype=float32) # apply soft attention soft_attention = feature_maps * alpha # Tensor("oracle/attention/mul:0", shape=(?, 49, 2048), dtype=float32) soft_attention = tf.reduce_sum( soft_attention, axis=1 ) # Tensor("oracle/attention/Sum:0", shape=(?, 2048), dtype=float32) return soft_attention
def __init__(self, config, num_words, device='', reuse=False): AbstractNetwork.__init__(self, "guesser", device=device) mini_batch_size = None with tf.variable_scope(self.scope_name, reuse=reuse): # Dialogues self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues') self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length') # Objects self.obj_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='obj_mask') self.obj_cats = tf.placeholder(tf.int32, [mini_batch_size, None], name='obj_cats') self.obj_spats = tf.placeholder( tf.float32, [mini_batch_size, None, config['spat_dim']], name='obj_spats') # Targets self.targets = tf.placeholder(tf.int32, [mini_batch_size], name="targets_index") self.object_cats_emb = utils.get_embedding( self.obj_cats, config['no_categories'] + 1, config['cat_emb_dim'], scope='cat_embedding') self.objects_input = tf.concat( [self.object_cats_emb, self.obj_spats], axis=2) self.flat_objects_inp = tf.reshape( self.objects_input, [-1, config['cat_emb_dim'] + config['spat_dim']]) with tf.variable_scope('obj_mlp'): h1 = utils.fully_connected(self.flat_objects_inp, n_out=config['obj_mlp_units'], activation='relu', scope='l1') h2 = utils.fully_connected(h1, n_out=config['dialog_emb_dim'], activation='relu', scope='l2') obj_embs = tf.reshape( h2, [-1, tf.shape(self.obj_cats)[1], config['dialog_emb_dim']]) # Compute the word embedding input_words = utils.get_embedding(self.dialogues, n_words=num_words, n_dim=config['word_emb_dim'], scope="input_word_embedding") last_states, _ = rnn.variable_length_LSTM( input_words, num_hidden=config['num_lstm_units'], seq_length=self.seq_length) last_states = tf.reshape(last_states, [-1, config['num_lstm_units'], 1]) scores = tf.matmul(obj_embs, last_states) scores = tf.reshape(scores, [-1, tf.shape(self.obj_cats)[1]]) def masked_softmax(scores, mask): # subtract max for stability scores = scores - tf.tile( tf.reduce_max(scores, axis=(1, ), keep_dims=True), [1, tf.shape(scores)[1]]) # compute padded softmax exp_scores = tf.exp(scores) exp_scores *= mask exp_sum_scores = tf.reduce_sum(exp_scores, axis=1, keep_dims=True) return exp_scores / tf.tile(exp_sum_scores, [1, tf.shape(exp_scores)[1]]) self.softmax = masked_softmax(scores, self.obj_mask) self.selected_object = tf.argmax(self.softmax, axis=1) self.loss = tf.reduce_mean( utils.cross_entropy(self.softmax, self.targets)) self.error = tf.reduce_mean(utils.error(self.softmax, self.targets))
def __init__(self, config, no_words, no_answers, reuse=False, device=''): ResnetModel.__init__(self, "vqa", device=device) with tf.variable_scope(self.scope_name, reuse=reuse) as scope: self.batch_size = None ##################### # QUESTION ##################### self._question = tf.placeholder(tf.int32, [self.batch_size, None], name='question') self._seq_length = tf.placeholder(tf.int32, [self.batch_size], name='seq_length') self._answer_count = tf.placeholder(tf.float32, [self.batch_size, no_answers], name='answer_count') self._is_training = tf.placeholder(tf.bool, name="is_training") dropout_keep = float(config.get("dropout_keep_prob", 1.0)) dropout_keep = tf.cond(self._is_training, lambda: tf.constant(dropout_keep), lambda: tf.constant(1.0)) word_emb = utils.get_embedding(self._question, n_words=no_words, n_dim=int( config["word_embedding_dim"]), scope="word_embedding", reuse=reuse) if config['glove']: self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") word_emb = tf.concat([word_emb, self._glove], axis=2) self.question_lstm, self.all_lstm_states = rnn.variable_length_LSTM( word_emb, num_hidden=int(config["no_hidden_LSTM"]), dropout_keep_prob=dropout_keep, seq_length=self._seq_length, depth=int(config["no_LSTM_cell"]), scope="question_lstm", reuse=reuse) ##################### # IMAGES ##################### self._image = tf.placeholder(tf.float32, [self.batch_size] + config['image']["dim"], name='image') self.image_out = get_image_features(image=self._image, question=self.question_lstm, is_training=self._is_training, scope_name=scope.name, config=config['image'], dropout_keep=dropout_keep) ##################### # COMBINE ##################### activation_name = config["activation"] with tf.variable_scope('final_mlp'): self.question_embedding = utils.fully_connected( self.question_lstm, config["no_question_mlp"], activation=activation_name, scope='question_mlp') self.image_embedding = utils.fully_connected( self.image_out, config["no_image_mlp"], activation=activation_name, scope='image_mlp') full_embedding = self.image_embedding * self.question_embedding full_embedding = tf.nn.dropout(full_embedding, dropout_keep) out = utils.fully_connected(full_embedding, config["no_hidden_final_mlp"], scope='layer1', activation=activation_name) out = tf.nn.dropout(out, dropout_keep) out = utils.fully_connected(out, no_answers, activation='linear', scope='layer2') # improve soft loss answer_count = tf.minimum(self._answer_count, 3) normalizing_sum = tf.maximum( 1.0, tf.reduce_sum(answer_count, 1, keep_dims=True)) self.answer_prob = answer_count / normalizing_sum self.soft_cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=out, labels=self.answer_prob, name='soft_cross_entropy') self.soft_loss = self.soft_cross_entropy self.target_answer = tf.argmax(self._answer_count, axis=1) # unmorm_log_prob = tf.log(self._answer_count) # self.target_answer = tf.multinomial(unmorm_log_prob, num_samples=1) # self.target_answer = tf.reshape(self.target_answer, shape=[-1]) self.hard_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=out, labels=self.target_answer, name='hard_cross_entropy') self.hard_loss = self.hard_cross_entropy if config['loss'] == 'soft': self.loss = self.soft_loss else: self.loss = self.hard_loss self.loss = tf.reduce_mean(self.loss) self.softmax = tf.nn.softmax(out, name='answer_prob') self.prediction = tf.argmax( out, axis=1, name='predicted_answer') # no need to compute the softmax with tf.variable_scope('accuracy'): ind = tf.range(tf.shape( self.prediction)[0]) * no_answers + tf.cast( self.prediction, tf.int32) pred_count = tf.gather(tf.reshape(self._answer_count, [-1]), ind) self.extended_accuracy = tf.minimum(pred_count / 3.0, 1.0, name="extended_accuracy") self.accuracy = tf.reduce_mean(self.extended_accuracy) tf.summary.scalar('soft_loss', self.soft_loss) tf.summary.scalar('hard_loss', self.hard_loss) tf.summary.scalar('accuracy', self.accuracy) print('Model... build!')
def __init__(self, config, num_words, device='', reuse=False): ResnetModel.__init__(self, "oracle", device=device) with tf.variable_scope(self.scope_name, reuse=reuse) as scope: embeddings = [] self.batch_size = None # QUESTION self._is_training = tf.placeholder(tf.bool, name="is_training") self._question = tf.placeholder(tf.int32, [self.batch_size, None], name='question') self._seq_length = tf.placeholder(tf.int32, [self.batch_size], name='seq_length') word_emb = utils.get_embedding( self._question, n_words=num_words, n_dim=int(config['model']['question']["embedding_dim"]), scope="word_embedding") lstm_states, _ = rnn.variable_length_LSTM( word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self._seq_length) embeddings.append(lstm_states) # CATEGORY if config['inputs']['category']: self._category = tf.placeholder(tf.int32, [self.batch_size], name='category') cat_emb = utils.get_embedding( self._category, int(config['model']['category']["n_categories"]) + 1, # we add the unkwon category int(config['model']['category']["embedding_dim"]), scope="cat_embedding") embeddings.append(cat_emb) print("Input: Category") # SPATIAL if config['inputs']['spatial']: self._spatial = tf.placeholder(tf.float32, [self.batch_size, 8], name='spatial') embeddings.append(self._spatial) print("Input: Spatial") # IMAGE if config['inputs']['image']: self._image = tf.placeholder(tf.float32, [self.batch_size] + config['model']['image']["dim"], name='image') self.image_out = get_image_features( image=self._image, question=lstm_states, is_training=self._is_training, scope_name=scope.name, config=config['model']['image']) embeddings.append(self.image_out) print("Input: Image") # CROP if config['inputs']['crop']: self._crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop') self.crop_out = get_image_features( image=self._crop, question=lstm_states, is_training=self._is_training, scope_name=scope.name, config=config["model"]['crop']) embeddings.append(self.crop_out) print("Input: Crop") # Compute the final embedding emb = tf.concat(embeddings, axis=1) # OUTPUT num_classes = 3 self._answer = tf.placeholder(tf.float32, [self.batch_size, num_classes], name='answer') with tf.variable_scope('mlp'): num_hiddens = config['model']['MLP']['num_hiddens'] l1 = utils.fully_connected(emb, num_hiddens, activation='relu', scope='l1') self.pred = utils.fully_connected(l1, num_classes, activation='softmax', scope='softmax') self.best_pred = tf.argmax(self.pred, axis=1) self.loss = tf.reduce_mean( utils.cross_entropy(self.pred, self._answer)) self.error = tf.reduce_mean(utils.error(self.pred, self._answer)) print('Model... Oracle build!')
def __init__(self, config, num_words_question ,num_words_description=None, device='', reuse=False): ResnetModel.__init__(self, "oracle", device=device) with open("data/dict_word_embedding_{}_{}.pickle".format("fasttext",config["model"]["question"]["embedding_type"]),"rb") as f: dict_all_embedding = pickle.load(f) with tf.variable_scope(self.scope_name, reuse=reuse) as scope: embeddings = [] co_attention = [None,None,None,None] self.batch_size = None max_seq_length = 12 # QUESTION if config['inputs']['question']: self._is_training = tf.placeholder(tf.bool, name="is_training") # self._question_word = tf.placeholder(tf.int32, [self.batch_size], name='question_word') # self._question = tf.placeholder(tf.int32, [self.batch_size, 14], name='question') self.seq_length_question = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_question') if config["model"]["glove"] == True or config["model"]["fasttext"] == True: print("****** WITH EMBEDDING ******") word_emb = utils.get_embedding(self._question, n_words=num_words_question, n_dim=int(config["model"]["word_embedding_dim"]), scope="word_embedding", dict_all_embedding=dict_all_embedding) else: print("****** NOT EMBEDDING ******") word_emb = utils.get_embedding(self._question, n_words=num_words_question, n_dim=int(config["model"]["word_embedding_dim"]), scope="word_embedding", dict_all_embedding=[]) print(".... word_emb 1 = {} ".format(word_emb)) self.out_question = None if config['model']['question']['lstm']: self.lstm_states_question, self.lstm_all_state_ques = rnn.variable_length_LSTM(word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self.seq_length_question) self.out_question = self.lstm_all_state_ques # print("out_queston = {} ".format(self.lstm_states_question)) # exit() # self.out_question = tf.reshape(self.out_question,[-1, self.out_question.get_shape()[1] * self.out_question.get_shape()[2] ]) else: self.out_question = word_emb if config["model"]["attention"]["co-attention"]: co_attention[0] = self.out_question # Tensor("oracle/lstm/lstmcell0/concat:0", shape=(?, 14, 1024), dtype=float32) embeddings.append(self.lstm_states_question) # print("question_lstm = {} ".format(self.out_question )) # exit() else: embeddings.append(self.lstm_states_question) # QUESTION-Pos if config['model']['question'] ['pos']: print("----------------------------------------") print("**** Oracle_network | input = question-pos ") self._question_pos = tf.placeholder(tf.int32, [self.batch_size, None], name='question_pos') self.seq_length_pos = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_ques_pos') word_emb = utils.get_embedding(self._question_pos, n_words=num_words_question, n_dim=100, scope="word_embedding_pos") if config["model"]["glove"] == True or config["model"]["fasttext"] == True: self._glove = tf.placeholder(tf.float32, [None, None,int(config["model"]["word_embedding_dim"])], name="embedding_vector_ques_pos") word_emb = tf.concat([word_emb, self._glove], axis=2) else: print("None ****************") lstm_states, _ = rnn.variable_length_LSTM(word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self.seq_length_pos,scope="lstm2") # embeddings.append(lstm_states) # DESCRIPTION if config['inputs']['description']: print("**** Oracle_network | input = Description ") self._description = tf.placeholder(tf.int32, [self.batch_size, None], name='description') self.seq_length_description = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_description') word_emb = utils.get_embedding(self._description, n_words=num_words_question, n_dim=100, reuse=True, scope="word_embedding") # print("word_emb = {} ".format(word_emb)) if config['model']['question']['lstm']: self.lstm_states_des, self.lstm_all_state_des = rnn.variable_length_LSTM(word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self.seq_length_description,scope="lstm3") self.out_question = self.lstm_states_des # print("self.out_question_emb = {} ".format(self.out_question)) # self.out_question = tf.reshape(self.out_question,[-1, self.out_question.get_shape()[1] * self.out_question.get_shape()[2] ]) else: self.out_question = word_emb # print("self.out_question = {} ".format(self.out_question)) if config["model"]["attention"]["co-attention"]: # co_attention[1] = self.out_question # embeddings.append(self.lstm_all_state_ques) embeddings.append(self.lstm_states_des) else: embeddings.append(self.lstm_states_des) if config['inputs']['history_question']: placeholders_lstmQuestion = [] placeholders_lstmLength = [] for i in range(6): self._embWord = tf.placeholder(tf.int32, [self.batch_size, 14], name="ques_hist_H{}".format(i)) self.seq_length_question_history = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_question_history_H{}'.format(i)) self.word_emb = utils.get_embedding(self._embWord, n_words=num_words_question, n_dim=100, reuse=True, scope="word_embedding") placeholders_lstmQuestion.append(self.word_emb) placeholders_lstmLength.append(self.seq_length_question_history) self.lstm_states, self.lstm_all_state_ques_hist = rnn.variable_length_LSTM(placeholders_lstmQuestion, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=placeholders_lstmLength,scope="lstm4",dim_4=True) if config["model"]["attention"]["co-attention"]: co_attention[2] = self.lstm_states else: embeddings.append(self.lstm_states) # Description-Pos if config['model']['description'] ['pos']: print("----------------------------------------") print("**** Oracle_network | inpurt = question-pos ") self._question_pos = tf.placeholder(tf.int32, [self.batch_size, None], name='des_pos') self.seq_length_pos = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_des_pos') word_emb = utils.get_embedding(self._question_pos, n_words=num_words_question, n_dim=300, scope="word_embedding_pos") if config["model"]["glove"] == True or config["model"]["fasttext"] == True: self._glove = tf.placeholder(tf.float32, [None, None, int(config["model"]["word_embedding_dim"])], name="embedding_vector_des_pos") word_emb = tf.concat([word_emb, self._glove], axis=2) else: print("None ****************") lstm_states, _ = rnn.variable_length_LSTM(word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self.seq_length_pos,scope="lstm5") # embeddings.append(lstm_states) # CATEGORY if config['inputs']['category']: print("**** Oracle_network | input = category ") if config["model"]["category"]["use_embedding"]: self._category = tf.placeholder(tf.float32, [self.batch_size,int(config["model"]["word_embedding_dim"])], name='category') cat_emb = self._category # cat_emb = utils.get_embedding(self._category, # int(config['model']['category']["n_categories"]) + 1, # n_dim=int(config["model"]["word_embedding_dim"]), # scope="cat_embedding", # dict_all_embedding=dict_all_embedding # ) else: self._category = tf.placeholder(tf.int32, [self.batch_size], name='category') cat_emb = utils.get_embedding(self._category, int(config['model']['category']["n_categories"]) + 1, # we add the unkwon category int(config["model"]["word_embedding_dim"]), scope="cat_embedding", ) # cat_emb = tf.expand_dims(cat_emb,1) embeddings.append(cat_emb) print("Input: Category") # ALLCATEGORY if config['inputs']['allcategory']: print("**** Oracle_network | input = allcategory ") self._allcategory = tf.placeholder(tf.float32, [self.batch_size,90], name='allcategory') # self.seq_length_allcategory = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_allcategory') # word_emb = utils.get_embedding(self._allcategory, # n_words=int(config['model']['category']["n_categories"]) + 1, # n_dim=int(config['model']['description']["embedding_dim"]), # scope="word_embedding_allcategory") #print(" SeqDescription = ",self.seq_length_description) # lstm_states, _ = rnn.variable_length_LSTM(word_emb, # num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), # seq_length=self.seq_length_allcategory,scope="lstm3") print(" Oracle_network | embdedding all_cate=",word_emb) # embeddings.append(self._allcategory) print("Input: allcategory") # SPATIAL if config['inputs']['spatial']: print("**** Oracle_network | input = spatial ") self._spatial = tf.placeholder(tf.float32, [self.batch_size, 8], name='spatial') embeddings.append(self._spatial) print("Input: Spatial") # IMAGE if config['inputs']['image']: print("**** Oracle_network | input = image ") self._image_id = tf.placeholder(tf.float32, [self.batch_size], name='image_id') self._image = tf.placeholder(tf.float32, [self.batch_size] + config['model']['image']["dim"], name='image') # self.image_out = tf.reshape(self._image,shpe=[224*224*3]) # print("question = {} ".format(self.lstm_states_question)) # exit() self.image_out = get_image_features( image=self._image, question=self.lstm_states_question, is_training=self._is_training, scope_name=scope.name, scope_feature="Image/", config=config['model']['image'] ) # embeddings.append(self.image_out) print("Input: Image") co_attention[3] = self.image_out print(" -- image_int ={}".format(self._image)) # exit() image_feature = tf.reshape(self.image_out, shape=[-1, (7 * 7) * 2048]) # input image_feature ?,7,7,2048 => ?,49,2048 embeddings.append(image_feature) # print("... Image Features = {}".format(self.image_out)) # CROP if config['inputs']['crop']: print("**** Oracle_network | input = crop ") self._image_id = tf.placeholder(tf.float32, [self.batch_size], name='image_id') # self._crop_id = tf.placeholder(tf.float32, [self.batch_size], name='crop_id') self._crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop') if config["model"]["attention"]["co-attention"]: self.crop_out = get_image_features( image=self._crop, question=self.lstm_states_question, is_training=self._is_training, scope_name=scope.name, scope_feature="Crop/", config=config["model"]['crop']) co_attention[3] = self.crop_out else: self.crop_out = get_image_features( image=self._crop, question=self.lstm_states_question, is_training=self._is_training, scope_name=scope.name, scope_feature="Crop/", co_attention=False, config=config["model"]['crop']) embeddings.append(self.crop_out) if config["model"]["crop"]["segment_crop"]["use"]: all_segment_crop = [] # for i in range(10): self._segment_crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop_segment'.format(0)) self.crop_out = get_image_features( image=self._segment_crop, question=self.lstm_states_question, is_training=self._is_training, scope_name="test", scope_feature="Segment/", config=config["model"]['crop']) print("self.crop_out = {} ".format(self.crop_out)) # all_segment_crop.add(self.crop_out) # print("-- crop = {},image_features = {} ".format(self.crop_out, image_feature)) # exit() if config["model"]["attention"]["co-attention"]: question_feature,history_feature , image_feature = compute_all_attention(question_states=co_attention[0], caption=co_attention[1], history_states=co_attention[2], image_feature=co_attention[3], no_mlp_units=config['model']['attention']['no_attention_mlp'], config = config ) embeddings.append(history_feature) embeddings.append(question_feature) embeddings.append(image_feature) # embeddings.append(question_feature) print("*** All Embedding = ",embeddings) self.emb = tf.concat(embeddings, axis=1) print("*** self.emb = ",self.emb) # Compute the final embedding # print("---------- Embeddings=",embeddings) # self.emb = tf.concat(embeddings, axis=1) # OUTPUT num_classes = 3 self._answer = tf.placeholder(tf.float32, [self.batch_size, num_classes], name='answer') with tf.variable_scope('mlp'): num_hiddens = config['model']['MLP']['num_hiddens'] # emb = tf.print(emb, [emb], "input: ") l1 = utils.fully_connected(self.emb, num_hiddens, activation='relu', scope='l1') self.pred = utils.fully_connected(l1, num_classes, activation='softmax', scope='softmax') self.best_pred = tf.argmax(self.pred, axis=1) # self.best_pred = tf.reduce_mean(self.best_pred) print("--- predict = {} ,answer = {} ".format(self.pred,self._answer)) # exit() # self.loss = None self.loss = tf.reduce_mean(utils.cross_entropy(self.pred, self._answer)) self.error = tf.reduce_mean(utils.error(self.pred, self._answer)) print("loss = {} ,error = {} ".format(self.loss,self.error)) print('Model... Oracle build!')
def __init__(self, config, num_words, policy_gradient, device='', reuse=False): # AbstractNetwork.__init__(self, "qgen_guesser", device=device) ResnetModel.__init__(self, "qgen_guesser", device=device) # Create the scope for this graph with tf.variable_scope(self.scope_name, reuse=reuse): # We set batch size to be none as the batch size for the validation set and train set are different # mini_batch_size = None # mini_batch_size = config['batch_size'] mini_batch_size = None self.guesser_loss_weight = tf.constant(config["guesser_loss_weight"], dtype = tf.float32, name = "guesser_loss_weight") self.qgen_loss_weight = tf.constant(config["qgen_loss_weight"], dtype = tf.float32, name = "qgen_loss_weight") self.loss = 0 # ********************************************************* # Placeholders specific for guesser and its processing # ********************************************************* # Objects self.obj_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='obj_mask') self.obj_cats = tf.placeholder(tf.int32, [mini_batch_size, None], name='obj_cats') self.obj_spats = tf.placeholder(tf.float32, [mini_batch_size, None, config['spat_dim']], name='obj_spats') # Targets self.targets = tf.placeholder(tf.int32, [mini_batch_size], name="targets_index") self.object_cats_emb = utils.get_embedding( self.obj_cats, config['no_categories'] + 1, config['cat_emb_dim'], scope='cat_embedding') self.objects_input = tf.concat([self.object_cats_emb, self.obj_spats], axis=2) self.flat_objects_inp = tf.reshape(self.objects_input, [-1, config['cat_emb_dim'] + config['spat_dim']]) with tf.variable_scope('obj_mlp'): h1 = utils.fully_connected( self.flat_objects_inp, n_out=config['obj_mlp_units'], activation='relu', scope='l1') h2 = utils.fully_connected( h1, n_out=config['no_hidden_final_mlp'], activation='relu', scope='l2') # print # print # print h2 # TODO: Object Embeddings do not have image features right now obj_embs = tf.reshape(h2, [-1, tf.shape(self.obj_cats)[1], config['no_hidden_final_mlp']]) # ********************************************************* # Placeholders for Qgen and common placeholder for guesser and its processing # ********************************************************* # Image self.images = tf.placeholder(tf.float32, [mini_batch_size] + config['image']["dim"], name='images') # Question self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues') self.answer_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='answer_mask') # 1 if keep and (1 q/a 1) for (START q/a STOP) self.padding_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='padding_mask') self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length') # Rewards self.cum_rewards = tf.placeholder(tf.float32, shape=[mini_batch_size, None], name='cum_reward') # DECODER Hidden state (for beam search) zero_state = tf.zeros([1, config['num_lstm_units']]) # default LSTM state is a zero-vector zero_state = tf.tile(zero_state, [tf.shape(self.images)[0], 1]) # trick to do a dynamic size 0 tensors self.decoder_zero_state_c = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_c") self.decoder_zero_state_h = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_h") decoder_initial_state = tf.contrib.rnn.LSTMStateTuple(c=self.decoder_zero_state_c, h=self.decoder_zero_state_h) # ******* # Misc # ******* self.is_training = tf.placeholder(tf.bool, name='is_training') self.greedy = tf.placeholder_with_default(False, shape=(), name="greedy") # use for graph self.samples = None # For each length of the answer, we are finding the next token # remove last token input_dialogues = self.dialogues[:, :-1] input_seq_length = self.seq_length - 1 # remove first token(=start token) rewards = self.cum_rewards[:, 1:] target_words = self.dialogues[:, 1:] # to understand the padding: # input # <start> is it a blue <?> <yes> is it a car <?> <no> <stop_dialogue> # target # is it a blue <?> - is it a car <?> - <stop_dialogue> - # TODO: # 1. Include Film in guesser (Check if you are using film or cbn????) # Add finetune in the training (see the training and config file of clever) # Check the use of finetune (should we input pretrained model), normalize etc # See in the config file, if the attention has to be put inside image block # As of now, in the first part where we get image embedding, the we only flatten the image. Use RCNN or other method to get the image features. # Include attention on image given the dialog embedding in the guesser part # Include dropout om lstm (option for inside and outside) and image # Include attention on words given the image features # 2. Use tf.gather and use all the lstm states where there was yes or no (-) in the target and the stop dialog # 3. Make the code run # Check how does the is_training flag works # image processing with tf.variable_scope('image_feature') as img_scope: if len(config["image"]["dim"]) == 1: self.image_out = self.images else: # TODO: Create a different config for this attention # Putting images tf.summary.image("image", self.images) self.image_out = get_image_features( image=self.images, question = None, is_training=self.is_training, scope_name=img_scope.name, config=config['image'], att = False ) image_pooling_size = [int((self.image_out).get_shape()[1]), int((self.image_out).get_shape()[2])] image_feature_depth = int((self.image_out).get_shape()[3]) self.image_out = tf.layers.max_pooling2d(self.image_out, image_pooling_size, 1, padding='valid', data_format='channels_last', name='max_pooling_image_out') self.image_out = tf.reshape(self.image_out, [-1, image_feature_depth]) # self.filmed_picture_out = tf.layers.average_pooling2d( self.filmed_picture_out, # final_pooling_size, # 1, # padding='valid', # data_format='channels_last', # name='average_pooling_filmed_picture_out') # self.image_out = get_attention(self.images, None, config["image"]["attention"]) #TODO: improve by using the previous lstm state? # self.image_out = tf.contrib.layers.flatten(self.image_out) print self.image_out print print # Reduce the embedding size of the image with tf.variable_scope('image_embedding'): self.image_emb = utils.fully_connected(self.image_out, config['image_embedding_size']) image_emb = tf.expand_dims(self.image_emb, 1) image_emb = tf.tile(image_emb, [1, tf.shape(input_dialogues)[1], 1]) # Compute the question embedding input_words = utils.get_embedding( input_dialogues, n_words=num_words, n_dim=config['word_embedding_size'], scope="word_embedding") # concat word embedding and image embedding # TODO: Check the size (see if input_seq_length is increased or not) decoder_input = tf.concat([input_words, image_emb], axis=2, name="concat_full_embedding") # encode one word+image decoder_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( config['num_lstm_units'], layer_norm=False, dropout_keep_prob=1.0, reuse=reuse) # TODO: Since we have concatinated image, check if the input_seq_length should be increased by one # Decoding the states to generate questions self.decoder_output, self.decoder_state = tf.nn.dynamic_rnn( cell=decoder_lstm_cell, inputs=decoder_input, dtype=tf.float32, initial_state=decoder_initial_state, sequence_length=input_seq_length, scope="word_decoder") # TODO: use multi-layer RNN max_sequence = tf.reduce_max(self.seq_length) # For the Guesser # Adding extra layers of LSTM # TODO: There are several default parameters in the fuction. Try using them # TODO: as of now, not using it. # TODO, as of now only using the hidden state, you may include the other state too last_states = self.decoder_state.h # last_states, _ = rnn.variable_length_LSTM_extension( # self.decoder_output, # self.decoder_state, # num_hidden = config['num_lstm_units'], # seq_length = input_seq_length # ) last_states = tf.reshape(last_states, [-1, config['num_lstm_units']]) # TODO: Can be moved to utils def masked_softmax(scores, mask): # subtract max for stability scores = scores - tf.tile(tf.reduce_max(scores, axis=(1,), keepdims=True), [1, tf.shape(scores)[1]]) # compute padded softmax exp_scores = tf.exp(scores) exp_scores *= mask exp_sum_scores = tf.reduce_sum(exp_scores, axis=1, keepdims=True) return exp_scores / tf.tile(exp_sum_scores, [1, tf.shape(exp_scores)[1]]) # compute the softmax for evaluation (on all the words on dialogue) with tf.variable_scope('decoder_output'): flat_decoder_output = tf.reshape(self.decoder_output, [-1, decoder_lstm_cell.output_size]) flat_mlp_output = utils.fully_connected(flat_decoder_output, num_words) # retrieve the batch/dialogue format mlp_output = tf.reshape(flat_mlp_output, [tf.shape(self.seq_length)[0], max_sequence - 1, num_words]) # Ignore th STOP token self.softmax_output = tf.nn.softmax(mlp_output, name="softmax") self.argmax_output = tf.argmax(mlp_output, axis=2) self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mlp_output, labels=target_words) # compute the maximum likelihood loss for the dialogues (for valid words) with tf.variable_scope('ml_loss'): ml_loss = tf.identity(self.cross_entropy_loss) ml_loss *= self.answer_mask[:, 1:] # remove answers (ignore the <stop> token) ml_loss *= self.padding_mask[:, 1:] # remove padding (ignore the <start> token) # Count number of unmask elements count = tf.reduce_sum(self.padding_mask) - tf.reduce_sum(1 - self.answer_mask[:, :-1]) - 1 # no_unpad - no_qa - START token ml_loss = tf.reduce_sum(ml_loss, axis=1) # reduce over dialogue dimension ml_loss = tf.reduce_sum(ml_loss, axis=0) # reduce over minibatch dimension self.ml_loss = ml_loss / count # Normalize self.qgen_loss = self.ml_loss self.loss += self.qgen_loss_weight * self.qgen_loss tf.summary.scalar("qgen_loss", self.qgen_loss) # TODO NOTE: IMP in config file, under the image section, set cbn to be true with tf.variable_scope('guesser_input') as scope: # Getting the CBN image features self.CBN_picture_out = get_image_features( image=self.images, question = last_states, is_training=self.is_training, scope_name=scope.name, config=config['image'] ) # FILMING the Features # self.filmed_picture_out = film_layer(ft=self.CBN_picture_out, context=last_states) # TODO: Make n a hyperparameter and add it to network parameters self.filmed_picture_out = self.CBN_picture_out n = 1 for i in range(n): with tf.variable_scope('film_layer_' + str(i)): self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out_3 = FiLMResblock(features=self.filmed_picture_out_2, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out_3, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = FiLMResblock(features=self.CBN_picture_out, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = film_layer(features=self.CBN_picture_out, context=last_states) # TODO: Doing a convolution over the feature maps (before the classifier) # Do a max pooling over the feature maps final_pooling_size = [int((self.filmed_picture_out).get_shape()[1]), int((self.filmed_picture_out).get_shape()[2])] final_feature_depth = int((self.filmed_picture_out).get_shape()[3]) if str(config["pooling"]).lower() == 'max': self.filmed_picture_out = tf.layers.max_pooling2d( self.filmed_picture_out, final_pooling_size, 1, padding='valid', data_format='channels_last', name='max_pooling_filmed_picture_out') elif str(config["pooling"]).lower() == 'avg': self.filmed_picture_out = tf.layers.average_pooling2d( self.filmed_picture_out, final_pooling_size, 1, padding='valid', data_format='channels_last', name='average_pooling_filmed_picture_out') else: print "No Pooling defined" sys.exit() self.filmed_picture_out = tf.reshape(self.filmed_picture_out, [-1, final_feature_depth]) # Combining filmed image and dialog features into one ##################### activation_name = config["activation"] self.question_embedding = utils.fully_connected(last_states, config["no_question_mlp"], activation=activation_name, scope='question_mlp') self.picture_embedding = utils.fully_connected(self.filmed_picture_out, config["no_picture_mlp"], activation=activation_name, scope='picture_mlp') self.full_embedding = self.picture_embedding * self.question_embedding # self.full_embedding = tf.nn.dropout(full_embedding, dropout_keep) # self.guesser_out_0 = utils.fully_connected(self.full_embedding, config["no_hidden_prefinal_mlp"], scope='hidden_prefinal', activation=activation_name) self.guesser_out_0 = self.full_embedding # out = tf.nn.dropout(out, dropout_keep) # Since we are not having # out = utils.fully_connected(out, no_answers, activation='linear', scope='layer_softmax') self.guesser_out = utils.fully_connected(self.guesser_out_0, config["no_hidden_final_mlp"], scope='hidden_final', activation=activation_name) self.guesser_out = tf.reshape(self.guesser_out, [-1, config["no_hidden_final_mlp"], 1]) # TODO DONE: Add all these losses to tensorboard with tf.variable_scope('guesser_output'): # TODO: In paper they do dot product, but in code they do matmul !! scores = tf.matmul(obj_embs, self.guesser_out) scores = tf.reshape(scores, [-1, tf.shape(self.obj_cats)[1]]) self.softmax = masked_softmax(scores, self.obj_mask) self.selected_object = tf.argmax(self.softmax, axis=1) self.guesser_error = tf.reduce_mean(utils.error(self.softmax, self.targets)) self.guesser_loss = tf.reduce_mean(utils.cross_entropy(self.softmax, self.targets)) self.loss += self.guesser_loss_weight * self.guesser_loss tf.summary.scalar("guesser loss", self.guesser_loss) # Compute policy gradient if policy_gradient: with tf.variable_scope('rl_baseline'): decoder_out = tf.stop_gradient(self.decoder_output) # take the LSTM output (and stop the gradient!) flat_decoder_output = tf.reshape(decoder_out, [-1, decoder_lstm_cell.output_size]) # flat_h1 = utils.fully_connected(flat_decoder_output, n_out=100, activation='relu', scope='baseline_hidden') flat_baseline = utils.fully_connected(flat_h1, 1, activation='relu', scope='baseline_out') self.baseline = tf.reshape(flat_baseline, [tf.shape(self.seq_length)[0], max_sequence-1]) self.baseline *= self.answer_mask[:, 1:] self.baseline *= self.padding_mask[:, 1:] with tf.variable_scope('policy_gradient_loss'): # Compute log_prob self.log_of_policy = tf.identity(self.cross_entropy_loss) self.log_of_policy *= self.answer_mask[:, 1:] # remove answers (<=> predicted answer has maximum reward) (ignore the START token in the mask) # No need to use padding mask as the discounted_reward is already zero once the episode terminated # Policy gradient loss rewards *= self.answer_mask[:, 1:] self.score_function = tf.multiply(self.log_of_policy, rewards - self.baseline) # score function self.baseline_loss = tf.reduce_sum(tf.square(rewards - self.baseline)) self.policy_gradient_loss = tf.reduce_sum(self.score_function, axis=1) # sum over the dialogue trajectory self.policy_gradient_loss = tf.reduce_mean(self.policy_gradient_loss, axis=0) # reduce over minibatch dimension self.loss = self.policy_gradient_loss tf.summary.scalar("total network loss", self.loss) self.summary = tf.summary.merge_all() print('Model... build!')