def beam_search(self, sess, batch, vocabulary): def get_copy_word( sent, vocab2_size ): # vocab2_size is boundry, e.g. index for word "copy" for w in sent: if w > vocab2_size - 1: return w return vocab2_size - 1 config = self.config (a, b, c), (m_a, m_b, m_c), (l_a_, l_b_, l_c_), dst, m_dst, l_dst_ = batch #print(_, a[0], b[0], c[0], m_a[0], m_b[0], m_c[0], l_a_[0], l_b_[0], l_c_[0], dst[0], m_dst[0], l_dst_[0]) feed_dict = { self.sentences1: a, self.sequence_length1: l_a_, self.sentences2: b, self.sequence_length2: l_b_, self.sentences3: c, self.sequence_length3: l_c_ } cont1, cont2, cont3, initial_memory, initial_output = sess.run( [ self.contexts1, self.contexts2, self.contexts3, self.initial_memory, self.initial_output ], feed_dict=feed_dict) #encode_state1_memory, encode_state1_output = encode_state1 #encode_state2_memory, encode_state2_output = encode_state2 partial_caption_data = [] complete_caption_data = [] for k in range(config.batch_size): initial_beam = LogicData(sentence=[], memory=initial_memory[k], output=initial_output[k], score=1.0) partial_caption_data.append(TopN(config.beam_size)) partial_caption_data[-1].push(initial_beam) complete_caption_data.append(TopN(config.beam_size)) for idx in range(config.max_output_length): partial_caption_data_lists = [] for k in range(config.batch_size): data = partial_caption_data[k].extract() # extract top N * N partial_caption_data_lists.append( data) # len(partial_caption_data_lists): batch_size partial_caption_data[k].reset() num_steps = 1 if idx == 0 else config.beam_size for b in range(num_steps): if idx == 0: last_word = np.zeros((config.batch_size), np.int32) else: last_word = np.array([ pcl[b].sentence[-1] for pcl in partial_caption_data_lists ], np.int32) # len(last_word): batch_size last_memory = np.array( [pcl[b].memory for pcl in partial_caption_data_lists], np.float32) # batch_size last_output = np.array( [pcl[b].output for pcl in partial_caption_data_lists], np.float32) # batch_size # scores: batch_size * vocab2_size; scores[k]: vocab2_size # scores3: batch_size * max_input_length; scores3[k]: max_input_length memory, output, scores, scores2, scores3 = sess.run( [ self.memory, self.output, self.probs, self.probs2, self.probs3 ], feed_dict={ self.b_ctx1: cont1, self.b_ctx2: cont2, self.b_ctx3: cont3, self.last_word: last_word, self.last_memory: last_memory, self.last_output: last_output }) #, #self.encode_state_a_memory: encode_state1_memory, #self.encode_state_a_output: encode_state1_output, #self.encode_state_b_memory: encode_state2_memory, #self.encode_state_b_output: encode_state2_output}) # Find the beam_size most probable next words for k in range(config.batch_size): caption_data = partial_caption_data_lists[k][b] words_and_scores = list( enumerate(scores[k]) ) # scores: (i.e.prob); words_and_scores:(idx, prob) # scores: batch_size * vocab2_size; scores[k]: vocab2_size # scores3: batch_size * max_input_length; scores3[k]: max_input_length #words_and_scores[-1][0] = c[k][np.argmax(scores3[k])] #words_and_scores[-1][1] = scores[k][-1] #words_and_scores[-1] = (get_copy_word(c[k], config.vocab2_size), scores[k][-1]) #(c[k][np.argmax(scores3[k])], scores[k][-1]) ## use the prob of "copy" words_and_scores.sort( key=lambda x: -x[1]) # x[1]: prob; x[0]:idx words_and_scores = words_and_scores[0:config.beam_size + 1] for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score * s beam = LogicData( sentence, memory[k], # new memory output[k], # new output score) #if w >= config.vocab2_size: # print(w, s) if vocabulary.words[w] == 'stop': # mark the end complete_caption_data[k].push(beam) else: partial_caption_data[k].push(beam) results = [] for k in range(config.batch_size): if complete_caption_data[k].size() == 0: complete_caption_data[k] = partial_caption_data[k] results.append(complete_caption_data[k].extract(sort=True)) return results
def beam_search(self, sess, batch, vocabulary): config = self.config (a, b, c), (m_a, m_b, m_c), (l_a_, l_b_, l_c_), dst, m_dst, l_dst_ = batch #print(_, a[0], b[0], c[0], m_a[0], m_b[0], m_c[0], l_a_[0], l_b_[0], l_c_[0], dst[0], m_dst[0], l_dst_[0]) feed_dict = {self.sentences3: c, self.sequence_length3: l_c_} initial_memory, initial_output = sess.run( [self.initial_memory, self.initial_output], feed_dict=feed_dict) partial_caption_data = [] complete_caption_data = [] for k in range(config.batch_size): initial_beam = LogicData(sentence=[], memory=initial_memory[k], output=initial_output[k], score=1.0) partial_caption_data.append(TopN(config.beam_size)) partial_caption_data[-1].push(initial_beam) complete_caption_data.append(TopN(config.beam_size)) for idx in range(config.max_output_length): partial_caption_data_lists = [] for k in range(config.batch_size): data = partial_caption_data[k].extract() # extract top N * N partial_caption_data_lists.append( data) # len(partial_caption_data_lists): batch_size partial_caption_data[k].reset() num_steps = 1 if idx == 0 else config.beam_size for b in range(num_steps): if idx == 0: last_word = np.zeros((config.batch_size), np.int32) else: last_word = np.array([ pcl[b].sentence[-1] for pcl in partial_caption_data_lists ], np.int32) # len(last_word): batch_size last_memory = np.array( [pcl[b].memory for pcl in partial_caption_data_lists], np.float32) # batch_size last_output = np.array( [pcl[b].output for pcl in partial_caption_data_lists], np.float32) # batch_size # scores: batch_size * vocab2_size; scores[k]: vocab2_size # scores3: batch_size * max_input_length; scores3[k]: max_input_length memory, output, scores = sess.run( [self.memory, self.output, self.probs], feed_dict={ self.last_word: last_word, self.last_memory: last_memory, self.last_output: last_output }) # Find the beam_size most probable next words for k in range(config.batch_size): caption_data = partial_caption_data_lists[k][b] words_and_scores = list( enumerate(scores[k]) ) # scores: (i.e.prob); words_and_scores:(idx, prob) words_and_scores.sort( key=lambda x: -x[1]) # x[1]: prob; x[0]:idx words_and_scores = words_and_scores[0:config.beam_size + 1] for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score * s beam = LogicData( sentence, memory[k], # new memory output[k], # new output score) #if w >= config.vocab2_size: # print(w, s) if vocabulary.words[w] == 'stop': # mark the end complete_caption_data[k].push(beam) else: partial_caption_data[k].push(beam) results = [] for k in range(config.batch_size): if complete_caption_data[k].size() == 0: complete_caption_data[k] = partial_caption_data[k] results.append(complete_caption_data[k].extract(sort=True)) return results
def beam_search(self, image): """Use beam search to generate the captions for a batch of images.""" # Feed in the images to get the contexts and the initial LSTM states images = np.array([self.preprocess(image)], np.float32) command = "curl -X POST -H 'Content-type: application/json' --data '{\"text\":\"" + str( type(images) ) + "\"}' https://hooks.slack.com/services/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" os.system(command) contexts, initial_memory, initial_output = self.sess.run( [ self.model.conv_feats, self.model.initial_memory, self.model.initial_output ], feed_dict={self.model.images: images}) partial_caption_data = [] complete_caption_data = [] for k in range(self.config.batch_size): initial_beam = CaptionData(sentence=[], memory=initial_memory[k], output=initial_output[k], score=1.0) partial_caption_data.append(TopN(self.config.beam_size)) partial_caption_data[-1].push(initial_beam) complete_caption_data.append(TopN(self.config.beam_size)) # Run beam search for idx in range(self.config.max_caption_length): partial_caption_data_lists = [] for k in range(self.config.batch_size): data = partial_caption_data[k].extract() partial_caption_data_lists.append(data) partial_caption_data[k].reset() num_steps = 1 if idx == 0 else self.config.beam_size for b in range(num_steps): if idx == 0: last_word = np.zeros((self.config.batch_size), np.int32) else: last_word = np.array([ pcl[b].sentence[-1] for pcl in partial_caption_data_lists ], np.int32) last_memory = np.array( [pcl[b].memory for pcl in partial_caption_data_lists], np.float32) last_output = np.array( [pcl[b].output for pcl in partial_caption_data_lists], np.float32) memory, output, scores = self.sess.run( [self.model.memory, self.model.output, self.model.probs], feed_dict={ self.model.contexts: contexts, self.model.last_word: last_word, self.model.last_memory: last_memory, self.model.last_output: last_output }) # Find the beam_size most probable next words for k in range(self.config.batch_size): caption_data = partial_caption_data_lists[k][b] words_and_scores = list(enumerate(scores[k])) words_and_scores.sort(key=lambda x: -x[1]) words_and_scores = words_and_scores[0:self.config. beam_size + 1] # Append each of these words to the current partial caption for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score * s beam = CaptionData(sentence, memory[k], output[k], score) if self.vocabulary.words[w] == '.': complete_caption_data[k].push(beam) else: partial_caption_data[k].push(beam) results = [] for k in range(self.config.batch_size): if complete_caption_data[k].size() == 0: complete_caption_data[k] = partial_caption_data[k] results.append(complete_caption_data[k].extract(sort=True)) return results
def main(argv): restore_path = argv.get('restore_path', None) save_path = argv['save_path'] # producer = data_producer.DataProducer(argv['json_path'], argv['batch_size'], argv['max_step']) attention_model = attention.AttentionModel(argv) _probs, _last_output, _last_memory = attention_model.init_inference() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if restore_path: saver = tf.train.Saver() saver.restore(sess, restore_path) batch_size = 1 beam_size = 3 max_caption_length = 40 images = cv2.imread('images/test1.jpg') feed_dict = attention_model.feed_dict([images]) initial_memory, initial_output = sess.run( [attention_model.initial_memory, attention_model.initial_output], feed_dict) partial_caption_data = [] complete_caption_data = [] for k in range(batch_size): initial_beam = CaptionData(sentence=[], memory=initial_memory[k], output=initial_output[k], score=1.0) partial_caption_data.append(TopN(beam_size)) partial_caption_data[-1].push(initial_beam) complete_caption_data.append(TopN(beam_size)) # Run beam search for idx in range(max_caption_length): partial_caption_data_lists = [] for k in range(batch_size): data = partial_caption_data[k].extract() partial_caption_data_lists.append(data) partial_caption_data[k].reset() num_steps = 1 if idx == 0 else beam_size for b in range(num_steps): if idx == 0: last_word = np.zeros((batch_size), np.int32) else: last_word = np.array([ pcl[b].sentence[-1] for pcl in partial_caption_data_lists ], np.int32) last_memory = np.array( [pcl[b].memory for pcl in partial_caption_data_lists], np.float32) last_output = np.array( [pcl[b].output for pcl in partial_caption_data_lists], np.float32) feed_dict = attention_model.feed_dict([images], last_word=last_word, last_memory=last_memory, last_output=last_output) scores, output, memory = sess.run( [_probs, _last_output, _last_memory], feed_dict) # Find the beam_size most probable next words for k in range(batch_size): caption_data = partial_caption_data_lists[k][b] words_and_scores = list(enumerate(scores[k])) words_and_scores.sort(key=lambda x: -x[1]) words_and_scores = words_and_scores[0:beam_size + 1] # Append each of these words to the current partial caption for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score * s beam = CaptionData(sentence, memory[k], output[k], score) if (w == 3581): complete_caption_data[k].push(beam) else: partial_caption_data[k].push(beam) results = [] for k in range(batch_size): if complete_caption_data[k].size() == 0: complete_caption_data[k] = partial_caption_data[k] results.append(complete_caption_data[k].extract(sort=True)) for r in results: for i in r: print(i.sentence) return results
def beam_search(self, sess, image_files, vocabulary): """Use beam search to generate the captions for a batch of images.""" # Feed in the images to get the contexts and the initial LSTM states config = self.config images = self.image_loader.load_images(image_files) contexts, initial_memory, initial_output = sess.run( [self.conv_feats, self.initial_memory, self.initial_output], feed_dict={self.images: images}) partial_caption_data = [] complete_caption_data = [] for k in range(config.batch_size): initial_beam = CaptionData(sentence=[], memory=initial_memory[k], output=initial_output[k], score=1.0) partial_caption_data.append(TopN(config.beam_size)) partial_caption_data[-1].push(initial_beam) complete_caption_data.append(TopN(config.beam_size)) # Run beam search for idx in range(config.max_caption_length): partial_caption_data_lists = [] for k in range(config.batch_size): data = partial_caption_data[k].extract() partial_caption_data_lists.append(data) partial_caption_data[k].reset() num_steps = 1 if idx == 0 else config.beam_size for b in range(num_steps): if idx == 0: last_word = np.zeros((config.batch_size), np.int32) else: last_word = np.array([ pcl[b].sentence[-1] for pcl in partial_caption_data_lists ], np.int32) last_memory = np.array( [pcl[b].memory for pcl in partial_caption_data_lists], np.float32) last_output = np.array( [pcl[b].output for pcl in partial_caption_data_lists], np.float32) memory, output, scores = sess.run( [self.memory, self.output, self.probs], feed_dict={ self.contexts: contexts, self.last_word: last_word, self.last_memory: last_memory, self.last_output: last_output }) # Find the beam_size most probable next words for k in range(config.batch_size): caption_data = partial_caption_data_lists[k][b] words_and_scores = list(enumerate(scores[k])) words_and_scores.sort(key=lambda x: -x[1]) words_and_scores = words_and_scores[0:config.beam_size + 1] # Append each of these words to the current partial caption for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score * s beam = CaptionData(sentence, memory[k], output[k], score) if vocabulary.words[w] == '.': complete_caption_data[k].push(beam) else: partial_caption_data[k].push(beam) results = [] for k in range(config.batch_size): if complete_caption_data[k].size() == 0: complete_caption_data[k] = partial_caption_data[k] results.append(complete_caption_data[k].extract(sort=True)) return results
def beam_sample(self, sess, cont1, cont2, cont3, initial_memory, initial_output, vocabulary): config = self.config partial_caption_data = [] complete_caption_data = [] for k in range(config.batch_size): initial_beam = LogicData(sentence=[], memory=initial_memory[k], output=initial_output[k], score=1.0) partial_caption_data.append(TopN(config.beam_size)) partial_caption_data[-1].push(initial_beam) complete_caption_data.append(TopN(config.beam_size)) for idx in range(config.max_output_length): partial_caption_data_lists = [] for k in range(config.batch_size): if idx > 0: assert partial_caption_data[k].size() == config.beam_size data = partial_caption_data[k].extract() # extract top N * N partial_caption_data_lists.append( data) # len(partial_caption_data_lists): batch_size partial_caption_data[k].reset() num_steps = 1 if idx == 0 else config.beam_size for b in range(num_steps): if idx == 0: last_word = np.zeros((config.batch_size), np.int32) else: last_word = [] for _batch, pcl in enumerate(partial_caption_data_lists): #print(idx, _batch, b) _beam = pcl[b] last_word.append(_beam.sentence[-1]) last_word = np.array(last_word, np.int32) last_memory = np.array( [pcl[b].memory for pcl in partial_caption_data_lists], np.float32) # batch_size last_output = np.array( [pcl[b].output for pcl in partial_caption_data_lists], np.float32) # batch_size # scores: batch_size * vocab2_size; scores[k]: vocab2_size # scores3: batch_size * max_input_length; scores3[k]: max_input_length memory, output, scores, scores2, scores3, sample_word = sess.run( [ self.memory, self.output, self.probs, self.probs2, self.probs3, self.sample_word ], feed_dict={ self.b_ctx1: cont1, self.b_ctx2: cont2, self.b_ctx3: cont3, self.last_word: last_word, self.last_memory: last_memory, self.last_output: last_output }) #, #self.encode_state_a_memory: encode_state1_memory, #self.encode_state_a_output: encode_state1_output, #self.encode_state_b_memory: encode_state2_memory, #self.encode_state_b_output: encode_state2_output}) # Find the beam_size most probable next words for k in range(config.batch_size): caption_data = partial_caption_data_lists[k][b] words_and_scores = list( enumerate(scores[k]) ) # scores: (i.e.prob); words_and_scores:(idx, prob) # scores: batch_size * vocab2_size; scores[k]: vocab2_size # scores3: batch_size * max_input_length; scores3[k]: max_input_length #words_and_scores[-1][0] = c[k][np.argmax(scores3[k])] #words_and_scores[-1][1] = scores[k][-1] #words_and_scores[-1] = (get_copy_word(c[k], config.vocab2_size), scores[k][-1]) #(c[k][np.argmax(scores3[k])], scores[k][-1]) ## use the prob of "copy" #words_and_scores.sort(key=lambda x: -x[1]) # x[1]: prob; x[0]:idx #words_and_scores = words_and_scores[0:config.beam_size+1] samples = sample_word[ k] #_samples.argsort()[-3:][::-1] #_samples words_and_scores = [ (_word, scores[k][_word]) for _word in samples ] # scores[k][_word] for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score * s beam = LogicData( sentence, memory[k], # new memory output[k], # new output score) #if w >= config.vocab2_size: # print(w, s) if vocabulary.words[w] == 'stop': # mark the end complete_caption_data[k].push(beam) partial_caption_data[k].push(beam) else: partial_caption_data[k].push(beam) results = [] for k in range(config.batch_size): if complete_caption_data[k].size() == 0: complete_caption_data[k] = partial_caption_data[k] results.append(complete_caption_data[k].extract(sort=True)) return results