def init_search_sentence(self, initial_memory, initial_output): self.partial_caption_data = [] self.complete_caption_data = [] for k in range(self.batch_size): initial_beam = CaptionData(sentence=[self.start_id], memory=initial_memory[k], output=initial_output[k], score=0, alphas=[]) self.partial_caption_data.append(TopN(self.beam_size)) self.partial_caption_data[-1].push(initial_beam) self.complete_caption_data.append(TopN(self.beam_size))
def add_result(self, beam_idx, memory, output, scores, alpha): # Find the beam_size most probable next words for k in range(self.batch_size): caption_data = self.partial_caption_data_lists[k][beam_idx] words_and_scores = list(enumerate(scores[k])) words_and_scores.sort(key=lambda x: -x[1]) words_and_scores = words_and_scores[0:self.beam_size + 1] # Append each of these words to the current partial caption for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score - np.log2(s) alphas = caption_data.alphas + [alpha[k]] beam = CaptionData(sentence, memory[k], output[k], score, alphas) if w == self.end_id: self.complete_caption_data[k].push(beam) else: self.partial_caption_data[k].push(beam)
def beam_search(self, sess, vocabulary): """Use beam search to generate the captions for a batch of images.""" # Feed in the images to get the contexts and the initial LSTM states config = self.config contexts, initial_memory, initial_output = sess.run( [self.conv_feats, self.initial_memory, self.initial_output]) partial_caption_data = [] complete_caption_data = [] for k in range(config.batch_size): initial_beam = CaptionData(sentence=[], memory=initial_memory[k], output=initial_output[k], score=1.0) partial_caption_data.append(TopN(config.beam_size)) partial_caption_data[-1].push(initial_beam) complete_caption_data.append(TopN(config.beam_size)) # Run beam search for idx in range(config.max_caption_length): partial_caption_data_lists = [] for k in range(config.batch_size): data = partial_caption_data[k].extract() partial_caption_data_lists.append(data) partial_caption_data[k].reset() num_steps = 1 if idx == 0 else config.beam_size for b in range(num_steps): if idx == 0: last_word = vocabulary.start_id * np.ones( (config.batch_size), np.int32) else: last_word = np.array([ pcl[b].sentence[-1] for pcl in partial_caption_data_lists ], np.int32) last_memory = np.array( [pcl[b].memory for pcl in partial_caption_data_lists], np.float32) last_output = np.array( [pcl[b].output for pcl in partial_caption_data_lists], np.float32) memory, output, scores = sess.run( [self.memory, self.output, self.probs], feed_dict={ self.contexts: contexts, self.last_word: last_word, self.last_memory: last_memory, self.last_output: last_output }) # Find the beam_size most probable next words for k in range(config.batch_size): caption_data = partial_caption_data_lists[k][b] words_and_scores = list(enumerate(scores[k])) words_and_scores.sort(key=lambda x: -x[1]) words_and_scores = words_and_scores[0:config.beam_size + 1] # Append each of these words to the current partial caption for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score * s beam = CaptionData(sentence, memory[k], output[k], score) if w == vocabulary.end_id: complete_caption_data[k].push(beam) else: partial_caption_data[k].push(beam) results = [] for k in range(config.batch_size): if complete_caption_data[k].size() == 0: complete_caption_data[k] = partial_caption_data[k] results.append(complete_caption_data[k].extract(sort=True)) return results
def beam_search(self, sess, image_files, vocabulary): """Use beam search to generate the captions for a batch of images.""" # Feed in the images to get the contexts and the initial LSTM states config = self.config images = self.image_loader.load_images(image_files) contexts, initial_memory, initial_output = sess.run( [self.conv_feats, self.initial_memory, self.initial_output], feed_dict={self.images: images}) partial_caption_data = [] complete_caption_data = [] alpha_all = [] last_output_all = [] for k in range(config.batch_size): initial_beam = CaptionData(sentence=[], memory=initial_memory[k], output=initial_output[k], score=1.0) partial_caption_data.append(TopN(config.beam_size)) partial_caption_data[-1].push(initial_beam) complete_caption_data.append(TopN(config.beam_size)) # Run beam search for idx in range(config.max_caption_length): partial_caption_data_lists = [] for k in range(config.batch_size): data = partial_caption_data[k].extract() partial_caption_data_lists.append(data) partial_caption_data[k].reset() num_steps = 1 if idx == 0 else config.beam_size for b in range(num_steps): if idx == 0: last_word = np.zeros((config.batch_size), np.int32) else: last_word = np.array([ pcl[b].sentence[-1] for pcl in partial_caption_data_lists ], np.int32) last_memory = np.array( [pcl[b].memory for pcl in partial_caption_data_lists], np.float32) last_output = np.array( [pcl[b].output for pcl in partial_caption_data_lists], np.float32) memory, output, scores = sess.run( [self.memory, self.output, self.probs], feed_dict={ self.contexts: contexts, self.last_word: last_word, self.last_memory: last_memory, self.last_output: last_output }) # Find the beam_size most probable next words for k in range(config.batch_size): caption_data = partial_caption_data_lists[k][b] words_and_scores = list(enumerate(scores[k])) words_and_scores.sort(key=lambda x: -x[1]) words_and_scores = words_and_scores[0:config.beam_size + 1] # Append each of these words to the current partial caption for w, s in words_and_scores: sentence = caption_data.sentence + [w] score = caption_data.score * s beam = CaptionData(sentence, memory[k], output[k], score) if vocabulary.words[w] == '.': complete_caption_data[k].push(beam) else: partial_caption_data[k].push(beam) results = [] alpha_status = sess.run(self.alpha, feed_dict={ self.contexts: contexts, self.last_memory: last_memory, self.last_output: last_output }) alpha_all.append(alpha_status) last_output_all.append(last_output) file = open('ResultProcess/alpha.pickle', 'wb') pickle.dump(alpha_all, file) file.close() # file = open('ResultProcess/last_output_all.pickle', 'wb') # pickle.dump(last_output_all, file) # file.close() for k in range(config.batch_size): if complete_caption_data[k].size() == 0: complete_caption_data[k] = partial_caption_data[k] results.append(complete_caption_data[k].extract(sort=True)) return results