def get_MC_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros([batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) batch_candidates = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32) batch_answer = np.zeros([batch_size], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_vid_length], dtype=np.uint32) batch_candidates_mask = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) batch_raw_sentences = np.asarray([[None]*5 for _ in range(batch_size)]) batch_row_indices = np.asarray([-1] * batch_size) for k in xrange(batch_size): key = chunk[k] MC_dict = self.get_MC_dict(key) candidates = MC_dict['candidates'] raw_sentences = MC_dict['raw_sentences'] answer = MC_dict['answer'] video_feature = self.get_video_feature(key) candidates_matrix = self.get_MC_matrix(candidates) video_mask = self.get_video_mask(video_feature) candidates_mask = self.get_MC_mask(candidates) batch_video_feature_convmap[k] = data_util.pad_video(video_feature, self.get_video_feature_dimension()) batch_candidates[k] = candidates_matrix batch_raw_sentences[k, :] = raw_sentences batch_answer[k] = 0 batch_video_mask[k] = video_mask batch_candidates_mask[k] = candidates_mask batch_row_indices[k] = MC_dict['row_indices'] if answer != 0: batch_candidates[k, [0, answer], :] = batch_candidates[k, [answer, 0], :] batch_candidates_mask[k, [0, answer], :] = batch_candidates_mask[k, [answer, 0], :] batch_raw_sentences[k, [0, answer]] = batch_raw_sentences[k, [answer, 0]] ret = { 'ids': chunk, 'video_features': batch_video_feature_convmap, 'candidates': batch_candidates, 'raw_sentences': batch_raw_sentences, 'answer': batch_answer, 'video_mask': batch_video_mask, 'candidates_mask': batch_candidates_mask, 'row_indices': batch_row_indices } return ret
def get_FIB_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) batch_blank_sent = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_answer = np.zeros([batch_size, self.word_matrix.shape[0]], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_vid_length], dtype=np.uint32) batch_blank_sent_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_reverse_blank_sent_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) for k in xrange(batch_size): key = chunk[k] video_feature = self.get_video_feature(key) blank_sent = self.get_blank_sentence(key) answer = self.get_blank_answer(key) video_mask = self.get_video_mask(video_feature) blank_sent_mask = self.get_blank_sent_mask(blank_sent) reverse_blank_sent_mask = self.get_reverse_blank_sent_mask( blank_sent) batch_video_feature_convmap[k] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_blank_sent[k, :len(blank_sent)] = blank_sent batch_answer[k] = answer batch_video_mask[k] = video_mask batch_blank_sent_mask[k] = blank_sent_mask batch_reverse_blank_sent_mask[k] = reverse_blank_sent_mask batch_debug_sent[k] = self.data_df.iloc[key]['sentence'] ret = { 'ids': chunk, 'video_features': batch_video_feature_convmap, 'blank_sent': batch_blank_sent, 'answer': batch_answer, 'video_mask': batch_video_mask, 'answer': batch_answer, 'blank_sent_mask': batch_blank_sent_mask, 'debug_sent': batch_debug_sent, 'reverse_blank_sent_mask': batch_reverse_blank_sent_mask } return ret
def get_Count_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) # Question, Right most aligned batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_right = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) batch_answer = np.zeros([batch_size, 1]) for k in xrange(batch_size): key = chunk[k] video_feature = self.get_video_feature(key) video_mask = self.get_video_mask(video_feature) batch_video_feature_convmap[k, :] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_video_mask[k] = video_mask answer = max(self.get_Count_answer(key), 1) question = self.get_Count_question(key) question_mask = self.get_Count_question_mask(question) # Left align batch_question[k, :len(question)] = question # Right align batch_question_right[k, -len(question):] = question batch_question_mask[k] = question_mask batch_answer[k] = answer batch_debug_sent[k] = self.data_df.loc[key, 'question'] ret = { 'ids': chunk, 'video_features': batch_video_feature_convmap, 'question_words': batch_question, 'question_words_right': batch_question_right, 'video_mask': batch_video_mask, 'question_mask': batch_question_mask, 'answer': batch_answer, 'debug_sent': batch_debug_sent } return ret
def get_CAP_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) batch_caption = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_vid_length], dtype=np.uint32) batch_caption_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) for k in xrange(batch_size): key = chunk[k] video_feature = self.get_video_feature(key) video_mask = self.get_video_mask(video_feature) batch_video_feature_convmap[k] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_video_mask[k] = video_mask if self.dataset_name != 'blind': try: caption = self.get_description(key) caption_mask = self.get_sentence_mask(caption) except: print key sys.exit() batch_caption[k, :len(caption)] = caption batch_caption_mask[k] = caption_mask batch_debug_sent[k] = self.data_df.iloc[key]['description'] ret = { 'ids': chunk, 'video_features': batch_video_feature_convmap, 'caption_words': batch_caption, 'video_mask': batch_video_mask, 'caption_mask': batch_caption_mask, 'debug_sent': batch_debug_sent } return ret
def get_RET_result(self, y_keys, x_keys, neg=True): batch_size = len(y_keys) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) batch_caption = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_caption_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) for k in xrange(batch_size): x_key = x_keys[k] y_key = y_keys[k] video_feature = self.get_video_feature(y_key) video_mask = self.get_video_mask(video_feature) caption = self.get_description(x_key) caption_mask = self.get_sentence_mask(caption) batch_video_feature_convmap[k, :] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_caption[k, :len(caption)] = caption batch_video_mask[k] = video_mask batch_caption_mask[k] = caption_mask batch_debug_sent[k] = self.data_df.iloc[x_key]['description'] ret = { 'ids': y_key, 'video_features': batch_video_feature_convmap, 'caption_words': batch_caption, 'video_mask': batch_video_mask, 'caption_mask': batch_caption_mask, 'debug_sent': batch_debug_sent } return ret
def get_Trans_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) batch_candidates = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32) batch_candidates_right = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32) batch_answer = np.zeros([batch_size], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_candidates_mask = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) batch_raw_sentences = np.asarray([[None]*5 for _ in range(batch_size)]) # [batch_size, 5] batch_row_indices = np.asarray([-1] * batch_size) batch_questions = [] for k in xrange(batch_size): key = chunk[k] MC_dict = self.get_Trans_dict(key) candidates = MC_dict['candidates'] raw_sentences = MC_dict['raw_sentences'] answer = int(MC_dict['answer']) question = MC_dict['question'] video_feature = self.get_video_feature(key) candidates_matrix = self.get_Trans_matrix(candidates) candidates_matrix_right = self.get_Trans_matrix(candidates, is_left=False) video_mask = self.get_video_mask(video_feature) candidates_mask = self.get_Trans_mask(candidates) batch_video_feature_convmap[k, :] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_candidates[k] = candidates_matrix batch_candidates_right[k] = candidates_matrix_right batch_raw_sentences[k, :] = raw_sentences batch_answer[k] = answer batch_video_mask[k] = video_mask batch_candidates_mask[k] = candidates_mask batch_row_indices[k] = MC_dict['row_indices'] batch_questions.append(question) batch_debug_sent[k] = self.data_df.loc[key, 'a'+str(int(answer+1))] ret = { 'ids': chunk, 'video_features': batch_video_feature_convmap, 'candidates': batch_candidates, 'candidates_right': batch_candidates_right, 'answer': batch_answer, 'raw_sentences': batch_raw_sentences, 'video_mask': batch_video_mask, 'candidates_mask': batch_candidates_mask, 'debug_sent': batch_debug_sent, 'row_indices' : batch_row_indices, 'question': batch_questions, } return ret
def get_FrameQA_result(self, chunk): batch_size = len(chunk) """ 下面的[batch_size] + list(self.get_video_feature_dimension()) 最后的返回就是(batch_size,7,7,1028) """ batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) # Question, Right most aligned batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_right = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) """ 这里debug_sent还不知道是什么意思 """ batch_debug_sent = np.asarray([None] * batch_size) """ 之前读取answer的时候,如果一个answer是由好几个单词组成的,这里不会进行分词 """ batch_answer = np.zeros([batch_size, 1]) batch_answer_type = np.zeros([batch_size, 1]) questions = [] for k in xrange(batch_size): key = chunk[k] """ 这里的维度是 [video_length,7, 7, 2048],也就是这里的时候,这个每个视频的video_length 还不是一样的 """ video_feature = self.get_video_feature(key) video_mask = self.get_video_mask(video_feature) batch_video_feature_convmap[k, :] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) # 这个就是记录一下,我这个video是补全了还是cut了 batch_video_mask[k] = video_mask """ 这里根据csv表格,会对answer type 进行一个分类,2代表的是颜色,0好像是物件吧 """ answer, answer_type = self.get_answer(key) if str(answer) in self.ans2idx: answer = self.ans2idx[answer] else: # unknown token, check later answer = 1 question = self.get_question(key) # 这里question也会有长度不一的情况,这里max_sequence_length=35 question_mask = self.get_question_mask(question) # Left align """ batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32) 所以question其实就是一个二维的矩阵 """ batch_question[k, :len(question)] = question # Right align batch_question_right[k, -len(question):] = question #questions.append(question) #batch_question_mask.append(len(question)) #question_mask batch_question_mask[k] = question_mask question_pad = np.zeros([self.max_length]) question_pad[:len(question)] = question # 这个questions变量,之后并没有使用,所以不用管了 questions.append(question_pad) batch_answer[k] = answer batch_answer_type[k] = float(int(answer_type)) batch_debug_sent[k] = self.data_df.loc[key, 'question'] ret = { 'ids': chunk, 'video_features': batch_video_feature_convmap, 'question_words': batch_question, 'question_words_right': batch_question_right, 'video_mask': batch_video_mask, 'question_mask': batch_question_mask, 'answer': batch_answer, 'answer_type': batch_answer_type, 'debug_sent': batch_debug_sent } return ret
def get_FrameQA_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) # Question, Right most aligned batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_right = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) batch_answer = np.zeros([batch_size, 1]) batch_answer_type = np.zeros([batch_size, 1]) questions = [] for k in xrange(batch_size): key = chunk[k] video_feature = self.get_video_feature(key) video_mask = self.get_video_mask(video_feature) batch_video_feature_convmap[k, :] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_video_mask[k] = video_mask answer, answer_type = self.get_answer(key) if str(answer) in self.ans2idx: answer = self.ans2idx[answer] else: # unknown token, check later answer = 1 question = self.get_question(key) question_mask = self.get_question_mask(question) # Left align batch_question[k, :len(question)] = question # Right align batch_question_right[k, -len(question):] = question #questions.append(question) #batch_question_mask.append(len(question)) #question_mask batch_question_mask[k] = question_mask question_pad = np.zeros([self.max_length]) question_pad[:len(question)] = question questions.append(question_pad) batch_answer[k] = answer batch_answer_type[k] = float(int(answer_type)) batch_debug_sent[k] = self.data_df.loc[key, 'question'] ret = { 'ids': chunk, 'video_features': batch_video_feature_convmap, 'question_words': batch_question, 'question_words_right': batch_question_right, 'video_mask': batch_video_mask, 'question_mask': batch_question_mask, 'answer': batch_answer, 'answer_type': batch_answer_type, 'debug_sent': batch_debug_sent } return ret