def get_Count_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) # Question, Right most aligned batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_right = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) batch_answer = np.zeros([batch_size, 1]) video_lengths = [] question_lengths = [] #answer_lengths = [] for k in xrange(batch_size): key = chunk[k] video_feature = self.get_video_feature(key) video_mask = self.get_video_mask(video_feature) vl = min(self.max_length, video_feature.shape[0]) video_lengths.append(vl) batch_video_feature_convmap[k, :] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_video_mask[k] = video_mask answer = max(self.get_Count_answer(key), 1) question = self.get_Count_question(key) question_mask = self.get_Count_question_mask(question) # Left align batch_question[k, :len(question)] = question # Right align batch_question_right[k, -len(question):] = question batch_question_mask[k] = question_mask question_lengths.append(len(question)) batch_answer[k] = answer batch_debug_sent[k] = self.data_df.loc[key, 'question'] ret = { 'ids': chunk, 'video_lengths': video_lengths, 'video_features': batch_video_feature_convmap, 'question_words': batch_question, 'question_words_right': batch_question_right, 'question_lengths': question_lengths, 'video_mask': batch_video_mask, 'question_mask': batch_question_mask, 'answer': batch_answer, 'debug_sent': batch_debug_sent } return ret
def get_Trans_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) batch_candidates = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32) batch_candidates_right = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32) batch_answer = np.zeros([batch_size], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_candidates_mask = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) batch_raw_sentences = np.asarray([[None]*5 for _ in range(batch_size)]) # [batch_size, 5] batch_row_indices = np.asarray([-1] * batch_size) video_lengths = [] candidate_lengths = [] batch_questions = [] question_word_nums = [] for k in xrange(batch_size): key = chunk[k] MC_dict = self.get_Trans_dict(key) candidates = MC_dict['candidates'] '''candidates what does the woman do 4 times \? flap hands <EOS> what does the woman do 4 times \? nod <EOS> what does the woman do 4 times \? run <EOS> what does the woman do 4 times \? crank fist <EOS> what does the woman do 4 times \? turn around <EOS> print(candidates) for cand in candidates: for ix in cand: print self.idx2word[ix], print ''' question = MC_dict['question'] qnum = len(question.split()) question_word_nums.append(qnum) # for cand in candidates: # for ix in cand: # print self.idx2word[ix], # print qnum raw_sentences = MC_dict['raw_sentences'] '''raw_sentences print(raw_sentences) ['swing jacket', 'jump', 'shake fingers', 'paw the air', 'hit man on right'] ['hit cigarettes', 'jump', 'spin body', 'step forward', 'kick wall'] ['take steps across the stage', 'strum leg', 'deal cards', 'point', 'move'] ['bob head', 'rub under eye', 'lick', 'shake hips', 'sticks out tongue'] ['blink eyes', 'swing arms', 'wave', 'hit dog', 'touch face'] ''' answer = int(MC_dict['answer']) # a choice from 0-4, since there are five candidate answers question = MC_dict['question'] #print(len(question),len(candidates[0]),len(raw_sentences[0])) '''answer and question print(answer) print(question) What does the man do 2 times ? 3 What does the woman do 4 times ? 2 What does the woman do 2 times ? 0 What does the man do 4 times ? 3 ''' video_feature = self.get_video_feature(key) candidates_matrix = self.get_Trans_matrix(candidates) candidates_matrix_right = self.get_Trans_matrix(candidates, is_left=False) vl = min(self.max_length, video_feature.shape[0]) video_lengths.append(vl) # get candidate length cand_lens = [] for cand in candidates: vl = min(self.max_length, len(cand)) cand_lens.append(vl) candidate_lengths.append(cand_lens) video_mask = self.get_video_mask(video_feature) candidates_mask = self.get_Trans_mask(candidates) batch_video_feature_convmap[k, :] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_candidates[k] = candidates_matrix batch_candidates_right[k] = candidates_matrix_right batch_raw_sentences[k, :] = raw_sentences batch_answer[k] = answer batch_video_mask[k] = video_mask batch_candidates_mask[k] = candidates_mask batch_row_indices[k] = MC_dict['row_indices'] batch_questions.append(question) batch_debug_sent[k] = self.data_df.loc[key, 'a'+str(int(answer+1))] ret = { 'ids': chunk, 'video_lengths': video_lengths, 'video_features': batch_video_feature_convmap, 'candidates': batch_candidates, 'candidates_right': batch_candidates_right, 'candidate_lengths': candidate_lengths, 'answer': batch_answer, 'raw_sentences': batch_raw_sentences, 'video_mask': batch_video_mask, 'candidates_mask': batch_candidates_mask, 'debug_sent': batch_debug_sent, 'row_indices' : batch_row_indices, 'question': batch_questions, 'num_mult_choices':5, 'question_word_nums':question_word_nums, } return ret
def get_FrameQA_result(self, chunk): batch_size = len(chunk) batch_video_feature_convmap = np.zeros( [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32) # Question, Right most aligned batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_right = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_question_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32) batch_debug_sent = np.asarray([None] * batch_size) batch_answer = np.zeros([batch_size, 1]) batch_answer_type = np.zeros([batch_size, 1]) questions = [] video_lengths = [] question_lengths = [] for k in xrange(batch_size): key = chunk[k] video_feature = self.get_video_feature(key) video_mask = self.get_video_mask(video_feature) vl = min(self.max_length, video_feature.shape[0]) video_lengths.append(vl) batch_video_feature_convmap[k, :] = data_util.pad_video( video_feature, self.get_video_feature_dimension()) batch_video_mask[k] = video_mask answer, answer_type = self.get_answer(key) if str(answer) in self.ans2idx: answer = self.ans2idx[answer] else: # unknown token, check later answer = 1 question = self.get_question(key) question_mask = self.get_question_mask(question) #print('1----------------',question,len(question)) question_lengths.append(len(question)) ''' for ix in question: print self.idx2word[ix], print self.idx2ans[answer] ''' # Left align batch_question[k, :len(question)] = question # Right align batch_question_right[k, -len(question):] = question #questions.append(question) #batch_question_mask.append(len(question)) #question_mask batch_question_mask[k] = question_mask question_pad = np.zeros([self.max_length]) question_pad[:len(question)] = question questions.append(question_pad) batch_answer[k] = answer batch_answer_type[k] = float(int(answer_type)) batch_debug_sent[k] = self.data_df.loc[key, 'question'] ret = { 'ids': chunk, 'video_lengths': video_lengths, 'video_features': batch_video_feature_convmap, 'question_words': batch_question, 'question_words_right': batch_question_right, 'question_lengths': question_lengths, 'video_mask': batch_video_mask, 'question_mask': batch_question_mask, 'answer': batch_answer, 'answer_type': batch_answer_type, 'debug_sent': batch_debug_sent } return ret