Exemplo n.º 1
0
    def get_Count_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32)

        # Question, Right most aligned
        batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_question_right = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_question_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)
        batch_answer = np.zeros([batch_size, 1])
        
        video_lengths = []
        question_lengths = []
        #answer_lengths = []
        
        for k in xrange(batch_size):
            key = chunk[k]
            video_feature = self.get_video_feature(key)
            video_mask = self.get_video_mask(video_feature)
            
            vl = min(self.max_length, video_feature.shape[0])
            video_lengths.append(vl)

            batch_video_feature_convmap[k, :] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())
            batch_video_mask[k] = video_mask

            answer = max(self.get_Count_answer(key), 1)

            question = self.get_Count_question(key)
            question_mask = self.get_Count_question_mask(question)
            # Left align
            batch_question[k, :len(question)] = question
            # Right align
            batch_question_right[k, -len(question):] = question
            batch_question_mask[k] = question_mask
            question_lengths.append(len(question))
            batch_answer[k] = answer
            batch_debug_sent[k] = self.data_df.loc[key, 'question']

        ret = {
            'ids': chunk,
            'video_lengths': video_lengths,
            'video_features': batch_video_feature_convmap,
            'question_words': batch_question,
            'question_words_right': batch_question_right,
            'question_lengths': question_lengths,
            'video_mask': batch_video_mask,
            'question_mask': batch_question_mask,
            'answer': batch_answer,
            'debug_sent': batch_debug_sent
        }
        return ret
Exemplo n.º 2
0
    def get_Trans_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32)

        batch_candidates = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32)
        batch_candidates_right = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32)
        batch_answer = np.zeros([batch_size], dtype=np.uint32)

        batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_candidates_mask = np.zeros([batch_size, 5, self.max_length], dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)
        batch_raw_sentences = np.asarray([[None]*5 for _ in range(batch_size)]) # [batch_size, 5]
        batch_row_indices = np.asarray([-1] * batch_size)

        video_lengths = []
        candidate_lengths = []
        batch_questions = []
        question_word_nums = []
        
        for k in xrange(batch_size):
            key = chunk[k]
            MC_dict = self.get_Trans_dict(key)
            candidates = MC_dict['candidates']
            
            '''candidates
            what does the woman do 4 times \? flap hands <EOS>
            what does the woman do 4 times \? nod <EOS>
            what does the woman do 4 times \? run <EOS>
            what does the woman do 4 times \? crank fist <EOS>
            what does the woman do 4 times \? turn around <EOS>
            print(candidates)
            for cand in candidates:
                for ix in cand:
                    print self.idx2word[ix],
                print
            '''
            question = MC_dict['question']

            qnum = len(question.split())
            question_word_nums.append(qnum)
                    
            
#             for cand in candidates:
#                 for ix in cand:
#                     print self.idx2word[ix],
#                 print qnum
                            
            raw_sentences = MC_dict['raw_sentences']
            
            '''raw_sentences
            print(raw_sentences)
            
            ['swing jacket', 'jump', 'shake fingers', 'paw the air', 'hit man on right']
            ['hit cigarettes', 'jump', 'spin body', 'step forward', 'kick wall']
            ['take steps across the stage', 'strum leg', 'deal cards', 'point', 'move']
            ['bob head', 'rub under eye', 'lick', 'shake hips', 'sticks out tongue']
            ['blink eyes', 'swing arms', 'wave', 'hit dog', 'touch face']
            '''
            
            answer = int(MC_dict['answer'])    # a choice from 0-4, since there are five candidate answers
            question = MC_dict['question']
            
            #print(len(question),len(candidates[0]),len(raw_sentences[0]))
            
            '''answer and question
            print(answer)
            print(question)
            
            What does the man do 2 times ?
            3
            What does the woman do 4 times ?
            2
            What does the woman do 2 times ?
            0
            What does the man do 4 times ?
            3
            '''

            video_feature = self.get_video_feature(key)
            candidates_matrix = self.get_Trans_matrix(candidates)
            candidates_matrix_right = self.get_Trans_matrix(candidates, is_left=False)

            vl = min(self.max_length, video_feature.shape[0])
            video_lengths.append(vl)
            
            # get candidate length
            cand_lens = []
            for cand in candidates:
                vl = min(self.max_length, len(cand))
                cand_lens.append(vl)
            candidate_lengths.append(cand_lens)
            
            video_mask = self.get_video_mask(video_feature)
            candidates_mask = self.get_Trans_mask(candidates)

            batch_video_feature_convmap[k, :] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())

            batch_candidates[k] = candidates_matrix
            batch_candidates_right[k] = candidates_matrix_right
            batch_raw_sentences[k, :] = raw_sentences
            batch_answer[k] = answer
            batch_video_mask[k] = video_mask
            batch_candidates_mask[k] = candidates_mask
            batch_row_indices[k] = MC_dict['row_indices']
            batch_questions.append(question)

            batch_debug_sent[k] = self.data_df.loc[key, 'a'+str(int(answer+1))]

        ret = {
            'ids': chunk,
            'video_lengths': video_lengths,
            'video_features': batch_video_feature_convmap,
            'candidates': batch_candidates,
            'candidates_right': batch_candidates_right,
            'candidate_lengths': candidate_lengths,
            'answer': batch_answer,
            'raw_sentences': batch_raw_sentences,
            'video_mask': batch_video_mask,
            'candidates_mask': batch_candidates_mask,
            'debug_sent': batch_debug_sent,
            'row_indices' : batch_row_indices,
            'question': batch_questions,
            'num_mult_choices':5,
            'question_word_nums':question_word_nums,
        }
        return ret
Exemplo n.º 3
0
    def get_FrameQA_result(self, chunk):
        batch_size = len(chunk)
        batch_video_feature_convmap = np.zeros(
            [batch_size] + list(self.get_video_feature_dimension()), dtype=np.float32)

        # Question, Right most aligned
        batch_question = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_question_right = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_video_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32)
        batch_question_mask = np.zeros([batch_size, self.max_length], dtype=np.uint32)

        batch_debug_sent = np.asarray([None] * batch_size)
        batch_answer = np.zeros([batch_size, 1])
        batch_answer_type = np.zeros([batch_size, 1])
        questions = []

        video_lengths = []
        question_lengths = []
        
        for k in xrange(batch_size):
            key = chunk[k]
            video_feature = self.get_video_feature(key)
            video_mask = self.get_video_mask(video_feature)

            vl = min(self.max_length, video_feature.shape[0])
            video_lengths.append(vl)
            
            batch_video_feature_convmap[k, :] = data_util.pad_video(
                video_feature, self.get_video_feature_dimension())
            batch_video_mask[k] = video_mask

            answer, answer_type = self.get_answer(key)
            if str(answer) in self.ans2idx:
                answer = self.ans2idx[answer]
            else:
                # unknown token, check later
                answer = 1
            question = self.get_question(key)            
            question_mask = self.get_question_mask(question)
            #print('1----------------',question,len(question))
            question_lengths.append(len(question))
            
            '''
            for ix in question:
                print self.idx2word[ix],
            print self.idx2ans[answer]
            '''
            
            
            
            # Left align
            batch_question[k, :len(question)] = question
            # Right align
            batch_question_right[k, -len(question):] = question
            #questions.append(question)
            #batch_question_mask.append(len(question)) #question_mask
            batch_question_mask[k] = question_mask
            question_pad = np.zeros([self.max_length])
            question_pad[:len(question)] = question
            questions.append(question_pad)
            batch_answer[k] = answer
            batch_answer_type[k] = float(int(answer_type))
            batch_debug_sent[k] = self.data_df.loc[key, 'question']

        ret = {
            'ids': chunk,
            'video_lengths': video_lengths,
            'video_features': batch_video_feature_convmap,
            'question_words': batch_question,
            'question_words_right': batch_question_right,
            'question_lengths': question_lengths,
            'video_mask': batch_video_mask,
            'question_mask': batch_question_mask,
            'answer': batch_answer,
            'answer_type': batch_answer_type,
            'debug_sent': batch_debug_sent
        }
        return ret