def forward(self, feed_dict): feed_dict = GView(feed_dict) monitors, outputs = {}, {} f_scene = self.resnet( feed_dict.image) # [batch_size=32,n_channels=256,h=16,w=24] f_sng = self.scene_graph( f_scene, feed_dict.image, feed_dict.objects_mask if self.true_mask else None) programs = feed_dict.program_qsseq programs, buffers, answers = self.reasoning(f_sng, programs, fd=feed_dict) outputs['buffers'] = buffers outputs['answer'] = answers update_from_loss_module(monitors, outputs, self.scene_graph.get_monitor()) update_from_loss_module(monitors, outputs, self.qa_loss(feed_dict, answers)) canonize_monitors(monitors) if self.training: loss = monitors[ 'loss/qa'] + monitors['loss/monet'] * self.loss_ratio return loss, monitors, outputs else: outputs['monitors'] = monitors outputs['buffers'] = buffers return outputs
def forward(self, feed_dict): feed_dict = GView(feed_dict) monitors, outputs = {}, {} vid_shape = feed_dict.video.size() B = vid_shape[0] N_frames = vid_shape[1] video_frames = feed_dict.video.reshape(vid_shape[0] * vid_shape[1], vid_shape[2], vid_shape[3], vid_shape[4]) f_scene = self.resnet(video_frames) f_scene = f_scene.reshape(B, N_frames, -1) f_scene, _ = self.lstm_video(f_scene) f_scene = f_scene[:, -1, :] f_scene = f_scene.squeeze() f_scene = f_scene.unsqueeze(-1).unsqueeze(-1) f_sng = self.scene_graph(f_scene, feed_dict.objects, feed_dict.objects_length) programs = feed_dict.program_qsseq programs, buffers, answers = self.reasoning(f_sng, programs, fd=feed_dict) outputs["buffers"] = buffers outputs["answer"] = answers update_from_loss_module( monitors, outputs, self.scene_loss( feed_dict, f_sng, self.reasoning.embedding_attribute, self.reasoning.embedding_relation, ), ) update_from_loss_module(monitors, outputs, self.qa_loss(feed_dict, answers)) canonize_monitors(monitors) if self.training: loss = monitors["loss/qa"] if configs.train.scene_add_supervision: loss = loss + monitors["loss/scene"] return loss, monitors, outputs else: outputs["monitors"] = monitors outputs["buffers"] = buffers return outputs
def forward(self, feed_dict): feed_dict = GView(feed_dict) monitors, outputs = {}, {} depth = feed_dict.depth depth = F.tanh(depth) * 0.5 inp = torch.cat((feed_dict.image, depth.unsqueeze(1)), axis=1) f_scene = self.resnet(inp) f_sng = self.scene_graph(f_scene, feed_dict.objects, feed_dict.objects_length) programs = feed_dict.program_qsseq programs, buffers, answers = self.reasoning(f_sng, programs, fd=feed_dict) outputs["buffers"] = buffers outputs["answer"] = answers update_from_loss_module( monitors, outputs, self.scene_loss( feed_dict, f_sng, self.reasoning.embedding_attribute, self.reasoning.embedding_relation, ), ) update_from_loss_module(monitors, outputs, self.qa_loss(feed_dict, answers)) canonize_monitors(monitors) if self.training: loss = monitors["loss/qa"] if configs.train.full_scene_supervision: loss = loss + monitors["loss/scene"] return loss, monitors, outputs else: outputs["monitors"] = monitors outputs["buffers"] = buffers return outputs
def forward(self, feed_dict): feed_dict = GView(feed_dict) monitors, outputs = {}, {} f_scene = self.resnet( feed_dict.image) # [batch_size=32,n_channels=256,h=16,w=24] f_sng = self.scene_graph(f_scene, feed_dict.objects, feed_dict.objects_length) programs = feed_dict.program_qsseq programs, buffers, answers = self.reasoning(f_sng, programs, fd=feed_dict) outputs['buffers'] = buffers outputs['answer'] = answers update_from_loss_module( monitors, outputs, self.scene_loss(feed_dict, f_sng, self.reasoning.embedding_attribute, self.reasoning.embedding_relation)) update_from_loss_module(monitors, outputs, self.qa_loss(feed_dict, answers)) canonize_monitors(monitors) if self.training: loss = monitors['loss/qa'] if configs.train.scene_add_supervision: loss = loss + monitors['loss/scene'] return loss, monitors, outputs else: outputs['monitors'] = monitors outputs['buffers'] = buffers return outputs
def forward(self, feed_dict_list): #feed_dict = GView(feed_dict) video_num = len(feed_dict_list) f_sng_list = [] for vid, feed_dict in enumerate(feed_dict_list): f_scene = self.resnet(feed_dict['img']) f_sng = self.scene_graph(f_scene, feed_dict) f_sng_list.append(f_sng) programs = [] for idx, feed_dict in enumerate(feed_dict_list): tmp_prog = [] feed_dict['answer'] = [] feed_dict['question_type'] = [] for ques in feed_dict['meta_ann']['questions']: if 'answer' not in ques.keys(): continue if 'program_cl' not in ques.keys(): continue if 'program_cl' in ques.keys(): tmp_prog.append(ques['program_cl']) feed_dict['answer'].append(ques['answer']) feed_dict['question_type'].append(ques['program_cl'][-1]['op']) programs.append(tmp_prog) programs_list, buffers_list, answers_list = self.reasoning( f_sng_list, programs, fd=feed_dict_list) monitors_list = [] output_list = [] #pdb.set_trace() for idx, buffers in enumerate(buffers_list): monitors, outputs = {}, {} outputs['buffers'] = buffers outputs['answer'] = answers_list[idx] feed_dict = feed_dict_list[idx] f_sng = [f_sng_list[idx]] answers = answers_list[idx] update_from_loss_module( monitors, outputs, self.scene_loss(feed_dict, f_sng, self.reasoning.embedding_attribute, self.reasoning.embedding_relation, self.reasoning.embedding_temporal)) update_from_loss_module(monitors, outputs, self.qa_loss(feed_dict, answers)) canonize_monitors(monitors) monitors_list.append(monitors) output_list.append(outputs) loss = 0 if self.training: for monitors in monitors_list: loss += monitors['loss/qa'] if self.args.scene_add_supervision: loss = loss + self.args.scene_supervision_weight * monitors[ 'loss/scene'] return loss / len(monitors_list), monitors, outputs else: outputs['monitors'] = monitors_list outputs['buffers'] = buffers_list outputs['answer'] = buffers_list return outputs