def forward(self, feed_dict):
        feed_dict = GView(feed_dict)
        monitors, outputs = {}, {}

        f_scene = self.resnet(
            feed_dict.image)  # [batch_size=32,n_channels=256,h=16,w=24]
        f_sng = self.scene_graph(
            f_scene, feed_dict.image,
            feed_dict.objects_mask if self.true_mask else None)

        programs = feed_dict.program_qsseq
        programs, buffers, answers = self.reasoning(f_sng,
                                                    programs,
                                                    fd=feed_dict)
        outputs['buffers'] = buffers
        outputs['answer'] = answers

        update_from_loss_module(monitors, outputs,
                                self.scene_graph.get_monitor())
        update_from_loss_module(monitors, outputs,
                                self.qa_loss(feed_dict, answers))
        canonize_monitors(monitors)

        if self.training:
            loss = monitors[
                'loss/qa'] + monitors['loss/monet'] * self.loss_ratio
            return loss, monitors, outputs
        else:
            outputs['monitors'] = monitors
            outputs['buffers'] = buffers
            return outputs
예제 #2
0
    def forward(self, feed_dict):
        feed_dict = GView(feed_dict)
        monitors, outputs = {}, {}

        vid_shape = feed_dict.video.size()

        B = vid_shape[0]
        N_frames = vid_shape[1]
        video_frames = feed_dict.video.reshape(vid_shape[0] * vid_shape[1],
                                               vid_shape[2], vid_shape[3],
                                               vid_shape[4])
        f_scene = self.resnet(video_frames)
        f_scene = f_scene.reshape(B, N_frames, -1)

        f_scene, _ = self.lstm_video(f_scene)
        f_scene = f_scene[:, -1, :]
        f_scene = f_scene.squeeze()
        f_scene = f_scene.unsqueeze(-1).unsqueeze(-1)

        f_sng = self.scene_graph(f_scene, feed_dict.objects,
                                 feed_dict.objects_length)

        programs = feed_dict.program_qsseq
        programs, buffers, answers = self.reasoning(f_sng,
                                                    programs,
                                                    fd=feed_dict)
        outputs["buffers"] = buffers
        outputs["answer"] = answers

        update_from_loss_module(
            monitors,
            outputs,
            self.scene_loss(
                feed_dict,
                f_sng,
                self.reasoning.embedding_attribute,
                self.reasoning.embedding_relation,
            ),
        )
        update_from_loss_module(monitors, outputs,
                                self.qa_loss(feed_dict, answers))

        canonize_monitors(monitors)

        if self.training:
            loss = monitors["loss/qa"]
            if configs.train.scene_add_supervision:
                loss = loss + monitors["loss/scene"]
            return loss, monitors, outputs
        else:
            outputs["monitors"] = monitors
            outputs["buffers"] = buffers
            return outputs
예제 #3
0
    def forward(self, feed_dict):
        feed_dict = GView(feed_dict)
        monitors, outputs = {}, {}

        depth = feed_dict.depth
        depth = F.tanh(depth) * 0.5
        inp = torch.cat((feed_dict.image, depth.unsqueeze(1)), axis=1)

        f_scene = self.resnet(inp)
        f_sng = self.scene_graph(f_scene, feed_dict.objects,
                                 feed_dict.objects_length)

        programs = feed_dict.program_qsseq
        programs, buffers, answers = self.reasoning(f_sng,
                                                    programs,
                                                    fd=feed_dict)
        outputs["buffers"] = buffers
        outputs["answer"] = answers

        update_from_loss_module(
            monitors,
            outputs,
            self.scene_loss(
                feed_dict,
                f_sng,
                self.reasoning.embedding_attribute,
                self.reasoning.embedding_relation,
            ),
        )
        update_from_loss_module(monitors, outputs,
                                self.qa_loss(feed_dict, answers))

        canonize_monitors(monitors)

        if self.training:
            loss = monitors["loss/qa"]
            if configs.train.full_scene_supervision:
                loss = loss + monitors["loss/scene"]
            return loss, monitors, outputs
        else:
            outputs["monitors"] = monitors
            outputs["buffers"] = buffers
            return outputs
예제 #4
0
    def forward(self, feed_dict):
        feed_dict = GView(feed_dict)

        monitors, outputs = {}, {}

        f_scene = self.resnet(
            feed_dict.image)  # [batch_size=32,n_channels=256,h=16,w=24]
        f_sng = self.scene_graph(f_scene, feed_dict.objects,
                                 feed_dict.objects_length)

        programs = feed_dict.program_qsseq
        programs, buffers, answers = self.reasoning(f_sng,
                                                    programs,
                                                    fd=feed_dict)
        outputs['buffers'] = buffers
        outputs['answer'] = answers

        update_from_loss_module(
            monitors, outputs,
            self.scene_loss(feed_dict, f_sng,
                            self.reasoning.embedding_attribute,
                            self.reasoning.embedding_relation))
        update_from_loss_module(monitors, outputs,
                                self.qa_loss(feed_dict, answers))

        canonize_monitors(monitors)

        if self.training:
            loss = monitors['loss/qa']
            if configs.train.scene_add_supervision:
                loss = loss + monitors['loss/scene']
            return loss, monitors, outputs
        else:
            outputs['monitors'] = monitors
            outputs['buffers'] = buffers
            return outputs
예제 #5
0
    def forward(self, feed_dict_list):
        #feed_dict = GView(feed_dict)

        video_num = len(feed_dict_list)
        f_sng_list = []
        for vid, feed_dict in enumerate(feed_dict_list):
            f_scene = self.resnet(feed_dict['img'])
            f_sng = self.scene_graph(f_scene, feed_dict)
            f_sng_list.append(f_sng)

        programs = []
        for idx, feed_dict in enumerate(feed_dict_list):
            tmp_prog = []
            feed_dict['answer'] = []
            feed_dict['question_type'] = []
            for ques in feed_dict['meta_ann']['questions']:
                if 'answer' not in ques.keys():
                    continue
                if 'program_cl' not in ques.keys():
                    continue
                if 'program_cl' in ques.keys():
                    tmp_prog.append(ques['program_cl'])
                feed_dict['answer'].append(ques['answer'])
                feed_dict['question_type'].append(ques['program_cl'][-1]['op'])
            programs.append(tmp_prog)
        programs_list, buffers_list, answers_list = self.reasoning(
            f_sng_list, programs, fd=feed_dict_list)
        monitors_list = []
        output_list = []
        #pdb.set_trace()
        for idx, buffers in enumerate(buffers_list):
            monitors, outputs = {}, {}

            outputs['buffers'] = buffers
            outputs['answer'] = answers_list[idx]
            feed_dict = feed_dict_list[idx]
            f_sng = [f_sng_list[idx]]
            answers = answers_list[idx]

            update_from_loss_module(
                monitors, outputs,
                self.scene_loss(feed_dict, f_sng,
                                self.reasoning.embedding_attribute,
                                self.reasoning.embedding_relation,
                                self.reasoning.embedding_temporal))
            update_from_loss_module(monitors, outputs,
                                    self.qa_loss(feed_dict, answers))
            canonize_monitors(monitors)
            monitors_list.append(monitors)
            output_list.append(outputs)

        loss = 0
        if self.training:
            for monitors in monitors_list:
                loss += monitors['loss/qa']
                if self.args.scene_add_supervision:
                    loss = loss + self.args.scene_supervision_weight * monitors[
                        'loss/scene']
            return loss / len(monitors_list), monitors, outputs
        else:
            outputs['monitors'] = monitors_list
            outputs['buffers'] = buffers_list
            outputs['answer'] = buffers_list
            return outputs