Пример #1
0
    def __init__(self,
                 txt_enc={},
                 self_q_att=False,
                 n_step=3,
                 shared=False,
                 cell={},
                 agg={},
                 classif={},
                 wid_to_word={},
                 word_to_wid={},
                 aid_to_ans=[],
                 ans_to_aid={}):
        super(MuRelNet, self).__init__()
        self.self_q_att = self_q_att
        self.n_step = n_step
        self.shared = shared
        self.cell = cell
        self.agg = agg
        assert self.agg['type'] in ['max', 'mean']
        self.classif = classif
        self.wid_to_word = wid_to_word
        self.word_to_wid = word_to_wid
        self.aid_to_ans = aid_to_ans
        self.ans_to_aid = ans_to_aid
        # Modules
        self.txt_enc = factory_text_enc(self.wid_to_word, txt_enc)
        if self.self_q_att:
            self.q_att_linear0 = nn.Linear(2400, 512)
            self.q_att_linear1 = nn.Linear(512, 2)

        if self.shared:
            self.cell = MuRelCell(**cell)
        else:
            self.cells = nn.ModuleList(
                [MuRelCell(**cell) for i in range(self.n_step)])

        if 'fusion' in self.classif:
            self.classif_module = block.factory_fusion(self.classif['fusion'])
        elif 'mlp' in self.classif:
            self.classif_module = MLP(self.classif['mlp'])
        else:
            raise ValueError(self.classif.keys())

        Logger().log_value('nparams',
                           sum(p.numel() for p in self.parameters()
                               if p.requires_grad),
                           should_print=True)

        Logger().log_value('nparams_txt_enc',
                           self.get_nparams_txt_enc(),
                           should_print=True)

        self.buffer = None
    def __init__(
        self,
        txt_enc={},
        dim_q=2400,
        max_ans=15,
        self_q_att=False,
        wid_to_word={},
        word_to_wid={},
        aid_to_ans=[],
        ans_to_aid={},
        output="classification",
    ):
        super().__init__()
        self.self_q_att = self_q_att
        self.wid_to_word = wid_to_word
        self.word_to_wid = word_to_wid
        self.aid_to_ans = aid_to_ans
        self.ans_to_aid = ans_to_aid
        self.max_ans = max_ans
        self.output = output

        if self.output == "classification":
            self.classifier = nn.Sequential(
                nn.Linear(2048 + dim_q, 1024),
                nn.ReLU(),
                nn.Linear(1024, len(aid_to_ans)),
            )

        elif self.output == "regression":
            self.classifier = nn.Sequential(
                nn.Linear(2048 + dim_q, 1024),
                nn.ReLU(),
                nn.Linear(1024, 1),
            )

        # Modules
        self.txt_enc = factory_text_enc(self.wid_to_word, txt_enc)
        if self.self_q_att:
            self.q_att_linear0 = nn.Linear(dim_q, 512)
            self.q_att_linear1 = nn.Linear(512, 2)

        Logger().log_value(
            "nparams",
            sum(p.numel() for p in self.parameters() if p.requires_grad),
            should_print=True,
        )

        Logger().log_value("nparams_txt_enc",
                           self.get_nparams_txt_enc(),
                           should_print=True)

        self.buffer = None
Пример #3
0
    def __init__(self,
                 txt_enc={},
                 q_max_length=14,
                 glimpse=2,
                 objects=36,
                 feat_dims={},
                 biattention={},
                 wid_to_word={},
                 word_to_wid={},
                 aid_to_ans=[],
                 ans_to_aid={}):
        super(BanNet, self).__init__()
        # self.self_q_att = self_q_att
        self.glimpse = glimpse
        self.q_max_length = q_max_length
        self.objects = objects
        # self.classif = classif
        self.wid_to_word = wid_to_word
        self.word_to_wid = word_to_wid
        self.aid_to_ans = aid_to_ans
        self.ans_to_aid = ans_to_aid
        # Modules

        self.txt_enc = factory_text_enc(self.wid_to_word, txt_enc)
        self.v_att = BilinearAttentionMap(**feat_dims, glimpse=glimpse)

        self.b_net = []
        self.q_prj = []
        self.c_prj = []

        for i in range(glimpse):
            # self.b_net.append(LowLankBilinearPooling(
            #                     feat_dims['v_dim'], feat_dims['q_dim'], feat_dims['h_dim'],
            #                     None, k=1))
            self.b_net.append(
                LowLankBilinearPooling(**feat_dims, h_out=None, k=1))
            self.q_prj.append(
                FCLayer(feat_dims['h_dim'], feat_dims['h_dim'], '', .2))
            self.c_prj.append(
                FCLayer(objects + 1, feat_dims['h_dim'], 'ReLU', .0))

        self.b_net = nn.ModuleList(self.b_net)
        self.q_prj = nn.ModuleList(self.q_prj)
        self.c_prj = nn.ModuleList(self.c_prj)

        self.classifier = Classifier(feat_dims['h_dim'],
                                     feat_dims['h_dim'] * 2, 3000, 0.5)

        # ここで skipthoughts している
        # if self.self_q_att:
        #     self.q_att_linear0 = nn.Linear(2400, 512)
        #     self.q_att_linear1 = nn.Linear(512, 2)

        # if self.shared:
        #     self.cell = MuRelCell(**cell)
        # else:
        #     self.cells = nn.ModuleList([MuRelCell(**cell) for i in range(self.n_step)])

        # self.slp = SLP(**self.classif['slp'])

        # if 'fusion' in self.classif:
        #     self.classif_module = block.factory_fusion(self.classif['fusion'])
        # elif 'mlp' in self.classif:
        #     self.classif_module = MLP(self.classif['mlp'])
        # else:
        #     raise ValueError(self.classif.keys())

        Logger().log_value('nparams',
                           sum(p.numel() for p in self.parameters()
                               if p.requires_grad),
                           should_print=True)
Пример #4
0
 def get_text_enc(self, vocab_words, options):
     """
     returns the text encoding network. 
     """
     return factory_text_enc(self.wid_to_word, options)
Пример #5
0
    def __init__(
        self,
        txt_enc={},
        self_q_att=False,
        max_ans=None,
        # answer_scaling=False,
        # model
        add_coords=True,
        self_attention_vision=True,
        fusion_vision=True,
        num_heads_self_att=None,
        intermediate_dim_self_att=None,
        hidden_dim=2048,
        # fusion
        fusion_mm_dim=1200,
        fusion_activ="relu",
        attention_scaling=1.0,
        residual_fusion=False,
        # output
        output=None,
        output_on="final",
        output_params={},
        wid_to_word={},
        word_to_wid={},
        aid_to_ans=[],
        ans_to_aid={},
    ):
        super().__init__()
        self.self_q_att = self_q_att
        self.wid_to_word = wid_to_word
        self.word_to_wid = word_to_wid
        self.aid_to_ans = aid_to_ans
        self.ans_to_aid = ans_to_aid
        self.max_ans = max_ans
        self.attention_scaling = attention_scaling
        self.add_coords = add_coords
        # self.

        # self.branches = branches

        self.self_attention_vision = self_attention_vision
        self.fusion_vision = fusion_vision
        self.residual_fusion = residual_fusion

        self.output_on = output_on
        self.output = output
        self.output_params = output_params

        # if self.add_lvis:
        #     self.lin_lvis = nn.Linear(1024, 2048)

        # Modules
        self.txt_enc = factory_text_enc(self.wid_to_word, txt_enc)
        if self.self_q_att:
            self.q_att_linear0 = nn.Linear(2400, 512)
            self.q_att_linear1 = nn.Linear(512, 2)

        self.q_projection = nn.Linear(2400, hidden_dim)

        self.hidden_dim = hidden_dim
        if hidden_dim != 2048:
            self.v_projection = nn.Linear(2048, hidden_dim)

        def transformer_block():
            return TransformerBlock(
                hidden_dim,
                hidden_dim,
                hidden_dim,
                num_heads=num_heads_self_att,
                intermediate_dim=intermediate_dim_self_att,
            )

        def fusion_block():
            return MLB(
                [hidden_dim, hidden_dim],
                hidden_dim,
                mm_dim=fusion_mm_dim,
                activ_input=fusion_activ,
                activ_output=fusion_activ,
                normalize=True,
                dropout_input=0.1,
                dropout_pre_lin=0.0,
                dropout_output=0.0,
            )

            # different modules

        if self.self_attention_vision:
            self.self_attention_vision_block = nn.ModuleList(
                [transformer_block()])
            Logger().log_value(
                "self-att-vision.nparams",
                self.get_nparams(self.self_attention_vision_block),
                should_print=True,
            )

        if self.fusion_vision:
            self.vi_fusion_blocks = nn.ModuleList([fusion_block()])
            Logger().log_value(
                "vi_fusion_blocks.nparams",
                self.get_nparams(self.vi_fusion_blocks),
                should_print=True,
            )

        # text
        self.question_cells = nn.ModuleList(
            [nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU())])

        self.output_module = RegressionFusionOutput(**output_params)

        if self.add_coords:
            self.lin_coords = nn.Linear(4, hidden_dim)

        Logger().log_value(
            "output.nparams",
            self.get_nparams(self.output_module),
            should_print=True,
        )

        Logger().log_value(
            "nparams",
            self.get_nparams(self),
            should_print=True,
        )

        Logger().log_value("nparams_txt_enc",
                           self.get_nparams_txt_enc(),
                           should_print=True)