Exemplos de GeLU em Python, exemplos de lxrt.modeling.GeLU em Python

Exemplo n.º 1

0

Exibir arquivo

    def __init__(self, num_answers, fn_type="softmax"):
        super().__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH)

        hid_dim = self.lxrt_encoder.dim
        print("Size of Hidden Dimension:", hid_dim)
        fc_dim = int(hid_dim)
        print("Size of Hidden Dimension:", fc_dim)

        # Type Predictor
        self.type_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                     BertLayerNorm(hid_dim * 2, eps=1e-12),
                                     nn.Linear(hid_dim * 2, 3))

        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax()

        if fn_type == "tanh":
            self.fn = self.tanh
            print("FN: TANH")
        elif fn_type == "softmax":
            self.fn = self.softmax
            print("FN: SOFTMAX")
        else:
            self.fn = self.sigmoid
            print("FN: SIGMOID")

        # YESNO feedforward
        self.yesno_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(2 * hid_dim, fc_dim), GeLU(),
                                      BertLayerNorm(fc_dim, eps=1e-12))

        # NUMBER feedforward
        self.number_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                       BertLayerNorm(hid_dim * 2, eps=1e-12),
                                       nn.Linear(2 * hid_dim, fc_dim), GeLU(),
                                       BertLayerNorm(fc_dim, eps=1e-12))

        # OTHER feedforward
        self.other_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(2 * hid_dim, fc_dim), GeLU(),
                                      BertLayerNorm(fc_dim, eps=1e-12))

        # Answering Heads
        self.logit_fc1 = nn.Sequential(nn.Linear(4 * fc_dim, hid_dim * 2),
                                       GeLU(),
                                       BertLayerNorm(hid_dim * 2, eps=1e-12),
                                       nn.Linear(hid_dim * 2, hid_dim))

        # Answering Heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_answers))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: classifier_model.py Projeto: jmhessel/lxmert

    def __init__(self, num_answers, model_type='full'):
        super().__init__()
        self.model_type = model_type
        self.lxrt_encoder = LXRTEncoder(args,
                                        max_seq_length=MAX_CLF_LENGTH,
                                        model_type=args.model_type)

        hid_dim = self.lxrt_encoder.dim

        if num_answers == 2:
            output_dim = 1
        else:
            output_dim = num_answers

        if self.model_type != 'concat':
            self.logit_fc = nn.Sequential(
                nn.Dropout(args.dropout), nn.Linear(hid_dim, hid_dim * 2),
                GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12),
                nn.Dropout(args.dropout), nn.Linear(hid_dim * 2, output_dim))
        else:
            linear = nn.Linear(hid_dim, output_dim)
            self.logit_fc = nn.Sequential(
                nn.Dropout(args.dropout),
                linear,
            )

        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 3

0

Exibir arquivo

    def __init__(self, num_answers):
        super().__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH)
        hid_dim = self.lxrt_encoder.dim

        #Build Decoder with Attention
        self.decoder = DecoderWithAttention(attention_dim=hid_dim,
                                            embed_dim=hid_dim,
                                            decoder_dim=hid_dim,
                                            vocab_size=vocab_size,
                                            features_dim=hid_dim,
                                            dropout=0.5)

        # VQA Answer heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_answers))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

        self.lstm = nn.LSTM(input_size=hid_dim,
                            hidden_size=hid_dim,
                            num_layers=1,
                            batch_first=True)
        self.linear = nn.Linear(hid_dim,
                                vocab_size)  #vocab size of bert is 30000

Exemplo n.º 4

0

Exibir arquivo

Arquivo: vqa_model_faster_rcnn.py Projeto: csarron/lxmert

    def __init__(self, num_answers):
        super().__init__()
        
        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(
            args,
            max_seq_length=MAX_VQA_LENGTH
        )
        hid_dim = self.lxrt_encoder.dim
        
        # VQA Answer heads
        self.logit_fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim * 2),
            GeLU(),
            BertLayerNorm(hid_dim * 2, eps=1e-12),
            nn.Linear(hid_dim * 2, num_answers)
        )
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
        # from https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.pth
        # https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.yaml

        self.args = SimpleNamespace(
                                # model_file= 'data/faster-rcnn-r101.pth',
                                model_file='data/R-50-FPN.pth',
                                config_file='data/R-50-FPN.yaml',
                                # config_file='../vqa-faster-rcnn/configs/visual_genome_vqa/e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512_vqa_test.yaml',
                                batch_size=args.batch_size,
                                num_features=36,
                                feature_name="fc6",
                                confidence_threshold=0,
                                background=True,
                                partition=0)
        self.detection_model = self._build_detection_model()

Exemplo n.º 5

0

Exibir arquivo

 def __init__(self, num_answers):
     super().__init__()
     self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_GQA_LENGTH)
     hid_dim = self.lxrt_encoder.dim
     self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                   BertLayerNorm(hid_dim * 2, eps=1e-12),
                                   nn.Linear(hid_dim * 2, num_answers))
     self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: gqa_model.py Projeto: reasoningpatterns/reasoningpatterns.github.io

 def __init__(self, metric):
     super(MatchingDecoderLV, self).__init__()
     HIDDEN_DECODER_SIZE = 256
     hid_dim = 768
     self.lang_proj = nn.Sequential(
         nn.Linear(hid_dim, hid_dim * 2),
         GeLU(),
         BertLayerNorm(hid_dim * 2, eps=1e-12),
         nn.Linear(hid_dim * 2, HIDDEN_DECODER_SIZE)
     )
     self.vis_proj = nn.Sequential(
         nn.Linear(hid_dim, hid_dim * 2),
         GeLU(),
         BertLayerNorm(hid_dim * 2, eps=1e-12),
         nn.Linear(hid_dim * 2, HIDDEN_DECODER_SIZE)
     )
     self.metric = metric
     assert metric in ['sdp', 'cosine']

Exemplo n.º 7

0

Exibir arquivo

 def __init__(self):
     super().__init__()
     self.lxrt_encoder = LXRTEncoder(args, max_seq_length=20)
     self.hid_dim = hid_dim = self.lxrt_encoder.dim
     self.logit_fc = nn.Sequential(nn.Linear(hid_dim * 2, hid_dim * 2),
                                   GeLU(),
                                   BertLayerNorm(hid_dim * 2, eps=1e-12),
                                   nn.Linear(hid_dim * 2, 2))
     self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: vqa_model_emb.py Projeto: ASU-Active-Perception-Group/vqa_mutant

    def __init__(self, num_answers):
        super().__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH)
        hid_dim = self.lxrt_encoder.dim

        # VQA Answer heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, 300))

        self.logit_fc_ans = nn.Sequential(
            nn.Linear(hid_dim, hid_dim * 2), GeLU(),
            BertLayerNorm(hid_dim * 2, eps=1e-12),
            nn.Linear(hid_dim * 2, num_answers))

        self.emb_proj = nn.Sequential(nn.Linear(300, hid_dim), GeLU(),
                                      BertLayerNorm(hid_dim, eps=1e-12),
                                      nn.Linear(hid_dim, 300))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: lxrt.py Projeto: itsShnik/adaptively-finetuning-transformers

    def __init__(self, num_blocks):
        super().__init__()

        # Build LXRT encoder
        #TODO: Make a new class in entry file
        self.policy_lxrt_encoder = PolicyLXRTEncoder(
            args, max_seq_length=MAX_VQA_LENGTH)
        hid_dim = self.policy_lxrt_encoder.dim

        # VQA Answer heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_blocks))
        self.logit_fc.apply(self.policy_lxrt_encoder.model.init_bert_weights)

Exemplo n.º 10

0

Exibir arquivo

 def __init__(self, model_type='full'):
     super().__init__()
     self.model_type = model_type
     self.lxrt_encoder = LXRTEncoder(args,
                                     max_seq_length=MAX_RANK_LENGTH,
                                     model_type=args.model_type)
     self.hid_dim = hid_dim = self.lxrt_encoder.dim
     if self.model_type != 'concat':
         self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim), GeLU(),
                                       BertLayerNorm(hid_dim, eps=1e-12),
                                       nn.Linear(hid_dim, 1))
     else:
         self.logit_fc = nn.Sequential(nn.Linear(hid_dim, 1), )
     self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: lxmert_yolov5s_predict.py Projeto: csarron/lxmert

    def __init__(self, num_answers):
        super().__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH)
        hid_dim = self.lxrt_encoder.dim

        # VQA Answer heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_answers))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
        device = torch.device('cpu')
        detection_model = torch.load(args.detection_model, map_location=device)
        self.detection_model = detection_model['model'].float().fuse().eval()

Exemplo n.º 12

0

Exibir arquivo

    def __init__(self, num_answers):
        super().__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH)
        hid_dim = self.lxrt_encoder.dim

        # VQA Answer heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_answers))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                                       do_lower_case=True)

Exemplo n.º 13

0

Exibir arquivo

    def __init__(self, num_answers, attention=False):
        super().__init__()
        print(f"Making {__name__}")
        self.flag = True
        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args,
                                        max_seq_length=MAX_VQA_LENGTH,
                                        attention=attention)
        hid_dim = self.lxrt_encoder.dim

        # VQA Answer heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_answers))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: lxmert.py Projeto: volkancirik/refer360

    def __init__(self, args, width=100, height=100):
        super().__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_SEQ_LENGTH)
        hid_dim = self.lxrt_encoder.dim
        num_logits = width * height
        self.width = width
        self.height = height
        self.n_actions = num_logits

        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_logits))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

        self.use_detectron = args.use_detectron

        if self.use_detectron:
            print('Detectron will be used.')
            data_path = DATA_PATH

            vg_classes = []
            with open(os.path.join(data_path, 'objects_vocab.txt')) as f:
                for object in f.readlines():
                    vg_classes.append(object.split(',')[0].lower().strip())

            MetadataCatalog.get("vg").thing_classes = vg_classes
            yaml_file = DETECTRON2_YAML
            cfg = get_cfg()
            cfg.merge_from_file(yaml_file)
            cfg.MODEL.RPN.POST_NMS_TOPK_TEST = 300
            cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.6
            cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6
            # VG Weight
            cfg.MODEL.WEIGHTS = "http://nlp.cs.unc.edu/models/faster_rcnn_from_caffe.pkl"
            self.predictor = DefaultPredictor(cfg)
        else:
            print('Resnet will be used.')
            self.cnn = nn.Sequential(
                *(list(models.resnet18(
                    pretrained=True).children())[:-3])).cuda().eval()
            self.cnn2box = nn.Linear(256, 2048)
            self.preprocess = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225]),
            ])

Exemplo n.º 15

0

Exibir arquivo

    def __init__(self, num_classes=2):
        super().__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args,
                                        max_seq_length=MAX_UTTERANCE_LENGTH)
        hid_dim = self.lxrt_encoder.dim

        # VCSD image features dimensions adjuster
        self.adaptive_pool = nn.AdaptiveAvgPool2d((36, 2048))

        # VCSD Classification head
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_classes))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 16

0

Exibir arquivo

    def __init__(self, num_answers, finetune_strategy='standard'):
        super().__init__()

        #self.finetune_strategy = finetune_strategy

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args,
                                        max_seq_length=MAX_VQA_LENGTH,
                                        finetune_strategy=finetune_strategy)
        hid_dim = self.lxrt_encoder.dim

        # VQA Answer heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_answers))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 17

0

Exibir arquivo

 def create_head(self, num_answers):
     hid_dim = self.lxrt_encoder.dim
     if self.logit_fc is None:
         if self.encoder_type == 'lxrt':
             self.logit_fc = nn.Sequential(
                 nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                 BertLayerNorm(hid_dim * 2, eps=1e-12),
                 nn.Linear(hid_dim * 2, num_answers))
         else:
             self.logit_fc = nn.Linear(1024, num_answers)
         init_weights = (
             self.lxrt_encoder.model.init_bert_weights
             if not isinstance(self.lxrt_encoder.model, nn.DataParallel)
             else self.lxrt_encoder.model.module.init_bert_weights)
         self.logit_fc.apply(init_weights)
         return
     self.logit_fc[-1] = nn.Linear(hid_dim * 2, num_answers)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: gqa_model.py Projeto: reasoningpatterns/reasoningpatterns.github.io

    def __init__(self, num_answers):
        super().__init__()


        self.lxrt_encoder = LXRTEncoder(
            args,
            max_seq_length=MAX_GQA_LENGTH
        )
        hid_dim = self.lxrt_encoder.dim
        self.logit_fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim * 2),
            GeLU(),
            BertLayerNorm(hid_dim * 2, eps=1e-12),
            nn.Linear(hid_dim * 2, num_answers)
        )
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
        if args.task_pointer != 'none':
            self.matching_decoder = MatchingDecoderLV(metric='sdp')

Exemplo n.º 19

0

Exibir arquivo

    def __init__(self, num_answers):
        super().__init__()
        self.lxrt_encoder = LXRTEncoder(
            args,
            max_seq_length=MAX_GQA_LENGTH,
            mode='xl'
        )
        hid_dim = self.lxrt_encoder.dim
        self.logit_fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim * 2),
            GeLU(),
            BertLayerNorm(hid_dim * 2, eps=1e-12),
            nn.Linear(hid_dim * 2, num_answers)
        )
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

        if args.task_nsp_qfpm or args.task_mlm_qfpm:
          self.qfpm = BertPreTrainingHeads(BertConfig(vocab_size_or_config_json_file = 30522),
                                           self.lxrt_encoder.model.bert.embeddings.word_embeddings.weight)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: pvqa_model.py Projeto: UCSD-AI4H/PathVQA

    def __init__(self, num_answers):
        super().__init__()

        # Build LXRT encoder
        # lxrt.entry.LXRTEncoder -> LXRTFeatureExtraction -> LXRTModel
        self.lxrt_encoder = LXRTEncoder(
            args,
            max_seq_length=MAX_PVQA_LENGTH
        )
        hid_dim = self.lxrt_encoder.dim

        # VQA Answer heads
        self.logit_fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim * 2),
            GeLU(),
            BertLayerNorm(hid_dim * 2, eps=1e-12),
            nn.Linear(hid_dim * 2, num_answers)
        )
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: kdd_model.py Projeto: zhongyunuestc/KDDCUP_2020_MultimodalitiesRecall_2nd_Place

    def __init__(self):
        super(KDDModel, self).__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args, mode='lx')
        hid_dim = self.lxrt_encoder.dim
        self.config = self.lxrt_encoder.model.config

        # Image-text heads
        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, 2))
        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)

        # AMSoftmax loss heads
        self.logit_W = torch.nn.Parameter(torch.randn(hid_dim, 2),
                                          requires_grad=True)
        nn.init.xavier_normal_(self.logit_W, gain=1)

        # MLM heads
        self.cls = BertPreTrainingHeads(
            self.config,
            self.lxrt_encoder.model.bert.embeddings.word_embeddings.weight)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: vqa_model_yn.py Projeto: ASU-Active-Perception-Group/vqa_mutant

    def __init__(self, num_answers, fn_type="softmax"):
        super().__init__()

        # Build LXRT encoder
        self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH)

        hid_dim = self.lxrt_encoder.dim
        print("Size of Hidden Dimension:", hid_dim)
        fc_dim = int(hid_dim)
        print("Size of Hidden Dimension:", fc_dim)

        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax()

        if fn_type == "tanh":
            self.fn = self.tanh
            print("FN: TANH")
        elif fn_type == "softmax":
            self.fn = self.softmax
            print("FN: SOFTMAX")
        else:
            self.fn = self.sigmoid
            print("FN: SIGMOID")

        # YN:AND/OR/NOT/NONE Type Predictor
        self.yn_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                   BertLayerNorm(hid_dim * 2, eps=1e-12),
                                   nn.Linear(hid_dim * 2, 4))

        # AND FF
        self.and_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                    BertLayerNorm(hid_dim * 2, eps=1e-12),
                                    nn.Linear(2 * hid_dim, fc_dim), GeLU(),
                                    BertLayerNorm(fc_dim, eps=1e-12))

        # OR FF
        self.or_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                   BertLayerNorm(hid_dim * 2, eps=1e-12),
                                   nn.Linear(2 * hid_dim, fc_dim), GeLU(),
                                   BertLayerNorm(fc_dim, eps=1e-12))

        # NOT FF
        self.not_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                    BertLayerNorm(hid_dim * 2, eps=1e-12),
                                    nn.Linear(2 * hid_dim, fc_dim), GeLU(),
                                    BertLayerNorm(fc_dim, eps=1e-12))

        # NONE FF
        self.none_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                     BertLayerNorm(hid_dim * 2, eps=1e-12),
                                     nn.Linear(2 * hid_dim, fc_dim), GeLU(),
                                     BertLayerNorm(fc_dim, eps=1e-12))

        # Answering Heads
        self.logit_fc1 = nn.Sequential(nn.Linear(6 * fc_dim, hid_dim * 2),
                                       GeLU(),
                                       BertLayerNorm(hid_dim * 2, eps=1e-12),
                                       nn.Linear(hid_dim * 2, hid_dim))

        self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(),
                                      BertLayerNorm(hid_dim * 2, eps=1e-12),
                                      nn.Linear(hid_dim * 2, num_answers))

        self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)