示例#1
0
文件: m4c.py 项目: zhangshengHust/mmf
    def _build_ocr_encoding(self):
        self.remove_ocr_fasttext = getattr(self.config.ocr,
                                           "remove_ocr_fasttext", False)
        self.remove_ocr_phoc = getattr(self.config.ocr, "remove_ocr_phoc",
                                       False)
        self.remove_ocr_frcn = getattr(self.config.ocr, "remove_ocr_frcn",
                                       False)
        self.remove_ocr_semantics = getattr(self.config.ocr,
                                            "remove_ocr_semantics", False)
        self.remove_ocr_bbox = getattr(self.config.ocr, "remove_ocr_bbox",
                                       False)

        # OCR appearance feature: Faster R-CNN
        self.ocr_faster_rcnn_fc7 = ImageFeatureEncoder(
            encoder_type="finetune_faster_rcnn_fpn_fc7",
            in_dim=2048,
            weights_file="models/detectron.defaults/fc7_w.pkl",
            bias_file="models/detectron.defaults/fc7_b.pkl",
            model_data_dir=self.config.model_data_dir,
        )
        self.finetune_modules.append({
            "module": self.ocr_faster_rcnn_fc7,
            "lr_scale": self.config.lr_scale_frcn
        })

        self.linear_ocr_feat_to_mmt_in = nn.Linear(self.config.ocr.mmt_in_dim,
                                                   self.mmt_config.hidden_size)

        # OCR location feature: relative bounding box coordinates (4-dim)
        self.linear_ocr_bbox_to_mmt_in = nn.Linear(4,
                                                   self.mmt_config.hidden_size)

        self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
示例#2
0
文件: build.py 项目: zpppy/mmf
def build_image_encoder(config, direct_features=False, **kwargs):
    from mmf.modules.encoders import ImageFeatureEncoder, ImageEncoder

    if direct_features:
        module = ImageFeatureEncoder(config.type, **config.params)
    else:
        module = ImageEncoder(config)
    return module.module
示例#3
0
    def _init_feature_encoders(self, attr: str):
        feat_encoder = self.config[attr + "_feature_encodings"]
        feature_dim = self.config[attr + "_feature_dim"]
        setattr(self, attr + "_feature_dim", feature_dim)

        encoder_type = feat_encoder.type
        encoder_kwargs = copy.deepcopy(feat_encoder.params)
        encoder_kwargs.model_data_dir = self.config.model_data_dir
        encoder_kwargs.cond_features = self.text_embeddings_out_dim

        feat_model = ImageFeatureEncoder(encoder_type, feature_dim,
                                         **encoder_kwargs)

        setattr(self, attr + "_feature_dim", feat_model.out_dim)
        setattr(self, attr + "_feature_encoders", feat_model)
示例#4
0
    def _init_feature_projectors(self, attr):
        feature_projectors = []
        feat_encoders_list_config = self.config[attr + "_feature_projections"]
        feat_dim = getattr(self, attr + "_feature_dim")

        for feat_encoder in feat_encoders_list_config:
            encoder_type = feat_encoder.type
            encoder_kwargs = feat_encoder.params

            feat_model = ImageFeatureEncoder(encoder_type, feat_dim, **encoder_kwargs)

            feature_projectors.append(feat_model)
            setattr(self, attr + "_feature_dim", feat_model.out_dim)

        setattr(self, attr + "_feature_projectors", nn.ModuleList(feature_projectors))
示例#5
0
    def _init_feature_encoders(self, attr):
        feat_encoders = []
        feat_encoders_list_config = self.config[attr + "_feature_encodings"]
        feature_dim = self.config[attr + "_feature_dim"]
        setattr(self, attr + "_feature_dim", feature_dim)

        for feat_encoder in feat_encoders_list_config:
            encoder_type = feat_encoder.type
            encoder_kwargs = copy.deepcopy(feat_encoder.params)
            encoder_kwargs.model_data_dir = self.config.model_data_dir

            feat_model = ImageFeatureEncoder(encoder_type, feature_dim,
                                             **encoder_kwargs)

            feat_encoders.append(feat_model)
            setattr(self, attr + "_feature_dim", feat_model.out_dim)

        setattr(self, attr + "_feature_encoders", nn.ModuleList(feat_encoders))
示例#6
0
    def _build_obj_encoding(self):
        # object appearance feature: Faster R-CNN
        self.obj_faster_rcnn_fc7 = ImageFeatureEncoder(
            encoder_type="finetune_faster_rcnn_fpn_fc7",
            in_dim=2048,
            weights_file="models/detectron.defaults/fc7_w.pkl",
            bias_file="models/detectron.defaults/fc7_b.pkl",
            model_data_dir=self.config.model_data_dir,
        )
        # apply smaller lr to pretrained Faster R-CNN fc7
        self.finetune_modules.append(
            {"module": self.obj_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn}
        )
        self.linear_obj_feat_to_mmt_in = nn.Linear(
            self.config.obj.mmt_in_dim, self.mmt_config.hidden_size
        )

        # object location feature: relative bounding box coordinates (4-dim)
        self.linear_obj_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size)

        self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size)
        self.obj_drop = nn.Dropout(self.config.obj.dropout_prob)