示例#1
0
文件: encoders.py 项目: naykun/mmf
    def __init__(self, config: Config, *args, **kwargs):
        super().__init__()
        encoder_type = config.type
        if isinstance(encoder_type, ImageFeatureEncoderTypes):
            encoder_type = encoder_type.value

        assert ("in_dim" in config.params
                ), "ImageFeatureEncoder require 'in_dim' param in config"
        params = config.params

        if encoder_type == "default" or encoder_type == "identity":
            self.module = Identity()
            self.module.in_dim = params.in_dim
            self.module.out_dim = params.in_dim
        elif encoder_type == "projection":
            if "module" not in params:
                params = deepcopy(params)
                params.module = "linear"
            self.module = ProjectionEmbedding(**params)
        elif encoder_type == "finetune_faster_rcnn_fpn_fc7":
            self.module = FinetuneFasterRcnnFpnFc7(params)
        elif encoder_type == "spatial":
            self.module = VisionSpatialEmbedding(**params)
        else:
            raise NotImplementedError("Unknown Image Encoder: %s" %
                                      encoder_type)

        self.out_dim = self.module.out_dim
示例#2
0
    def _init_feature_embeddings(self, attr):
        feature_embeddings_list = []
        num_feature_feat = len(
            getattr(self.config, f"{attr}_feature_encodings"))

        self.image_feature_projection = ProjectionEmbedding(
            **self.config.image_feature_projection)
        self.feature_embeddings_out_dim = 0

        if self.config.image_intra_attention:
            self.image_feature_intra_attention = nn.MultiheadAttention(
                **self.config.image_feature_attentions[0])

        for _ in range(num_feature_feat):
            feature_embeddings = []
            feature_attn_model_list = self.config[attr + "_feature_embeddings"]

            for feature_attn_model_params in feature_attn_model_list:
                feature_embedding = nn.MultiheadAttention(
                    **feature_attn_model_params)
                feature_embeddings.append(feature_embedding)
                self.feature_embeddings_out_dim += feature_attn_model_params[
                    "embed_dim"]

            feature_embeddings = nn.ModuleList(feature_embeddings)
            feature_embeddings_list.append(feature_embeddings)

        setattr(self, attr + "_feature_embeddings_out_dim",
                self.feature_embeddings_out_dim)
        del self.feature_embeddings_out_dim
        setattr(
            self,
            attr + "_feature_embeddings_list",
            nn.ModuleList(feature_embeddings_list),
        )
示例#3
0
 def build(self):
     self.vision_module = build_image_encoder(self.config.image_encoder)
     self.classifier = build_classifier_layer(self.config.classifier)
     self.language_module = ProjectionEmbedding(
         **self.config.text_encoder.params)
     self.dropout = torch.nn.Dropout(self.config.dropout)
     self.fusion = torch.nn.Linear(**self.config.fusion.params)
示例#4
0
    def build(self):
        """
       Config's image_encoder attribute will used to build an MMF image
       encoder. This config in yaml will look like:
 
       # "type" parameter specifies the type of encoder we are using here.
       # In this particular case, we are using resnet152
       type: resnet152
    
       # Parameters are passed to underlying encoder class by
       # build_image_encoder
       params:
         # Specifies whether to use a pretrained version
         pretrained: true
         # Pooling type, use max to use AdaptiveMaxPool2D
         pool_type: avg
    
         # Number of output features from the encoder, -1 for original
         # otherwise, supports between 1 to 9
         num_output_features: 1
       """
        self.vision_module = build_image_encoder(self.config.image_encoder)
        """
       For classifer, configuration would look like:
       # Specifies the type of the classifier, in this case mlp
       type: mlp
       # Parameter to the classifier passed through build_classifier_layer
       params:
         # Dimension of the tensor coming into the classifier
         in_dim: 512
         # Dimension of the tensor going out of the classifier
         out_dim: 2
         # Number of MLP layers in the classifier
         num_layers: 0
       """
        self.classifier = build_classifier_layer(self.config.classifier)

        # ProjectionEmbeddings takes in params directly as it is module
        # So, pass in kwargs, which are in_dim, out_dim and module
        # whose value would be "linear" as we want linear layer
        self.language_module = ProjectionEmbedding(
            **self.config.text_encoder.params)
        # Dropout value will come from config now
        self.dropout = torch.nn.Dropout(self.config.dropout)
        # Same as Projection Embedding, fusion's layer params (which are param
        # for linear layer) will come from config now
        self.fusion = torch.nn.Linear(**self.config.fusion.params)
        self.relu = torch.nn.ReLU()
示例#5
0
    def __init__(self, encoder_type, in_dim, **kwargs):
        super().__init__()

        if encoder_type == "default" or encoder_type == "identity":
            self.module = Identity()
            self.module.in_dim = in_dim
            self.module.out_dim = in_dim
        elif encoder_type == "projection":
            module_type = kwargs.pop("module", "linear")
            self.module = ProjectionEmbedding(module_type, in_dim, **kwargs)
        elif encoder_type == "finetune_faster_rcnn_fpn_fc7":
            self.module = FinetuneFasterRcnnFpnFc7(in_dim, **kwargs)
        else:
            raise NotImplementedError("Unknown Image Encoder: %s" % encoder_type)

        self.out_dim = self.module.out_dim