def __init__(self, config: Config, *args, **kwargs): super().__init__() encoder_type = config.type if isinstance(encoder_type, ImageFeatureEncoderTypes): encoder_type = encoder_type.value assert ("in_dim" in config.params ), "ImageFeatureEncoder require 'in_dim' param in config" params = config.params if encoder_type == "default" or encoder_type == "identity": self.module = Identity() self.module.in_dim = params.in_dim self.module.out_dim = params.in_dim elif encoder_type == "projection": if "module" not in params: params = deepcopy(params) params.module = "linear" self.module = ProjectionEmbedding(**params) elif encoder_type == "finetune_faster_rcnn_fpn_fc7": self.module = FinetuneFasterRcnnFpnFc7(params) elif encoder_type == "spatial": self.module = VisionSpatialEmbedding(**params) else: raise NotImplementedError("Unknown Image Encoder: %s" % encoder_type) self.out_dim = self.module.out_dim
def _init_feature_embeddings(self, attr): feature_embeddings_list = [] num_feature_feat = len( getattr(self.config, f"{attr}_feature_encodings")) self.image_feature_projection = ProjectionEmbedding( **self.config.image_feature_projection) self.feature_embeddings_out_dim = 0 if self.config.image_intra_attention: self.image_feature_intra_attention = nn.MultiheadAttention( **self.config.image_feature_attentions[0]) for _ in range(num_feature_feat): feature_embeddings = [] feature_attn_model_list = self.config[attr + "_feature_embeddings"] for feature_attn_model_params in feature_attn_model_list: feature_embedding = nn.MultiheadAttention( **feature_attn_model_params) feature_embeddings.append(feature_embedding) self.feature_embeddings_out_dim += feature_attn_model_params[ "embed_dim"] feature_embeddings = nn.ModuleList(feature_embeddings) feature_embeddings_list.append(feature_embeddings) setattr(self, attr + "_feature_embeddings_out_dim", self.feature_embeddings_out_dim) del self.feature_embeddings_out_dim setattr( self, attr + "_feature_embeddings_list", nn.ModuleList(feature_embeddings_list), )
def build(self): self.vision_module = build_image_encoder(self.config.image_encoder) self.classifier = build_classifier_layer(self.config.classifier) self.language_module = ProjectionEmbedding( **self.config.text_encoder.params) self.dropout = torch.nn.Dropout(self.config.dropout) self.fusion = torch.nn.Linear(**self.config.fusion.params)
def build(self): """ Config's image_encoder attribute will used to build an MMF image encoder. This config in yaml will look like: # "type" parameter specifies the type of encoder we are using here. # In this particular case, we are using resnet152 type: resnet152 # Parameters are passed to underlying encoder class by # build_image_encoder params: # Specifies whether to use a pretrained version pretrained: true # Pooling type, use max to use AdaptiveMaxPool2D pool_type: avg # Number of output features from the encoder, -1 for original # otherwise, supports between 1 to 9 num_output_features: 1 """ self.vision_module = build_image_encoder(self.config.image_encoder) """ For classifer, configuration would look like: # Specifies the type of the classifier, in this case mlp type: mlp # Parameter to the classifier passed through build_classifier_layer params: # Dimension of the tensor coming into the classifier in_dim: 512 # Dimension of the tensor going out of the classifier out_dim: 2 # Number of MLP layers in the classifier num_layers: 0 """ self.classifier = build_classifier_layer(self.config.classifier) # ProjectionEmbeddings takes in params directly as it is module # So, pass in kwargs, which are in_dim, out_dim and module # whose value would be "linear" as we want linear layer self.language_module = ProjectionEmbedding( **self.config.text_encoder.params) # Dropout value will come from config now self.dropout = torch.nn.Dropout(self.config.dropout) # Same as Projection Embedding, fusion's layer params (which are param # for linear layer) will come from config now self.fusion = torch.nn.Linear(**self.config.fusion.params) self.relu = torch.nn.ReLU()
def __init__(self, encoder_type, in_dim, **kwargs): super().__init__() if encoder_type == "default" or encoder_type == "identity": self.module = Identity() self.module.in_dim = in_dim self.module.out_dim = in_dim elif encoder_type == "projection": module_type = kwargs.pop("module", "linear") self.module = ProjectionEmbedding(module_type, in_dim, **kwargs) elif encoder_type == "finetune_faster_rcnn_fpn_fc7": self.module = FinetuneFasterRcnnFpnFc7(in_dim, **kwargs) else: raise NotImplementedError("Unknown Image Encoder: %s" % encoder_type) self.out_dim = self.module.out_dim