def build(self): self._is_direct_features_input = self.config.direct_features_input # Encoders self.text_encoder = build_text_encoder(self.config.text_encoder) self.image_encoder = build_image_encoder( self.config.image_encoder, self._is_direct_features_input ) # Projectors image_proj_config = deepcopy(self.config.image_projection) self.image_proj = build_classifier_layer(image_proj_config) text_proj_config = deepcopy(self.config.text_projection) self.text_proj = build_classifier_layer(text_proj_config) # Aggregators self.image_pool = AttnPool1d(self.config.final_hidden_size, 1) self.text_pool = AttnPool1d(self.config.final_hidden_size, 1) # Shared transformer transformer_layer = torch.nn.TransformerEncoderLayer( self.config.final_hidden_size, 4, 2048, dropout=0.1, activation="relu" ) self.shared_transformer = torch.nn.TransformerEncoder( transformer_layer, num_layers=2 ) # Position embeddings - Image self.image_pos_emb = PositionEmbeddingSine(self.config.final_hidden_size // 2)
def build(self): self.base = FusionBase(self.config) num_features = self.config.num_features if not self._is_direct_features_input: num_features = self.config.modal_encoder.params.num_output_features # As the in_dim is dynamically calculated we need to copy classifier_config modal_classifier_config = deepcopy(self.config.modal_classifier) modal_classifier_config.params.in_dim = (num_features * self.config.modal_hidden_size) self.modal_classifier = build_classifier_layer(modal_classifier_config) text_classifier_config = deepcopy(self.config.text_classifier) text_classifier_config.params.in_dim = self.config.text_hidden_size self.text_classifier = build_classifier_layer(text_classifier_config)
def build(self): self.vision_module = build_image_encoder(self.config.image_encoder) self.classifier = build_classifier_layer(self.config.classifier) self.language_module = ProjectionEmbedding( **self.config.text_encoder.params) self.dropout = torch.nn.Dropout(self.config.dropout) self.fusion = torch.nn.Linear(**self.config.fusion.params)
def build(self): self.base = UnimodalBase(self.config) self._is_direct_features_input = self.config.direct_features_input num_features = self.config.modal_encoder.params.num_output_features # As the in_dim is dynamically calculated we need to copy classifier_config classifier_config = deepcopy(self.config.classifier) classifier_config.params.in_dim = num_features * self.config.modal_hidden_size self.classifier = build_classifier_layer(classifier_config)
def build(self): self.base = UnimodalBase(self.config) self._is_direct_features_input = self.config.direct_features_input if self.config.get("freeze_base", False): for param in self.base.parameters(): param.requires_grad = False num_features = self.config.modal_encoder.params.num_output_features # As the in_dim is dynamically calculated we need to copy classifier_config classifier_config = deepcopy(self.config.classifier) classifier_config.params.in_dim = num_features * self.config.modal_hidden_size self.classifier = build_classifier_layer(classifier_config)
def build(self): """ Config's image_encoder attribute will used to build an MMF image encoder. This config in yaml will look like: # "type" parameter specifies the type of encoder we are using here. # In this particular case, we are using resnet152 type: resnet152 # Parameters are passed to underlying encoder class by # build_image_encoder params: # Specifies whether to use a pretrained version pretrained: true # Pooling type, use max to use AdaptiveMaxPool2D pool_type: avg # Number of output features from the encoder, -1 for original # otherwise, supports between 1 to 9 num_output_features: 1 """ self.vision_module = build_image_encoder(self.config.image_encoder) """ For classifer, configuration would look like: # Specifies the type of the classifier, in this case mlp type: mlp # Parameter to the classifier passed through build_classifier_layer params: # Dimension of the tensor coming into the classifier in_dim: 512 # Dimension of the tensor going out of the classifier out_dim: 2 # Number of MLP layers in the classifier num_layers: 0 """ self.classifier = build_classifier_layer(self.config.classifier) # ProjectionEmbeddings takes in params directly as it is module # So, pass in kwargs, which are in_dim, out_dim and module # whose value would be "linear" as we want linear layer self.language_module = ProjectionEmbedding( **self.config.text_encoder.params) # Dropout value will come from config now self.dropout = torch.nn.Dropout(self.config.dropout) # Same as Projection Embedding, fusion's layer params (which are param # for linear layer) will come from config now self.fusion = torch.nn.Linear(**self.config.fusion.params) self.relu = torch.nn.ReLU()
def build(self): self.base = FusionBase(self.config) num_features = self.config.num_features if not self._is_direct_features_input: num_features = self.config.modal_encoder.params.num_output_features # As the in_dim is dynamically calculated we need to copy classifier_config classifier_config = deepcopy(self.config.classifier) classifier_config.params.in_dim = num_features * self.config.modal_hidden_size classifier_config.params.in_dim += self.config.text_hidden_size self.classifier = build_classifier_layer(classifier_config) if self.config.freeze_text or self.config.freeze_complete_base: for p in self.base.text.parameters(): p.requires_grad = False if self.config.freeze_modal or self.config.freeze_complete_base: for p in self.base.modal.parameters(): p.requires_grad = False
def build(self): self.base = UnimodalBase(self.config) # As the in_dim is dynamically calculated we need to copy classifier_config classifier_config = deepcopy(self.config.classifier) classifier_config.params.in_dim = self.config.text_hidden_size self.classifier = build_classifier_layer(classifier_config)