def build(self): self._is_direct_features_input = self.config.direct_features_input # Encoders self.text_encoder = build_text_encoder(self.config.text_encoder) self.image_encoder = build_image_encoder( self.config.image_encoder, self._is_direct_features_input ) # Projectors image_proj_config = deepcopy(self.config.image_projection) self.image_proj = build_classifier_layer(image_proj_config) text_proj_config = deepcopy(self.config.text_projection) self.text_proj = build_classifier_layer(text_proj_config) # Aggregators self.image_pool = AttnPool1d(self.config.final_hidden_size, 1) self.text_pool = AttnPool1d(self.config.final_hidden_size, 1) # Shared transformer transformer_layer = torch.nn.TransformerEncoderLayer( self.config.final_hidden_size, 4, 2048, dropout=0.1, activation="relu" ) self.shared_transformer = torch.nn.TransformerEncoder( transformer_layer, num_layers=2 ) # Position embeddings - Image self.image_pos_emb = PositionEmbeddingSine(self.config.final_hidden_size // 2)
def _build_encoders(self, config): text_encoder = None if config.get("text_encoder", None): text_encoder = build_text_encoder(config.text_encoder) modal_encoder = None if config.get("modal_encoder", None): modal_encoder = self._build_modal_encoder(config.modal_encoder) return (text_encoder, modal_encoder)