def _build_ocr_encoding(self): self.remove_ocr_fasttext = getattr( self.config.ocr, "remove_ocr_fasttext", False ) self.remove_ocr_phoc = getattr(self.config.ocr, "remove_ocr_phoc", False) self.remove_ocr_frcn = getattr(self.config.ocr, "remove_ocr_frcn", False) self.remove_ocr_semantics = getattr( self.config.ocr, "remove_ocr_semantics", False ) self.remove_ocr_bbox = getattr(self.config.ocr, "remove_ocr_bbox", False) # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = build_image_encoder( self._build_encoder_config(), direct_features=True ) self.finetune_modules.append( {"module": self.ocr_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn} ) self.linear_ocr_feat_to_mmt_in = nn.Linear( self.config.ocr.mmt_in_dim, self.mmt_config.hidden_size ) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
def build(self): self._is_direct_features_input = self.config.direct_features_input # Encoders self.text_encoder = build_text_encoder(self.config.text_encoder) self.image_encoder = build_image_encoder( self.config.image_encoder, self._is_direct_features_input ) # Projectors image_proj_config = deepcopy(self.config.image_projection) self.image_proj = build_classifier_layer(image_proj_config) text_proj_config = deepcopy(self.config.text_projection) self.text_proj = build_classifier_layer(text_proj_config) # Aggregators self.image_pool = AttnPool1d(self.config.final_hidden_size, 1) self.text_pool = AttnPool1d(self.config.final_hidden_size, 1) # Shared transformer transformer_layer = torch.nn.TransformerEncoderLayer( self.config.final_hidden_size, 4, 2048, dropout=0.1, activation="relu" ) self.shared_transformer = torch.nn.TransformerEncoder( transformer_layer, num_layers=2 ) # Position embeddings - Image self.image_pos_emb = PositionEmbeddingSine(self.config.final_hidden_size // 2)
def build(self): self.vision_module = build_image_encoder(self.config.image_encoder) self.classifier = build_classifier_layer(self.config.classifier) self.language_module = ProjectionEmbedding( **self.config.text_encoder.params) self.dropout = torch.nn.Dropout(self.config.dropout) self.fusion = torch.nn.Linear(**self.config.fusion.params)
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True ) if self.config.concate_trace: self.trace_feature_module = build_encoder(self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased" ) elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.max_position_embeddings = 1090 config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder ) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method ) if ( hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention ): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101 self.vae = OpenAIDiscreteVAE() image_code_dim = 768 image_fmap_size = self.vae.image_size // (2 ** self.vae.num_layers) self.image_seq_len = image_fmap_size ** 2 self.image_emb = torch.nn.Embedding(self.vae.num_tokens, image_code_dim) self.image_pos_emb = AxialPositionalEmbedding( image_code_dim, axial_shape=(image_fmap_size, image_fmap_size) )
def _init_feature_encoders(self, attr: str): feat_encoder = self.config[attr + "_feature_encodings"] feature_dim = self.config[attr + "_feature_dim"] setattr(self, attr + "_feature_dim", feature_dim) feat_encoder_config = copy.deepcopy(feat_encoder) feat_encoder_config.params.model_data_dir = self.config.model_data_dir feat_encoder_config.params.in_dim = feature_dim feat_model = build_image_encoder(feat_encoder_config, direct_features=True) setattr(self, attr + "_feature_dim", feat_model.out_dim) setattr(self, attr + "_feature_encoders", feat_model)
def _init_feature_encoders(self, attr): feat_encoders = [] feat_encoders_list_config = self.config[attr + "_feature_encodings"] feature_dim = self.config[attr + "_feature_dim"] setattr(self, attr + "_feature_dim", feature_dim) for feat_encoder in feat_encoders_list_config: feat_encoder_config = copy.deepcopy(feat_encoder) feat_encoder_config.params.model_data_dir = self.config.model_data_dir feat_model = build_image_encoder(feat_encoder_config, direct_features=True) feat_encoders.append(feat_model) setattr(self, attr + "_feature_dim", feat_model.out_dim) setattr(self, attr + "_feature_encoders", nn.ModuleList(feat_encoders))
def build(self): """ Config's image_encoder attribute will used to build an MMF image encoder. This config in yaml will look like: # "type" parameter specifies the type of encoder we are using here. # In this particular case, we are using resnet152 type: resnet152 # Parameters are passed to underlying encoder class by # build_image_encoder params: # Specifies whether to use a pretrained version pretrained: true # Pooling type, use max to use AdaptiveMaxPool2D pool_type: avg # Number of output features from the encoder, -1 for original # otherwise, supports between 1 to 9 num_output_features: 1 """ self.vision_module = build_image_encoder(self.config.image_encoder) """ For classifer, configuration would look like: # Specifies the type of the classifier, in this case mlp type: mlp # Parameter to the classifier passed through build_classifier_layer params: # Dimension of the tensor coming into the classifier in_dim: 512 # Dimension of the tensor going out of the classifier out_dim: 2 # Number of MLP layers in the classifier num_layers: 0 """ self.classifier = build_classifier_layer(self.config.classifier) # ProjectionEmbeddings takes in params directly as it is module # So, pass in kwargs, which are in_dim, out_dim and module # whose value would be "linear" as we want linear layer self.language_module = ProjectionEmbedding( **self.config.text_encoder.params) # Dropout value will come from config now self.dropout = torch.nn.Dropout(self.config.dropout) # Same as Projection Embedding, fusion's layer params (which are param # for linear layer) will come from config now self.fusion = torch.nn.Linear(**self.config.fusion.params) self.relu = torch.nn.ReLU()
def _init_feature_projectors(self, attr): feature_projectors = [] feat_encoders_list_config = self.config[attr + "_feature_projections"] feat_dim = getattr(self, attr + "_feature_dim") for feat_encoder in feat_encoders_list_config: feat_encoder_config = copy.deepcopy(feat_encoder) feat_encoder_config.params.in_dim = feat_dim feat_model = build_image_encoder(feat_encoder_config, direct_features=True) feature_projectors.append(feat_model) setattr(self, attr + "_feature_dim", feat_model.out_dim) setattr(self, attr + "_feature_projectors", nn.ModuleList(feature_projectors))
def _build_obj_encoding(self): # object appearance feature: Faster R-CNN self.obj_faster_rcnn_fc7 = build_image_encoder( self._build_encoder_config(), direct_features=True ) # apply smaller lr to pretrained Faster R-CNN fc7 self.finetune_modules.append( {"module": self.obj_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn} ) self.linear_obj_feat_to_mmt_in = nn.Linear( self.config.obj.mmt_in_dim, self.mmt_config.hidden_size ) # object location feature: relative bounding box coordinates (4-dim) self.linear_obj_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_drop = nn.Dropout(self.config.obj.dropout_prob)
def build(self): # to be further set # breakpoint() self.image_feature_module = build_image_encoder( self.config.image_feature_processor, direct_features=True) if self.config.concate_trace: self.trace_feature_module = build_encoder( self.config.trace_feature_encoder) if self.config.base_model_name == "bert-base-uncased": self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained( "bert-base-uncased", "bert-base-uncased") elif self.config.base_model_name == "2layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 2 config_decoder.num_hidden_layers = 2 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) elif self.config.base_model_name == "3layer-base": config_encoder = BertConfig() config_decoder = BertConfig() config_encoder.num_hidden_layers = 3 config_decoder.num_hidden_layers = 3 self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) self.encoderdecoder = EncoderDecoderModel(config=self.codec_config) if self.config.loop_contrastive: self.trace_caption_contrastive = TraceCaptionContrastiveModel( self.config.tc_contrastive_aggregate_method) if (hasattr(self.config, "pretrans_attention") and self.config.pretrans_attention): # import ipdb; ipdb.set_trace() tempconf = self.encoderdecoder.config.encoder num_heads = tempconf.num_attention_heads num_layers = tempconf.num_hidden_layers self.attention_trans = AttentionTransform(num_layers, num_heads, 100) self.BOS_ID = 101
def _build_modal_encoder(self, config): return build_image_encoder( config, direct_features=self._is_direct_features_input)