def __init__(self, config): super(BertBiAttention, self).__init__() if config.bi_hidden_size % config.bi_num_attention_heads != 0: raise ValueError( 'The hidden size (%d) is not a multiple of the number of attention ' 'heads (%d)' % (config.bi_hidden_size, config.bi_num_attention_heads)) self.num_attention_heads = config.bi_num_attention_heads self.attention_head_size = int(config.bi_hidden_size / config.bi_num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size # self.scale = nn.Linear(1, self.num_attention_heads, bias=False) # self.scale_act_fn = ACT2FN['relu'] v_config = BertConfig.from_dict(config.v_config) self.query1 = nn.Linear(v_config.hidden_size, self.all_head_size) self.key1 = nn.Linear(v_config.hidden_size, self.all_head_size) self.value1 = nn.Linear(v_config.hidden_size, self.all_head_size) # self.logit1 = nn.Linear(config.hidden_size, self.num_attention_heads) self.dropout1 = nn.Dropout(v_config.attention_probs_dropout_prob) t_config = BertConfig.from_dict(config.t_config) self.query2 = nn.Linear(t_config.hidden_size, self.all_head_size) self.key2 = nn.Linear(t_config.hidden_size, self.all_head_size) self.value2 = nn.Linear(t_config.hidden_size, self.all_head_size) # self.logit2 = nn.Linear(config.hidden_size, self.num_attention_heads) self.dropout2 = nn.Dropout(t_config.attention_probs_dropout_prob)
def __init__(self, config): super(BertEncoder, self).__init__() # in the bert encoder, we need to extract three things here. # text bert layer: BertLayer # vision bert layer: BertImageLayer # Bi-Attention: Given the output of two bertlayer, perform bi-directional # attention and add on two layers. t_config = BertConfig.from_dict(config.t_config) v_config = BertConfig.from_dict(config.v_config) self.FAST_MODE = config.fast_mode self.with_coattention = config.with_coattention self.v_biattention_id = v_config.biattention_id self.t_biattention_id = t_config.biattention_id self.in_batch_pairs = config.in_batch_pairs self.fixed_t_layer = config.fixed_t_layer self.fixed_v_layer = config.fixed_v_layer # layer = BertLayer(config) layer = BertLayer(t_config) v_layer = BertLayer(v_config) connect_layer = BertConnectionLayer(config) self.layer = nn.ModuleList( [copy.deepcopy(layer) for _ in range(t_config.num_hidden_layers)]) self.v_layer = nn.ModuleList([ copy.deepcopy(v_layer) for _ in range(v_config.num_hidden_layers) ]) self.c_layer = nn.ModuleList([ copy.deepcopy(connect_layer) for _ in range(len(v_config.biattention_id)) ])
def __init__(self, config, bert_model_embedding_weights): super(BertPreTrainingHeads, self).__init__() t_config = BertConfig.from_dict(config.t_config) self.predictions = BertLMPredictionHead(t_config, bert_model_embedding_weights) self.bi_seq_relationship = nn.Linear(config.bi_hidden_size, 2) v_config = BertConfig.from_dict(config.v_config) self.imagePredictions = BertImagePredictionHead(v_config) self.fusion_method = config.fusion_method self.dropout = nn.Dropout(0.1)
def __init__(self, config): super(BertConnectionLayer, self).__init__() self.biattention = BertBiAttention(config) self.biOutput = BertBiOutput(config) v_config = BertConfig.from_dict(config.v_config) self.v_intermediate = BertIntermediate(v_config) self.v_output = BertOutput(v_config) t_config = BertConfig.from_dict(config.t_config) self.t_intermediate = BertIntermediate(t_config) self.t_output = BertOutput(t_config)
def __init__(self, config): super(BertBiOutput, self).__init__() v_config = BertConfig.from_dict(config.v_config) self.dense1 = nn.Linear(config.bi_hidden_size, v_config.hidden_size) self.LayerNorm1 = BertLayerNorm(v_config.hidden_size, eps=1e-12) self.dropout1 = nn.Dropout(v_config.hidden_dropout_prob) # self.q_dense1 = nn.Linear(config.bi_hidden_size, v_config.hidden_size) # self.q_dropout1 = nn.Dropout(v_config.hidden_dropout_prob) t_config = BertConfig.from_dict(config.t_config) self.dense2 = nn.Linear(config.bi_hidden_size, t_config.hidden_size) self.LayerNorm2 = BertLayerNorm(t_config.hidden_size, eps=1e-12) self.dropout2 = nn.Dropout(t_config.hidden_dropout_prob)
def __init__(self, config): super(BertModel, self).__init__(config) t_config = BertConfig.from_dict(config.t_config) v_config = BertConfig.from_dict(config.v_config) # initilize word embedding self.embeddings = BertEmbeddings(t_config) # initlize the vision embedding self.v_embeddings = BertImageEmbeddings(v_config) self.encoder = BertEncoder(config) self.t_pooler = BertTextPooler(config) self.v_pooler = BertImagePooler(config) self.init_weights()
def __init__(self, config, extra_config): super().__init__() self.config = config self.output_attentions = self.config.output_attentions self.output_hidden_states = self.config.output_hidden_states self.pooler_strategy = self.config.get("pooler_strategy", "default") # Graph input params self.feed_graph_to_vb = extra_config["feed_graph_to_vb"] self.graph_node_hid_dim = extra_config["node_hid_dim"] self.graph_feed_mode = extra_config["feed_mode"] self.graph_topk = extra_config["topk_ans_feed"] # If doing graph, make a graph embedding layer if self.feed_graph_to_vb: self.graph_embedding = nn.Sequential( nn.Linear(self.graph_node_hid_dim, config.hidden_size), nn.LayerNorm(config.hidden_size, eps=1e-12), nn.Dropout(config.hidden_dropout_prob), # hidden_dropout_prb ) # If bert_model_name is not specified, you will need to specify # all of the required parameters for BERTConfig and a pretrained # model won't be loaded self.bert_model_name = self.config.get("bert_model_name", None) self.bert_config = BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True) ) if self.bert_model_name is None or self.bert_model_name == "nopretrain": self.bert = VisualBERTBase( self.bert_config, visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) else: self.bert = VisualBERTBase.from_pretrained( self.config.bert_model_name, config=self.bert_config, cache_dir=os.path.join( get_mmf_cache_dir(), "distributed_{}".format(-1) ), visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) self.training_head_type = self.config.training_head_type self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob) if self.config.training_head_type == "nlvr2": self.bert.config.hidden_size *= 2 self.classifier = nn.Sequential(BertPredictionHeadTransform(self.bert.config)) self.init_weights()
def __init__(self, config): super().__init__() self.config = config self.output_attentions = self.config.output_attentions self.output_hidden_states = self.config.output_hidden_states # If bert_model_name is not specified, you will need to specify # all of the required parameters for BERTConfig and a pretrained # model won't be loaded self.bert_model_name = getattr(self.config, "bert_model_name", None) self.bert_config = BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True) ) if self.bert_model_name is None: self.bert = VisualBERTBase( self.bert_config, visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) else: self.bert = VisualBERTBase.from_pretrained( self.config.bert_model_name, config=self.bert_config, cache_dir=os.path.join( get_mmf_cache_dir(), "distributed_{}".format(-1) ), visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) self.vocab_size = self.bert.config.vocab_size # TODO: Once omegaconf fixes int keys issue, bring this back # See https://github.com/omry/omegaconf/issues/149 # with omegaconf.open_dict(self.config): # # Add bert config such as hidden_state to our main config # self.config.update(self.bert.config.to_dict()) if self.bert_model_name is None: bert_masked_lm = BertForPreTraining(self.bert.config) else: bert_masked_lm = BertForPreTraining.from_pretrained( self.config.bert_model_name, config=self.bert.config, cache_dir=os.path.join( get_mmf_cache_dir(), "distributed_{}".format(-1) ), ) self.cls = deepcopy(bert_masked_lm.cls) self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1) self.init_weights()
def __init__(self, config): """ """ config = BertConfig.from_dict(config) super().__init__(config) #self.embeddings = BertEmbeddings(config) self.encoder = BertScanEncoder(config) self.pooler = BertPooler(config) self.apply(self.init_bert_weights)
def __init__(self, config): super(BertModel, self).__init__(config) self.task_specific_tokens = config.task_specific_tokens t_config = BertConfig.from_dict(config.t_config) v_config = BertConfig.from_dict(config.v_config) # initilize word embedding if config.model == 'bert': self.embeddings = BertEmbeddings(t_config) elif config.model == 'roberta': self.embeddings = RobertaEmbeddings(t_config) # initlize the vision embedding self.v_embeddings = BertImageEmbeddings(v_config) self.encoder = BertEncoder(config) self.t_pooler = BertTextPooler(config) self.v_pooler = BertImagePooler(config) self.init_weights()
def __init__(self, config, bert_model_embedding_weights): super(BertPreTrainingHeads, self).__init__() t_config = BertConfig.from_dict(config.t_config) self.causal_predictor_t2v = BertLMPredictionHead( t_config, bert_model_embedding_weights, 768) self.causal_predictor_t = BertLMPredictionHead( t_config, bert_model_embedding_weights, 768) self.predictions = BertLMPredictionHead(t_config, bert_model_embedding_weights, 768) self.bi_seq_relationship = nn.Linear(config.bi_hidden_size, 2) v_config = BertConfig.from_dict(config.v_config) self.causal_predictor_v2t = BertImagePredictionHead(v_config, 1024) self.causal_predictor_v = BertImagePredictionHead( v_config, 2048) # causal loss,必须放在前面,它修改了config.v_hidden_size self.imagePredictions = BertImagePredictionHead( v_config, 1024) # 类比之前的mask_loss_v self.fusion_method = config.fusion_method self.dropout = nn.Dropout(0.1) self.criterion_v = nn.KLDivLoss(reduction='none') self.criterion_t = CrossEntropyLoss(ignore_index=-1)
def __init__(self, config): super().__init__() self.config = config self.output_attentions = self.config.output_attentions self.output_hidden_states = self.config.output_hidden_states self.pooler_strategy = self.config.get("pooler_strategy", "default") # If bert_model_name is not specified, you will need to specify # all of the required parameters for BERTConfig and a pretrained # model won't be loaded self.bert_model_name = getattr(self.config, "bert_model_name", None) self.bert_config = BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True) ) if self.bert_model_name is None: self.bert = VisualBERTBase( self.bert_config, visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) else: self.bert = VisualBERTBase.from_pretrained( self.config.bert_model_name, config=self.bert_config, cache_dir=os.path.join( get_mmf_cache_dir(), "distributed_{}".format(-1) ), visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) self.training_head_type = self.config.training_head_type self.num_labels = self.config.num_labels self.dropout = Dropout(self.bert.config.hidden_dropout_prob) if self.config.training_head_type == "nlvr2": self.bert.config.hidden_size *= 2 self.classifier = Sequential( BertPredictionHeadTransform(self.bert.config), Linear(self.bert.config.hidden_size, self.config.num_labels), ) self.vqa_pooler = IndexSelect() self.init_weights()
def __init__(self, **kwargs): super().__init__() self.config = kwargs self.output_attentions = self.config['output_attentions'] self.output_hidden_states = self.config['output_hidden_states'] self.pooler_strategy = self.config.get('pooler_strategy', 'default') # If bert_model_name is not specified, you will need to specify # all of the required parameters for BERTConfig and a pretrained # model won't be loaded self.bert_model_name = self.config['bert_model_name'] self.bert_config = BertConfig.from_dict(self.config) if self.bert_model_name is None: self.bert = VisualBERTBase( self.bert_config, visual_embedding_dim=self.config['visual_embedding_dim'], embedding_strategy=self.config['embedding_strategy'], bypass_transformer=self.config['bypass_transformer'], output_attentions=self.config['output_attentions'], output_hidden_states=self.config['output_hidden_states'], ) else: from imix.utils.config import ToExpanduser cache_dir = os.path.join('~/.cache/torch', 'transformers') cache_dir = ToExpanduser.modify_path(cache_dir) self.bert = VisualBERTBase.from_pretrained( self.config['bert_model_name'], config=self.bert_config, cache_dir=cache_dir, visual_embedding_dim=self.config['visual_embedding_dim'], embedding_strategy=self.config['embedding_strategy'], bypass_transformer=self.config['bypass_transformer'], output_attentions=self.config['output_attentions'], output_hidden_states=self.config['output_hidden_states'], ) self.training_head_type = self.config['training_head_type'] self.num_labels = self.config['num_labels'] self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob) if self.config['training_head_type'] == 'nlvr2': self.bert.config.hidden_size *= 2 self.classifier = nn.Sequential( BertPredictionHeadTransform(self.bert.config), nn.Linear(self.bert.config.hidden_size, self.config['num_labels']), ) self.init_weights()
def __init__(self, config, mode="lxr"): super().__init__() self.config = config self.num_labels = config.num_labels self.gqa_labels = config.gqa_labels self.mode = config.mode self.bert = LXMERTBase.from_pretrained( self.config.bert_model_name, config=BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True)), cache_dir=os.path.join(get_mmf_cache_dir(), "distributed_{}".format(-1)), ) self.classifier = BertVisualAnswerHead( config, [self.num_labels, self.gqa_labels]) self.init_weights()
def __init__(self, args, mode='x'): super().__init__() # Build LXRT Model self.config = args self.max_seq_length = self.config['max_seq_length'] # set_visual_config(args) # Using the bert tokenizer self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, ) # Build LXRT Model self.model = LXRTFeatureExtraction.from_pretrained( self.config['bert_model_name'], config=BertConfig.from_dict(self.config), cache_dir=ToExpanduser.modify_path( os.path.join('~/.cache/torch', 'transformers')), ) '''
def __init__(self, config): super().__init__() # Configuration self.config = config # LXMERT backbone self.bert = LXMERTBase.from_pretrained( self.config.bert_model_name, config=BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True)), cache_dir=os.path.join(get_mmf_cache_dir(), "distributed_{}".format(-1)), ) self.num_labels = config.num_labels self.gqa_labels = config.gqa_labels self.task_mask_lm = config.task_mask_lm self.task_obj_predict = config.task_obj_predict self.task_matched = config.task_matched self.task_qa = config.task_qa self.visual_losses = config.visual_losses self.visual_loss_config = config.visual_loss_config # Pre-training heads self.cls = BertPreTrainingHeads( config, self.bert.embeddings.word_embeddings.weight) if self.task_obj_predict: self.obj_predict_head = BertVisualObjHead(config) if self.task_qa: self.answer_head = BertVisualAnswerHead( config, [self.num_labels, self.gqa_labels]) # loss functions self.loss_fcts = { "l2": SmoothL1Loss(reduction="none"), "ce": CrossEntropyLoss(ignore_index=-1, reduction="none"), "ce_lang": CrossEntropyLoss(ignore_index=-1), }
def __init__(self, config): super().__init__() # Configuration self.config = config # LXMERT backbone self.bert = LXMERTBase.from_pretrained( self.config.bert_model_name, config=BertConfig.from_dict(self.config), ) self.num_labels = config.num_labels self.gqa_labels = config.gqa_labels self.task_mask_lm = config.task_mask_lm self.task_obj_predict = config.task_obj_predict self.task_matched = config.task_matched self.task_qa = config.task_qa self.visual_losses = config.visual_losses self.visual_loss_config = config.visual_loss_config # Pre-training heads self.cls = BertPreTrainingHeads( config, self.bert.embeddings.word_embeddings.weight) if self.task_obj_predict: self.obj_predict_head = BertVisualObjHead(config) if self.task_qa: self.answer_head = BertVisualAnswerHead( config, [self.num_labels, self.gqa_labels]) # # loss functions self.loss_fcts = { 'l2': SmoothL1Loss(reduction='none'), 'ce': CrossEntropyLoss(ignore_index=-1, reduction='none'), 'ce_lang': CrossEntropyLoss(ignore_index=-1), }
def __init__(self, **kwargs): super().__init__() self.config = config = kwargs['params'] self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.root_path = os.path.dirname(__file__) # task_lr = [] task_ids = [] for i, task_id in enumerate(config.tasks.split('-')): task = 'TASK' + task_id cfg = config.TASKS[task] name = cfg.name task_ids.append(task) # task_lr.append(cfg.lr) # base_lr = min(task_lr) # loss_scale = {} # for i, task_id in enumerate(config.tasks.split('-')): # task = 'TASK' + task_id # loss_scale[task] = task_lr[i] / base_lr train_steps = max( [config.TASKS[k]['num_training_steps'] for k in task_ids]) // config.gradient_accumulation_steps num_labels = max([config.TASKS[k]['num_labels'] for k in task_ids]) self.task_start_iter = {} if len(task_ids) == 1: self.task_start_iter[task_ids[0]] = 0 else: for task_id in task_ids: self.task_start_iter[task_id] = train_steps - ( config.TASKS[task_id]['num_epoch'] * config.TASKS[task_id]['iters_in_epoch'] // config.gradient_accumulation_steps) # task_ave_iter_list = sorted(task_ave_iter.values()) # median_num_iter = task_ave_iter_list[-1] # num_train_optimization_steps = ( # median_num_iter * \ # config.total_epochs // config.gradient_accumulation_steps # ) bertconfig = BertConfig.from_dict(config) self.model = DeVLBertForVLTasks.from_pretrained( config.from_pretrained, config=bertconfig, num_labels=num_labels, ) if config.freeze != -1: bert_weight_name = json.load( open( self.root_path + '/config/' + config.bert_model + '_weight_name.json', 'r')) bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= config.freeze: bert_weight_name_filtered.append(name) for key, value in dict(self.model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False logger.info('filtered weight') logger.info(bert_weight_name_filtered) self.lr_reduce_list = [5, 7] self.global_step = 0 self.task_iter_train = {name: None for name in task_ids} self.task_count = {name: 0 for name in task_ids} self.task_ids = task_ids self.is_ema_state = False self.bkp_state_dict = None self.use_ema = config.TASKS[task_ids[0]]['use_ema'] self.ema_decay_ratio = config.TASKS[task_ids[0]]['ema_decay_ratio'] self.ema_state_dict = {}
def __init__(self, **kwargs): super().__init__() self.config = config = kwargs['params'] self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.root_path = os.path.dirname(__file__) # task_lr = [] task_ids = [] for i, task_id in enumerate(config.tasks.split('-')): task = 'TASK' + task_id cfg = config.TASKS[task] name = cfg.name task_ids.append(task) # task_lr.append(cfg.lr) # base_lr = min(task_lr) # loss_scale = {} # for i, task_id in enumerate(config.tasks.split('-')): # task = 'TASK' + task_id # loss_scale[task] = task_lr[i] / base_lr # task_ave_iter = {} self.task_stop_controller = {} for task_id in task_ids: # task_ave_iter[task_id] = int(config.TASKS[task]['num_epoch'] * num_iter * # config.train_iter_multiplier / # config.TASKS[task]['num_epoch']) # config.total_epochs) self.task_stop_controller[task_id] = MultiTaskStopOnPlateau( mode='max', patience=1, continue_threshold=0.005, cooldown=1, threshold=0.001, ) # task_ave_iter_list = sorted(task_ave_iter.values()) # median_num_iter = task_ave_iter_list[-1] # num_train_optimization_steps = ( # median_num_iter * \ # config.total_epochs // config.gradient_accumulation_steps # ) num_labels = max([config.TASKS[k]['num_labels'] for k in task_ids]) bertconfig = BertConfig.from_dict(config) if bertconfig.visual_target == 0: bertconfig.v_config.target_size = 1601 else: bertconfig.v_config.target_size = 2048 if 'roberta' in config.bert_model: bertconfig.model = 'roberta' self.model = VILBertForVLTasks.from_pretrained( config.from_pretrained, config=bertconfig, num_labels=num_labels, ) if config.freeze != -1: bert_weight_name = json.load( open( self.root_path + '/config/' + config.bert_model + '_weight_name.json', 'r')) bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= config.freeze: bert_weight_name_filtered.append(name) for key, value in dict(self.model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False logger.info('filtered weight') logger.info(bert_weight_name_filtered) self.lr_reduce_list = [5, 7] self.global_step = 0 self.task_iter_train = {name: None for name in task_ids} self.task_count = {name: 0 for name in task_ids} self.task_ids = task_ids
if __name__ == '__main__': # Load the parameters from json file args = parser.parse_args() # json_path = os.path.join(args.model_dir, 'params.json') # assert os.path.isfile( # json_path), "No json configuration file found at {}".format(json_path) # params = utils.Params("/Volumes/Coding/HM_caompettion/Our_Own_Code/params.json") with open("/Volumes/Coding/HM_caompettion/Our_Own_Code/params.json", "r") as read_file: data = json.load(read_file) # json1_data = json.loads("/Volumes/Coding/HM_caompettion/Our_Own_Code/params.json") print(type(data)) params = BertConfig.from_dict(data) # print(params) # exit(0) # use GPU if available params.cuda = torch.cuda.is_available() # Set the random seed for reproducible experiments torch.manual_seed(230) if params.cuda: torch.cuda.manual_seed(230) # Set the logger utils.set_logger(os.path.join(args.model_dir, 'train.log')) # Create the input data pipeline
def __init__(self, config): super().__init__() self.config = config self.output_attentions = self.config.output_attentions self.output_hidden_states = self.config.output_hidden_states self.pooler_strategy = self.config.get("pooler_strategy", "default") # If bert_model_name is not specified, you will need to specify # all of the required parameters for BERTConfig and a pretrained # model won't be loaded self.bert_model_name = getattr(self.config, "bert_model_name", None) self.bert_config = BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True)) if self.bert_model_name is None: self.bert = VisualBERTBase( self.bert_config, visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) else: self.bert = VisualBERTBase.from_pretrained( self.config.bert_model_name, config=self.bert_config, cache_dir=os.path.join(get_mmf_cache_dir(), "distributed_{}".format(-1)), visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) self.training_head_type = self.config.training_head_type self.num_labels = self.config.num_labels self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob) if self.config.training_head_type == "nlvr2": self.bert.config.hidden_size *= 2 # # original classifier layer # self.classifier = nn.Sequential( # BertPredictionHeadTransform(self.bert.config), # nn.Linear(self.bert.config.hidden_size, self.config.num_labels), # ) self.classifier = nn.ModuleList([ BertPredictionHeadTransform(self.bert.config), nn.Linear(3 * self.bert.config.hidden_size, self.config.num_labels), nn.Linear(2 * self.bert.config.hidden_size, 2), ]) # add the attention self.attn1 = nn.MultiheadAttention( self.bert.config.hidden_size, self.bert.config.num_attention_heads, self.bert.config.attention_probs_dropout_prob) self.attn2 = nn.MultiheadAttention( self.bert.config.hidden_size, self.bert.config.num_attention_heads, self.bert.config.attention_probs_dropout_prob) self.fc = nn.Sequential( nn.Linear(2 * self.bert.config.hidden_size, self.bert.config.hidden_size), nn.ReLU(), nn.Dropout(self.bert.config.hidden_dropout_prob)) self.attn_pool = AttentionPool( self.bert.config.hidden_size, self.bert.config.attention_probs_dropout_prob) self.init_weights()