def init_data(self, use_cuda): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.cfg.output_attentions = True torch_attention = BertAttention(self.cfg) torch_attention.eval() if use_cuda: torch_attention.to(test_device) # Get FT Attention turbo_attention = turbo_transformers.BertAttention.from_torch( torch_attention) turbo_decoder_attention = turbo_transformers.MultiHeadedAttention.from_torch( torch_attention, is_trans_weight=False) hidden_size = self.cfg.hidden_size input_tensor = torch.rand(size=(batch_size, seq_length, hidden_size), dtype=torch.float32, device=test_device) attention_mask = torch.ones((batch_size, seq_length), dtype=torch.float32, device=test_device) attention_mask = attention_mask[:, None, None, :] attention_mask = (1.0 - attention_mask) * -10000.0 return torch_attention, turbo_attention, turbo_decoder_attention, input_tensor, attention_mask
def _build_txt_encoding(self): TEXT_BERT_HIDDEN_SIZE = 768 self.text_bert_config = BertConfig(**self.config.text_bert) if self.config.text_bert_init_from_bert_base: self.text_bert = TextBert.from_pretrained( "bert-base-uncased", config=self.text_bert_config) # Use a smaller learning rate on text bert when initializing # from BERT_BASE self.finetune_modules.append({ "module": self.text_bert, "lr_scale": self.config.lr_scale_text_bert }) else: logger.info("NOT initializing text_bert from BERT_BASE") self.text_bert = TextBert(self.text_bert_config) # if the text bert output dimension doesn't match the # multimodal transformer (mmt) hidden dimension, # add a linear projection layer between the two if self.mmt_config.hidden_size != TEXT_BERT_HIDDEN_SIZE: logger.info( f"Projecting text_bert output to {self.mmt_config.hidden_size} dim" ) self.text_bert_out_linear = nn.Linear(TEXT_BERT_HIDDEN_SIZE, self.mmt_config.hidden_size) else: self.text_bert_out_linear = nn.Identity()
def init_data(self, use_cuda) -> None: test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig() self.torch_encoder_layer = BertEncoder(self.cfg) self.torch_encoder_layer.eval() if use_cuda: self.torch_encoder_layer.to(test_device) self.batch_size = 1 self.seq_length = 40 self.hidden_size = self.cfg.hidden_size self.input_tensor = torch.rand(size=(self.batch_size, self.seq_length, self.hidden_size), dtype=torch.float32, device=test_device) self.attention_mask = torch.ones((self.batch_size, self.seq_length), dtype=torch.float32, device=test_device) self.attention_mask = self.attention_mask[:, None, None, :] self.attention_mask = (1.0 - self.attention_mask) * -10000.0 self.turbo_bert_encoder = turbo_transformers.BertEncoder.from_torch( self.torch_encoder_layer)
def init_data(self, use_cuda) -> None: test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig() self.intermediate_size = self.cfg.intermediate_size # 3072; self.hidden_size = self.cfg.hidden_size # 768 self.torch_bertout = BertOutput(self.cfg) self.torch_bertout.eval() if use_cuda: self.torch_bertout.to(test_device) self.turbo_bertout = turbo_transformers.BertOutput.from_torch( self.torch_bertout) self.intermediate_output = torch.rand( size=(batch_size, seq_length, self.intermediate_size), dtype=torch.float32, device=test_device) self.attention_output = torch.rand(size=(batch_size, seq_length, self.hidden_size), dtype=torch.float32, device=test_device)
def init_data(self, use_cuda: bool): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') torch.set_grad_enabled(False) cfg = BertConfig() self.torch_embedding = BertEmbeddings(cfg) self.torch_embedding.eval() if use_cuda: self.torch_embedding.to(test_device) self.turbo_embedding = turbo_transformers.BertEmbeddings.from_torch( self.torch_embedding) input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_length), dtype=torch.long, device=test_device) position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) position_ids = position_ids.repeat(batch_size, 1) token_type_ids = torch.zeros_like(input_ids, dtype=torch.long) return input_ids, position_ids, token_type_ids
def __init__(self, model_name: str) -> None: super().__init__() config = BertConfig.from_pretrained(model_name) self.input_dim = config.hidden_size self.output_dim = config.vocab_size # TODO(mattg): It's possible that we could use some kind of cache like we have in # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel. That way, we # would only load the BERT weights once. Though, it's not clear how to do that here, as we # need to load `BertForMaskedLM`, not just `BertModel`... bert_model = BertForMaskedLM.from_pretrained(model_name) self.bert_lm_head = bert_model.cls
def _build_word_embedding(self): self.bert_config = BertConfig.from_pretrained(self.config.bert_model_name) if self.config.pretrained_bert: bert_model = BertForPreTraining.from_pretrained(self.config.bert_model_name) self.word_embedding = bert_model.bert.embeddings self.pooler = bert_model.bert.pooler self.pooler.apply(self.init_weights) else: self.pooler = BertPooler(self.bert_config) self.word_embedding = BertEmbeddings(self.bert_config)
def __init__(self, config): super().__init__() self.config = config self.output_attentions = self.config.output_attentions self.output_hidden_states = self.config.output_hidden_states # If bert_model_name is not specified, you will need to specify # all of the required parameters for BERTConfig and a pretrained # model won't be loaded self.bert_model_name = getattr(self.config, "bert_model_name", None) self.bert_config = BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True)) if self.bert_model_name is None: self.bert = VisualBERTBase( self.bert_config, visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) else: self.bert = VisualBERTBase.from_pretrained( self.config.bert_model_name, config=self.bert_config, cache_dir=os.path.join(get_mmf_cache_dir(), "distributed_{}".format(-1)), visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) self.vocab_size = self.bert.config.vocab_size # TODO: Once omegaconf fixes int keys issue, bring this back # See https://github.com/omry/omegaconf/issues/149 # with omegaconf.open_dict(self.config): # # Add bert config such as hidden_state to our main config # self.config.update(self.bert.config.to_dict()) if self.bert_model_name is None: bert_masked_lm = BertForPreTraining(self.bert.config) else: bert_masked_lm = BertForPreTraining.from_pretrained( self.config.bert_model_name, config=self.bert.config, cache_dir=os.path.join(get_mmf_cache_dir(), "distributed_{}".format(-1)), ) self.cls = deepcopy(bert_masked_lm.cls) self.loss_fct = nn.CrossEntropyLoss(ignore_index=-1) self.init_weights()
def __init__(self, embedding_matrix, opt): super(LCA_GLOVE, self).__init__() # Only few of the parameters are necessary in the config.json, such as hidden_size, num_attention_heads self.config = BertConfig.from_json_file("modules/utils/bert_config.json") self.opt = opt self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float)) self.lc_embed = nn.Embedding(2, opt.embed_dim) self.global_encoder1 = SelfAttention(self.config, opt) self.local_encoder1 = SelfAttention(self.config, opt) self.local_encoder2 = SelfAttention(self.config, opt) self.mha = SelfAttention(self.config, opt) self.pool = BertPooler(self.config) self.dropout = nn.Dropout(opt.dropout) self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim) self.classifier = nn.Linear(opt.embed_dim, 2)
def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device, "turbo")
def init_data(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) torch.set_grad_enabled(False) self.cfg = BertConfig() self.torch_pooler = BertPooler(self.cfg) if torch.cuda.is_available(): self.torch_pooler.to(self.test_device) self.torch_pooler.eval() self.turbo_pooler = turbo_transformers.BertPooler.from_torch( self.torch_pooler)
def __init__(self, config): super().__init__() self.config = config self.output_attentions = self.config.output_attentions self.output_hidden_states = self.config.output_hidden_states self.pooler_strategy = self.config.get("pooler_strategy", "default") # If bert_model_name is not specified, you will need to specify # all of the required parameters for BERTConfig and a pretrained # model won't be loaded self.bert_model_name = getattr(self.config, "bert_model_name", None) self.bert_config = BertConfig.from_dict( OmegaConf.to_container(self.config, resolve=True)) if self.bert_model_name is None: self.bert = VisualBERTBase( self.bert_config, visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) else: self.bert = VisualBERTBase.from_pretrained( self.config.bert_model_name, config=self.bert_config, cache_dir=os.path.join(get_mmf_cache_dir(), "distributed_{}".format(-1)), visual_embedding_dim=self.config.visual_embedding_dim, embedding_strategy=self.config.embedding_strategy, bypass_transformer=self.config.bypass_transformer, output_attentions=self.config.output_attentions, output_hidden_states=self.config.output_hidden_states, ) self.training_head_type = self.config.training_head_type self.num_labels = self.config.num_labels self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob) if self.config.training_head_type == "nlvr2": self.bert.config.hidden_size *= 2 self.classifier = nn.Sequential( BertPredictionHeadTransform(self.bert.config), nn.Linear(self.bert.config.hidden_size, self.config.num_labels), ) self.init_weights()
def setup_method(self): self.monkeypatch = MonkeyPatch() # monkeypatch the PretrainedBertModel to return the tiny test fixture model config_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "config.json" vocab_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "vocab.txt" config = BertConfig.from_json_file(config_path) self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config)) self.monkeypatch.setattr( BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path) ) super().setup_method() self.set_up_model( FIXTURES_ROOT / "structured_prediction" / "srl" / "bert_srl.jsonnet", FIXTURES_ROOT / "structured_prediction" / "srl" / "conll_2012", )
def __init__( self, vocab: Vocabulary, embedding_dim: int, feedforward_dim: int, num_layers: int, num_attention_heads: int, position_embedding_dim: int, tokenizer_path: str, position_embedding_type: str = "absolute", activation: str = "gelu", hidden_dropout: float = 0.1, ) -> None: super().__init__() # TODO: # - Need to apply corrections in pretrained_transformer_mismatched_embedder tokenizer = BertTokenizer.from_pretrained(tokenizer_path) vocab.add_transformer_vocab(tokenizer, "tokens") # "tokens" is padded by default--undo that del vocab._token_to_index["tokens"]["@@PADDING@@"] del vocab._token_to_index["tokens"]["@@UNKNOWN@@"] assert len(vocab._token_to_index["tokens"]) == len(vocab._index_to_token["tokens"]) cfg = BertConfig( vocab_size=vocab.get_vocab_size("tokens"), hidden_size=embedding_dim, num_hidden_layers=num_layers, num_attention_heads=num_attention_heads, intermediate_size=feedforward_dim, hidden_act=activation, hidden_dropout_prob=hidden_dropout, max_position_embeddings=position_embedding_dim, position_embedding_type=position_embedding_type, use_cache=True, ) self.cfg = cfg self._vocab = vocab self._namespace = "tokens" self.bert = BertModel(cfg) self.masking_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 )
def init_bert_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.torch_model = BertModel(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size self.turbo_model = turbo_transformers.BertModelSmartBatch.from_torch( self.torch_model)
def __init__(self, embedding_matrix, opt): super(LCF_GLOVE, self).__init__() self.config = BertConfig.from_json_file( "modules/utils/bert_config.json") self.opt = opt self.embed = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix, dtype=torch.float)) self.mha_global = SelfAttention(self.config, opt) self.mha_local = SelfAttention(self.config, opt) self.ffn_global = PositionwiseFeedForward(self.opt.embed_dim, dropout=self.opt.dropout) self.ffn_local = PositionwiseFeedForward(self.opt.embed_dim, dropout=self.opt.dropout) self.mha_local_SA = SelfAttention(self.config, opt) self.mha_global_SA = SelfAttention(self.config, opt) self.pool = BertPooler(self.config) self.dropout = nn.Dropout(opt.dropout) self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
def init_attn_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) # torch model is from ONMT # self.torch_model = MultiHeadedAttention(self.cfg.num_attention_heads, self.cfg.hidden_size) self.torch_model = BertAttention(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size # self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_onmt( # self.torch_model) self.turbo_model = turbo_transformers.MultiHeadedAttentionSmartBatch.from_torch( self.torch_model)
def __init__(self, config): super().__init__(config) self.mmt_config = BertConfig(**self.config.mmt) self._datasets = registry.get("config").datasets.split(",")
def __init__(self, config): super().__init__() self.save_hyperparameters() bert_config = BertConfig( vocab_size=config["vocab_size"], hidden_size=config["hidden_size"], num_hidden_layers=config["num_layers"], num_attention_heads=config["num_heads"], intermediate_size=config["hidden_size"] * config["mlp_ratio"], max_position_embeddings=config["max_text_len"], hidden_dropout_prob=config["drop_rate"], attention_probs_dropout_prob=config["drop_rate"], ) self.tempeture_max_OT = config['tempeture_max_OT'] self.text_embeddings = BertEmbeddings(bert_config) self.text_embeddings.apply(objectives.init_weights) self.token_type_embeddings = nn.Embedding(2, config["hidden_size"]) self.token_type_embeddings.apply(objectives.init_weights) import vilt.modules.vision_transformer as vit if self.hparams.config["load_path"] == "": self.transformer = getattr(vit, self.hparams.config["vit"])( pretrained=config["pretrained_flag"], config=self.hparams.config) else: self.transformer = getattr(vit, self.hparams.config["vit"])( pretrained=False, config=self.hparams.config ) self.pooler = heads.Pooler(config["hidden_size"]) self.pooler.apply(objectives.init_weights) if config["loss_names"]["mlm"] > 0: self.mlm_score = heads.MLMHead(bert_config) self.mlm_score.apply(objectives.init_weights) if config["loss_names"]["itm"] > 0: self.itm_score = heads.ITMHead(config["hidden_size"]) self.itm_score.apply(objectives.init_weights) if config["loss_names"]["mpp"] > 0: self.mpp_score = heads.MPPHead(bert_config) self.mpp_score.apply(objectives.init_weights) # ===================== Downstream ===================== # if ( self.hparams.config["load_path"] != "" and not self.hparams.config["test_only"] ): ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu") state_dict = ckpt["state_dict"] self.load_state_dict(state_dict, strict=False) print(f'Loading checkpoint from {self.hparams.config["load_path"]}') hs = self.hparams.config["hidden_size"] if self.hparams.config["loss_names"]["vqa"] > 0: vs = self.hparams.config["vqav2_label_size"] self.vqa_classifier = nn.Sequential( nn.Linear(hs, hs * 2), nn.LayerNorm(hs * 2), nn.GELU(), nn.Linear(hs * 2, vs), ) self.vqa_classifier.apply(objectives.init_weights) if self.hparams.config["loss_names"]["nlvr2"] > 0: self.nlvr2_classifier = nn.Sequential( nn.Linear(hs * 2, hs * 2), nn.LayerNorm(hs * 2), nn.GELU(), nn.Linear(hs * 2, 2), ) self.nlvr2_classifier.apply(objectives.init_weights) emb_data = self.token_type_embeddings.weight.data self.token_type_embeddings = nn.Embedding(3, hs) self.token_type_embeddings.apply(objectives.init_weights) self.token_type_embeddings.weight.data[0, :] = emb_data[0, :] self.token_type_embeddings.weight.data[1, :] = emb_data[1, :] self.token_type_embeddings.weight.data[2, :] = emb_data[1, :] if self.hparams.config["loss_names"]["irtr"] > 0: self.rank_output = nn.Linear(hs, 1) self.rank_output.weight.data = self.itm_score.fc.weight.data[1:, :] self.rank_output.bias.data = self.itm_score.fc.bias.data[1:] self.margin = 0.2 for p in self.itm_score.parameters(): p.requires_grad = False vilt_utils.set_metrics(self) self.current_tasks = list() # ===================== load downstream (test_only) ====================== if self.hparams.config["load_path"] != "" and self.hparams.config["test_only"]: ckpt = torch.load(self.hparams.config["load_path"], map_location="cpu") state_dict = ckpt["state_dict"] self.load_state_dict(state_dict, strict=False) print(f'Loading checkpoint from {self.hparams.config["load_path"]}')
params = { 'tokenizer_config': { 'type': 'bert-base-uncased', 'params': { 'do_lower_case': True } }, 'mask_probability': 0, 'max_seq_length': 128 } mmf_tok = MMFTokenizer(OmegaConf.create(params)) mmf_tok._tokenizer = BertTokenizer(vocab_file="vocabulary.txt") config = BertConfig.from_pretrained('bert-large-uncased', num_labels=2, vocab_size=len(vocabulary), num_hidden_layers=3) net = VisualBertModel(config, visual_embedding_dim=2048).cuda() out_txt = mmf_tok({'text': report}) input_ids = torch.tensor(out_txt['input_ids']).unsqueeze(0) input_mask = torch.tensor(out_txt['input_mask']).unsqueeze(0) img = torch.zeros(1, 14, 2048) out = net(input_ids=input_ids.cuda(), text_mask=input_mask.cuda(), visual_embeddings=img.cuda()) print(net.config.add_cross_attention) print(out.keys())