def _build_word_embedding(self): self.bert_config = BertConfig.from_pretrained(self.config.bert_model_name) if self.config.pretrained_bert: bert_model = BertForPreTraining.from_pretrained(self.config.bert_model_name) self.word_embedding = bert_model.bert.embeddings self.pooler = bert_model.bert.pooler self.pooler.apply(self.init_weights) else: self.pooler = BertPooler(self.bert_config) self.word_embedding = BertEmbeddings(self.bert_config)
def __init__(self, model_name: str) -> None: super().__init__() config = BertConfig.from_pretrained(model_name) self.input_dim = config.hidden_size self.output_dim = config.vocab_size # TODO(mattg): It's possible that we could use some kind of cache like we have in # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel. That way, we # would only load the BERT weights once. Though, it's not clear how to do that here, as we # need to load `BertForMaskedLM`, not just `BertModel`... bert_model = BertForMaskedLM.from_pretrained(model_name) self.bert_lm_head = bert_model.cls
params = { 'tokenizer_config': { 'type': 'bert-base-uncased', 'params': { 'do_lower_case': True } }, 'mask_probability': 0, 'max_seq_length': 128 } mmf_tok = MMFTokenizer(OmegaConf.create(params)) mmf_tok._tokenizer = BertTokenizer(vocab_file="vocabulary.txt") config = BertConfig.from_pretrained('bert-large-uncased', num_labels=2, vocab_size=len(vocabulary), num_hidden_layers=3) net = VisualBertModel(config, visual_embedding_dim=2048).cuda() out_txt = mmf_tok({'text': report}) input_ids = torch.tensor(out_txt['input_ids']).unsqueeze(0) input_mask = torch.tensor(out_txt['input_mask']).unsqueeze(0) img = torch.zeros(1, 14, 2048) out = net(input_ids=input_ids.cuda(), text_mask=input_mask.cuda(), visual_embeddings=img.cuda()) print(net.config.add_cross_attention) print(out.keys())