def setUp(self): super().setUp() self.monkeypatch = MonkeyPatch() # monkeypatch the PretrainedBertModel to return the tiny test fixture model config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config))
def setUp(self): super().setUp() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" self.token_indexer = PretrainedBertIndexer(str(vocab_path)) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) self.bert_model = BertModel(config) self.token_embedder = BertEmbedder(self.bert_model)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def test_sliding_window(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert tokens["input_ids"].tolist() == [[ 16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2, 14, 12, 17 ]] assert tokens["offsets"].tolist() == [[1, 3, 4, 5, 6, 7, 8, 9, 10, 11]] bert_vectors = token_embedder(tokens["input_ids"]) assert list(bert_vectors.shape) == [1, 13, 12] # Testing without token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert list(bert_vectors.shape) == [1, 10, 12] # Testing with token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"], token_type_ids=tokens["token_type_ids"]) assert list(bert_vectors.shape) == [1, 10, 12]
def __init__(self, embedding_matrix, opt): super(LCA_GLOVE, self).__init__() # Only few of the parameters are necessary in the config.json, such as hidden_size, num_attention_heads self.config = BertConfig.from_json_file("utils/bert_config.json") self.opt = opt self.embed = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix, dtype=torch.float)) self.lc_embed = nn.Embedding(2, opt.embed_dim) self.global_encoder1 = SelfAttention(self.config, opt) self.local_encoder1 = SelfAttention(self.config, opt) self.local_encoder2 = SelfAttention(self.config, opt) self.mha = SelfAttention(self.config, opt) self.pool = BertPooler(self.config) self.dropout = nn.Dropout(opt.dropout) self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim) self.classifier = nn.Linear(opt.embed_dim, 2)
def setUp(self): self.monkeypatch = MonkeyPatch() # monkeypatch the PretrainedBertModel to return the tiny test fixture model config_path = self.FIXTURES_ROOT / "bert" / "config.json" vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" config = BertConfig.from_json_file(config_path) self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config)) self.monkeypatch.setattr( BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path) ) super().setUp() self.set_up_model( self.FIXTURES_ROOT / "bert_srl" / "experiment.jsonnet", self.FIXTURES_ROOT / "conll_2012", )
def __init__(self, embedding_matrix, opt): super(LCF_GLOVE, self).__init__() self.config = BertConfig.from_json_file("utils/bert_config.json") self.opt = opt self.embed = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix, dtype=torch.float)) self.mha_global = SelfAttention(self.config, opt) self.mha_local = SelfAttention(self.config, opt) self.ffn_global = PositionwiseFeedForward(self.opt.embed_dim, dropout=self.opt.dropout) self.ffn_local = PositionwiseFeedForward(self.opt.embed_dim, dropout=self.opt.dropout) self.mha_local_SA = SelfAttention(self.config, opt) self.mha_global_SA = SelfAttention(self.config, opt) self.pool = BertPooler(self.config) self.dropout = nn.Dropout(opt.dropout) self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
def test_sliding_window_with_batch(self): tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig.from_json_file(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({ "tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer}) }) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"]["bert"] # Testing without token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"]) assert bert_vectors is not None # Testing with token_type_ids bert_vectors = token_embedder(tokens["input_ids"], offsets=tokens["offsets"], token_type_ids=tokens["token_type_ids"]) assert bert_vectors is not None
import torch from transformers.tokenization_bert_japanese import BertJapaneseTokenizer from transformers.modeling_bert import BertForMaskedLM, BertConfig import MeCab # Load the models tokenizer = BertJapaneseTokenizer.from_pretrained('model/') config = BertConfig.from_json_file('model/bert_base_32k_config.json') model = BertForMaskedLM.from_pretrained('model/model.ckpt-580000_pytorch.bin', config=config) m = MeCab.Tagger("-Ochasen") def sent_emb(text): print('text:', text) input_ids = tokenizer.encode(text, return_tensors='pt') print('tokenizer.conert:', tokenizer.convert_ids_to_tokens(input_ids[0].tolist())) masked_index = torch.where( input_ids == tokenizer.mask_token_id)[1].tolist()[0] print('masked index:', masked_index) result = model(input_ids) pred_ids = result[0][:, masked_index].topk(10).indices.tolist()[0] output = [] for pred_id in pred_ids: output_ids = input_ids.tolist()[0] output_ids[masked_index] = pred_id text = ''.join(tokenizer.decode(output_ids)) #print(text)
""" This file contains implementation of transformation tensorflow Bert model to pytorch representation. """ import torch from transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert # This script is used to convert tensorflow bert model to pytorch representation publicly known # path to dictionary bert_dir='/mnt/data/xkloco00_pc5/external/multi_cased_L-12_H-768_A-12' # important files tf_checkpoint_path=bert_dir+'/'+"bert_model.ckpt" bert_config_file = bert_dir+'/'+"bert_config.json" pytorch_dump_path=bert_dir+'/'+"pytorch_model.bin" config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def __init__(self, config: Config, output_encoded_layers: bool, *args, **kwargs) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # Load config config_file = os.path.join(config.bert_cpt_dir, "bert_config.json") bert_config = BertConfig.from_json_file(config_file) print("Bert model config {}".format(bert_config)) # Instantiate model. model = BertModel(bert_config) weights_path = os.path.join(config.bert_cpt_dir, "pytorch_model.bin") # load pre-trained weights if weights_path exists if config.load_weights and PathManager.isfile(weights_path): state_dict = torch.load(weights_path) missing_keys: List[str] = [] unexpected_keys: List[str] = [] error_msgs: List[str] = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, "_metadata", None) tmp_state_dict = {} for key, value in state_dict.items(): if key.endswith( "LayerNorm.gamma"): # compatibility with v0.5 models key = key.replace("LayerNorm.gamma", "LayerNorm.weight") if key.endswith( "LayerNorm.beta"): # compatibility with v0.5 models key = key.replace("LayerNorm.beta", "LayerNorm.bias") tmp_state_dict[key] = value state_dict = tmp_state_dict if metadata is not None: state_dict._metadata = metadata def load(module, prefix=""): local_metadata = ({} if metadata is None else metadata.get( prefix[:-1], {})) module._load_from_state_dict( state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs, ) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") load(model, prefix="" if hasattr(model, "bert") else "bert.") if len(missing_keys) > 0: print( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: print( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) self.bert = model log_class_usage(__class__)