def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) attention_mask = None if self.use_attention_mask: attention_mask = random_attention_mask( [self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) config = BigBirdConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, attention_type=self.attention_type, block_size=self.block_size, num_random_blocks=self.num_random_blocks, use_bias=self.use_bias, rescale_embeddings=self.rescale_embeddings, ) return config, input_ids, token_type_ids, attention_mask
def pytorch_benchmark(batch_sizes, sequence_lengths, nums_random_blocks, output_path, attention_type="block_sparse"): # Compare takes a list of measurements which we'll save in results. device = torch.device("cuda") fp = open(output_path, "w") writer = csv.writer(fp) writer.writerow(["batch_size", "seq_length", "r", "forward time (ms)", "bakward time (ms)"]) tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base") for b, n, r in product(batch_sizes, sequence_lengths, nums_random_blocks): print(b, n, r) inputs = tokenizer([input_text for _ in range(b)], max_length=n, truncation=True, return_tensors="pt") config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base", attention_type=attention_type) model = BigBirdForSequenceClassification.from_pretrained("google/bigbird-roberta-base", config=config) model.to(device) try: torch.cuda.synchronize() forward_time = 0 backward_time = 0 for _ in range(10): forward_elapse, backward_elapse = time_foward_backward(model, inputs) forward_time += forward_elapse backward_time += backward_elapse forward_time /= 10 backward_time /= 10 print(forward_time, backward_time) writer.writerow([b, n, r, forward_time, backward_time]) except Exception as e: print("Error:", e) traceback.print_exc() fp.close()
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, pytorch_dump_path, is_trivia_qa): # Initialise PyTorch model config = BigBirdConfig.from_json_file(big_bird_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) if is_trivia_qa: model = BigBirdForQuestionAnswering(config) else: model = BigBirdForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=is_trivia_qa) # Save pytorch-model print(f"Save PyTorch model to {pytorch_dump_path}") model.save_pretrained(pytorch_dump_path)
def test_torch_encode_plus_sent_to_model(self): import torch from transformers import BigBirdConfig, BigBirdModel # Build sequence first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10] sequence = " ".join(first_ten_tokens) encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt", return_token_type_ids=False) batch_encoded_sequence = self.big_tokenizer.batch_encode_plus( [sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False ) config = BigBirdConfig(attention_type="original_full") model = BigBirdModel(config) assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size with torch.no_grad(): model(**encoded_sequence) model(**batch_encoded_sequence)
def get_config(self): return BigBirdConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_encoder_decoder=False, initializer_range=self.initializer_range, attention_type=self.attention_type, use_bias=self.use_bias, rescale_embeddings=self.rescale_embeddings, block_size=self.block_size, num_random_blocks=self.num_rand_blocks, position_embedding_type=self.position_embedding_type, )
from torch.utils.data import DataLoader from transformers import BertTokenizer, BigBirdConfig BATCH_SIZE = 4 SHUFFLE = False SEQ_LENGTH = 128 PATH = "data/ontonotes/train" tokenizer = BertTokenizer.from_pretrained('bert-base-cased') reader = ReaderOntonotes(include_document_ids=True) sentences, labels, documents_masks, document2sentences, sentence2position = reader.read(PATH) dataset = ChunksPlusDocumentsDataset(sentences, labels, SEQ_LENGTH, document2sentences, sentence2position, tokenizer, 'Bert') dataloader = DataLoader(dataset, BATCH_SIZE, shuffle=SHUFFLE, collate_fn=dataset.paddings) input_ids, label_ids, attention_mask, word_ids = next(iter(dataloader)) # количество классов для NER classes = len(dataset.entity_tags) config = BigBirdConfig() config.pad_token_id = 0 config.bos_token_id = 101 config.sep_token_id = 102 config.eos_token_id = -1 config.attention_type='original_full' model = LongAttentionBERT(model_name='bert-base-cased', classes=classes, attention_config=config) print(model.forward(input_ids=input_ids, attention_mask=attention_mask))