def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size)

        attention_mask = None
        if self.use_attention_mask:
            attention_mask = random_attention_mask(
                [self.batch_size, self.seq_length])

        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length],
                                        self.type_vocab_size)

        config = BigBirdConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            is_decoder=False,
            initializer_range=self.initializer_range,
            attention_type=self.attention_type,
            block_size=self.block_size,
            num_random_blocks=self.num_random_blocks,
            use_bias=self.use_bias,
            rescale_embeddings=self.rescale_embeddings,
        )

        return config, input_ids, token_type_ids, attention_mask
示例#2
0
def pytorch_benchmark(batch_sizes, sequence_lengths, nums_random_blocks, output_path, attention_type="block_sparse"):
    # Compare takes a list of measurements which we'll save in results.
    device = torch.device("cuda")

    fp = open(output_path, "w")
    writer = csv.writer(fp)
    writer.writerow(["batch_size", "seq_length", "r", "forward time (ms)", "bakward time (ms)"])
    tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
    for b, n, r in product(batch_sizes, sequence_lengths, nums_random_blocks):
        print(b, n, r)
        inputs = tokenizer([input_text for _ in range(b)], max_length=n, truncation=True, return_tensors="pt")
        config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base", attention_type=attention_type)
        model = BigBirdForSequenceClassification.from_pretrained("google/bigbird-roberta-base", config=config)
        model.to(device)
        try:
            torch.cuda.synchronize()
            forward_time = 0
            backward_time = 0
            for _ in range(10):
                forward_elapse, backward_elapse = time_foward_backward(model, inputs)

                forward_time += forward_elapse
                backward_time += backward_elapse
            forward_time /= 10
            backward_time /= 10
            print(forward_time, backward_time)
            writer.writerow([b, n, r, forward_time, backward_time])
        except Exception as e:
            print("Error:", e)
            traceback.print_exc()

    fp.close()
示例#3
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, pytorch_dump_path, is_trivia_qa):
    # Initialise PyTorch model
    config = BigBirdConfig.from_json_file(big_bird_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))

    if is_trivia_qa:
        model = BigBirdForQuestionAnswering(config)
    else:
        model = BigBirdForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=is_trivia_qa)

    # Save pytorch-model
    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)
    def test_torch_encode_plus_sent_to_model(self):
        import torch

        from transformers import BigBirdConfig, BigBirdModel

        # Build sequence
        first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10]
        sequence = " ".join(first_ten_tokens)
        encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt", return_token_type_ids=False)
        batch_encoded_sequence = self.big_tokenizer.batch_encode_plus(
            [sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False
        )

        config = BigBirdConfig(attention_type="original_full")
        model = BigBirdModel(config)

        assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size

        with torch.no_grad():
            model(**encoded_sequence)
            model(**batch_encoded_sequence)
 def get_config(self):
     return BigBirdConfig(
         vocab_size=self.vocab_size,
         hidden_size=self.hidden_size,
         num_hidden_layers=self.num_hidden_layers,
         num_attention_heads=self.num_attention_heads,
         intermediate_size=self.intermediate_size,
         hidden_act=self.hidden_act,
         hidden_dropout_prob=self.hidden_dropout_prob,
         attention_probs_dropout_prob=self.attention_probs_dropout_prob,
         max_position_embeddings=self.max_position_embeddings,
         type_vocab_size=self.type_vocab_size,
         is_encoder_decoder=False,
         initializer_range=self.initializer_range,
         attention_type=self.attention_type,
         use_bias=self.use_bias,
         rescale_embeddings=self.rescale_embeddings,
         block_size=self.block_size,
         num_random_blocks=self.num_rand_blocks,
         position_embedding_type=self.position_embedding_type,
     )
示例#6
0
from torch.utils.data import DataLoader

from transformers import BertTokenizer, BigBirdConfig

BATCH_SIZE = 4
SHUFFLE = False
SEQ_LENGTH = 128 
PATH = "data/ontonotes/train"

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

reader = ReaderOntonotes(include_document_ids=True)
sentences, labels, documents_masks, document2sentences, sentence2position = reader.read(PATH)
dataset = ChunksPlusDocumentsDataset(sentences, labels, SEQ_LENGTH, document2sentences, sentence2position, tokenizer, 'Bert')
dataloader = DataLoader(dataset, BATCH_SIZE, shuffle=SHUFFLE, collate_fn=dataset.paddings)

input_ids, label_ids, attention_mask, word_ids = next(iter(dataloader))

# количество классов для NER
classes = len(dataset.entity_tags)

config = BigBirdConfig()
config.pad_token_id = 0
config.bos_token_id = 101
config.sep_token_id = 102
config.eos_token_id = -1
config.attention_type='original_full'

model = LongAttentionBERT(model_name='bert-base-cased', classes=classes, attention_config=config)

print(model.forward(input_ids=input_ids, attention_mask=attention_mask))