def create_and_check_attention_mask_determinism(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = LongformerModel(config=config) model.to(torch_device) model.eval() attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) output_with_mask = model( input_ids, attention_mask=attention_mask)["last_hidden_state"] output_without_mask = model(input_ids)["last_hidden_state"] self.parent.assertTrue( torch.allclose(output_with_mask[0, 0, :5], output_without_mask[0, 0, :5], atol=1e-4))
def create_and_check_model_with_global_attention_mask( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = LongformerModel(config=config) model.to(torch_device) model.eval() global_attention_mask = input_mask.clone() global_attention_mask[:, input_mask.shape[-1] // 2] = 0 global_attention_mask = global_attention_mask.to(torch_device) result = model( input_ids, attention_mask=input_mask, global_attention_mask=global_attention_mask, token_type_ids=token_type_ids, ) result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask) result = model(input_ids, global_attention_mask=global_attention_mask) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def create_and_check_longformer_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = LongformerModel(config=config) model.to(torch_device) model.eval() sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def __init__(self, model_name: str = "allenai/longformer-base-4096"): self.model = LongformerModel.from_pretrained(model_name) self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
def __init__(self, config, project_dim: int = 0, seq_project=True): LongformerModel.__init__(self, config) assert config.hidden_size > 0, 'Encoder hidden_size can\'t be zero' self.encode_proj = nn.Linear(config.hidden_size, project_dim) if project_dim != 0 else None self.seq_project = seq_project self.init_weights()
print(df.target.value_counts()) import torch from transformers import DistilBertTokenizerFast, DistilBertModel, DistilBertConfig from transformers import LongformerTokenizerFast, LongformerModel, LongformerConfig #model_name = 'distilbert-base-uncased' model_name = 'allenai/longformer-base-4096' tokenizer = LongformerTokenizerFast.from_pretrained(model_name) df["vecs"] = df.text.map( lambda x: torch.LongTensor(tokenizer.encode(x)).unsqueeze(0)) config = LongformerConfig.from_pretrained(model_name, output_hidden_states=True) model = LongformerModel.from_pretrained(model_name, config=config) device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cpu' model = model.to(device) input_tf = tokenizer.batch_encode_plus(df.text.to_list(), return_tensors='pt', padding=True) #vecs = input_tf['input_ids'].to(device) #granola_ids = granola_ids.to(device) model.eval() with torch.no_grad(): print("and GO!!!!")
from transformers import ElectraForMaskedLM, ElectraTokenizer tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator') model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator') input_ids = torch.tensor( tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] print(prediction_scores) ## Longformer from transformers import LongformerModel, LongformerTokenizer model = LongformerModel.from_pretrained('longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096') SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze( 0) # batch of size 1 # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones( input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [ 1, 4, 21, ]] = 2 # Set global attention based on the task. For example,
def test_all(args): # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet). device = torch.device('cuda:0') results = [] for model_name in args.models: # Here we run an example input from transformers import LongformerModel torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[model_name] model = LongformerModel.from_pretrained( torch_model_name_or_dir) # pretrained model name or directory model.to(device) # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model # TODO: call convert_longformer_to_onnx to export onnx instead. import os.path optimized = False precision = 'fp32' onnx_model_path = os.path.join(args.onnx_dir, model_name + ".onnx") optimized_fp32_model = os.path.join(args.onnx_dir, model_name + "_fp32.onnx") optimized_fp16_model = os.path.join(args.onnx_dir, model_name + "_fp16.onnx") if os.path.isfile(optimized_fp16_model): onnx_model_path = optimized_fp16_model optimized = True precision = 'fp16' elif os.path.isfile(optimized_fp32_model): onnx_model_path = optimized_fp32_model optimized = True print("ONNX model path:", onnx_model_path) for num_threads in args.num_threads: if "torch" in args.engines: results += test_torch_latency(device, model, model_name, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads, args.verbose) if "onnxruntime" in args.engines: if args.memory: test_ort_memory(device, onnx_model_path, args.batch_sizes[0], args.sequence_lengths[0], args.global_lengths[0], args.test_times, num_threads) else: # test latency session = benchmark_helper.create_onnxruntime_session( onnx_model_path, use_gpu=True, enable_all_optimization=True, num_threads=num_threads) if session is None: raise RuntimeError( f"Failed to create ORT sesssion from ONNX file {onnx_model_path}" ) results += test_ort_latency( device, model, model_name, session, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads, optimized, precision, args.validate_onnx, args.disable_io_binding, args.verbose) return results
def __init__(self): super(Model, self).__init__() self.model = LongformerModel.from_pretrained(model_config.pretrain_model_path, gradient_checkpointing=True) self.config = self.model.config self.dropout = nn.Dropout(self.config.hidden_dropout_prob) self.classifier = nn.Linear(self.config.hidden_size, config.num_labels)
from sentence_transformers import SentenceTransformer from tqdm import tqdm import gensim from torch import nn as nn from config import SBERT_MODEL_NAME from utils.types import FolkLoreData, FolkLoreEmb, FolkLoreEmbCoarse nlp = en_core_web_sm.load() sbert_model = SentenceTransformer(SBERT_MODEL_NAME) from transformers import LongformerModel, LongformerTokenizerFast, LongformerConfig LFconfig = LongformerConfig.from_pretrained('allenai/longformer-base-4096') LF_model = LongformerModel.from_pretrained('allenai/longformer-base-4096', config=LFconfig) LF_tokenizer = LongformerTokenizerFast.from_pretrained( 'allenai/longformer-base-4096') LF_tokenizer.model_max_length = LF_model.config.max_position_embeddings class MatrixVectorScaledDotProductAttention(nn.Module): def __init__(self, temperature, attn_dropout=0.1): super().__init__() self.temperature = temperature self.dropout = nn.Dropout(attn_dropout) self.softmax = nn.Softmax(dim=1) def forward(self, q, k, v, mask=None): """ q: tensor of shape (n*b, d_k)
def __init__(self, config): super(LongformerQA, self).__init__(config) self.longformer = LongformerModel(config) self.qa_outputs = torch.nn.Linear(config.hidden_size, config.num_labels) self.init_weights()
def __init__(self, config_path): config = configparser.ConfigParser() config.read(config_path) self.n_epoch = config.getint("general", "n_epoch") self.batch_size = config.getint("general", "batch_size") self.train_bert = config.getboolean("general", "train_bert") self.lr = config.getfloat("general", "lr") self.cut_frac = config.getfloat("general", "cut_frac") self.log_dir = Path(config.get("general", "log_dir")) if not self.log_dir.exists(): self.log_dir.mkdir(parents=True) self.model_save_freq = config.getint("general", "model_save_freq") self.device = "cuda" if torch.cuda.is_available() else "cpu" # bert_config_path = config.get("bert", "config_path") # bert_tokenizer_path = config.get("bert", "tokenizer_path") # bert_model_path = config.get("bert", "model_path") self.bert_tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') # self.bert_tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_path) tkzer_save_dir = self.log_dir / "tokenizer" if not tkzer_save_dir.exists(): tkzer_save_dir.mkdir() self.bert_tokenizer.save_pretrained(tkzer_save_dir) self.bert_model = LongformerModel.from_pretrained( 'allenai/longformer-base-4096') self.bert_config = self.bert_model.config # self.bert_config = BertConfig.from_pretrained(bert_config_path) # self.bert_model = BertModel.from_pretrained(bert_model_path, config=self.bert_config) self.max_seq_length = self.bert_config.max_position_embeddings - 2 # self.max_seq_length = self.bert_config.max_position_embeddings self.bert_model.to(self.device) if self.train_bert: self.bert_model.train() else: self.bert_model.eval() train_conll_path = config.get("data", "train_path") print("train path", train_conll_path) assert Path(train_conll_path).exists() dev_conll_path = config.get("data", "dev_path") print("dev path", dev_conll_path) assert Path(dev_conll_path).exists() dev1_conll_path = Path(dev_conll_path) / "1" print("dev1 path", dev1_conll_path) assert dev1_conll_path.exists() dev2_conll_path = Path(dev_conll_path) / "2" print("dev2 path", dev2_conll_path) assert dev2_conll_path.exists() self.train_dataset = ConllDataset(train_conll_path) # self.dev_dataset = ConllDataset(dev_conll_path) self.dev1_dataset = ConllDataset(dev1_conll_path) self.dev2_dataset = ConllDataset(dev2_conll_path) if self.batch_size == -1: self.batch_size = len(self.train_dataset) self.scaler = torch.cuda.amp.GradScaler() tb_cmt = f"lr_{self.lr}_cut-frac_{self.cut_frac}" self.writer = SummaryWriter(log_dir=self.log_dir, comment=tb_cmt)
def __init__(self, task_configs=[], device='cpu', finetuning=True, lm='bert', bert_pt=None, bert_path=None): super().__init__() assert len(task_configs) > 0 # load the model or model checkpoint if bert_path == None: if lm == 'bert': self.bert = BertModel.from_pretrained(model_ckpts[lm]) elif lm == 'distilbert': self.bert = DistilBertModel.from_pretrained(model_ckpts[lm]) elif lm == 'albert': self.bert = AlbertModel.from_pretrained(model_ckpts[lm]) elif lm == 'xlnet': self.bert = XLNetModel.from_pretrained(model_ckpts[lm]) elif lm == 'roberta': self.bert = RobertaModel.from_pretrained(model_ckpts[lm]) elif lm == 'longformer': self.bert = LongformerModel.from_pretrained(model_ckpts[lm]) else: output_model_file = bert_path model_state_dict = torch.load(output_model_file, map_location=lambda storage, loc: storage) if lm == 'bert': self.bert = BertModel.from_pretrained(model_ckpts[lm], state_dict=model_state_dict) elif lm == 'distilbert': self.bert = DistilBertModel.from_pretrained(model_ckpts[lm], state_dict=model_state_dict) elif lm == 'albert': self.bert = AlbertModel.from_pretrained(model_ckpts[lm], state_dict=model_state_dict) elif lm == 'xlnet': self.bert = XLNetModel.from_pretrained(model_ckpts[lm], state_dict=model_state_dict) elif lm == 'roberta': self.bert = RobertaModel.from_pretrained(model_ckpts[lm], state_dict=model_state_dict) self.device = device self.finetuning = finetuning self.task_configs = task_configs self.module_dict = nn.ModuleDict({}) self.lm = lm # hard corded for now hidden_size = 768 hidden_dropout_prob = 0.1 for config in task_configs: name = config['name'] task_type = config['task_type'] vocab = config['vocab'] if task_type == 'tagging': # for tagging vocab_size = len(vocab) # 'O' and '<PAD>' if 'O' not in vocab: vocab_size += 1 if '<PAD>' not in vocab: vocab_size += 1 else: # for pairing and classification vocab_size = len(vocab) self.module_dict['%s_dropout' % name] = nn.Dropout(hidden_dropout_prob) self.module_dict['%s_fc' % name] = nn.Linear(hidden_size, vocab_size)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # TODO: use random word ID. #TODO: simulate masked word global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) if num_global_tokens > 0: global_token_index = list(range(num_global_tokens)) global_attention_mask[:, global_token_index] = 1 # TODO: support more inputs like token_type_ids, position_ids return input_ids, attention_mask, global_attention_mask args = parse_arguments() model_name = args.model onnx_model_path = model_name + ".onnx" from transformers import LongformerModel model = LongformerModel.from_pretrained(MODELS[model_name]) # pretrained model name or directory input_ids, attention_mask, global_attention_mask = get_dummy_inputs(sequence_length=args.sequence_length, num_global_tokens=args.global_length, device=torch.device('cpu')) example_outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask) # A new function to replace LongformerSelfAttention.forward #For transformers 4.0 def my_longformer_self_attention_forward_4(self, hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None): # TODO: move mask calculation to LongFormerModel class to avoid calculating it again and again in each layer. global_mask = is_index_global_attn.int() torch.masked_fill(attention_mask, is_index_global_attn, 0.0) weight = torch.stack((self.query.weight.transpose(0,1), self.key.weight.transpose(0,1), self.value.weight.transpose(0,1)), dim=1) weight = weight.reshape(self.embed_dim, 3*self.embed_dim)
def __init__(self, config): super().__init__(config) self.longformer = LongformerModel(config) self.init_weights()
def test_layer_attn_probs(self): model = LongformerModel.from_pretrained( "patrickvonplaten/longformer-random-tiny") model.eval() layer = model.encoder.layer[0].attention.self.to(torch_device) hidden_states = torch.cat( [self._get_hidden_states(), self._get_hidden_states() - 0.5], dim=0) batch_size, seq_length, hidden_size = hidden_states.size() attention_mask = torch.zeros((batch_size, seq_length), dtype=torch.float32, device=torch_device) # create attn mask attention_mask[0, -2:] = 10000.0 attention_mask[0, -1:] = -10000.0 attention_mask[1, 1:] = 10000.0 is_index_masked = attention_mask < 0 is_index_global_attn = attention_mask > 0 is_global_attn = is_index_global_attn.flatten().any().item() output_hidden_states, local_attentions, global_attentions = layer( hidden_states, attention_mask=attention_mask, is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn, output_attentions=True, ) self.assertEqual(local_attentions.shape, (2, 4, 2, 8)) self.assertEqual(global_attentions.shape, (2, 2, 3, 4)) # All tokens with global attention have weight 0 in local attentions. self.assertTrue(torch.all(local_attentions[0, 2:4, :, :] == 0)) self.assertTrue(torch.all(local_attentions[1, 1:4, :, :] == 0)) # The weight of all tokens with local attention must sum to 1. self.assertTrue( torch.all( torch.abs(global_attentions[0, :, :2, :].sum(dim=-1) - 1) < 1e-6)) self.assertTrue( torch.all( torch.abs(global_attentions[1, :, :1, :].sum(dim=-1) - 1) < 1e-6)) self.assertTrue( torch.allclose( local_attentions[0, 0, 0, :], torch.tensor( [ 0.3328, 0.0000, 0.0000, 0.0000, 0.0000, 0.3355, 0.3318, 0.0000 ], dtype=torch.float32, device=torch_device, ), atol=1e-3, )) self.assertTrue( torch.allclose( local_attentions[1, 0, 0, :], torch.tensor( [ 0.2492, 0.2502, 0.2502, 0.0000, 0.0000, 0.2505, 0.0000, 0.0000 ], dtype=torch.float32, device=torch_device, ), atol=1e-3, )) # All the global attention weights must sum to 1. self.assertTrue( torch.all(torch.abs(global_attentions.sum(dim=-1) - 1) < 1e-6)) self.assertTrue( torch.allclose( global_attentions[0, 0, 1, :], torch.tensor( [0.2500, 0.2500, 0.2500, 0.2500], dtype=torch.float32, device=torch_device, ), atol=1e-3, )) self.assertTrue( torch.allclose( global_attentions[1, 0, 0, :], torch.tensor( [0.2497, 0.2500, 0.2499, 0.2504], dtype=torch.float32, device=torch_device, ), atol=1e-3, ))
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = LongformerModel(config) self.classifier = RobertaClassificationHead(config)