def test_longformer(self): config = LongformerConfig() config.attention_mode = "n2" config.attention_window = [256] * 12 config.attention_dilation = [1] * 12 longformer = Longformer(config) encoder = TransformerEncoderBuilder.from_kwargs( n_layers=12, n_heads=12, query_dimensions=64, value_dimensions=64, feed_forward_dimensions=3072, attention_type="full", final_normalization=False, activation="gelu").get() longformer.eval() encoder.eval() # Before the weight copy they should be different x = torch.rand(3, 10, 768) o1 = longformer.encoder(x, head_mask=[None] * 12)[0] o2 = encoder(x) self.assertGreater(torch.abs(o1 - o2).max().item(), 1) # And after the copy they should be exactly the same encoder.load_state_dict(LongformerMapper().map( longformer.encoder.state_dict())) o1 = longformer.encoder(x, head_mask=[None] * 12)[0] o2 = encoder(x) self.assertLess(torch.abs(o1 - o2).max().item(), 1e-4)
def test_selfattention(self): np.random.seed(1) random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) torch.cuda.manual_seed_all(1) seqlen = 1024 embed_dim = 60 num_heads = 3 bsz = 3 config = LongformerConfig() config.num_attention_heads = num_heads config.hidden_size = embed_dim config.attention_probs_dropout_prob = 0.0 config.attention_window = [256] config.attention_dilation = [1] config.attention_mode = 'sliding_chunks' config.autoregressive = False attn = LongformerSelfAttention(config=config, layer_id=0) attn = attn.cuda() hidden_state = torch.randn(bsz, seqlen, embed_dim) attention_mask = torch.zeros( (bsz, 1, 1, seqlen), dtype=torch.int) # local attention everywhere # test None attention_mask (default which is local attention everywhere) output_nonemask = self._run_test(attn, hidden_state, None) output = self._run_test(attn, hidden_state, attention_mask) self.assertTrue(torch.allclose(output, output_nonemask, atol=1e-7)) # test padding attention_mask[:, :, :, -10:] = -1 self._run_test(attn, hidden_state, attention_mask) # test same global attention on all examples attention_mask[:, :, :, :10] = 1 self._run_test(attn, hidden_state, attention_mask) # test same number of global attention but different locations attention_mask[:] = 0 attention_mask[:, :, :, -10:] = -1 attention_mask[0, :, :, :10] = 1 attention_mask[1, :, :, 5:15] = 1 attention_mask[2, :, :, 10:20] = 1 self._run_test(attn, hidden_state, attention_mask) # test variable number of global attention attention_mask[:] = 0 attention_mask[:, :, :, -10:] = -1 attention_mask[0, :, :, 5:15] = 1 attention_mask[2, :, :, 13:17] = 1 self._run_test(attn, hidden_state, attention_mask)
def __init__(self): super(Model, self).__init__() self.config = LongformerConfig.from_pretrained('./longformer_pretrain') self.config.attention_mode = 'sliding_chunks' self.longformer = Longformer.from_pretrained('./longformer_pretrain', config=self.config) self.output = nn.Linear(self.config.hidden_size, 2)
def __init__(self, num_labels, args): super().__init__() config = LongformerConfig.from_pretrained(args.model + '-4096/') config.attention_mode = 'sliding_chunks' self.longformer = Longformer.from_pretrained(args.model + '-4096/', config=config) self.dropout = nn.Dropout(self.longformer.config.hidden_dropout_prob) self.classifier = nn.Linear(self.longformer.config.hidden_size, num_labels)
def __init__(self, args): super(EncoderLayer, self).__init__() # self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) config = LongformerConfig() config.num_attention_heads = args.num_heads config.hidden_size = args.encoder_hidden_size config.attention_probs_dropout_prob = args.dropout config.attention_window = [args.window_size] config.attention_dilation = [1] # No dilation config.attention_mode = 'tvm' config.output_attentions = True config.autoregressive = False self.self_attn = LongformerSelfAttention(config=config, layer_id=0) self.pos_ffn = PositionwiseFeedForward( args.encoder_hidden_size, args.encoder_hidden_size, dropout=args.dropout) self.layer_norm = nn.LayerNorm(args.encoder_hidden_size) self.dropout = nn.Dropout(args.dropout) self.fc = nn.Linear(args.encoder_hidden_size, args.encoder_hidden_size) nn.init.xavier_normal_(self.fc.weight)
def _run_test(self, device, dtype, attention_mode): config = LongformerConfig.from_pretrained( '/net/s3/s2-research/beltagy/longformer/model_release/longformer-base-4096/config.json' ) config.attention_mode = attention_mode model = Longformer.from_pretrained( '/net/s3/s2-research/beltagy/longformer/model_release/longformer-base-4096/pytorch_model.bin', config=config) model = model.eval() tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer.model_max_length = 4096 SAMPLE_TEXT = ' '.join(['Hello world! '] * 1025) # long input document token_ids = tokenizer.encode(SAMPLE_TEXT) token_ids = token_ids[:4095] + token_ids[-1:] input_ids = torch.tensor(token_ids).unsqueeze(0) input_ids = input_ids.to(device=device) model = model.to(device=device, dtype=dtype) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) attention_mask[:, [ 1, 4, 21, ]] = 2 output = model(input_ids, attention_mask=attention_mask)[0] output = output.float().sum() expected_output_sum = torch.tensor( 76193.671875, device=device ) # with no padding needed, and fixed roberta-tokenizer print( f'device: {device}, dtype: {dtype}, attention_mode: {attention_mode} ' f'Expected: {expected_output_sum}, Given: {output.sum()}') atol = 1e-2 if dtype == torch.half else 1e-4 self.assertTrue( torch.allclose(output.sum(), expected_output_sum, atol=atol))
def __init__(self, init_args): super().__init__() if isinstance(init_args, dict): # for loading the checkpoint, pl passes a dict (hparams are saved as dict) init_args = Namespace(**init_args) config_path = init_args.config_path or init_args.model_dir checkpoint_path = init_args.checkpoint_path or init_args.model_dir logger.info( f'loading model from config: {config_path}, checkpoint: {checkpoint_path}' ) config = LongformerConfig.from_pretrained(config_path) config.attention_mode = init_args.attention_mode logger.info(f'attention mode set to {config.attention_mode}') self.model_config = config self.model = Longformer.from_pretrained(checkpoint_path, config=config) self.tokenizer = BertTokenizer.from_pretrained(init_args.tokenizer) self.tokenizer.model_max_length = self.model.config.max_position_embeddings self.hparams = init_args self.hparams.seqlen = self.model.config.max_position_embeddings self.classifier = nn.Linear(config.hidden_size, init_args.num_labels)
def test_something(self): config = LongformerConfig.from_pretrained(self.model_dir) # choose the attention mode 'n2', 'tvm' or 'sliding_chunks' # 'n2': for regular n2 attantion # 'tvm': a custom CUDA kernel implementation of our sliding window attention # 'sliding_chunks': a PyTorch implementation of our sliding window attention config.attention_mode = 'sliding_chunks' model = Longformer.from_pretrained(self.model_dir, config=config) tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer.model_max_length = model.config.max_position_embeddings SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze( 0) # batch of size 1 # TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'` # model = model.cuda(); input_ids = input_ids.cuda() # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones( input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [ 1, 4, 21, ]] = 2 # Set global attention based on the task. For example, # classification: the <s> token # QA: question tokens # padding seqlen to the nearest multiple of 512. Needed for the 'sliding_chunks' attention input_ids, attention_mask = pad_to_window_size( input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id) output = model(input_ids, attention_mask=attention_mask)[0] # could have done more here.... self.assertIsNotNone(output)
import torch from longformer.longformer import Longformer, LongformerConfig from longformer.sliding_chunks import pad_to_window_size from transformers import RobertaTokenizer config = LongformerConfig.from_pretrained('downloads/longformer-base-4096/') # choose the attention mode 'n2', 'tvm' or 'sliding_chunks' # 'n2': for regular n2 attantion # 'tvm': a custom CUDA kernel implementation of our sliding window attention # 'sliding_chunks': a PyTorch implementation of our sliding window attention #config.attention_mode = 'n2' config.attention_mode = 'tvm' #config.attention_mode = 'sliding_chunks' model = Longformer.from_pretrained('downloads/longformer-base-4096/', config=config) tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer.model_max_length = model.config.max_position_embeddings SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 # TVM code doesn't work on CPU. Uncomment this if `config.attention_mode = 'tvm'` model = model.cuda(); input_ids = input_ids.cuda() # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example, # classification: the <s> token # QA: question tokens
# model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment') # # input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 # print(input_ids.shape) # outputs = model(input_ids) # # pooled_output = torch.mean(outputs[0], dim=1) # # last_hidden_states = outputs[0] import torch from longformer.longformer import Longformer, LongformerConfig from longformer.sliding_chunks import pad_to_window_size from transformers import RobertaTokenizer config = LongformerConfig.from_pretrained('longformer-base-4096/') # choose the attention mode 'n2', 'tvm' or 'sliding_chunks' # 'n2': for regular n2 attantion # 'tvm': a custom CUDA kernel implementation of our sliding window attention # 'sliding_chunks': a PyTorch implementation of our sliding window attention config.attention_mode = 'sliding_chunks' model = Longformer.from_pretrained('longformer-base-4096/', config=config) tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer.model_max_length = model.config.max_position_embeddings SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document SAMPLE_TEXT = f'{tokenizer.cls_token}{SAMPLE_TEXT}{tokenizer.eos_token}' input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze( 0) # batch of size 1
# -*- coding: utf-8 -*- """ @Time : 2020/11/12 15:59 @Auth : xiaolu @File :test2.py @IDE :PyCharm @Email:[email protected] """ import torch from pdb import set_trace from transformers import BertTokenizer, AdamW from longformer.longformer import Longformer, LongformerConfig from longformer.sliding_chunks import pad_to_window_size config = LongformerConfig.from_pretrained('./longformer_pretrain') config.attention_mode = 'sliding_chunks' model = Longformer.from_pretrained('./longformer_pretrain', config=config) tokenizer = BertTokenizer.from_pretrained('./longformer_pretrain/vocab.txt') tokenizer.model_max_length = model.config.max_position_embeddings input_text = '你是我患得患失的梦' * 200 input_ids = torch.tensor(tokenizer.encode(input_text)).unsqueeze(0) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) print(input_ids.size()) input_ids, attention_mask = pad_to_window_size(input_ids, attention_mask, config.attention_window[0],
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, MafiaDataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) num_labels = 2 # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = LongformerConfig.from_pretrained('longformer-base-4096/') config.num_labels = num_labels config.attention_mode = 'sliding_chunks' tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer.model_max_length = config.max_position_embeddings longformer = Longformer.from_pretrained('longformer-base-4096/', config=config) model = LongformerForSequenceClassification(config, longformer) # Get datasets train_dataset = MafiascumDataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = MafiascumDataset( data_args, tokenizer=tokenizer, mode="dev") if training_args.do_eval else None test_dataset = MafiascumDataset( data_args, tokenizer=tokenizer, mode="test") if training_args.do_predict else None def compute_metrics(p: EvalPrediction) -> Dict: def simple_accuracy(preds, labels): return (preds == labels).mean() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, } preds = np.argmax(p.predictions, axis=1) return acc_and_f1(preds, p.label_ids) # import torch_xla.core.xla_model as xm # device = xm.xla_device(n=1) device = 'cuda' # Initialize our Trainer trainer = Trainer( model=model, args=training_args, device=device, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_datasets = [eval_dataset] for eval_dataset in eval_datasets: eval_result = trainer.evaluate(eval_dataset=eval_dataset).metrics output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions # For classification predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
def __init__( self, input_size: int, output_size: int = 256, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, attention_dropout_rate: float = 0.0, input_layer: str = "conv2d", normalize_before: bool = True, concat_after: bool = False, positionwise_layer_type: str = "linear", positionwise_conv_kernel_size: int = 3, macaron_style: bool = False, rel_pos_type: str = "legacy", pos_enc_layer_type: str = "abs_pos", selfattention_layer_type: str = "lf_selfattn", activation_type: str = "swish", use_cnn_module: bool = True, zero_triu: bool = False, cnn_module_kernel: int = 31, padding_idx: int = -1, interctc_layer_idx: List[int] = [], interctc_use_conditioning: bool = False, attention_windows: list = [100, 100, 100, 100, 100, 100], attention_dilation: list = [1, 1, 1, 1, 1, 1], attention_mode: str = "sliding_chunks", ): assert check_argument_types() super().__init__(input_size) self._output_size = output_size activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding else: raise ValueError("incorrect or unknown pos_enc_layer: " + pos_enc_layer_type + "Use abs_pos") if len(attention_dilation) != num_blocks: raise ValueError( "incorrect attention_dilation parameter of length" + str(len(attention_dilation)) + " does not match num_blocks" + str(num_blocks)) if len(attention_windows) != num_blocks: raise ValueError( "incorrect attention_windows parameter of length" + str(len(attention_windows)) + " does not match num_blocks" + str(num_blocks)) if attention_mode != "tvm" and max(attention_dilation) != 1: raise ValueError("incorrect attention mode for dilation: " + attention_mode + "Use attention_mode=tvm with Cuda Kernel") if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(input_size, output_size), torch.nn.LayerNorm(output_size), torch.nn.Dropout(dropout_rate), pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d2": self.embed = Conv2dSubsampling2( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d6": self.embed = Conv2dSubsampling6( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "conv2d8": self.embed = Conv2dSubsampling8( input_size, output_size, dropout_rate, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx), pos_enc_class(output_size, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(output_size, positional_dropout_rate), ) elif input_layer is None: self.embed = torch.nn.Sequential( pos_enc_class(output_size, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = ( output_size, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = ( output_size, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") self.selfattention_layer_type = selfattention_layer_type if selfattention_layer_type == "lf_selfattn": assert pos_enc_layer_type == "abs_pos" from longformer.longformer import LongformerConfig from espnet.nets.pytorch_backend.transformer.longformer_attention import ( LongformerAttention, ) encoder_selfattn_layer = LongformerAttention config = LongformerConfig( attention_window=attention_windows, attention_dilation=attention_dilation, autoregressive=False, num_attention_heads=attention_heads, hidden_size=output_size, attention_probs_dropout_prob=dropout_rate, attention_mode=attention_mode, ) encoder_selfattn_layer_args = (config, ) else: raise ValueError("incompatible or unknown encoder_attn_layer: " + selfattention_layer_type + " Use lf_selfattn") convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda layer_id: EncoderLayer( output_size, encoder_selfattn_layer(*(encoder_selfattn_layer_args + (layer_id, ))), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(output_size) self.interctc_layer_idx = interctc_layer_idx if len(interctc_layer_idx) > 0: assert 0 < min(interctc_layer_idx) and max( interctc_layer_idx) < num_blocks self.interctc_use_conditioning = interctc_use_conditioning self.conditioning_layer = None