def convert_to_long_model(model_name, tokenizer_name, save_model_to, attention_window, max_pos): """ Starting from the roberta-base checkpoint, the following function converts it into an instance of RobertaLong. Args: save_model_to (str): path to output dir attention_window (int): max_pos (int): max model position before adding extra 2 tokens for roberta models Returns: transformers.RobertaForMaskedLM: RoBERTa model with LM head on top """ model = RobertaForMaskedLM.from_pretrained(model_name) tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_name, model_max_length=max_pos) config = model.config # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 2 step = current_max_pos - 2 while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.roberta.embeddings.position_embeddings.weight[2:] k += step model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.roberta.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = layer.attention.self.query longformer_self_attn.key_global = layer.attention.self.key longformer_self_attn.value_global = layer.attention.self.value layer.attention.self = longformer_self_attn logger.info(f' saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer
def create_long_model(model_type, model, tokenizer, config, attention_window=512, max_pos=4096): """Convert RoBERTa to Longformer. for other model_type like BERT, replacing model.encoder.layer.attention.self to LongformerSelfAttension() is not available at this time. """ from transformers.modeling_longformer import LongformerSelfAttention # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.embeddings.position_embeddings.weight.shape if model_type in ['roberta']: max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 0 step = current_max_pos b = 0 if model_type in ['roberta']: # NOTE: RoBERTa has positions 0,1 reserved k = 2 step = current_max_pos - 2 b = 2 while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.embeddings.position_embeddings.weight[b:] k += step model.embeddings.position_embeddings.weight.data = new_pos_embed # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = layer.attention.self.query longformer_self_attn.key_global = layer.attention.self.key longformer_self_attn.value_global = layer.attention.self.value layer.attention.self = longformer_self_attn return model, tokenizer, config
def create_long_model(save_model_to, attention_window, max_pos): model = BertForMaskedLM.from_pretrained( "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract") config = model.config tokenizer = BertTokenizerFast.from_pretrained( "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", model_max_length=max_pos) #tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', model_max_length=max_pos) #pdb.set_trace() # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.bert.embeddings.position_embeddings.weight.shape #max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.bert.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) model.bert.embeddings.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), ) # copy position embeddings over and over to initialize the new position embeddings k = 0 step = current_max_pos while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.bert.embeddings.position_embeddings.weight k += step model.bert.embeddings.position_embeddings.weight.data = new_pos_embed # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.bert.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = layer.attention.self.query longformer_self_attn.key_global = layer.attention.self.key longformer_self_attn.value_global = layer.attention.self.value layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) #pdb.set_trace() return model, tokenizer
def create_long_model(model_name, save_model_to, attention_window, max_pos): model = BertForMaskedLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=max_pos) config = model.config # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.bert.embeddings.position_embeddings.weight.shape config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.bert.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 0 step = current_max_pos while k < max_pos - 1: if (k + step < max_pos): new_pos_embed[k:( k + step)] = model.bert.embeddings.position_embeddings.weight k += step model.bert.embeddings.position_embeddings.weight.data = new_pos_embed # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.bert.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = layer.attention.self.query longformer_self_attn.key_global = layer.attention.self.key longformer_self_attn.value_global = layer.attention.self.value layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer
def create_long_model(save_model_to, attention_window, max_pos): model = RobertaForMaskedLM.from_pretrained('roberta-base') tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', model_max_length=max_pos) config = model.config # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 2 step = current_max_pos - 2 while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.roberta.embeddings.position_embeddings.weight[2:] k += step model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed # model.roberta.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos) # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.roberta.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = copy.deepcopy( layer.attention.self.query) longformer_self_attn.key_global = copy.deepcopy( layer.attention.self.key) longformer_self_attn.value_global = copy.deepcopy( layer.attention.self.value) layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer
def create_long_model(save_model_to, attention_window, max_pos, pretrained_config, pretrained_checkpoint, pretrained_tokenizer): """ Convert RoBERTa into Long-Version :param save_model_to: the model save path :param attention_window: the long-attention defined above :param max_pos: extend the position embedding to max_pos=4096 :return: modified model and tokenizer """ config = BertConfig.from_pretrained(pretrained_config) model = BertForMaskedLM.from_pretrained(pretrained_checkpoint, config=config) tokenizer = BertTokenizerFast.from_pretrained(pretrained_tokenizer, model_max_length=max_pos) # extend position embedding tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.bert.embeddings.position_embeddings.weight.shape # RoBERTa has position 0,1 reserved, embedding size = max_pos + 2 #max_pos += 2 # ??? is this fit for BERT-based RoBerta_zh? """ RoBERTa reserved position 0 1, However, Bert-based RoBERTa_zh did not. """ config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.bert.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # init by duplication k = 0 step = current_max_pos while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.bert.embeddings.position_embeddings.weight[0:] k += step model.bert.embeddings.position_embeddings.weight.data = new_pos_embed # The next problem is that: BERT_Based RoBERTa has not attribute [position_ids] for [bert.embeddings] # model.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos) # replace the modeling_bert.BertSelfAttention obj with LongformerSelfAttention config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.bert.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = copy.deepcopy( layer.attention.self.query) longformer_self_attn.key_global = copy.deepcopy( layer.attention.self.key) longformer_self_attn.value_global = copy.deepcopy( layer.attention.self.value) layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer
def create_long_model(model_specified, attention_window, max_pos, save_model_to): """Starting from the `roberta-base` (or similar) checkpoint, the following function converts it into an instance of `RobertaLong`. It makes the following changes: 1)extend the position embeddings from `512` positions to `max_pos`. In Longformer, we set `max_pos=4096` 2)initialize the additional position embeddings by copying the embeddings of the first `512` positions. This initialization is crucial for the model performance (check table 6 in [the paper](https://arxiv.org/pdf/2004.05150.pdf) for performance without this initialization) 3) replaces `modeling_bert.BertSelfAttention` objects with `modeling_longformer.LongformerSelfAttention` with a attention window size `attention_window` The output of this function works for long documents even without pretraining. Check tables 6 and 11 in [the paper](https://arxiv.org/pdf/2004.05150.pdf) to get a sense of the expected performance of this model before pretraining.""" model = RobertaForMaskedLM.from_pretrained( model_specified) #,gradient_checkpointing=True) tokenizer = RobertaTokenizer.from_pretrained(model_specified, model_max_length=max_pos) config = model.config # extend position embeddings tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape max_pos += 2 # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos # allocate a larger position embedding matrix new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty( max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 2 step = current_max_pos - 2 while k < max_pos - 1: new_pos_embed[k:( k + step)] = model.roberta.embeddings.position_embeddings.weight[2:] k += step model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed model.roberta.embeddings.position_embeddings.num_embeddings = len( new_pos_embed.data) # # first, check that model.roberta.embeddings.position_embeddings.weight.data.shape is correct — has to be 4096 (default) of your desired length # model.roberta.embeddings.position_ids = torch.arange( # 0, model.roberta.embeddings.position_embeddings.num_embeddings # )[None] model.roberta.embeddings.position_ids.data = torch.tensor( [i for i in range(max_pos)]).reshape(1, max_pos) # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.roberta.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = copy.deepcopy(layer.attention.self.query) longformer_self_attn.key = copy.deepcopy(layer.attention.self.key) longformer_self_attn.value = copy.deepcopy(layer.attention.self.value) longformer_self_attn.query_global = copy.deepcopy( layer.attention.self.query) longformer_self_attn.key_global = copy.deepcopy( layer.attention.self.key) longformer_self_attn.value_global = copy.deepcopy( layer.attention.self.value) layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to)
def create_long_model( save_model_to, model, tokenizer, attention_window, model_max_length ): config = model.config position_embeddings = model.roberta.embeddings.position_embeddings tokenizer.model_max_length = model_max_length tokenizer.init_kwargs['model_max_length'] = model_max_length current_model_max_length, embed_size = position_embeddings.weight.shape # NOTE: RoBERTa has positions 0,1 reserved # embedding size is max position + 2 model_max_length += 2 config.max_position_embeddings = model_max_length assert model_max_length > current_model_max_length, \ "New model max_length must be longer than current max_length" # BUG for XLM: Need to make all zeros sice too large base model new_pos_embed = position_embeddings.weight.new_zeros( model_max_length, embed_size ) k = 2 step = current_model_max_length - 2 while k < model_max_length - 1: new_pos_embed[k:( k + step)] = position_embeddings.weight[2:] k += step # HACK for Huggingface transformers >=3.4.0 and < 4.0 # https://github.com/huggingface/transformers/issues/6465#issuecomment-719042969 position_embeddings.weight.data = new_pos_embed model.roberta.embeddings.position_embeddings.num_embeddings = len( new_pos_embed.data ) num_model_embeddings = position_embeddings.num_embeddings model.roberta.embeddings.position_ids = torch.arange( 0, num_model_embeddings )[None] # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention` config.attention_window = [attention_window] * config.num_hidden_layers for i, layer in enumerate(model.roberta.encoder.layer): longformer_self_attn = LongformerSelfAttention(config, layer_id=i) longformer_self_attn.query = layer.attention.self.query longformer_self_attn.key = layer.attention.self.key longformer_self_attn.value = layer.attention.self.value longformer_self_attn.query_global = layer.attention.self.query longformer_self_attn.key_global = layer.attention.self.key longformer_self_attn.value_global = layer.attention.self.value layer.attention.self = longformer_self_attn logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to) return model, tokenizer