def __init__(self, args, dictionary): super().__init__() embedding_dim = 768 self.padding_idx = 1 self.dense = nn.Linear(embedding_dim, embedding_dim) self.layer_norm = LayerNorm(embedding_dim) init_bert_params(self.dense) self.encoder = TransformerSentenceEncoder( padding_idx=1, vocab_size=50265, num_encoder_layers=12, embedding_dim=768, ffn_embedding_dim=3072, num_attention_heads=12, dropout=0.1, attention_dropout=0.1, activation_dropout=0.0, layerdrop=0.0, max_seq_len=512, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn="gelu", q_noise=0.0, qn_block_size=8, ) embed_tokens = self.encoder.embed_tokens self.lm_head = RobertaLMHead( embed_dim=embedding_dim, output_dim=50265, activation_fn="gelu", weight=embed_tokens.weight, )
def __init__(self, dropout: float = 0.1): super(RobertaEncoderFast, self).__init__() self.fairseq_roberta = TransformerSentenceEncoder( padding_idx=1, vocab_size=32769, num_encoder_layers=12, embedding_dim=768, ffn_embedding_dim=3072, num_attention_heads=12, dropout=0.1, attention_dropout=0.1, activation_dropout=0.0, layerdrop=0.0, max_seq_len=512, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn="gelu", q_noise=0.0, qn_block_size=8, ) # self.encode_proj = ( # nn.Linear(config.hidden_size, project_dim) if project_dim != 0 else None # ) # if dropout != 0: # self.fairseq_roberta.encoder.sentence_encoder=dropout from fairseq.modules.transformer_sentence_encoder import init_bert_params self.apply(init_bert_params)
def __init__(self, args, dictionary): super().__init__(dictionary) self.args = args # RoBERTa is a sentence encoder model, so users will intuitively trim # encoder layers. However, the implementation uses the fairseq decoder, # so we fix here. if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) args.decoder_layers_to_keep = args.encoder_layers_to_keep args.encoder_layers_to_keep = None self.sentence_encoder = TransformerSentenceEncoder( padding_idx=dictionary.pad(), vocab_size=len(dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, layerdrop=args.encoder_layerdrop, max_seq_len=args.max_positions, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn=args.activation_fn, ) self.lm_head = RobertaLMHead( embed_dim=args.encoder_embed_dim, output_dim=len(dictionary), activation_fn=args.activation_fn, weight=self.sentence_encoder.embed_tokens.weight, )
def __init__(self,args,dictionary): super().__init__() embedding_dim=768 self.padding_idx=1 self.dense = nn.Linear(embedding_dim, embedding_dim) self.layer_norm = LayerNorm(embedding_dim) init_bert_params(self.dense) self.encoder=TransformerSentenceEncoder( padding_idx=1, vocab_size=50265, num_encoder_layers=12, embedding_dim=768, ffn_embedding_dim=3072, num_attention_heads=12, dropout=0.1, attention_dropout=0.1, activation_dropout=0.0, layerdrop=0.0, max_seq_len=512, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn="gelu", q_noise=0.0, qn_block_size=8, ) embed_tokens=self.encoder.embed_tokens self.lm_head = RobertaLMHead( embed_dim=embedding_dim, output_dim=50265, activation_fn="gelu", weight=embed_tokens.weight, ) #args=base_architecture(args) if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if getattr(args, "max_source_positions", None) is None: args.max_source_positions = 512 if getattr(args, "max_target_positions", None) is None: args.max_target_positions = 512 print('???',embed_tokens.embedding_dim) self.decoder=TransformerDecoder(args, dictionary, embed_tokens, no_encoder_attn=getattr(args, "no_cross_attention", False)) self.class_num=args.num_classes self.classification_heads = RobertaClassificationHead( 768, 768, self.class_num, 'tanh', 0.0, 0.0, 8, )
def __init__(self, args, dictionary): super().__init__(dictionary) self.padding_idx = dictionary.pad() self.vocab_size = dictionary.__len__() self.max_positions = args.max_positions self.sentence_encoder = TransformerSentenceEncoder( padding_idx=self.padding_idx, vocab_size=self.vocab_size, num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.act_dropout, max_seq_len=self.max_positions, num_segments=args.num_segment, use_position_embeddings=not args.no_token_positional_embeddings, encoder_normalize_before=args.encoder_normalize_before, apply_bert_init=args.apply_bert_init, activation_fn=args.activation_fn, learned_pos_embedding=args.encoder_learned_pos, add_bias_kv=args.bias_kv, add_zero_attn=args.zero_attn, ) self.share_input_output_embed = args.share_encoder_input_output_embed self.embed_out = None self.sentence_projection_layer = None self.sentence_out_dim = args.sentence_class_num self.lm_output_learned_bias = None # Remove head is set to true during fine-tuning self.load_softmax = not getattr(args, 'remove_head', False) self.masked_lm_pooler = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.pooler_activation = utils.get_activation_fn( args.pooler_activation_fn) self.lm_head_transform_weight = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.activation_fn = utils.get_activation_fn(args.activation_fn) self.layer_norm = LayerNorm(args.encoder_embed_dim) self.lm_output_learned_bias = None if self.load_softmax: self.lm_output_learned_bias = nn.Parameter( torch.zeros(self.vocab_size)) if not self.share_input_output_embed: self.embed_out = nn.Linear(args.encoder_embed_dim, self.vocab_size, bias=False) if args.sent_loss: self.sentence_projection_layer = nn.Linear( args.encoder_embed_dim, self.sentence_out_dim, bias=False)
def __init__(self, args, dictionary, share_embed_tokens): super().__init__(dictionary) self.args = args self.sentence_encoder = TransformerSentenceEncoder( padding_idx=dictionary.pad(), vocab_size=len(dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=int(args.encoder_embed_dim / args.generator_size_divider), ffn_embedding_dim=int(args.encoder_ffn_embed_dim / args.generator_size_divider), num_attention_heads=int(args.encoder_attention_heads / args.generator_size_divider), dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=args.max_positions, num_segments=0, encoder_normalize_before=args.encoder_normalize_before, embedding_normalize=args.embedding_normalize, apply_bert_init=True, activation_fn=args.activation_fn, share_embed_tokens=share_embed_tokens, shared_embedding_dim=args.encoder_embed_dim, ) self.lm_head = GeneratorLMHead( embed_dim=int(args.encoder_embed_dim / args.generator_size_divider), output_dim=len(dictionary), activation_fn=args.activation_fn, weight=self.sentence_encoder.embed_tokens.weight, share_emb_pro=self.sentence_encoder.embed_linear, )
def __init__(self, args, dictionary): super().__init__(dictionary) self.args = args self.sentence_encoder = TransformerSentenceEncoder( padding_idx=dictionary.pad(), vocab_size=len(dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=args.max_positions, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn=args.activation_fn, ) self.lm_head = RobertaLMHead( embed_dim=args.encoder_embed_dim, output_dim=len(dictionary), activation_fn=args.activation_fn, weight=self.sentence_encoder.embed_tokens.weight, )
def __init__(self, args): super().__init__() embedding_dim = 768 self.dense = nn.Linear(embedding_dim, embedding_dim) self.layer_norm = LayerNorm(embedding_dim) init_bert_params(self.dense) self.encoder = TransformerSentenceEncoder( padding_idx=1, vocab_size=32769, num_encoder_layers=12, embedding_dim=768, ffn_embedding_dim=3072, num_attention_heads=12, dropout=0.1, attention_dropout=0.1, activation_dropout=0.0, layerdrop=0.0, max_seq_len=512, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn="gelu", q_noise=0.0, qn_block_size=8, )
def __init__(self, config, model_argobj=None): nn.Module.__init__(self) NLL.__init__(self, model_argobj) self.encoder = TransformerSentenceEncoder( padding_idx=1, vocab_size=32769, num_encoder_layers=12, embedding_dim=768, ffn_embedding_dim=3072, num_attention_heads=12, dropout=0.1, attention_dropout=0.1, activation_dropout=0.0, layerdrop=0.0, max_seq_len=512, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn="gelu", q_noise=0.0, qn_block_size=8, ) self.embeddingHead = nn.Linear(config.hidden_size, 768) self.norm = nn.LayerNorm(768) self.apply(self._init_weights)
def __init__(self, args, dictionary): super().__init__(dictionary) self.args = args if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) self.sentence_encoder = TransformerSentenceEncoder( padding_idx=dictionary.pad(), vocab_size=len(dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, layerdrop=args.encoder_layerdrop, max_seq_len=args.max_positions, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn=args.activation_fn, q_noise=args.quant_noise_pq, qn_block_size=args.quant_noise_pq_block_size, ) args.untie_weights_roberta = getattr(args, 'untie_weights_roberta', False) self.lm_head = RobertaLMHead( embed_dim=args.encoder_embed_dim, output_dim=len(dictionary), activation_fn=args.activation_fn, weight=self.sentence_encoder.embed_tokens.weight if not args.untie_weights_roberta else None, )
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.padding_idx = dictionary.pad() self.vocab_size = dictionary.__len__() self.max_positions = args.max_target_positions self.embed_tokens = embed_tokens use_position_embeddings = ( not getattr(args, 'no_token_positional_embeddings', False) ) encoder_normalize_before = getattr(args, 'encoder_normalize_before', False) use_bert_layer_norm = getattr(args, 'bert_layer_norm', False) use_gelu = getattr(args, 'use_gelu', False) apply_bert_init = getattr(args, 'apply_bert_init', False) self.sentence_encoder = TransformerSentenceEncoder( padding_idx=self.padding_idx, vocab_size=self.vocab_size, num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.act_dropout, max_seq_len=self.max_positions, num_segments=args.num_segment, use_position_embeddings=use_position_embeddings, encoder_normalize_before=encoder_normalize_before, use_bert_layer_norm=use_bert_layer_norm, use_gelu=use_gelu, apply_bert_init=apply_bert_init, ) self.share_input_output_embed = getattr( args, 'share_encoder_input_output_embed', False) self.embed_out = None self.sentence_projection_layer = None self.sentence_out_dim = args.sentence_class_num # Remove head is set to true during fine-tuning self.load_softmax = not getattr(args, 'remove_head', False) if self.load_softmax: if not self.share_input_output_embed: self.embed_out = nn.Linear( args.encoder_embed_dim, self.vocab_size, bias=False ) if args.sent_loss: self.sentence_projection_layer = nn.Linear( args.encoder_embed_dim, self.sentence_out_dim, bias=False )
def __init__(self, model_path, padding_idx, vocab_size): super(OnmtRobertaEncoder, self).__init__() self.roberta_encoder = TransformerSentenceEncoder( padding_idx=padding_idx, vocab_size=vocab_size, num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=args.max_positions, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn=args.activation_fn, ) print(self.roberta_encoder) print("defined the roberta network!") model_ckpt_file=os.path.join(model_path, "model.pt") if os.path.exists(model_ckpt_file): ckpt = torch.load(model_ckpt_file, map_location='cpu') args = ckpt["args"] model_dict = {} for k, v in ckpt["model"].items(): if "decoder.sentence_encoder." in k: k = k.replace("decoder.sentence_encoder.", "") if k not in self.roberta_encoder.state_dict().keys(): print("skip", k) continue model_dict[k] = v print("{}:{}".format(k, v.size())) self.roberta_encoder.load_state_dict(model_dict) print("loaded {}/{} weights".format(len(model_dict.keys()), len(self.roberta_encoder.state_dict().keys()))) self.roberta_encoder.embed_tokens=expandEmbeddingByN(self.roberta_encoder.embed_tokens, 4 ) print("*"*50)
def __init__( self, config: Config, output_encoded_layers: bool, padding_idx: int, vocab_size: int, *args, **kwarg, ) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) self.multilingual = config.multilingual self.offset_positions_by_padding = config.offset_positions_by_padding self.use_torchscript = config.use_torchscript self.use_bias_finetuning = config.use_bias_finetuning self.traced_encoder = None self.sentence_encoder = TransformerSentenceEncoderModule( padding_idx=padding_idx, vocab_size=vocab_size, num_encoder_layers=config.num_encoder_layers, embedding_dim=config.embedding_dim, ffn_embedding_dim=config.ffn_embedding_dim, num_attention_heads=config.num_attention_heads, dropout=config.dropout, attention_dropout=config.attention_dropout, activation_dropout=config.activation_dropout, max_seq_len=config.max_seq_len, num_segments=config.num_segments, use_position_embeddings=config.use_position_embeddings, offset_positions_by_padding=config.offset_positions_by_padding, encoder_normalize_before=config.encoder_normalize_before, apply_bert_init=config.apply_bert_init, activation_fn=config.activation_fn, freeze_embeddings=config.freeze_embeddings, n_trans_layers_to_freeze=config.n_trans_layers_to_freeze, export=self.export, ) if self.use_torchscript: assert hasattr(self.sentence_encoder, "traceable") self.sentence_encoder.traceable = self.use_torchscript if self.use_bias_finetuning: for (n, p) in self.sentence_encoder.named_parameters(): # "sentence_encoder.layers.0.self_attn.k_proj.weight" -> false # "sentence_encoder.layers.0.self_attn.k_proj.bias" -> true if n.split(".")[-1] != "bias": p.requires_grad_(False) log_class_usage(__class__)
def build_encoder(self, args, dictionary): return TransformerSentenceEncoder( padding_idx=dictionary.pad(), vocab_size=len(dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, layerdrop=args.encoder_layerdrop, max_seq_len=args.max_positions, num_segments=args.num_segments, encoder_normalize_before=True, apply_bert_init=True, activation_fn=args.activation_fn, q_noise=args.quant_noise_pq, qn_block_size=args.quant_noise_pq_block_size, )
def __init__(self, args, dictionary): super().__init__(dictionary) self.args = args self.sentence_encoder = TransformerSentenceEncoder( padding_idx=dictionary.pad(), vocab_size=len(dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, embedding_noise=args.embedding_noise, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=args.max_positions, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn=args.activation_fn, ) self.span_logits = nn.Linear(args.encoder_embed_dim, 2) if not args.no_pooler: self.answer_class = MNLIPoolerClass(args.encoder_embed_dim, args.pooler_dropout)
class OnmtRobertaEncoder(EncoderBase): ''' Returns: (torch.FloatTensor, torch.FloatTensor): * embeddings ``(src_len, batch_size, model_dim)`` * memory_bank ``(src_len, batch_size, model_dim)`` ''' def __init__(self, model_path, padding_idx, vocab_size): super(OnmtRobertaEncoder, self).__init__() self.roberta_encoder = TransformerSentenceEncoder( padding_idx=padding_idx, vocab_size=vocab_size, num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=args.max_positions, num_segments=0, encoder_normalize_before=True, apply_bert_init=True, activation_fn=args.activation_fn, ) print(self.roberta_encoder) print("defined the roberta network!") model_ckpt_file=os.path.join(model_path, "model.pt") if os.path.exists(model_ckpt_file): ckpt = torch.load(model_ckpt_file, map_location='cpu') args = ckpt["args"] model_dict = {} for k, v in ckpt["model"].items(): if "decoder.sentence_encoder." in k: k = k.replace("decoder.sentence_encoder.", "") if k not in self.roberta_encoder.state_dict().keys(): print("skip", k) continue model_dict[k] = v print("{}:{}".format(k, v.size())) self.roberta_encoder.load_state_dict(model_dict) print("loaded {}/{} weights".format(len(model_dict.keys()), len(self.roberta_encoder.state_dict().keys()))) self.roberta_encoder.embed_tokens=expandEmbeddingByN(self.roberta_encoder.embed_tokens, 4 ) print("*"*50) def forward(self, src, lengths=None): """See :func:`EncoderBase.forward()`""" self._check_args(src, lengths) src=src.squeeze(2).transpose(0,1).contiguous() #outs, sent_out=self.roberta_encoder(src) emb, outs, sent_out=self.forwad1(self.roberta_encoder,src) #emb=outs[0] out=outs[-1] #print("src--> outs", src.size(), out.size(), emb.size()) #return emb.transpose(0,1).contiguous(), out.transpose(0, 1).contiguous(), lengths return emb, out, lengths
def __init__(self, args, task): super(BertRanker, self).__init__(args, task) init_model = getattr(args, "pretrained_model", "") self.joint_layers = nn.ModuleList() if os.path.isfile(init_model): print(f"initialize weight from {init_model}") from fairseq import hub_utils x = hub_utils.from_pretrained( os.path.dirname(init_model), checkpoint_file=os.path.basename(init_model), ) in_state_dict = x["models"][0].state_dict() init_args = x["args"].model num_positional_emb = init_args.max_positions + task.dictionary.pad( ) + 1 # follow the setup in roberta self.model = TransformerSentenceEncoder( padding_idx=task.dictionary.pad(), vocab_size=len(task.dictionary), num_encoder_layers=getattr(args, "encoder_layers", init_args.encoder_layers), embedding_dim=init_args.encoder_embed_dim, ffn_embedding_dim=init_args.encoder_ffn_embed_dim, num_attention_heads=init_args.encoder_attention_heads, dropout=init_args.dropout, attention_dropout=init_args.attention_dropout, activation_dropout=init_args.activation_dropout, num_segments=2, # add language embeddings max_seq_len=num_positional_emb, offset_positions_by_padding=False, encoder_normalize_before=True, apply_bert_init=True, activation_fn=init_args.activation_fn, freeze_embeddings=args.freeze_embeddings, n_trans_layers_to_freeze=args.n_trans_layers_to_freeze, ) # still need to learn segment embeddings as we added a second language embedding if args.freeze_embeddings: for p in self.model.segment_embeddings.parameters(): p.requires_grad = False update_init_roberta_model_state(in_state_dict) print("loading weights from the pretrained model") self.model.load_state_dict( in_state_dict, strict=False) # ignore mismatch in language embeddings ffn_embedding_dim = init_args.encoder_ffn_embed_dim num_attention_heads = init_args.encoder_attention_heads dropout = init_args.dropout attention_dropout = init_args.attention_dropout activation_dropout = init_args.activation_dropout activation_fn = init_args.activation_fn classifier_embed_dim = getattr(args, "embed_dim", init_args.encoder_embed_dim) if classifier_embed_dim != init_args.encoder_embed_dim: self.transform_layer = nn.Linear(init_args.encoder_embed_dim, classifier_embed_dim) else: self.model = TransformerSentenceEncoder( padding_idx=task.dictionary.pad(), vocab_size=len(task.dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.embed_dim, ffn_embedding_dim=args.ffn_embed_dim, num_attention_heads=args.attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=task.max_positions() if task.max_positions() else args.tokens_per_sample, num_segments=2, offset_positions_by_padding=False, encoder_normalize_before=args.encoder_normalize_before, apply_bert_init=args.apply_bert_init, activation_fn=args.activation_fn, ) classifier_embed_dim = args.embed_dim ffn_embedding_dim = args.ffn_embed_dim num_attention_heads = args.attention_heads dropout = args.dropout attention_dropout = args.attention_dropout activation_dropout = args.activation_dropout activation_fn = args.activation_fn self.joint_classification = args.joint_classification if args.joint_classification == "sent": if args.joint_normalize_before: self.joint_layer_norm = LayerNorm(classifier_embed_dim) else: self.joint_layer_norm = None self.joint_layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=classifier_embed_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, ) for _ in range(args.num_joint_layers) ]) self.classifier = RobertaClassificationHead( classifier_embed_dim, classifier_embed_dim, 1, # num_classes "tanh", args.classifier_dropout, )
class BertRanker(BaseRanker): def __init__(self, args, task): super(BertRanker, self).__init__(args, task) init_model = getattr(args, "pretrained_model", "") self.joint_layers = nn.ModuleList() if os.path.isfile(init_model): print(f"initialize weight from {init_model}") from fairseq import hub_utils x = hub_utils.from_pretrained( os.path.dirname(init_model), checkpoint_file=os.path.basename(init_model), ) in_state_dict = x["models"][0].state_dict() init_args = x["args"].model num_positional_emb = init_args.max_positions + task.dictionary.pad( ) + 1 # follow the setup in roberta self.model = TransformerSentenceEncoder( padding_idx=task.dictionary.pad(), vocab_size=len(task.dictionary), num_encoder_layers=getattr(args, "encoder_layers", init_args.encoder_layers), embedding_dim=init_args.encoder_embed_dim, ffn_embedding_dim=init_args.encoder_ffn_embed_dim, num_attention_heads=init_args.encoder_attention_heads, dropout=init_args.dropout, attention_dropout=init_args.attention_dropout, activation_dropout=init_args.activation_dropout, num_segments=2, # add language embeddings max_seq_len=num_positional_emb, offset_positions_by_padding=False, encoder_normalize_before=True, apply_bert_init=True, activation_fn=init_args.activation_fn, freeze_embeddings=args.freeze_embeddings, n_trans_layers_to_freeze=args.n_trans_layers_to_freeze, ) # still need to learn segment embeddings as we added a second language embedding if args.freeze_embeddings: for p in self.model.segment_embeddings.parameters(): p.requires_grad = False update_init_roberta_model_state(in_state_dict) print("loading weights from the pretrained model") self.model.load_state_dict( in_state_dict, strict=False) # ignore mismatch in language embeddings ffn_embedding_dim = init_args.encoder_ffn_embed_dim num_attention_heads = init_args.encoder_attention_heads dropout = init_args.dropout attention_dropout = init_args.attention_dropout activation_dropout = init_args.activation_dropout activation_fn = init_args.activation_fn classifier_embed_dim = getattr(args, "embed_dim", init_args.encoder_embed_dim) if classifier_embed_dim != init_args.encoder_embed_dim: self.transform_layer = nn.Linear(init_args.encoder_embed_dim, classifier_embed_dim) else: self.model = TransformerSentenceEncoder( padding_idx=task.dictionary.pad(), vocab_size=len(task.dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.embed_dim, ffn_embedding_dim=args.ffn_embed_dim, num_attention_heads=args.attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=task.max_positions() if task.max_positions() else args.tokens_per_sample, num_segments=2, offset_positions_by_padding=False, encoder_normalize_before=args.encoder_normalize_before, apply_bert_init=args.apply_bert_init, activation_fn=args.activation_fn, ) classifier_embed_dim = args.embed_dim ffn_embedding_dim = args.ffn_embed_dim num_attention_heads = args.attention_heads dropout = args.dropout attention_dropout = args.attention_dropout activation_dropout = args.activation_dropout activation_fn = args.activation_fn self.joint_classification = args.joint_classification if args.joint_classification == "sent": if args.joint_normalize_before: self.joint_layer_norm = LayerNorm(classifier_embed_dim) else: self.joint_layer_norm = None self.joint_layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=classifier_embed_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, ) for _ in range(args.num_joint_layers) ]) self.classifier = RobertaClassificationHead( classifier_embed_dim, classifier_embed_dim, 1, # num_classes "tanh", args.classifier_dropout, ) def forward(self, src_tokens, src_lengths): segment_labels = self.get_segment_labels(src_tokens) positions = self.get_positions(src_tokens, segment_labels) inner_states, _ = self.model( tokens=src_tokens, segment_labels=segment_labels, last_state_only=True, positions=positions, ) return inner_states[-1].transpose(0, 1) # T x B x C -> B x T x C def sentence_forward(self, encoder_out, src_tokens=None, sentence_rep="head"): # encoder_out: B x T x C if sentence_rep == "head": x = encoder_out[:, :1, :] else: # 'meanpool', 'maxpool' assert src_tokens is not None, "meanpool requires src_tokens input" segment_labels = self.get_segment_labels(src_tokens) padding_mask = src_tokens.ne(self.padding_idx) encoder_mask = segment_labels * padding_mask.type_as( segment_labels) if sentence_rep == "meanpool": ntokens = torch.sum(encoder_mask, dim=1, keepdim=True) x = torch.sum( encoder_out * encoder_mask.unsqueeze(2), dim=1, keepdim=True) / ntokens.unsqueeze(2).type_as(encoder_out) else: # 'maxpool' encoder_out[(encoder_mask == 0).unsqueeze(2).repeat( 1, 1, encoder_out.shape[-1])] = -float("inf") x, _ = torch.max(encoder_out, dim=1, keepdim=True) if hasattr(self, "transform_layer"): x = self.transform_layer(x) return x # B x 1 x C def joint_forward(self, x): # x: T x B x C if self.joint_layer_norm: x = self.joint_layer_norm(x.transpose(0, 1)) x = x.transpose(0, 1) for layer in self.joint_layers: x, _ = layer(x, self_attn_padding_mask=None) return x def classification_forward(self, x): # x: B x T x C return self.classifier(x)
class TransformerSentenceEncoder(TransformerSentenceEncoderBase): """ Implementation of the Transformer Sentence Encoder. This directly makes use of the TransformerSentenceEncoder module in Fairseq. A few interesting config options: - encoder_normalize_before detemines whether the layer norm is applied before or after self_attention. This is similar to original implementation from Google. - activation_fn can be set to 'gelu' instead of the default of 'relu'. """ class Config(TransformerSentenceEncoderBase.Config, ConfigBase): # Dropout parameters dropout: float = 0.1 attention_dropout: float = 0.1 activation_dropout: float = 0.1 # Parameters related to hidden states and self-attention embedding_dim: int = 768 ffn_embedding_dim: int = 3072 num_encoder_layers: int = 6 num_attention_heads: int = 8 num_segments: int = 2 # Parameters related to positions use_position_embeddings: bool = True # the fairseq module for position embeddings offsets all position # ids by the padding index. Disable this offset by setting this flag # to False. This will work correctly since we mask out the embeddings # associated with padding in the encoder offset_positions_by_padding: bool = True # Model Initialization parameters apply_bert_init: bool = True # Misc. Params encoder_normalize_before: bool = True activation_fn: str = "relu" max_seq_len: int = 128 # multilingual is set to true for cross-lingual LM training multilingual: bool = False # Flags for freezing parameters (e.g. during fine-tuning) freeze_embeddings: bool = False n_trans_layers_to_freeze: int = 0 # Use of TorchScript and optimizations use_torchscript: bool = False # Fine-tune bias parameters only (https://nlp.biu.ac.il/~yogo/bitfit.pdf) use_bias_finetuning: bool = False def __init__( self, config: Config, output_encoded_layers: bool, padding_idx: int, vocab_size: int, *args, **kwarg, ) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) self.multilingual = config.multilingual self.offset_positions_by_padding = config.offset_positions_by_padding self.use_torchscript = config.use_torchscript self.use_bias_finetuning = config.use_bias_finetuning self.traced_encoder = None self.sentence_encoder = TransformerSentenceEncoderModule( padding_idx=padding_idx, vocab_size=vocab_size, num_encoder_layers=config.num_encoder_layers, embedding_dim=config.embedding_dim, ffn_embedding_dim=config.ffn_embedding_dim, num_attention_heads=config.num_attention_heads, dropout=config.dropout, attention_dropout=config.attention_dropout, activation_dropout=config.activation_dropout, max_seq_len=config.max_seq_len, num_segments=config.num_segments, use_position_embeddings=config.use_position_embeddings, offset_positions_by_padding=config.offset_positions_by_padding, encoder_normalize_before=config.encoder_normalize_before, apply_bert_init=config.apply_bert_init, activation_fn=config.activation_fn, freeze_embeddings=config.freeze_embeddings, n_trans_layers_to_freeze=config.n_trans_layers_to_freeze, export=self.export, ) if self.use_torchscript: assert hasattr(self.sentence_encoder, "traceable") self.sentence_encoder.traceable = self.use_torchscript if self.use_bias_finetuning: for (n, p) in self.sentence_encoder.named_parameters(): # "sentence_encoder.layers.0.self_attn.k_proj.weight" -> false # "sentence_encoder.layers.0.self_attn.k_proj.bias" -> true if n.split(".")[-1] != "bias": p.requires_grad_(False) log_class_usage(__class__) def load_state_dict(self, state_dict): self.upgrade_state_dict_named(state_dict) # "projection" must be be in sync with the name of member variable projection. has_projection = any("projection" in key for key in state_dict.keys()) if self.projection is not None and not has_projection: projection_temp = self.projection self.projection = None super().load_state_dict(state_dict) self.projection = projection_temp else: super().load_state_dict(state_dict) def _encoder( self, input_tuple: Tuple[torch.Tensor, ...]) -> Tuple[torch.Tensor, ...]: tokens, _, segment_labels, positions = input_tuple if self.offset_positions_by_padding or (not self.multilingual): positions = None if self.use_torchscript and self.traced_encoder is None: self.traced_encoder = TracedTransformerEncoder( self.sentence_encoder, tokens, segment_labels, positions) del self.sentence_encoder self.sentence_encoder = self.traced_encoder print("Using traced transformer sentence encoder") encoded_layers, pooled_output = self.sentence_encoder( tokens, segment_labels, positions=positions) # Each tensor in encoded_layers output by the Fairseq module has # the shape: T x B x C. Convert this to B x T x C encoded_layers = [x.transpose(0, 1) for x in encoded_layers] return encoded_layers, pooled_output def _embedding(self): # used to tie weights in MaskedLM model return self.sentence_encoder.embed_tokens def upgrade_state_dict_named(self, state_dict): # We convert in_proj_weight to individual q,k,v weights items_to_add = {} keys_to_remove = [] for k in state_dict.keys(): if k.endswith("in_proj_weight"): # in_proj_weight used to be q + k + v with same dimensions dim = int(state_dict[k].shape[0] / 3) items_to_add[k.replace("in_proj_weight", "q_proj.weight")] = state_dict[k][:dim] items_to_add[k.replace( "in_proj_weight", "k_proj.weight")] = state_dict[k][dim:2 * dim] items_to_add[k.replace("in_proj_weight", "v_proj.weight")] = state_dict[k][2 * dim:] keys_to_remove.append(k) if k.endswith("in_proj_bias"): dim = int(state_dict[k].shape[0] / 3) items_to_add[k.replace("in_proj_bias", "q_proj.bias")] = state_dict[k][:dim] items_to_add[k.replace("in_proj_bias", "k_proj.bias")] = state_dict[k][dim:2 * dim] items_to_add[k.replace( "in_proj_bias", "v_proj.bias")] = state_dict[k][2 * dim:] keys_to_remove.append(k) for k in keys_to_remove: del state_dict[k] for key, value in items_to_add.items(): state_dict[key] = value return state_dict