Пример #1
0
    def __init__(self, args, dictionary):
        super().__init__()
        embedding_dim = 768
        self.padding_idx = 1

        self.dense = nn.Linear(embedding_dim, embedding_dim)
        self.layer_norm = LayerNorm(embedding_dim)
        init_bert_params(self.dense)
        self.encoder = TransformerSentenceEncoder(
            padding_idx=1,
            vocab_size=50265,
            num_encoder_layers=12,
            embedding_dim=768,
            ffn_embedding_dim=3072,
            num_attention_heads=12,
            dropout=0.1,
            attention_dropout=0.1,
            activation_dropout=0.0,
            layerdrop=0.0,
            max_seq_len=512,
            num_segments=0,
            encoder_normalize_before=True,
            apply_bert_init=True,
            activation_fn="gelu",
            q_noise=0.0,
            qn_block_size=8,
        )
        embed_tokens = self.encoder.embed_tokens
        self.lm_head = RobertaLMHead(
            embed_dim=embedding_dim,
            output_dim=50265,
            activation_fn="gelu",
            weight=embed_tokens.weight,
        )
Пример #2
0
 def __init__(self, dropout: float = 0.1):
     super(RobertaEncoderFast, self).__init__()
     self.fairseq_roberta = TransformerSentenceEncoder(
         padding_idx=1,
         vocab_size=32769,
         num_encoder_layers=12,
         embedding_dim=768,
         ffn_embedding_dim=3072,
         num_attention_heads=12,
         dropout=0.1,
         attention_dropout=0.1,
         activation_dropout=0.0,
         layerdrop=0.0,
         max_seq_len=512,
         num_segments=0,
         encoder_normalize_before=True,
         apply_bert_init=True,
         activation_fn="gelu",
         q_noise=0.0,
         qn_block_size=8,
     )
     # self.encode_proj = (
     #     nn.Linear(config.hidden_size, project_dim) if project_dim != 0 else None
     # )
     # if dropout != 0:
     #     self.fairseq_roberta.encoder.sentence_encoder=dropout
     from fairseq.modules.transformer_sentence_encoder import init_bert_params
     self.apply(init_bert_params)
Пример #3
0
    def __init__(self, args, dictionary):
        super().__init__(dictionary)
        self.args = args

        # RoBERTa is a sentence encoder model, so users will intuitively trim
        # encoder layers. However, the implementation uses the fairseq decoder,
        # so we fix here.
        if args.encoder_layers_to_keep:
            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
            args.decoder_layers_to_keep = args.encoder_layers_to_keep
            args.encoder_layers_to_keep = None

        self.sentence_encoder = TransformerSentenceEncoder(
            padding_idx=dictionary.pad(),
            vocab_size=len(dictionary),
            num_encoder_layers=args.encoder_layers,
            embedding_dim=args.encoder_embed_dim,
            ffn_embedding_dim=args.encoder_ffn_embed_dim,
            num_attention_heads=args.encoder_attention_heads,
            dropout=args.dropout,
            attention_dropout=args.attention_dropout,
            activation_dropout=args.activation_dropout,
            layerdrop=args.encoder_layerdrop,
            max_seq_len=args.max_positions,
            num_segments=0,
            encoder_normalize_before=True,
            apply_bert_init=True,
            activation_fn=args.activation_fn,
        )
        self.lm_head = RobertaLMHead(
            embed_dim=args.encoder_embed_dim,
            output_dim=len(dictionary),
            activation_fn=args.activation_fn,
            weight=self.sentence_encoder.embed_tokens.weight,
        )
Пример #4
0
    def __init__(self,args,dictionary):
        super().__init__()
        embedding_dim=768
        self.padding_idx=1

        self.dense = nn.Linear(embedding_dim, embedding_dim)
        self.layer_norm = LayerNorm(embedding_dim)
        init_bert_params(self.dense)
        self.encoder=TransformerSentenceEncoder(
                padding_idx=1,
                vocab_size=50265,
                num_encoder_layers=12,
                embedding_dim=768,
                ffn_embedding_dim=3072,
                num_attention_heads=12,
                dropout=0.1,
                attention_dropout=0.1,
                activation_dropout=0.0,
                layerdrop=0.0,
                max_seq_len=512,
                num_segments=0,
                encoder_normalize_before=True,
                apply_bert_init=True,
                activation_fn="gelu",
                q_noise=0.0,
                qn_block_size=8,
        )
        embed_tokens=self.encoder.embed_tokens
        self.lm_head = RobertaLMHead(
            embed_dim=embedding_dim,
            output_dim=50265,
            activation_fn="gelu",
            weight=embed_tokens.weight,
        )

        #args=base_architecture(args)

        if args.encoder_layers_to_keep:
            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
        if args.decoder_layers_to_keep:
            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))

        if getattr(args, "max_source_positions", None) is None:
            args.max_source_positions = 512
        if getattr(args, "max_target_positions", None) is None:
            args.max_target_positions = 512
        print('???',embed_tokens.embedding_dim)

        self.decoder=TransformerDecoder(args, dictionary, embed_tokens, no_encoder_attn=getattr(args, "no_cross_attention", False))

        self.class_num=args.num_classes
        self.classification_heads = RobertaClassificationHead(
            768,
            768,
            self.class_num,
            'tanh',
            0.0,
            0.0,
            8,
        )
Пример #5
0
    def __init__(self, args, dictionary):
        super().__init__(dictionary)

        self.padding_idx = dictionary.pad()
        self.vocab_size = dictionary.__len__()
        self.max_positions = args.max_positions

        self.sentence_encoder = TransformerSentenceEncoder(
            padding_idx=self.padding_idx,
            vocab_size=self.vocab_size,
            num_encoder_layers=args.encoder_layers,
            embedding_dim=args.encoder_embed_dim,
            ffn_embedding_dim=args.encoder_ffn_embed_dim,
            num_attention_heads=args.encoder_attention_heads,
            dropout=args.dropout,
            attention_dropout=args.attention_dropout,
            activation_dropout=args.act_dropout,
            max_seq_len=self.max_positions,
            num_segments=args.num_segment,
            use_position_embeddings=not args.no_token_positional_embeddings,
            encoder_normalize_before=args.encoder_normalize_before,
            apply_bert_init=args.apply_bert_init,
            activation_fn=args.activation_fn,
            learned_pos_embedding=args.encoder_learned_pos,
            add_bias_kv=args.bias_kv,
            add_zero_attn=args.zero_attn,
        )

        self.share_input_output_embed = args.share_encoder_input_output_embed
        self.embed_out = None
        self.sentence_projection_layer = None
        self.sentence_out_dim = args.sentence_class_num
        self.lm_output_learned_bias = None

        # Remove head is set to true during fine-tuning
        self.load_softmax = not getattr(args, 'remove_head', False)

        self.masked_lm_pooler = nn.Linear(args.encoder_embed_dim,
                                          args.encoder_embed_dim)
        self.pooler_activation = utils.get_activation_fn(
            args.pooler_activation_fn)

        self.lm_head_transform_weight = nn.Linear(args.encoder_embed_dim,
                                                  args.encoder_embed_dim)
        self.activation_fn = utils.get_activation_fn(args.activation_fn)
        self.layer_norm = LayerNorm(args.encoder_embed_dim)

        self.lm_output_learned_bias = None
        if self.load_softmax:
            self.lm_output_learned_bias = nn.Parameter(
                torch.zeros(self.vocab_size))

            if not self.share_input_output_embed:
                self.embed_out = nn.Linear(args.encoder_embed_dim,
                                           self.vocab_size,
                                           bias=False)

            if args.sent_loss:
                self.sentence_projection_layer = nn.Linear(
                    args.encoder_embed_dim, self.sentence_out_dim, bias=False)
Пример #6
0
 def __init__(self, args, dictionary, share_embed_tokens):
     super().__init__(dictionary)
     self.args = args
     self.sentence_encoder = TransformerSentenceEncoder(
         padding_idx=dictionary.pad(),
         vocab_size=len(dictionary),
         num_encoder_layers=args.encoder_layers,
         embedding_dim=int(args.encoder_embed_dim / args.generator_size_divider),
         ffn_embedding_dim=int(args.encoder_ffn_embed_dim / args.generator_size_divider),
         num_attention_heads=int(args.encoder_attention_heads / args.generator_size_divider),
         dropout=args.dropout,
         attention_dropout=args.attention_dropout,
         activation_dropout=args.activation_dropout,
         max_seq_len=args.max_positions,
         num_segments=0,
         encoder_normalize_before=args.encoder_normalize_before,
         embedding_normalize=args.embedding_normalize,
         apply_bert_init=True,
         activation_fn=args.activation_fn,
         share_embed_tokens=share_embed_tokens,
         shared_embedding_dim=args.encoder_embed_dim,
     )
     self.lm_head = GeneratorLMHead(
         embed_dim=int(args.encoder_embed_dim / args.generator_size_divider),
         output_dim=len(dictionary),
         activation_fn=args.activation_fn,
         weight=self.sentence_encoder.embed_tokens.weight,
         share_emb_pro=self.sentence_encoder.embed_linear,
     )
Пример #7
0
 def __init__(self, args, dictionary):
     super().__init__(dictionary)
     self.args = args
     self.sentence_encoder = TransformerSentenceEncoder(
         padding_idx=dictionary.pad(),
         vocab_size=len(dictionary),
         num_encoder_layers=args.encoder_layers,
         embedding_dim=args.encoder_embed_dim,
         ffn_embedding_dim=args.encoder_ffn_embed_dim,
         num_attention_heads=args.encoder_attention_heads,
         dropout=args.dropout,
         attention_dropout=args.attention_dropout,
         activation_dropout=args.activation_dropout,
         max_seq_len=args.max_positions,
         num_segments=0,
         encoder_normalize_before=True,
         apply_bert_init=True,
         activation_fn=args.activation_fn,
     )
     self.lm_head = RobertaLMHead(
         embed_dim=args.encoder_embed_dim,
         output_dim=len(dictionary),
         activation_fn=args.activation_fn,
         weight=self.sentence_encoder.embed_tokens.weight,
     )
Пример #8
0
    def __init__(self, args):
        super().__init__()
        embedding_dim = 768

        self.dense = nn.Linear(embedding_dim, embedding_dim)
        self.layer_norm = LayerNorm(embedding_dim)
        init_bert_params(self.dense)
        self.encoder = TransformerSentenceEncoder(
            padding_idx=1,
            vocab_size=32769,
            num_encoder_layers=12,
            embedding_dim=768,
            ffn_embedding_dim=3072,
            num_attention_heads=12,
            dropout=0.1,
            attention_dropout=0.1,
            activation_dropout=0.0,
            layerdrop=0.0,
            max_seq_len=512,
            num_segments=0,
            encoder_normalize_before=True,
            apply_bert_init=True,
            activation_fn="gelu",
            q_noise=0.0,
            qn_block_size=8,
        )
Пример #9
0
    def __init__(self, config, model_argobj=None):
        nn.Module.__init__(self)
        NLL.__init__(self, model_argobj)

        self.encoder = TransformerSentenceEncoder(
            padding_idx=1,
            vocab_size=32769,
            num_encoder_layers=12,
            embedding_dim=768,
            ffn_embedding_dim=3072,
            num_attention_heads=12,
            dropout=0.1,
            attention_dropout=0.1,
            activation_dropout=0.0,
            layerdrop=0.0,
            max_seq_len=512,
            num_segments=0,
            encoder_normalize_before=True,
            apply_bert_init=True,
            activation_fn="gelu",
            q_noise=0.0,
            qn_block_size=8,
        )
        self.embeddingHead = nn.Linear(config.hidden_size, 768)
        self.norm = nn.LayerNorm(768)
        self.apply(self._init_weights)
    def __init__(self, args, dictionary):
        super().__init__(dictionary)
        self.args = args

        if args.encoder_layers_to_keep:
            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))

        self.sentence_encoder = TransformerSentenceEncoder(
            padding_idx=dictionary.pad(),
            vocab_size=len(dictionary),
            num_encoder_layers=args.encoder_layers,
            embedding_dim=args.encoder_embed_dim,
            ffn_embedding_dim=args.encoder_ffn_embed_dim,
            num_attention_heads=args.encoder_attention_heads,
            dropout=args.dropout,
            attention_dropout=args.attention_dropout,
            activation_dropout=args.activation_dropout,
            layerdrop=args.encoder_layerdrop,
            max_seq_len=args.max_positions,
            num_segments=0,
            encoder_normalize_before=True,
            apply_bert_init=True,
            activation_fn=args.activation_fn,
            q_noise=args.quant_noise_pq,
            qn_block_size=args.quant_noise_pq_block_size,
        )
        args.untie_weights_roberta = getattr(args, 'untie_weights_roberta', False)

        self.lm_head = RobertaLMHead(
            embed_dim=args.encoder_embed_dim,
            output_dim=len(dictionary),
            activation_fn=args.activation_fn,
            weight=self.sentence_encoder.embed_tokens.weight if not args.untie_weights_roberta else None,
        )
Пример #11
0
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)

        self.padding_idx = dictionary.pad()
        self.vocab_size = dictionary.__len__()
        self.max_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        use_position_embeddings = (
            not getattr(args, 'no_token_positional_embeddings', False)
        )
        encoder_normalize_before = getattr(args, 'encoder_normalize_before', False)
        use_bert_layer_norm = getattr(args, 'bert_layer_norm', False)
        use_gelu = getattr(args, 'use_gelu', False)
        apply_bert_init = getattr(args, 'apply_bert_init', False)

        self.sentence_encoder = TransformerSentenceEncoder(
            padding_idx=self.padding_idx,
            vocab_size=self.vocab_size,
            num_encoder_layers=args.encoder_layers,
            embedding_dim=args.encoder_embed_dim,
            ffn_embedding_dim=args.encoder_ffn_embed_dim,
            num_attention_heads=args.encoder_attention_heads,
            dropout=args.dropout,
            attention_dropout=args.attention_dropout,
            activation_dropout=args.act_dropout,
            max_seq_len=self.max_positions,
            num_segments=args.num_segment,
            use_position_embeddings=use_position_embeddings,
            encoder_normalize_before=encoder_normalize_before,
            use_bert_layer_norm=use_bert_layer_norm,
            use_gelu=use_gelu,
            apply_bert_init=apply_bert_init,
        )

        self.share_input_output_embed = getattr(
            args, 'share_encoder_input_output_embed', False)
        self.embed_out = None
        self.sentence_projection_layer = None
        self.sentence_out_dim = args.sentence_class_num

        # Remove head is set to true during fine-tuning
        self.load_softmax = not getattr(args, 'remove_head', False)

        if self.load_softmax:
            if not self.share_input_output_embed:
                self.embed_out = nn.Linear(
                    args.encoder_embed_dim,
                    self.vocab_size,
                    bias=False
                )

            if args.sent_loss:
                self.sentence_projection_layer = nn.Linear(
                    args.encoder_embed_dim,
                    self.sentence_out_dim,
                    bias=False
                )
Пример #12
0
    def __init__(self, model_path, padding_idx, vocab_size):
        super(OnmtRobertaEncoder, self).__init__()
        

        self.roberta_encoder = TransformerSentenceEncoder(
            padding_idx=padding_idx,
            vocab_size=vocab_size,
            num_encoder_layers=args.encoder_layers,
            embedding_dim=args.encoder_embed_dim,
            ffn_embedding_dim=args.encoder_ffn_embed_dim,
            num_attention_heads=args.encoder_attention_heads,
            dropout=args.dropout,
            attention_dropout=args.attention_dropout,
            activation_dropout=args.activation_dropout,
            max_seq_len=args.max_positions,
            num_segments=0,
            encoder_normalize_before=True,
            apply_bert_init=True,
            activation_fn=args.activation_fn,
        )
        print(self.roberta_encoder)
        print("defined the roberta network!")
        model_ckpt_file=os.path.join(model_path, "model.pt")
        if os.path.exists(model_ckpt_file):
            ckpt = torch.load(model_ckpt_file, map_location='cpu')
            args = ckpt["args"]
            model_dict = {}
            for k, v in ckpt["model"].items():
                if "decoder.sentence_encoder." in k:
                    k = k.replace("decoder.sentence_encoder.", "")
                    if k not in self.roberta_encoder.state_dict().keys():
                        print("skip", k)
                        continue
                    model_dict[k] = v
                    print("{}:{}".format(k, v.size()))

            self.roberta_encoder.load_state_dict(model_dict)
            print("loaded {}/{} weights".format(len(model_dict.keys()), len(self.roberta_encoder.state_dict().keys())))

        self.roberta_encoder.embed_tokens=expandEmbeddingByN(self.roberta_encoder.embed_tokens, 4 )
        print("*"*50)
Пример #13
0
    def __init__(
        self,
        config: Config,
        output_encoded_layers: bool,
        padding_idx: int,
        vocab_size: int,
        *args,
        **kwarg,
    ) -> None:

        super().__init__(config, output_encoded_layers=output_encoded_layers)
        self.multilingual = config.multilingual
        self.offset_positions_by_padding = config.offset_positions_by_padding
        self.use_torchscript = config.use_torchscript
        self.use_bias_finetuning = config.use_bias_finetuning
        self.traced_encoder = None

        self.sentence_encoder = TransformerSentenceEncoderModule(
            padding_idx=padding_idx,
            vocab_size=vocab_size,
            num_encoder_layers=config.num_encoder_layers,
            embedding_dim=config.embedding_dim,
            ffn_embedding_dim=config.ffn_embedding_dim,
            num_attention_heads=config.num_attention_heads,
            dropout=config.dropout,
            attention_dropout=config.attention_dropout,
            activation_dropout=config.activation_dropout,
            max_seq_len=config.max_seq_len,
            num_segments=config.num_segments,
            use_position_embeddings=config.use_position_embeddings,
            offset_positions_by_padding=config.offset_positions_by_padding,
            encoder_normalize_before=config.encoder_normalize_before,
            apply_bert_init=config.apply_bert_init,
            activation_fn=config.activation_fn,
            freeze_embeddings=config.freeze_embeddings,
            n_trans_layers_to_freeze=config.n_trans_layers_to_freeze,
            export=self.export,
        )
        if self.use_torchscript:
            assert hasattr(self.sentence_encoder, "traceable")
            self.sentence_encoder.traceable = self.use_torchscript
        if self.use_bias_finetuning:
            for (n, p) in self.sentence_encoder.named_parameters():
                # "sentence_encoder.layers.0.self_attn.k_proj.weight" -> false
                # "sentence_encoder.layers.0.self_attn.k_proj.bias" -> true
                if n.split(".")[-1] != "bias":
                    p.requires_grad_(False)

        log_class_usage(__class__)
Пример #14
0
 def build_encoder(self, args, dictionary):
     return TransformerSentenceEncoder(
         padding_idx=dictionary.pad(),
         vocab_size=len(dictionary),
         num_encoder_layers=args.encoder_layers,
         embedding_dim=args.encoder_embed_dim,
         ffn_embedding_dim=args.encoder_ffn_embed_dim,
         num_attention_heads=args.encoder_attention_heads,
         dropout=args.dropout,
         attention_dropout=args.attention_dropout,
         activation_dropout=args.activation_dropout,
         layerdrop=args.encoder_layerdrop,
         max_seq_len=args.max_positions,
         num_segments=args.num_segments,
         encoder_normalize_before=True,
         apply_bert_init=True,
         activation_fn=args.activation_fn,
         q_noise=args.quant_noise_pq,
         qn_block_size=args.quant_noise_pq_block_size,
     )
Пример #15
0
 def __init__(self, args, dictionary):
     super().__init__(dictionary)
     self.args = args
     self.sentence_encoder = TransformerSentenceEncoder(
         padding_idx=dictionary.pad(),
         vocab_size=len(dictionary),
         num_encoder_layers=args.encoder_layers,
         embedding_dim=args.encoder_embed_dim,
         ffn_embedding_dim=args.encoder_ffn_embed_dim,
         num_attention_heads=args.encoder_attention_heads,
         embedding_noise=args.embedding_noise,
         dropout=args.dropout,
         attention_dropout=args.attention_dropout,
         activation_dropout=args.activation_dropout,
         max_seq_len=args.max_positions,
         num_segments=0,
         encoder_normalize_before=True,
         apply_bert_init=True,
         activation_fn=args.activation_fn,
     )
     self.span_logits =  nn.Linear(args.encoder_embed_dim, 2)
     
     if not args.no_pooler:
         self.answer_class = MNLIPoolerClass(args.encoder_embed_dim, args.pooler_dropout)
Пример #16
0
class OnmtRobertaEncoder(EncoderBase):
    '''
    Returns:
        (torch.FloatTensor, torch.FloatTensor):

        * embeddings ``(src_len, batch_size, model_dim)``
        * memory_bank ``(src_len, batch_size, model_dim)``
    '''

    def __init__(self, model_path, padding_idx, vocab_size):
        super(OnmtRobertaEncoder, self).__init__()
        

        self.roberta_encoder = TransformerSentenceEncoder(
            padding_idx=padding_idx,
            vocab_size=vocab_size,
            num_encoder_layers=args.encoder_layers,
            embedding_dim=args.encoder_embed_dim,
            ffn_embedding_dim=args.encoder_ffn_embed_dim,
            num_attention_heads=args.encoder_attention_heads,
            dropout=args.dropout,
            attention_dropout=args.attention_dropout,
            activation_dropout=args.activation_dropout,
            max_seq_len=args.max_positions,
            num_segments=0,
            encoder_normalize_before=True,
            apply_bert_init=True,
            activation_fn=args.activation_fn,
        )
        print(self.roberta_encoder)
        print("defined the roberta network!")
        model_ckpt_file=os.path.join(model_path, "model.pt")
        if os.path.exists(model_ckpt_file):
            ckpt = torch.load(model_ckpt_file, map_location='cpu')
            args = ckpt["args"]
            model_dict = {}
            for k, v in ckpt["model"].items():
                if "decoder.sentence_encoder." in k:
                    k = k.replace("decoder.sentence_encoder.", "")
                    if k not in self.roberta_encoder.state_dict().keys():
                        print("skip", k)
                        continue
                    model_dict[k] = v
                    print("{}:{}".format(k, v.size()))

            self.roberta_encoder.load_state_dict(model_dict)
            print("loaded {}/{} weights".format(len(model_dict.keys()), len(self.roberta_encoder.state_dict().keys())))

        self.roberta_encoder.embed_tokens=expandEmbeddingByN(self.roberta_encoder.embed_tokens, 4 )
        print("*"*50)


    def forward(self, src, lengths=None):
        """See :func:`EncoderBase.forward()`"""
        self._check_args(src, lengths)
        src=src.squeeze(2).transpose(0,1).contiguous()

        #outs, sent_out=self.roberta_encoder(src)
        emb, outs, sent_out=self.forwad1(self.roberta_encoder,src)

        #emb=outs[0]

        out=outs[-1]
        #print("src--> outs", src.size(), out.size(), emb.size())
        #return emb.transpose(0,1).contiguous(), out.transpose(0, 1).contiguous(), lengths
        return emb, out, lengths
    def __init__(self, args, task):
        super(BertRanker, self).__init__(args, task)

        init_model = getattr(args, "pretrained_model", "")
        self.joint_layers = nn.ModuleList()
        if os.path.isfile(init_model):
            print(f"initialize weight from {init_model}")

            from fairseq import hub_utils

            x = hub_utils.from_pretrained(
                os.path.dirname(init_model),
                checkpoint_file=os.path.basename(init_model),
            )

            in_state_dict = x["models"][0].state_dict()
            init_args = x["args"].model

            num_positional_emb = init_args.max_positions + task.dictionary.pad(
            ) + 1

            # follow the setup in roberta
            self.model = TransformerSentenceEncoder(
                padding_idx=task.dictionary.pad(),
                vocab_size=len(task.dictionary),
                num_encoder_layers=getattr(args, "encoder_layers",
                                           init_args.encoder_layers),
                embedding_dim=init_args.encoder_embed_dim,
                ffn_embedding_dim=init_args.encoder_ffn_embed_dim,
                num_attention_heads=init_args.encoder_attention_heads,
                dropout=init_args.dropout,
                attention_dropout=init_args.attention_dropout,
                activation_dropout=init_args.activation_dropout,
                num_segments=2,  # add language embeddings
                max_seq_len=num_positional_emb,
                offset_positions_by_padding=False,
                encoder_normalize_before=True,
                apply_bert_init=True,
                activation_fn=init_args.activation_fn,
                freeze_embeddings=args.freeze_embeddings,
                n_trans_layers_to_freeze=args.n_trans_layers_to_freeze,
            )

            # still need to learn segment embeddings as we added a second language embedding
            if args.freeze_embeddings:
                for p in self.model.segment_embeddings.parameters():
                    p.requires_grad = False

            update_init_roberta_model_state(in_state_dict)
            print("loading weights from the pretrained model")
            self.model.load_state_dict(
                in_state_dict,
                strict=False)  # ignore mismatch in language embeddings

            ffn_embedding_dim = init_args.encoder_ffn_embed_dim
            num_attention_heads = init_args.encoder_attention_heads
            dropout = init_args.dropout
            attention_dropout = init_args.attention_dropout
            activation_dropout = init_args.activation_dropout
            activation_fn = init_args.activation_fn

            classifier_embed_dim = getattr(args, "embed_dim",
                                           init_args.encoder_embed_dim)
            if classifier_embed_dim != init_args.encoder_embed_dim:
                self.transform_layer = nn.Linear(init_args.encoder_embed_dim,
                                                 classifier_embed_dim)
        else:
            self.model = TransformerSentenceEncoder(
                padding_idx=task.dictionary.pad(),
                vocab_size=len(task.dictionary),
                num_encoder_layers=args.encoder_layers,
                embedding_dim=args.embed_dim,
                ffn_embedding_dim=args.ffn_embed_dim,
                num_attention_heads=args.attention_heads,
                dropout=args.dropout,
                attention_dropout=args.attention_dropout,
                activation_dropout=args.activation_dropout,
                max_seq_len=task.max_positions()
                if task.max_positions() else args.tokens_per_sample,
                num_segments=2,
                offset_positions_by_padding=False,
                encoder_normalize_before=args.encoder_normalize_before,
                apply_bert_init=args.apply_bert_init,
                activation_fn=args.activation_fn,
            )

            classifier_embed_dim = args.embed_dim
            ffn_embedding_dim = args.ffn_embed_dim
            num_attention_heads = args.attention_heads
            dropout = args.dropout
            attention_dropout = args.attention_dropout
            activation_dropout = args.activation_dropout
            activation_fn = args.activation_fn

        self.joint_classification = args.joint_classification
        if args.joint_classification == "sent":
            if args.joint_normalize_before:
                self.joint_layer_norm = LayerNorm(classifier_embed_dim)
            else:
                self.joint_layer_norm = None

            self.joint_layers = nn.ModuleList([
                TransformerSentenceEncoderLayer(
                    embedding_dim=classifier_embed_dim,
                    ffn_embedding_dim=ffn_embedding_dim,
                    num_attention_heads=num_attention_heads,
                    dropout=dropout,
                    attention_dropout=attention_dropout,
                    activation_dropout=activation_dropout,
                    activation_fn=activation_fn,
                ) for _ in range(args.num_joint_layers)
            ])

        self.classifier = RobertaClassificationHead(
            classifier_embed_dim,
            classifier_embed_dim,
            1,  # num_classes
            "tanh",
            args.classifier_dropout,
        )
class BertRanker(BaseRanker):
    def __init__(self, args, task):
        super(BertRanker, self).__init__(args, task)

        init_model = getattr(args, "pretrained_model", "")
        self.joint_layers = nn.ModuleList()
        if os.path.isfile(init_model):
            print(f"initialize weight from {init_model}")

            from fairseq import hub_utils

            x = hub_utils.from_pretrained(
                os.path.dirname(init_model),
                checkpoint_file=os.path.basename(init_model),
            )

            in_state_dict = x["models"][0].state_dict()
            init_args = x["args"].model

            num_positional_emb = init_args.max_positions + task.dictionary.pad(
            ) + 1

            # follow the setup in roberta
            self.model = TransformerSentenceEncoder(
                padding_idx=task.dictionary.pad(),
                vocab_size=len(task.dictionary),
                num_encoder_layers=getattr(args, "encoder_layers",
                                           init_args.encoder_layers),
                embedding_dim=init_args.encoder_embed_dim,
                ffn_embedding_dim=init_args.encoder_ffn_embed_dim,
                num_attention_heads=init_args.encoder_attention_heads,
                dropout=init_args.dropout,
                attention_dropout=init_args.attention_dropout,
                activation_dropout=init_args.activation_dropout,
                num_segments=2,  # add language embeddings
                max_seq_len=num_positional_emb,
                offset_positions_by_padding=False,
                encoder_normalize_before=True,
                apply_bert_init=True,
                activation_fn=init_args.activation_fn,
                freeze_embeddings=args.freeze_embeddings,
                n_trans_layers_to_freeze=args.n_trans_layers_to_freeze,
            )

            # still need to learn segment embeddings as we added a second language embedding
            if args.freeze_embeddings:
                for p in self.model.segment_embeddings.parameters():
                    p.requires_grad = False

            update_init_roberta_model_state(in_state_dict)
            print("loading weights from the pretrained model")
            self.model.load_state_dict(
                in_state_dict,
                strict=False)  # ignore mismatch in language embeddings

            ffn_embedding_dim = init_args.encoder_ffn_embed_dim
            num_attention_heads = init_args.encoder_attention_heads
            dropout = init_args.dropout
            attention_dropout = init_args.attention_dropout
            activation_dropout = init_args.activation_dropout
            activation_fn = init_args.activation_fn

            classifier_embed_dim = getattr(args, "embed_dim",
                                           init_args.encoder_embed_dim)
            if classifier_embed_dim != init_args.encoder_embed_dim:
                self.transform_layer = nn.Linear(init_args.encoder_embed_dim,
                                                 classifier_embed_dim)
        else:
            self.model = TransformerSentenceEncoder(
                padding_idx=task.dictionary.pad(),
                vocab_size=len(task.dictionary),
                num_encoder_layers=args.encoder_layers,
                embedding_dim=args.embed_dim,
                ffn_embedding_dim=args.ffn_embed_dim,
                num_attention_heads=args.attention_heads,
                dropout=args.dropout,
                attention_dropout=args.attention_dropout,
                activation_dropout=args.activation_dropout,
                max_seq_len=task.max_positions()
                if task.max_positions() else args.tokens_per_sample,
                num_segments=2,
                offset_positions_by_padding=False,
                encoder_normalize_before=args.encoder_normalize_before,
                apply_bert_init=args.apply_bert_init,
                activation_fn=args.activation_fn,
            )

            classifier_embed_dim = args.embed_dim
            ffn_embedding_dim = args.ffn_embed_dim
            num_attention_heads = args.attention_heads
            dropout = args.dropout
            attention_dropout = args.attention_dropout
            activation_dropout = args.activation_dropout
            activation_fn = args.activation_fn

        self.joint_classification = args.joint_classification
        if args.joint_classification == "sent":
            if args.joint_normalize_before:
                self.joint_layer_norm = LayerNorm(classifier_embed_dim)
            else:
                self.joint_layer_norm = None

            self.joint_layers = nn.ModuleList([
                TransformerSentenceEncoderLayer(
                    embedding_dim=classifier_embed_dim,
                    ffn_embedding_dim=ffn_embedding_dim,
                    num_attention_heads=num_attention_heads,
                    dropout=dropout,
                    attention_dropout=attention_dropout,
                    activation_dropout=activation_dropout,
                    activation_fn=activation_fn,
                ) for _ in range(args.num_joint_layers)
            ])

        self.classifier = RobertaClassificationHead(
            classifier_embed_dim,
            classifier_embed_dim,
            1,  # num_classes
            "tanh",
            args.classifier_dropout,
        )

    def forward(self, src_tokens, src_lengths):
        segment_labels = self.get_segment_labels(src_tokens)
        positions = self.get_positions(src_tokens, segment_labels)

        inner_states, _ = self.model(
            tokens=src_tokens,
            segment_labels=segment_labels,
            last_state_only=True,
            positions=positions,
        )

        return inner_states[-1].transpose(0, 1)  # T x B x C -> B x T x C

    def sentence_forward(self,
                         encoder_out,
                         src_tokens=None,
                         sentence_rep="head"):
        # encoder_out: B x T x C
        if sentence_rep == "head":
            x = encoder_out[:, :1, :]
        else:  # 'meanpool', 'maxpool'
            assert src_tokens is not None, "meanpool requires src_tokens input"
            segment_labels = self.get_segment_labels(src_tokens)
            padding_mask = src_tokens.ne(self.padding_idx)
            encoder_mask = segment_labels * padding_mask.type_as(
                segment_labels)

            if sentence_rep == "meanpool":
                ntokens = torch.sum(encoder_mask, dim=1, keepdim=True)
                x = torch.sum(
                    encoder_out * encoder_mask.unsqueeze(2),
                    dim=1,
                    keepdim=True) / ntokens.unsqueeze(2).type_as(encoder_out)
            else:  # 'maxpool'
                encoder_out[(encoder_mask == 0).unsqueeze(2).repeat(
                    1, 1, encoder_out.shape[-1])] = -float("inf")
                x, _ = torch.max(encoder_out, dim=1, keepdim=True)

        if hasattr(self, "transform_layer"):
            x = self.transform_layer(x)

        return x  # B x 1 x C

    def joint_forward(self, x):
        # x: T x B x C
        if self.joint_layer_norm:
            x = self.joint_layer_norm(x.transpose(0, 1))
            x = x.transpose(0, 1)

        for layer in self.joint_layers:
            x, _ = layer(x, self_attn_padding_mask=None)
        return x

    def classification_forward(self, x):
        # x: B x T x C
        return self.classifier(x)
Пример #19
0
class TransformerSentenceEncoder(TransformerSentenceEncoderBase):
    """
    Implementation of the Transformer Sentence Encoder. This directly makes
    use of the TransformerSentenceEncoder module in Fairseq.

    A few interesting config options:
        - encoder_normalize_before detemines whether the layer norm is applied
          before or after self_attention. This is similar to original
          implementation from Google.
        - activation_fn can be set to 'gelu' instead of the default of 'relu'.
    """
    class Config(TransformerSentenceEncoderBase.Config, ConfigBase):
        # Dropout parameters
        dropout: float = 0.1
        attention_dropout: float = 0.1
        activation_dropout: float = 0.1

        # Parameters related to hidden states and self-attention
        embedding_dim: int = 768
        ffn_embedding_dim: int = 3072
        num_encoder_layers: int = 6
        num_attention_heads: int = 8
        num_segments: int = 2

        # Parameters related to positions
        use_position_embeddings: bool = True
        # the fairseq module for position embeddings offsets all position
        # ids by the padding index. Disable this offset by setting this flag
        # to False. This will work correctly since we mask out the embeddings
        # associated with padding in the encoder
        offset_positions_by_padding: bool = True

        # Model Initialization parameters
        apply_bert_init: bool = True

        # Misc. Params
        encoder_normalize_before: bool = True
        activation_fn: str = "relu"
        max_seq_len: int = 128

        # multilingual is set to true for cross-lingual LM training
        multilingual: bool = False

        # Flags for freezing parameters (e.g. during fine-tuning)
        freeze_embeddings: bool = False
        n_trans_layers_to_freeze: int = 0

        # Use of TorchScript and optimizations
        use_torchscript: bool = False

        # Fine-tune bias parameters only (https://nlp.biu.ac.il/~yogo/bitfit.pdf)
        use_bias_finetuning: bool = False

    def __init__(
        self,
        config: Config,
        output_encoded_layers: bool,
        padding_idx: int,
        vocab_size: int,
        *args,
        **kwarg,
    ) -> None:

        super().__init__(config, output_encoded_layers=output_encoded_layers)
        self.multilingual = config.multilingual
        self.offset_positions_by_padding = config.offset_positions_by_padding
        self.use_torchscript = config.use_torchscript
        self.use_bias_finetuning = config.use_bias_finetuning
        self.traced_encoder = None

        self.sentence_encoder = TransformerSentenceEncoderModule(
            padding_idx=padding_idx,
            vocab_size=vocab_size,
            num_encoder_layers=config.num_encoder_layers,
            embedding_dim=config.embedding_dim,
            ffn_embedding_dim=config.ffn_embedding_dim,
            num_attention_heads=config.num_attention_heads,
            dropout=config.dropout,
            attention_dropout=config.attention_dropout,
            activation_dropout=config.activation_dropout,
            max_seq_len=config.max_seq_len,
            num_segments=config.num_segments,
            use_position_embeddings=config.use_position_embeddings,
            offset_positions_by_padding=config.offset_positions_by_padding,
            encoder_normalize_before=config.encoder_normalize_before,
            apply_bert_init=config.apply_bert_init,
            activation_fn=config.activation_fn,
            freeze_embeddings=config.freeze_embeddings,
            n_trans_layers_to_freeze=config.n_trans_layers_to_freeze,
            export=self.export,
        )
        if self.use_torchscript:
            assert hasattr(self.sentence_encoder, "traceable")
            self.sentence_encoder.traceable = self.use_torchscript
        if self.use_bias_finetuning:
            for (n, p) in self.sentence_encoder.named_parameters():
                # "sentence_encoder.layers.0.self_attn.k_proj.weight" -> false
                # "sentence_encoder.layers.0.self_attn.k_proj.bias" -> true
                if n.split(".")[-1] != "bias":
                    p.requires_grad_(False)

        log_class_usage(__class__)

    def load_state_dict(self, state_dict):
        self.upgrade_state_dict_named(state_dict)
        # "projection" must be be in sync with the name of member variable projection.
        has_projection = any("projection" in key for key in state_dict.keys())
        if self.projection is not None and not has_projection:
            projection_temp = self.projection
            self.projection = None
            super().load_state_dict(state_dict)
            self.projection = projection_temp
        else:
            super().load_state_dict(state_dict)

    def _encoder(
            self, input_tuple: Tuple[torch.Tensor,
                                     ...]) -> Tuple[torch.Tensor, ...]:
        tokens, _, segment_labels, positions = input_tuple
        if self.offset_positions_by_padding or (not self.multilingual):
            positions = None

        if self.use_torchscript and self.traced_encoder is None:
            self.traced_encoder = TracedTransformerEncoder(
                self.sentence_encoder, tokens, segment_labels, positions)
            del self.sentence_encoder
            self.sentence_encoder = self.traced_encoder
            print("Using traced transformer sentence encoder")

        encoded_layers, pooled_output = self.sentence_encoder(
            tokens, segment_labels, positions=positions)
        # Each tensor in encoded_layers output by the Fairseq module has
        # the shape: T x B x C. Convert this to B x T x C
        encoded_layers = [x.transpose(0, 1) for x in encoded_layers]
        return encoded_layers, pooled_output

    def _embedding(self):
        # used to tie weights in MaskedLM model
        return self.sentence_encoder.embed_tokens

    def upgrade_state_dict_named(self, state_dict):
        # We convert in_proj_weight to individual q,k,v weights
        items_to_add = {}
        keys_to_remove = []
        for k in state_dict.keys():
            if k.endswith("in_proj_weight"):
                # in_proj_weight used to be q + k + v with same dimensions
                dim = int(state_dict[k].shape[0] / 3)
                items_to_add[k.replace("in_proj_weight",
                                       "q_proj.weight")] = state_dict[k][:dim]
                items_to_add[k.replace(
                    "in_proj_weight",
                    "k_proj.weight")] = state_dict[k][dim:2 * dim]
                items_to_add[k.replace("in_proj_weight",
                                       "v_proj.weight")] = state_dict[k][2 *
                                                                         dim:]
                keys_to_remove.append(k)

            if k.endswith("in_proj_bias"):
                dim = int(state_dict[k].shape[0] / 3)
                items_to_add[k.replace("in_proj_bias",
                                       "q_proj.bias")] = state_dict[k][:dim]
                items_to_add[k.replace("in_proj_bias",
                                       "k_proj.bias")] = state_dict[k][dim:2 *
                                                                       dim]
                items_to_add[k.replace(
                    "in_proj_bias", "v_proj.bias")] = state_dict[k][2 * dim:]
                keys_to_remove.append(k)

        for k in keys_to_remove:
            del state_dict[k]

        for key, value in items_to_add.items():
            state_dict[key] = value

        return state_dict