Exemplo n.º 1
0
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        self.embed_positions = (
            PositionalEmbedding(
                args.max_source_positions,
                embed_dim,
                self.padding_idx,
                learned=args.encoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])

        for _ in range(args.encoder_layers):
            if _ == 0:
                base_encoder_layer = self.build_encoder_layer(args)
                self.layers.extend([base_encoder_layer])
            else:
                encoder_layer = self.build_encoder_layer(args)
                encoder_layer = copy_params(base_encoder_layer, encoder_layer, args)
                self.layers.extend([encoder_layer])

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
    def __init__(
            self,
            args,  # Chỉ cần truyền vào cái này là đủ các tham số dòng lệnh cần thiết rồi. Còn nó là gì thì chưa được liệt kê
            dictionary,
            embed_tokens):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim  # Số chiều sau khi đã mã hóa thành nhị phân
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        # self.src_word_emb = embed_tokens  # Sửa

        self.embed_positions = (PositionalEmbedding(
            args.max_source_positions,
            embed_dim,
            self.padding_idx,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_encoder_layer(args, )
            for i in range(args.encoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
Exemplo n.º 3
0
 def __init__(self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout, q_noise=0, qn_block_size=8):
     super().__init__()
     self.dense = nn.Linear(input_dim, inner_dim)
     self.activation_fn = utils.get_activation_fn(activation_fn)
     self.dropout = nn.Dropout(p=pooler_dropout)
     self.out_proj = apply_quant_noise_(
         nn.Linear(inner_dim, num_classes), q_noise, qn_block_size
     )
Exemplo n.º 4
0
    def __init__(self, args, dictionary, embed_tokens, embed_bytes):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_bytes.embedding_dim

        # assume the padding will be the same for a sequences,
        self.padding_idx = embed_bytes.padding_idx

        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens

        if not self.args.input_combine == 'drop_bytes':
            self.embed_bytes = embed_bytes

        if self.args.input_combine == 'cnn':
            self.byte_combine = ByteCombineCNN(embed_dim, embed_dim)
        elif self.args.input_combine == 'drop_bytes':
            self.byte_combine = None
        else:  # input_combine == 'sum'
            self.byte_combine = ByteCombineSUM()

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_encoder_layer(args) for _ in range(args.encoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
Exemplo n.º 5
0
    def __init__(self, cfg, dictionary, embed_tokens, return_fc=False):
        self.cfg = cfg
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__)
        )
        self.encoder_layerdrop = cfg.encoder.layerdrop
        self.return_fc = return_fc

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = cfg.max_source_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim)

        self.embed_positions = (
            PositionalEmbedding(
                cfg.max_source_positions,
                embed_dim,
                self.padding_idx,
                learned=cfg.encoder.learned_pos,
            )
            if not cfg.no_token_positional_embeddings
            else None
        )
        if cfg.layernorm_embedding:
            self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layernorm_embedding = None

        if not cfg.adaptive_input and cfg.quant_noise.pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                cfg.quant_noise.pq,
                cfg.quant_noise.pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend(
            [self.build_encoder_layer(cfg) for i in range(cfg.encoder.layers)]
        )
        self.num_layers = len(self.layers)

        if cfg.encoder.normalize_before:
            self.layer_norm = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layer_norm = None
Exemplo n.º 6
0
    def __init__(self,
                 args,
                 conv_layers_before=None,
                 input_size=83,
                 transformer_context=None):
        self.args = args
        super(TransformerEncoder, self).__init__(None)  # no src dictionary
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = args.encoder_embed_dim
        self.max_source_positions = args.max_source_positions

        self.conv_layers_before = conv_layers_before
        self.fc0 = Linear(input_size,
                          embed_dim) if input_size != embed_dim else None

        self.embed_positions = (PositionalEmbedding(
            self.output_lengths(self.max_source_positions),
            embed_dim,
            0,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_encoder_layer(args) for i in range(args.encoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.transformer_context = transformer_context
Exemplo n.º 7
0
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout = args.dropout
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        self.embed_positions = (
            PositionalEmbedding(
                args.max_source_positions,
                embed_dim,
                self.padding_idx,
                learned=args.encoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_encoder_layer(args, i)
            for i in range(args.encoder_layers)
        ])
        self.num_layers = len(self.layers)
        self.attention_window = args.attention_window

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None
Exemplo n.º 8
0
 def __init__(self, input_dim, inner_dim, num_classes, num_heads, activation_fn, pooler_dropout, q_noise=0, qn_block_size=8):
     super().__init__()
     self.attn = MultiheadAttention(
         input_dim,
         num_heads,
         q_noise=q_noise,
         qn_block_size=qn_block_size,
     )
     self.layernorm = LayerNorm(input_dim)
     self.dense = nn.Linear(input_dim, inner_dim)
     self.activation_fn = utils.get_activation_fn(activation_fn)
     self.dropout = nn.Dropout(p=pooler_dropout)
     self.out_proj = apply_quant_noise_(
         nn.Linear(inner_dim, num_classes), q_noise, qn_block_size
     )
Exemplo n.º 9
0
Arquivo: model.py Projeto: BwRy/trex
 def __init__(
     self,
     input_dim,
     activation_fn,
     pooler_dropout,
     q_noise=0,
     qn_block_size=8,
     do_spectral_norm=False,
 ):
     super().__init__()
     self.dense = nn.Linear(input_dim, input_dim)
     self.activation_fn = utils.get_activation_fn(activation_fn)
     self.dropout = nn.Dropout(p=pooler_dropout)
     self.out_proj = apply_quant_noise_(nn.Linear(input_dim, input_dim),
                                        q_noise, qn_block_size)
     if do_spectral_norm:
         if q_noise != 0:
             raise NotImplementedError(
                 "Attempting to use Spectral Normalization with Quant Noise. This is not officially supported"
             )
         self.out_proj = torch.nn.utils.spectral_norm(self.out_proj)
Exemplo n.º 10
0
    def __init__(
        self,
        padding_idx: int,
        vocab_size: int,
        num_encoder_layers: int = 6,
        embedding_dim: int = 768,
        ffn_embedding_dim: int = 3072,
        num_attention_heads: int = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        layerdrop: float = 0.0,
        max_seq_len: int = 256,
        num_segments: int = 2,
        use_position_embeddings: bool = True,
        offset_positions_by_padding: bool = True,
        encoder_normalize_before: bool = False,
        apply_bert_init: bool = False,
        activation_fn: str = "relu",
        learned_pos_embedding: bool = True,
        embed_scale: float = None,
        freeze_embeddings: bool = False,
        n_trans_layers_to_freeze: int = 0,
        export: bool = False,
        traceable: bool = False,
        q_noise: float = 0.0,
        qn_block_size: int = 8,
    ) -> None:

        super().__init__()
        self.padding_idx = padding_idx
        self.vocab_size = vocab_size
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)
        self.layerdrop = layerdrop
        self.max_seq_len = max_seq_len
        self.embedding_dim = embedding_dim
        self.num_segments = num_segments
        self.use_position_embeddings = use_position_embeddings
        self.apply_bert_init = apply_bert_init
        self.learned_pos_embedding = learned_pos_embedding
        self.traceable = traceable

        self.embed_tokens = self.build_embedding(self.vocab_size,
                                                 self.embedding_dim,
                                                 self.padding_idx)
        self.embed_scale = embed_scale

        if q_noise > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
                q_noise,
                qn_block_size,
            )
        else:
            self.quant_noise = None

        self.segment_embeddings = (nn.Embedding(
            self.num_segments, self.embedding_dim, padding_idx=None)
                                   if self.num_segments > 0 else None)

        self.embed_positions = (PositionalEmbedding(
            self.max_seq_len,
            self.embedding_dim,
            padding_idx=(
                self.padding_idx if offset_positions_by_padding else None),
            learned=self.learned_pos_embedding,
        ) if self.use_position_embeddings else None)

        if encoder_normalize_before:
            self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export)
        else:
            self.emb_layer_norm = None

        if self.layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_transformer_sentence_encoder_layer(
                embedding_dim=self.embedding_dim,
                ffn_embedding_dim=ffn_embedding_dim,
                num_attention_heads=num_attention_heads,
                dropout=self.dropout_module.p,
                attention_dropout=attention_dropout,
                activation_dropout=activation_dropout,
                activation_fn=activation_fn,
                export=export,
                q_noise=q_noise,
                qn_block_size=qn_block_size,
            ) for _ in range(num_encoder_layers)
        ])

        # Apply initialization of model params after building the model
        if self.apply_bert_init:
            self.apply(init_bert_params)

        def freeze_module_params(m):
            if m is not None:
                for p in m.parameters():
                    p.requires_grad = False

        if freeze_embeddings:
            freeze_module_params(self.embed_tokens)
            freeze_module_params(self.segment_embeddings)
            freeze_module_params(self.embed_positions)
            freeze_module_params(self.emb_layer_norm)

        for layer in range(n_trans_layers_to_freeze):
            freeze_module_params(self.layers[layer])
Exemplo n.º 11
0
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__)
        self.encoder_layerdrop = args.encoder_layerdrop
        
        self.encoder_num_pos = args.encoder_num_pos                            # number of unique pos tags             
        self.pos_embed_dim = args.encoder_pos_embed_dim                        # pos vector dimension

        self.embed_tokens = embed_tokens
        if self.encoder_num_pos != 0:
          self.pos_tokens = self.build_pos_embed(dictionary, self.embed_tokens, args.encoder_embed_dim)
          total_dim = self.embed_tokens.embedding_dim + self.pos_tokens.embedding_dim
        else:
          self.pos_tokens = None
          total_dim = self.embed_tokens.embedding_dim

        embed_dim = self.embed_tokens.embedding_dim 
        self.padding_idx = self.embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        #self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        self.embed_positions = (
            PositionalEmbedding(
                args.max_source_positions,
                embed_dim,
                self.padding_idx,
                learned=args.encoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        self.train_en_file = self.load_train()
        self.pos_tag = self.load_pos_tag()

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(total_dim)
        else:
            self.layernorm_embedding = None

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(total_dim, total_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend(
            [self.build_encoder_layer(args) for i in range(args.encoder_layers)]
        )
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(total_dim)
        else:
            self.layer_norm = None
    def __init__(self, args, dictionary, feature_dict, embed_tokens, feature_embed_tokens):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        
        self.src_dict = dictionary
        self.feature_dict = feature_dict
        self.merging_method = args.feature_merge
        
        self.dropout = args.dropout
        self.encoder_layerdrop = args.encoder_layerdrop
        
        if self.merging_method == "concat":
            embed_dim = embed_tokens.embedding_dim +feature_embed_tokens.embedding_dim
        else:
            embed_dim = embed_tokens.embedding_dim
            
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens
        self.feature_embed_tokens = feature_embed_tokens

        # self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
        self.embed_scale = 1.0
        
        self.embed_positions = (
            PositionalEmbedding(
                args.max_source_positions,
                embed_dim,
                self.padding_idx,
                learned=args.encoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend(
            [self.build_encoder_layer(args) for i in range(args.encoder_layers)]
        )
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None
            
            
        if args.feature_merge =="gate":
            self.gate_layer = Linear(embed_dim*2, embed_dim)
            self.gate_sigmoid = torch.nn.Sigmoid()
Exemplo n.º 13
0
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions
        self.device_ = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # creating constant positional encoding table with the max source positions
        pe = torch.zeros(self.max_source_positions, embed_dim)
        position = torch.arange(0, self.max_source_positions).unsqueeze(1)
        div_term = torch.exp((torch.arange(0, embed_dim, 2, dtype=torch.float) * -(math.log(10000.0) / embed_dim)))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        self.constant_positional_encoding = pe.to(self.device_)

        # position layers
        self.position_layers = args.position_layers

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        self.embed_positions = (
            PositionalEmbedding(
                args.max_source_positions,
                embed_dim,
                self.padding_idx,
                learned=args.encoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend(
            [self.build_encoder_layer(args) for i in range(args.encoder_layers)]
        )
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        self.pos_weight = nn.Linear(embed_dim, embed_dim, bias=True).to(self.device_)
Exemplo n.º 14
0
    def __init__(
        self,
        cfg,
        dictionary,
        embed_tokens,
        no_encoder_attn=False,
        output_projection=None,
        scheduled_sampling_rate_scheduler=None,
    ):
        is_no_token_positional_embeddings_changed = False
        if (not cfg.no_token_positional_embeddings
                and cfg.decoder.relative_positional_embeddings):
            cfg.no_token_positional_embeddings = True
            is_no_token_positional_embeddings_changed = True
            logger.info(
                "disabled decoder's absolute positional embeddings as decoder_relative_positional_embeddings is True."
            )

        self.cfg = cfg
        super(TransformerDecoderBase, self).__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout_module = FairseqDropout(
            cfg.dropout,
            module_name=module_name_fordropout(self.__class__.__name__))
        self.decoder_layerdrop = cfg.decoder.layerdrop
        self.share_input_output_embed = cfg.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = cfg.decoder.embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = cfg.decoder.output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = cfg.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(
            embed_dim)

        if not cfg.adaptive_input and cfg.quant_noise.pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                cfg.quant_noise.pq,
                cfg.quant_noise.pq_block_size,
            )
        else:
            self.quant_noise = None

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)
        self.embed_positions = (PositionalEmbedding(
            self.max_target_positions,
            embed_dim,
            self.padding_idx,
            learned=cfg.decoder.learned_pos,
        ) if not cfg.no_token_positional_embeddings else None)
        if cfg.layernorm_embedding:
            self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layernorm_embedding = None

        self.cross_self_attention = cfg.cross_self_attention

        if cfg.decoder.relative_positional_embeddings:
            if cfg.decoder.learned_pos:
                rel_pos_embed_list = [
                    RelativePositionalEmbedding(
                        cfg.decoder.embed_dim,
                        padding_idx=None,
                        max_size=cfg.max_target_positions,
                        learned=True,
                    ) for _ in range(cfg.decoder.layers)
                ]
            else:
                rel_pos_embed = RelativePositionalEmbedding(
                    cfg.decoder.embed_dim,
                    padding_idx=None,
                    max_size=None,
                    learned=False,
                )
                # single instance referenced across layers
                rel_pos_embed_list = [rel_pos_embed] * cfg.decoder.layers
        else:
            rel_pos_embed_list = [None] * cfg.decoder.layers

        if self.decoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(
                cfg,
                no_encoder_attn,
                positional_embedding=rel_pos_embed_list[i])
            for i in range(cfg.decoder.layers)
        ])
        self.num_layers = len(self.layers)

        if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm:
            self.layer_norm = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layer_norm = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False)
                                if embed_dim != self.output_embed_dim
                                and not cfg.tie_adaptive_weights else None)

        self.adaptive_softmax = None
        self.output_projection = output_projection
        if self.output_projection is None:
            self.build_output_projection(cfg, dictionary, embed_tokens)

        if is_no_token_positional_embeddings_changed:
            cfg.no_token_positional_embeddings = not cfg.no_token_positional_embeddings

        self.scheduled_sampling_rate_scheduler = scheduled_sampling_rate_scheduler
        for layer in self.layers:
            if isinstance(
                    layer,
                    TransformerWithRelativePositionalEmbeddingDecoderLayerBase
            ):
                layer.need_attn = False  # make validation fast
Exemplo n.º 15
0
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout = args.dropout
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        self.embed_positions = (PositionalEmbedding(
            args.max_source_positions,
            embed_dim,
            self.padding_idx,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None)

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_encoder_layer(args) for i in range(args.encoder_layers)
        ])

        # My config

        class my_net_config:
            dropout = 0.1
            dropatt = 0.1
            d_head = 64
            n_head = 12
            n_block = 3
            d_model = 768
            d_inner = 3072
            dropact = 0.0
            block_param = [2, 2, 2]

        class my_args:
            truncate_seq = False
            attn_type = "rel_shift"
            seg_id_cls = 2

        cls_target = True

        self.funnel = False
        if self.funnel:
            self.net_config = my_net_config()
            self.args = my_args()

            self.attn_info = AttentionStructure(self.net_config, self.args)
            self.attn_layers = nn.ModuleList()
            self.pffn_layers = nn.ModuleList()

            for block_idx in range(self.net_config.n_block):
                for _ in range(self.net_config.block_param[block_idx]):
                    self.attn_layers.append(
                        RelMultiheadAttention(
                            self.net_config,
                            self.args,
                            self.net_config.d_model,
                            self.net_config.n_head,
                            self.net_config.d_head,
                            self.net_config.dropout,
                            self.net_config.dropatt,
                            block_idx,
                        ))

                    self.pffn_layers.append(
                        PositionwiseFFN(
                            self.net_config.d_model,
                            self.net_config.d_inner,
                            self.net_config.dropout,
                            self.net_config.dropact,
                        ))
            # if cls_target:
            # self.build_cls_head()

        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None
Exemplo n.º 16
0
    def __init__(
        self,
        args,
        dictionary,
        embed_tokens,
        no_encoder_attn=False,
        output_projection=None,
    ):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__
        )
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        self.project_in_dim = (
            Linear(input_embed_dim, embed_dim, bias=False)
            if embed_dim != input_embed_dim
            else None
        )
        self.embed_positions = (
            PositionalEmbedding(
                self.max_target_positions,
                embed_dim,
                self.padding_idx,
                learned=args.decoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.cross_self_attention = getattr(args, "cross_self_attention", False)

        if self.decoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend(
            [
                self.build_decoder_layer(args, no_encoder_attn)
                for _ in range(args.decoder_layers)
            ]
        )
        self.num_layers = len(self.layers)

        if args.decoder_normalize_before and not getattr(
            args, "no_decoder_final_norm", False
        ):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.project_out_dim = (
            Linear(embed_dim, self.output_embed_dim, bias=False)
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
            else None
        )

        self.adaptive_softmax = None
        self.output_projection = output_projection
        if self.output_projection is None:
            self.build_output_projection(args, dictionary, embed_tokens)
Exemplo n.º 17
0
    def __init__(
        self,
        padding_idx: int,
        vocab_size: int,
        projection_length: int,
        num_encoder_layers: int = 6,
        embedding_dim: int = 768,
        ffn_embedding_dim: int = 3072,
        num_attention_heads: int = 8,
        num_projected_attention_heads: int = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        layerdrop: float = 0.0,
        max_seq_len: int = 256,
        num_segments: int = 0,
        use_position_embeddings: bool = True,
        offset_positions_by_padding: bool = True,
        layernorm_embedding: bool = False,
        normalize_before: bool = False,
        dynamic_projection: bool = True,
        tie_kv=True,
        tie_layer_weights: bool = False,
        apply_bert_init: bool = False,
        activation_fn: str = "relu",
        learned_pos_embedding: bool = True,
        embed_scale: float = None,
        freeze_embeddings: bool = False,
        n_trans_layers_to_freeze: int = 0,
        export: bool = False,
        traceable: bool = False,
        q_noise: float = 0.0,
        qn_block_size: int = 8,
        sen_rep_type: str = 'cls',
    ) -> None:

        super().__init__()
        self.padding_idx = padding_idx
        self.vocab_size = vocab_size
        self.proj_len = projection_length
        self.dynamic_projection = dynamic_projection
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)
        self.layerdrop = layerdrop
        self.max_seq_len = max_seq_len
        self.embedding_dim = embedding_dim
        self.num_segments = num_segments
        self.use_position_embeddings = use_position_embeddings
        self.apply_bert_init = apply_bert_init
        self.learned_pos_embedding = learned_pos_embedding
        self.traceable = traceable
        self.tie_layer_weights = tie_layer_weights
        self.tpu = False  # whether we're on TPU
        self.sen_rep_type = sen_rep_type

        self.embed_tokens = self.build_embedding(self.vocab_size,
                                                 self.embedding_dim,
                                                 self.padding_idx)
        self.embed_scale = embed_scale

        if q_noise > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
                q_noise, qn_block_size)
        else:
            self.quant_noise = None

        if self.num_segments > 0:
            self.segment_embeddings = nn.Embedding(self.num_segments,
                                                   self.embedding_dim,
                                                   padding_idx=None)
            nn.init.normal_(self.segment_embeddings.weight,
                            mean=0.0,
                            std=self.embedding_dim**-0.5)
        else:
            self.segment_embeddings = None

        self.embed_positions = (PositionalEmbedding(
            self.max_seq_len,
            self.embedding_dim,
            padding_idx=(
                self.padding_idx if offset_positions_by_padding else None),
            learned=self.learned_pos_embedding,
        ) if self.use_position_embeddings else None)

        self.projected_embeddings = Parameter(
            torch.Tensor(self.proj_len, self.embedding_dim))
        nn.init.normal_(self.projected_embeddings,
                        mean=0.0,
                        std=self.embedding_dim**-0.5)
        if self.use_position_embeddings and not self.learned_pos_embedding:
            projected_positions = get_sinusoidal_positional_embedding(
                self.proj_len, self.embedding_dim)
            if self.embed_scale is None:
                self.embed_scale = math.sqrt(self.embedding_dim)
        else:
            projected_positions = None
        self.register_buffer('projected_positions', projected_positions)

        if self.layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.num_layers = num_encoder_layers
        real_num_layes = 1 if self.tie_layer_weights else num_encoder_layers
        self.layers.extend([
            self.build_luna_sentence_encoder_layer(
                embedding_dim=self.embedding_dim,
                ffn_embedding_dim=ffn_embedding_dim,
                num_attention_heads=num_attention_heads,
                num_projected_attention_heads=num_projected_attention_heads,
                dropout=self.dropout_module.p,
                attention_dropout=attention_dropout,
                activation_dropout=activation_dropout,
                activation_fn=activation_fn,
                normalize_before=normalize_before,
                tie_kv=tie_kv,
                export=export,
                q_noise=q_noise,
                qn_block_size=qn_block_size,
            ) for _ in range(real_num_layes)
        ])

        assert not layernorm_embedding or not normalize_before

        if layernorm_embedding:
            self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export)
            self.proj_emb_layer_norm = LayerNorm(self.embedding_dim,
                                                 export=export)
        else:
            self.emb_layer_norm = None
            self.proj_emb_layer_norm = None

        if normalize_before:
            self.layer_norm = LayerNorm(self.embedding_dim, export=export)
            self.proj_layer_norm = LayerNorm(self.embedding_dim, export=export)
        else:
            self.layer_norm = None
            self.proj_layer_norm = None

        # Apply initialization of model params after building the model
        if self.apply_bert_init:
            self.apply(init_bert_params)

        def freeze_module_params(m):
            if m is not None:
                for p in m.parameters():
                    p.requires_grad = False

        if freeze_embeddings:
            self.projected_embeddings.requires_grad = False
            freeze_module_params(self.embed_tokens)
            freeze_module_params(self.segment_embeddings)
            freeze_module_params(self.embed_positions)
            freeze_module_params(self.emb_layer_norm)
            freeze_module_params(self.proj_emb_layer_norm)

        for layer in range(n_trans_layers_to_freeze):
            freeze_module_params(self.layers[layer])
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout = args.dropout
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        # self.embed_tokens = embed_tokens
        self.output_embed_dim = args.decoder_output_dim

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        # self.embed_positions = (
        #     PositionalEmbedding(
        #         args.max_source_positions,
        #         embed_dim,
        #         self.padding_idx,
        #         learned=args.encoder_learned_pos,
        #     )
        #     if not args.no_token_positional_embeddings
        #     else None
        # )

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_encoder_layer(args)
            for i in range(args.encoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.project_out_dim = (
            Linear(embed_dim, self.output_embed_dim, bias=False)
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
            else None
        )

        self.output_projection = nn.Linear(
            embed_tokens.weight.shape[1],
            embed_tokens.weight.shape[0],
            bias=False,
        )
        self.output_projection.weight = embed_tokens.weight
Exemplo n.º 19
0
    def __init__(
        self,
        cfg,
        return_fc=False,
        pre_encoder=None,
        input_size=83,
        transformer_context=None,
    ):
        self.cfg = cfg
        super(TransformerEncoderBase, self).__init__(None)  # no src dictionary
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__)
        )
        self.encoder_layerdrop = cfg.encoder.layerdrop
        self.return_fc = return_fc

        embed_dim = cfg.encoder.embed_dim
        self.max_source_positions = cfg.max_source_positions

        self.pre_encoder = pre_encoder
        self.fc0 = Linear(input_size, embed_dim) if input_size != embed_dim else None

        self.embed_scale = (
            1.0
            if cfg.no_scale_embedding
            or self.fc0 is not None  # always diable scaling if fc0 is present
            else math.sqrt(embed_dim)
        )

        if (
            not cfg.no_token_positional_embeddings
            and cfg.encoder.relative_positional_embeddings
        ):
            logger.info(
                "disabled encoder's absolute positional embeddings as encoder_relative_positional_embeddings is True."
            )
        self.embed_positions = (
            PositionalEmbedding(
                self.output_lengths(self.max_source_positions),
                embed_dim,
                0,
                learned=cfg.encoder.learned_pos,
            )
            if not cfg.no_token_positional_embeddings
            and not cfg.encoder.relative_positional_embeddings
            else None
        )

        if cfg.layernorm_embedding:
            self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layernorm_embedding = None

        if not cfg.adaptive_input and cfg.quant_noise.pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                cfg.quant_noise.pq,
                cfg.quant_noise.pq_block_size,
            )
        else:
            self.quant_noise = None

        if cfg.encoder.relative_positional_embeddings:
            if cfg.encoder.learned_pos:
                rel_pos_embed_list = [
                    RelativePositionalEmbedding(
                        cfg.encoder.embed_dim,
                        padding_idx=None,
                        max_size=self.output_lengths(cfg.max_source_positions),
                        learned=True,
                    )
                    for _ in range(cfg.encoder.layers)
                ]
            else:
                rel_pos_embed = RelativePositionalEmbedding(
                    cfg.encoder.embed_dim,
                    padding_idx=None,
                    max_size=None,
                    learned=False,
                )
                # single instance referenced across layers
                rel_pos_embed_list = [rel_pos_embed] * cfg.encoder.layers
        else:
            rel_pos_embed_list = [None] * cfg.encoder.layers

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend(
            [
                self.build_encoder_layer(
                    cfg, positional_embedding=rel_pos_embed_list[i]
                )
                for i in range(cfg.encoder.layers)
            ]
        )
        self.num_layers = len(self.layers)

        if cfg.encoder.normalize_before and cfg.encoder.layer_type != "conformer":
            self.layer_norm = LayerNorm(embed_dim, export=cfg.export)
        else:
            self.layer_norm = None

        self.transformer_context = transformer_context
Exemplo n.º 20
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        self.args = args
        super().__init__(args, dictionary, embed_tokens)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout = args.dropout
        self.decoder_layerdrop = args.decoder_layerdrop
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.embed_dim = embed_dim
        self.output_embed_dim = args.decoder_output_dim

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        self.project_in_dim = (
            Linear(input_embed_dim, embed_dim, bias=False)
            if embed_dim != input_embed_dim
            else None
        )

        self.embed_positions = (
            PositionalEmbedding(
                args.max_target_positions,
                embed_dim,
                self.padding_idx,
                learned=args.decoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.cross_self_attention = getattr(args, "cross_self_attention", False)

        if self.decoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(args, no_encoder_attn)
            for _ in range(args.decoder_layers)
        ])
        self.num_layers = len(self.layers)

        if args.decoder_normalize_before and not getattr(args, "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.project_out_dim = (
            Linear(embed_dim, self.output_embed_dim, bias=False)
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
            else None
        )

        self.adaptive_softmax = None
        self.output_projection = None
        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif self.share_input_output_embed:
            self.output_projection = nn.Linear(
                self.embed_tokens.weight.shape[1],
                self.embed_tokens.weight.shape[0],
                bias=False,
            )
            self.output_projection.weight = self.embed_tokens.weight
        else:
            self.output_projection = nn.Linear(
                self.output_embed_dim, len(dictionary), bias=False
            )
            nn.init.normal_(
                self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
            )
Exemplo n.º 21
0
    def __init__(self, args, dictionary, embed_tokens):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))

        self.dropout_module = FairseqDropout(
            args.dropout, module_name=self.__class__.__name__)
        self.encoder_layerdrop = args.encoder_layerdrop

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(
            embed_dim)

        self.embed_positions = (
            PositionalEmbedding(
                args.max_source_positions,
                embed_dim,
                self.padding_idx,
                learned=args.encoder_learned_pos,
            )
            if not args.no_token_positional_embeddings
            else None
        )

        if getattr(args, "layernorm_embedding", False):
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        if not args.adaptive_input and args.quant_noise_pq > 0:
            self.quant_noise = apply_quant_noise_(
                nn.Linear(embed_dim, embed_dim, bias=False),
                args.quant_noise_pq,
                args.quant_noise_pq_block_size,
            )
        else:
            self.quant_noise = None

        if self.encoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])

        self.layers.extend(
            [self.build_encoder_layer(args)
             for i in range(args.encoder_layers)]
        )
        self.num_layers = len(self.layers)

        if args.encoder_normalize_before:
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.layer_wise_attention = args.layer_wise_attn

        # for T5 attention
        self.rel_pos = args.rel_pos
        self.relative_attention_num_buckets = args.relative_attention_num_buckets
        self.max_rel_dist = args.max_rel_pos
        self.ignore_direction = args.ignore_direction
        self.num_heads = args.encoder_attention_heads
        if self.rel_pos:
            max_dist = self.relative_attention_num_buckets + 1 \
                if self.ignore_direction else self.relative_attention_num_buckets * 2 + 1
            self.relative_attention_bias = nn.Embedding(max_dist, self.num_heads)