Пример #1
0
    def __init__(
        self,
        vocab_size,
        input_dim,
        cutoff,
        dropout,
        factor=4.0,
        adaptive_inputs=None,
        tie_proj=False,
        q_noise=0,
        qn_block_size=8,
    ):
        super().__init__()

        if vocab_size > cutoff[-1]:
            cutoff = cutoff + [vocab_size]
        else:
            assert (vocab_size == cutoff[-1]
                    ), "cannot specify cutoff larger than vocab size"

        output_dim = cutoff[0] + len(cutoff) - 1

        self.vocab_size = vocab_size
        self.cutoff = cutoff
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)
        self.input_dim = input_dim
        self.factor = factor
        self.q_noise = q_noise
        self.qn_block_size = qn_block_size

        self.lsm = nn.LogSoftmax(dim=1)

        if adaptive_inputs is not None:
            self.head = TiedHeadModule(
                adaptive_inputs.weights_for_band(0),
                input_dim,
                len(cutoff) - 1,
                self.q_noise,
                self.qn_block_size,
            )
        else:
            self.head = quant_noise(
                nn.Linear(input_dim, output_dim, bias=False),
                self.q_noise,
                self.qn_block_size,
            )

        self._make_tail(adaptive_inputs, tie_proj)

        def init_weights(m):
            if (hasattr(m, "weight") and not isinstance(m, TiedLinear)
                    and not isinstance(m, TiedHeadModule)):
                nn.init.xavier_uniform_(m.weight)

        self.apply(init_weights)

        self.register_buffer("version", torch.LongTensor([1]))
Пример #2
0
    def _make_tail(self, adaptive_inputs=None, tie_proj=False):
        self.tail = nn.ModuleList()
        for i in range(len(self.cutoff) - 1):
            dim = int(self.input_dim // self.factor**(i + 1))

            tied_emb, tied_proj = (adaptive_inputs.weights_for_band(i + 1)
                                   if adaptive_inputs is not None else
                                   (None, None))

            if tied_proj is not None:
                if tie_proj:
                    proj = quant_noise(
                        TiedLinear(tied_proj, transpose=True),
                        self.q_noise,
                        self.qn_block_size,
                    )
                else:
                    proj = quant_noise(
                        nn.Linear(tied_proj.size(0),
                                  tied_proj.size(1),
                                  bias=False),
                        self.q_noise,
                        self.qn_block_size,
                    )
            else:
                proj = quant_noise(
                    nn.Linear(self.input_dim, dim, bias=False),
                    self.q_noise,
                    self.qn_block_size,
                )

            if tied_emb is None:
                out_proj = nn.Linear(dim,
                                     self.cutoff[i + 1] - self.cutoff[i],
                                     bias=False)
            else:
                out_proj = TiedLinear(tied_emb, transpose=False)

            m = nn.Sequential(
                proj,
                nn.Dropout(self.dropout_module.p),
                quant_noise(out_proj, self.q_noise, self.qn_block_size),
            )

            self.tail.append(m)
    def __init__(
        self,
        embedding_dim: int = 768,
        ffn_embedding_dim: int = 3072,
        num_attention_heads: int = 8,
        dropout: float = 0.1,
        attention_dropout: float = 0.1,
        activation_dropout: float = 0.1,
        activation_fn: str = 'relu',
        export: bool = False,
        q_noise: float = 0.0,
        qn_block_size: int = 8,
    ) -> None:

        super().__init__()
        # Initialize parameters
        self.embedding_dim = embedding_dim
        self.dropout = dropout
        self.activation_dropout = activation_dropout

        # Initialize blocks
        self.activation_fn = utils.get_activation_fn(activation_fn)
        self.self_attn = MultiheadAttention(
            self.embedding_dim,
            num_attention_heads,
            dropout=attention_dropout,
            add_bias_kv=False,
            add_zero_attn=False,
            self_attention=True,
            q_noise=q_noise,
            qn_block_size=qn_block_size,
        )

        # layer norm associated with the self attention layer
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim,
                                              export=export)
        self.fc1 = quant_noise(
            nn.Linear(self.embedding_dim, ffn_embedding_dim), q_noise,
            qn_block_size)
        self.fc2 = quant_noise(
            nn.Linear(ffn_embedding_dim, self.embedding_dim), q_noise,
            qn_block_size)

        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
Пример #4
0
    def _prune_fc_layer(self, remove_index: List[int]):
        new_fc1_weight = []
        new_fc1_bias = []
        for i in range(self.fc1.out_features):
            if i not in remove_index:
                new_fc1_weight.append(self.fc1.weight[i])
                new_fc1_bias.append(self.fc1.bias[i])

        new_fc1_weight = torch.stack(new_fc1_weight).detach()
        new_fc1_weight.requires_grad = True

        new_fc1_bias = torch.stack(new_fc1_bias).detach()
        new_fc1_bias.requires_grad = True

        self.fc1 = quant_noise(
            nn.Linear(self.fc1.in_features,
                      self.fc1.out_features - len(remove_index)),
            p=self.quant_noise,
            block_size=self.quant_noise_block_size,
        )
        self.fc1.weight = torch.nn.Parameter(new_fc1_weight)
        self.fc1.bias = torch.nn.Parameter(new_fc1_bias)

        new_fc2_weight = []
        new_fc2_bias = []
        for i in range(self.fc2.in_features):
            if i not in remove_index:
                new_fc2_weight.append(self.fc2.weight[:, i])
        new_fc2_bias = self.fc2.bias.detach()

        new_fc2_weight = torch.stack(new_fc2_weight, dim=-1).detach()
        new_fc2_weight.requires_grad = True

        new_fc2_bias = self.fc2.bias.detach()
        new_fc2_bias.requires_grad = True

        self.fc2 = quant_noise(
            nn.Linear(self.fc2.in_features - len(remove_index),
                      self.fc2.out_features),
            p=self.quant_noise,
            block_size=self.quant_noise_block_size,
        )
        self.fc2.weight = torch.nn.Parameter(new_fc2_weight)
        self.fc2.bias = torch.nn.Parameter(new_fc2_bias)
Пример #5
0
    def __init__(self, weights, input_dim, num_classes, q_noise,
                 qn_block_size):
        super().__init__()
        tied_emb, _ = weights
        self.num_words, emb_dim = tied_emb.size()

        self.word_proj = quant_noise(TiedLinear(tied_emb, transpose=False),
                                     q_noise, qn_block_size)
        if input_dim != emb_dim:
            self.word_proj = nn.Sequential(
                quant_noise(nn.Linear(input_dim, emb_dim, bias=False), q_noise,
                            qn_block_size),
                self.word_proj,
            )

        self.class_proj = quant_noise(
            nn.Linear(input_dim, num_classes, bias=False), q_noise,
            qn_block_size)
        self.out_dim = self.num_words + num_classes

        self.register_buffer('_float_tensor', torch.FloatTensor(1))
Пример #6
0
    def __init__(
        self,
        vocab_size: int,
        padding_idx: int,
        initial_dim: int,
        factor: float,
        output_dim: int,
        cutoff: List[int],
        q_noise: float = 0,
        qn_block_size: int = 8,
    ):
        super().__init__()

        if vocab_size > cutoff[-1]:
            cutoff = cutoff + [vocab_size]
        else:
            assert (vocab_size == cutoff[-1]
                    ), "cannot specify cutoff larger than vocab size"

        self.cutoff = cutoff
        self.embedding_dim = output_dim
        self.padding_idx = padding_idx

        self.embeddings = nn.ModuleList()
        for i in range(len(self.cutoff)):
            prev = self.cutoff[i - 1] if i > 0 else 0
            size = self.cutoff[i] - prev
            dim = int(initial_dim // (factor**i))
            seq = nn.Sequential(
                nn.Embedding(size, dim, self.padding_idx),
                quant_noise(nn.Linear(dim, output_dim, bias=False), q_noise,
                            qn_block_size),
            )

            self.embeddings.append(seq)
            self.padding_idx = None
        self.padding_idx = padding_idx

        def init_weights(m):
            if isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1]**-0.5)
                nn.init.constant_(m.weight[padding_idx], 0)
            elif hasattr(m, "weight"):
                nn.init.xavier_uniform_(m.weight)

        self.apply(init_weights)

        self.register_buffer("_float_tensor", torch.FloatTensor(1))
Пример #7
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        dropout=0.0,
        bias=True,
        tie_kv=True,
        q_noise=0.0,
        qn_block_size=8,
        parallel=True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.parallel = parallel
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.pq_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                   q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.pc_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                   q_noise, qn_block_size)
        if tie_kv:
            self.c_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            self.k_proj = self.v_proj = None
        else:
            self.k_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            self.v_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            self.c_proj = None

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)
        self.reset_parameters()

        self.onnx_trace = False
        self.tpu = False
Пример #8
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
        # Ture

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(
            nn.Linear(2 * self.vdim, embed_dim, bias=bias), q_noise,
            qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False
        self.tpu = False
Пример #9
0
 def build_domain_projection(self, input_dim, output_dim, q_noise,
                             qn_block_size):
     return quant_noise(nn.Linear(input_dim, output_dim),
                        p=q_noise,
                        block_size=qn_block_size)
Пример #10
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
        compressed=1,
        max_seq_len=256,
        shared_kv_compressed=0,
        shared_compress_layer=None,
        freeze_compress=0,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        # used for compress sequence to subsequence
        if shared_compress_layer is None:
            self.compress_seq_len = max_seq_len // compressed
            self.compress_k = nn.Linear(max_seq_len,
                                        self.compress_seq_len,
                                        bias=False)
            if shared_kv_compressed == 0:
                self.compress_v = nn.Linear(max_seq_len,
                                            self.compress_seq_len,
                                            bias=False)
            self.layerwise_sharing = False
        else:
            self.compress_k = shared_compress_layer
            if shared_kv_compressed == 0:
                self.compress_v = shared_compress_layer
            self.layerwise_sharing = True
        self.shared_kv_compressed = shared_kv_compressed

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        if freeze_compress == 1:
            self.compress_k.weight.requires_grad = False
            if shared_kv_compressed == 0:
                self.compress_v.weight.requires_grad = False

        self.onnx_trace = False
        self.tpu = False
Пример #11
0
    def __init__(self,
                 embed_dim,
                 num_heads,
                 kdim=None,
                 vdim=None,
                 dropout=0.0,
                 bias=True,
                 add_bias_kv=False,
                 add_zero_attn=False,
                 self_attention=False,
                 encoder_decoder_attention=False,
                 q_noise=0.0,
                 qn_block_size=8,
                 nblocks=1,
                 top_k_ratio=None):
        super().__init__()
        self.embed_dim = embed_dim
        self.nblocks = nblocks
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        print('heads', num_heads)
        print('num blocks', nblocks)

        if top_k_ratio is None:
            self.sa = None
            print('no topk')
        else:
            top_k = int(top_k_ratio * nblocks * num_heads)
            self.sa = SparseAttention(top_k=top_k)
            print('using topk', top_k)

        self.head_dim = 128  #embed_dim // num_heads

        #assert (
        #    self.head_dim * num_heads == self.embed_dim
        #), "embed_dim must be divisible by num_heads"

        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(
            nn.Linear(self.kdim, self.head_dim * num_heads, bias=bias),
            q_noise, qn_block_size)
        self.v_proj = quant_noise(
            nn.Linear(self.vdim, self.head_dim * num_heads, bias=bias),
            q_noise, qn_block_size)
        self.q_proj = quant_noise(
            nn.Linear(embed_dim, self.head_dim * num_heads, bias=bias),
            q_noise, qn_block_size)

        self.out_proj = quant_noise(
            nn.Linear(self.head_dim * num_heads, embed_dim, bias=bias),
            q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False
        self.tpu = False
Пример #12
0
    def __init__(self,
                 embed_dim,
                 num_heads,
                 kdim=None,
                 vdim=None,
                 dropout=0.0,
                 bias=True,
                 add_bias_kv=False,
                 add_zero_attn=False,
                 self_attention=False,
                 encoder_decoder_attention=False,
                 q_noise=0.0,
                 qn_block_size=8,
                 relative_pos_type=None,
                 max_relative_pos=None,
                 heads_share_embeddings=False,
                 add_pos_embeddings_to_values=False):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False
        self.tpu = False

        self.positional_embedding_layer = None
        self.unmasked_attention = None
        self.add_pos_embeddings_to_values = add_pos_embeddings_to_values
        if relative_pos_type is not None:
            if add_pos_embeddings_to_values:
                self.enable_torch_version = False
            self.unmasked_attention = (relative_pos_type == "unmasked")
            self.heads_share_embeddings = heads_share_embeddings
            self.positional_embedding_layer = RelativePositionalEmbedding(
                max_relative_pos=max_relative_pos,
                num_heads=num_heads,
                embedding_dim=self.head_dim,
                unmasked=self.unmasked_attention,
                heads_share_embeddings=heads_share_embeddings,
                add_to_values=add_pos_embeddings_to_values)
Пример #13
0
 def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
     return quant_noise(
         GroupLinearLayer(input_dim // self.nb, output_dim // self.nb,
                          self.nb), q_noise, qn_block_size)
Пример #14
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        relaxed_attention_weight=0.0,
        q_noise=0.0,
        qn_block_size=8,
        # TODO: pass in config rather than string.
        # config defined in xformers.components.attention.AttentionConfig
        xformers_att_config: Optional[str] = None,
        xformers_blocksparse_layout: Optional[
            torch.Tensor] = None,  # This should be part of the config
        xformers_blocksparse_blocksize: Optional[
            int] = 16,  # This should be part of the config
        positional_embedding=None,
    ):
        super().__init__()

        xformers_att_config = utils.eval_str_dict(xformers_att_config)
        self.use_xformers = xformers_att_config is not None
        if self.use_xformers and not _xformers_available:
            raise ImportError("\n\n  Please install xFormers.")
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention
        self.relaxed_attention_weight = relaxed_attention_weight

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn
        self.beam_size = 1

        self.positional_embedding = positional_embedding
        if (self.positional_embedding is not None
                and not self.positional_embedding.learnable):
            self.pos_bias_u = nn.Parameter(torch.Tensor(embed_dim))
            self.pos_bias_v = nn.Parameter(torch.Tensor(embed_dim))
            self.pos_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=False), q_noise,
                qn_block_size)
        else:
            self.pos_bias_u = self.pos_bias_v = self.pos_proj = None

        self.reset_parameters()

        if self.use_xformers:
            xformers_att_config["dropout"] = xformers_att_config.get(
                "dropout", dropout)
            xformers_att_config["num_heads"] = xformers_att_config.get(
                "num_heads", num_heads)

            if xformers_blocksparse_layout is not None:
                # Could be part of a single config passed only once
                xformers_att_config[
                    "block_size"] = xformers_blocksparse_blocksize
                xformers_att_config["layout"] = xformers_blocksparse_layout
                xformers_att_config["name"] = "blocksparse"

            self.attention = build_attention(xformers_att_config)

        self.onnx_trace = False
        self.skip_embed_dim_check = False
Пример #15
0
 def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
     return quant_noise(nn.Linear(input_dim, output_dim),
                        p=q_noise,
                        block_size=qn_block_size)
Пример #16
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
        sigsoftmax=False,
        mix_softmax=False,
        mix_type=-1,
        temperature=1.0,
        pre_drop_mix=False,
        pre_mix=False,
        fix_head_dim=-1,
        use_div_reg=False,
        synth_attn_type='vanilla',
        synth_hidden_dim=-1,
        synth_factor_dim=-1,
        synth_trainable_random=True,
        synth_max_len_seq=-1,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        # fix head dim
        self.fix_head_dim = fix_head_dim
        self.embed_dim2 = embed_dim
        if self.fix_head_dim != -1:
            self.head_dim = fix_head_dim
            print('fix head dim:', self.head_dim)
            self.embed_dim2 = self.head_dim * self.num_heads
        else:
            self.head_dim = embed_dim // num_heads
            assert (self.head_dim * num_heads == self.embed_dim
                    ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(
            nn.Linear(self.kdim, self.embed_dim2, bias=bias), q_noise,
            qn_block_size)
        self.v_proj = quant_noise(
            nn.Linear(self.vdim, self.embed_dim2, bias=bias), q_noise,
            qn_block_size)
        self.q_proj = quant_noise(
            nn.Linear(self.embed_dim, self.embed_dim2, bias=bias), q_noise,
            qn_block_size)

        self.out_proj = quant_noise(
            nn.Linear(self.embed_dim2, self.embed_dim, bias=bias), q_noise,
            qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False
        self.tpu = False

        # custom code
        # div reg
        self.use_div_reg = use_div_reg
        if self.use_div_reg:
            self.div_reg = 0
            print('using div regularization')
        else:
            self.div_reg = None

        # Mix-softmax paras
        self.sigsoftmax = sigsoftmax
        self.mix_softmax = mix_softmax
        self.mix_type = mix_type
        self.temperature = temperature
        self.pre_drop_mix = pre_drop_mix
        self.pre_mix = pre_mix

        for attr in ['sigsoftmax', 'pre_drop_mix', 'pre_mix']:
            if getattr(self, attr) is True:
                print('using {}.'.format(attr))

        if self.mix_softmax:
            print('using mix type: {}, pre drop mix: {}, pre mix: {}.'.format(
                self.mix_type, self.pre_drop_mix, self.pre_mix))
            self.bias = None
            self.stochastic_w = None
            if self.mix_type == 9:  # fixed randn projection (allow negative values)
                self.stochastic_w = torch.randn(self.num_heads, self.num_heads)
                self.stochastic_w = nn.Parameter(self.stochastic_w / (
                    self.stochastic_w.sum(dim=0, keepdim=True) +
                    self.stochastic_w.sum(dim=0, keepdim=True).sign() * 1e-12),
                                                 requires_grad=False)
            elif self.mix_type == 10:  # fixed rand projection
                self.stochastic_w = torch.rand(self.num_heads, self.num_heads)
                self.stochastic_w = nn.Parameter(
                    self.stochastic_w /
                    self.stochastic_w.sum(dim=0, keepdim=True),
                    requires_grad=False)
            elif self.mix_type in [11, 12, 13]:  # data dependent parameter
                self.stochastic_w = nn.Parameter(
                    torch.randn(self.head_dim, self.num_heads) / self.head_dim)
            elif self.mix_type in [14, 15, 16]:
                self.bias = nn.Parameter(
                    torch.eye(self.num_heads, self.num_heads))
                self.stochastic_w = nn.Parameter(
                    torch.randn(self.head_dim, self.num_heads) / self.head_dim)
            elif self.mix_type in [17, 18, 19]:
                self.bias = nn.Parameter(
                    torch.eye(self.num_heads, self.num_heads))
                self.stochastic_w = nn.Parameter(
                    torch.zeros(self.head_dim, self.num_heads))
            else:
                self.stochastic_w = nn.Parameter(
                    torch.eye(self.num_heads, self.num_heads))
            self.r_temperature = 1.0 / self.temperature

        # Synthesizer initializer starts here
        self.synth_attn_type = synth_attn_type
        self.synth_hidden_dim = synth_hidden_dim  # For dense variant
        self.synth_factor_dim = synth_factor_dim  # 0 for not using, else enables factorization
        self.synth_trainable_random = synth_trainable_random  # True for trainable random variant
        self.synth_max_len_seq = synth_max_len_seq
        if self.synth_attn_type == 'vanilla':
            pass
        else:
            print('Synthesizer attn_type: ', self.synth_attn_type,
                  'hidden dim: ', self.synth_hidden_dim, 'factor dim: ',
                  self.synth_factor_dim, 'trainable_random: ',
                  self.synth_trainable_random, 'max_len_seq: ',
                  self.synth_max_len_seq)
            if self.synth_attn_type == 'dense':
                self.synth_attn = DenseAttention(self.synth_max_len_seq,
                                                 self.head_dim,
                                                 self.synth_hidden_dim)
            elif self.synth_attn_type == 'random':
                self.synth_attn = RandomAttention(num_heads,
                                                  self.synth_max_len_seq,
                                                  self.synth_trainable_random)
            elif self.synth_attn_type == 'dense_factorized':
                self.synth_attn = FactorizedDenseAttention(
                    self.synth_max_len_seq, self.head_dim,
                    self.synth_factor_dim)
            elif self.synth_attn_type == 'random_factorized':
                self.synth_attn = FactorizedRandomAttention(
                    num_heads, self.synth_factor_dim, self.synth_max_len_seq,
                    self.synth_trainable_random)
            else:
                print('unknown attn_type: ', self.attn_type)
                exit(1)
Пример #17
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
        biased_attn_weight=True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None
        self.kk_bias_r = nn.Parameter(torch.zeros(num_heads),
                                      requires_grad=biased_attn_weight)

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False

        self.enable_torch_version = False
        if hasattr(F, "multi_head_attention_forward"):
            self.enable_torch_version = True
        else:
            self.enable_torch_version = False
Пример #18
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
        normalized_attention=False,
        normalized_attention_logsoftmax=False,
        normalized_attention_by_entropy=False,
        positional_embeddings_in_attention=False,
        symmetric_kv_context_params=False,
        symmetric_kv_positional_params=False,
        #normalized_attention_by_positional_score=False,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)
        self.positional_embeddings_in_attention = positional_embeddings_in_attention
        self.symmetric_kv_context_params = symmetric_kv_context_params
        self.symmetric_kv_positional_params = symmetric_kv_positional_params
        #self.normalized_attention_by_positional_score=normalized_attention_by_positional_score

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        if self.symmetric_kv_context_params:
            assert self.kdim == embed_dim, (
                "Symmetric context attention requires kdim == embed_dim")
            self.q_proj.weight = self.k_proj.weight

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.normalized_attention = normalized_attention
        self.normalized_attention_logsoftmax = normalized_attention_logsoftmax
        self.normalized_attention_by_entropy = normalized_attention_by_entropy
        if self.normalized_attention:
            self.attention_gain = quant_noise(
                nn.Linear(embed_dim, num_heads, bias=True), q_noise,
                qn_block_size)

        if self.positional_embeddings_in_attention:
            self.pos_k_proj = quant_noise(
                nn.Linear(self.kdim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            self.pos_q_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            if self.symmetric_kv_positional_params:
                assert self.kdim == embed_dim, (
                    "Symmetric positional attention requires kdim == embed_dim"
                )
                self.pos_q_proj.weight = self.pos_k_proj.weight
            self.pos_embeddings = SinusoidalPositionalEmbedding(
                embed_dim, None)

        self.reset_parameters()

        self.onnx_trace = False
Пример #19
0
    def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0, qn_block_size=8):
        super().__init__(embed_dim, num_heads, kdim=kdim, vdim=vdim, dropout=dropout, bias=bias, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=self_attention, encoder_decoder_attention=encoder_decoder_attention, q_noise=q_noise, qn_block_size=qn_block_size)  

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=True), q_noise, qn_block_size)
        self.k_proj.bias = nn.Parameter(torch.zeros_like(self.k_proj.bias, requires_grad=False))
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
        capsule_proj_weight=None,
        capsule_proj_bias=None,
        dynamic_routing_weights=None,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        # Added capsule weights
        self.dynamic_routing_weights = dynamic_routing_weights  #[nn.Parameter(torch.ones( self.head_dim, self.num_heads, self.num_heads, device='cuda', dtype= torch.half, requires_grad=True)) for _ in range (0, self.num_heads)]

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False