Exemplo n.º 1
0
    def __init__(self, heads: int, d_model: int, dropout_prob: float,
                 sigma: DPFP):
        super().__init__()

        # Number of features per head
        self.d_k = d_model // heads
        #
        self.heads = heads

        # These transform the `query` multi-headed attention.
        self.query = PrepareForMultiHeadAttention(d_model,
                                                  heads,
                                                  self.d_k,
                                                  bias=False)
        # These transform the `key` and `value` for multi-headed attention.
        self.key = PrepareForMultiHeadAttention(d_model,
                                                heads,
                                                self.d_k,
                                                bias=False)
        self.value = PrepareForMultiHeadAttention(d_model,
                                                  heads,
                                                  self.d_k,
                                                  bias=False)

        self.gate = nn.Sequential(
            PrepareForMultiHeadAttention(d_model, heads, 1, bias=False),
            nn.Sigmoid())

        self.sigma = sigma

        # Output layer
        self.output = nn.Linear(d_model, d_model)
        # Dropout
        self.dropout = nn.Dropout(dropout_prob)
Exemplo n.º 2
0
Arquivo: __init__.py Projeto: wx-b/nn
    def __init__(self, heads: int, d_model: int, dropout_prob: float,
                 phi: DPFP):
        super().__init__()

        # Number of features per head $d_k$
        self.d_k = d_model // heads
        # Number of heads
        self.heads = heads

        # These transform the `query`, `key` and `value` multi-headed attention.
        self.query = PrepareForMultiHeadAttention(d_model,
                                                  heads,
                                                  self.d_k,
                                                  bias=False)
        self.key = PrepareForMultiHeadAttention(d_model,
                                                heads,
                                                self.d_k,
                                                bias=False)
        self.value = PrepareForMultiHeadAttention(d_model,
                                                  heads,
                                                  self.d_k,
                                                  bias=False)

        # Interpolation weight function $\sigma \Big(\color{orange}{W_\beta} x^{(i)} \Big)$ for each head
        self.interpolation_weight = nn.Sequential(
            PrepareForMultiHeadAttention(d_model, heads, 1, bias=False),
            nn.Sigmoid())

        # $\color{lightgreen}{\phi'}$
        self.phi = phi

        # Output layer
        self.output = nn.Linear(d_model, d_model)
        # Dropout
        self.dropout = nn.Dropout(dropout_prob)
Exemplo n.º 3
0
    def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1):
        super().__init__()

        self.d_k = d_model // heads
        self.heads = heads

        # These transform the `query`, `key` and `value` vectors for multi-headed attention.
        self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, False)
        self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, False)
        self.value = PrepareForMultiHeadAttention(d_model, heads, self.d_k, False)

        # Output layer
        self.output = nn.Linear(d_model, d_model)
        # Dropout
        self.dropout = nn.Dropout(dropout_prob)
        # Scaling factor before the softmax
        self.scale = 1 / math.sqrt(self.d_k)

        # Softmax for attention along the time dimension of `key`
        self.softmax = nn.Softmax(dim=0)

        # Number of relative positions
        self.P = 2 ** 12

        # Relative positional embeddings for key relative to the query.
        self.key_pos_embeddings = nn.Parameter(torch.zeros((self.P, heads, self.d_k)), requires_grad=True)
        # Relative positional embedding bias for key relative to the query.
        self.key_pos_bias = nn.Parameter(torch.zeros((self.P, heads)), requires_grad=True)
        # Positional embeddings for the query is independent of the position of the query
        self.query_pos_bias = nn.Parameter(torch.zeros((heads, self.d_k)), requires_grad=True)

        # We store attentions so that it can used for logging, or other computations if needed
        self.attn = None
Exemplo n.º 4
0
Arquivo: __init__.py Projeto: wx-b/nn
    def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, *,
                 is_kv_precomputed: bool = False):
        """
        * 'heads' is the number of attention heads
        * `d_model` is the number of features in the transformer
        * `dropout_prob` is the attention dropout probability
        * `is_kv_precomputed` is whether key, value tensors are already calculated
        """

        super().__init__()

        # Number of features per head
        self.d_k = d_model // heads
        #
        self.heads = heads

        # These transform the `query` multi-headed attention.
        self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
        # These transform the `key` and `value` for multi-headed attention.
        if not is_kv_precomputed:
            self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
            self.value = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=True)
        # Keys and values are already calculated
        else:
            self.key = None
            self.value = None

        # Output layer
        self.output = nn.Linear(d_model, d_model)
        # Dropout
        self.dropout = nn.Dropout(dropout_prob)
        # Scaling factor before the softmax
        self.scale = 1 / math.sqrt(self.d_k)

        # Softmax for attention along the time dimension of `key`
        self.softmax = nn.Softmax(dim=0)

        # Number of relative positions
        self.P = 2 ** 12

        # Relative positional embeddings for key relative to the query.
        self.key_pos_embeddings = nn.Parameter(torch.zeros((self.P, heads, self.d_k)), requires_grad=True)
        # Relative positional embedding bias for key relative to the query.
        self.key_pos_bias = nn.Parameter(torch.zeros((self.P, heads)), requires_grad=True)
        # Positional embeddings for the query is independent of the position of the query
        self.query_pos_bias = nn.Parameter(torch.zeros((heads, self.d_k)), requires_grad=True)

        # We store attentions so that it can be used for logging, or other computations if needed
        self.attn = None
Exemplo n.º 5
0
    def __init__(self, layer: FeedbackTransformerLayer, n_layers: int,
                 d_model: int, heads: int):
        """
        * `layer` is the feedback transformer layer, which we clone for each layer
        * `n_layers` is the number of layers in the transformer
        * `d_model` is the number of features in the transformer
        * 'heads' is the number of attention heads
        """

        super().__init__()
        # Make copies of the transformer layer
        self.layers = clone_module_list(layer, n_layers)
        # Final normalization layer
        self.norm = nn.LayerNorm([layer.size])
        # Memory vectors are computed as a weighted sum of representations of each layer.
        # This is the weights parameter for that.
        self.weights = nn.Parameter(torch.ones(n_layers + 1),
                                    requires_grad=True)
        # Softmax for weights before taking the weighted sum
        self.softmax = nn.Softmax(0)

        # Number of features in a head
        d_k = d_model // heads
        # Module to transform embeddings (memory) to get keys
        self.key = PrepareForMultiHeadAttention(d_model,
                                                heads,
                                                d_k,
                                                bias=False)
        # Module to transform embeddings (memory) to get keys
        self.value = PrepareForMultiHeadAttention(d_model,
                                                  heads,
                                                  d_k,
                                                  bias=False)

        # Memory for stacked keys
        self.mem_key = Stack(512)
        # Memory for stacked values
        self.mem_value = Stack(512)