Exemplo n.º 1
0
  def __init__(self,
               num_heads,
               num_units,
               dropout=0.1,
               return_attention=False,
               **kwargs):
    """Initializes this layer.

    Args:
      num_heads: The number of attention heads.
      num_units: The number of hidden units.
      dropout: The probability to drop units from the inputs.
      return_attention: If ``True``, also return the attention weights of the
        first head.
      kwargs: Additional layer arguments.
    """
    super(MultiHeadAttention, self).__init__(**kwargs)
    if num_units % num_heads != 0:
      raise ValueError("Multi head attention requires that num_units is a"
                       " multiple of %s" % num_heads)
    self.num_heads = num_heads
    self.num_units = num_units
    self.linear_queries = common.Dense(num_units)
    self.linear_keys = common.Dense(num_units)
    self.linear_values = common.Dense(num_units)
    self.linear_output = common.Dense(num_units)
    self.dropout = dropout
    self.return_attention = return_attention
Exemplo n.º 2
0
    def __init__(self,
                 num_heads,
                 num_units,
                 dropout=0.1,
                 return_attention=False,
                 maximum_relative_position=None,
                 **kwargs):
        """Initializes this layer.

    Args:
      num_heads: The number of attention heads.
      num_units: The number of hidden units.
      dropout: The probability to drop units from the inputs.
      return_attention: If ``True``, also return the attention weights of the
        first head.
      maximum_relative_position: Maximum relative position representation
        (from https://arxiv.org/abs/1803.02155).
      kwargs: Additional layer arguments.
    """
        super(MultiHeadAttention, self).__init__(**kwargs)
        if num_units % num_heads != 0:
            raise ValueError(
                "Multi head attention requires that num_units is a"
                " multiple of %s" % num_heads)
        self.num_heads = num_heads
        self.num_units_per_head = num_units // num_heads
        self.linear_queries = common.Dense(num_units)
        self.linear_keys = common.Dense(num_units)
        self.linear_values = common.Dense(num_units)
        self.linear_output = common.Dense(num_units)
        self.dropout = dropout
        self.return_attention = return_attention
        self.maximum_relative_position = maximum_relative_position
Exemplo n.º 3
0
 def testDense(self, weight_shape, input_shape, transpose):
   weight = tf.zeros(weight_shape)
   layer = common.Dense(10, weight=weight, transpose=transpose)
   x = tf.ones(input_shape)
   y = layer(x)
   self.assertEqual(layer.kernel.experimental_ref(), weight.experimental_ref())
   self.assertEqual(self.evaluate(tf.reduce_sum(y)), 0)
    def __init__(self,
                 num_heads,
                 num_units,
                 dropout=0.1,
                 return_attention=False,
                 maximum_relative_position=None,
                 attention_span=None,
                 num_attended_heads=1,
                 **kwargs):
        """Initializes this layer.

    Args:
      num_heads: The number of attention heads.
      num_units: The number of hidden units.
      dropout: The probability to drop units from the inputs.
      return_attention: If ``True``, also return the attention weights of the
        first head.
      maximum_relative_position: Maximum relative position representation
        (from https://arxiv.org/abs/1803.02155).
      attention_span: Maximum relative position to attend to
        (from https://arxiv.org/abs/1904.03107).
      num_attended_heads: How many heads should be attended. Defaults to 1
        as each head only attends to itself in vanilla Transformer. Increase to
        an odd number < `num_heads` to also model head interaction.
        (from ttps://arxiv.org/abs/1904.03107).
      kwargs: Additional layer arguments.
    """
        super(MultiHeadAttention, self).__init__(**kwargs)
        if num_units % num_heads != 0:
            raise ValueError(
                "Multi head attention requires that num_units is a"
                " multiple of %s" % num_heads)
        self.num_heads = num_heads
        self.num_units_per_head = num_units // num_heads
        self.linear_queries = common.Dense(num_units)
        self.linear_keys = common.Dense(num_units)
        self.linear_values = common.Dense(num_units)
        self.linear_output = common.Dense(num_units)
        self.dropout = dropout
        self.return_attention = return_attention
        self.maximum_relative_position = maximum_relative_position
        self.attention_span = attention_span
        if num_attended_heads % 2 == 0:
            raise ValueError(
                "Number heads attended must be odd to guarantee symmetry.")
        self.num_attended_heads = num_attended_heads
Exemplo n.º 5
0
    def __init__(
        self, inner_dim, output_dim, dropout=0.1, activation=tf.nn.relu, **kwargs
    ):
        """Initializes this layer.

        Args:
          inner_dim: The number of units of the inner linear transformation.
          output_dim: The number of units of the ouput linear transformation.
          dropout: The probability to drop units from the activation output.
          activation: The activation function to apply between the two linear
            transformations.
          kwargs: Additional layer arguments.
        """
        super().__init__(**kwargs)
        self.inner = common.Dense(inner_dim, activation=activation)
        self.outer = common.Dense(output_dim)
        self.dropout = dropout
Exemplo n.º 6
0
 def _add_attention(cell):
     # Produce Luong-style attentional hidden states.
     attention_layer = common.Dense(
         cell.output_size, use_bias=False, activation=attention_layer_activation
     )
     wrapper = tfa.seq2seq.AttentionWrapper(
         cell, self.attention_mechanism, attention_layer=attention_layer
     )
     return wrapper
Exemplo n.º 7
0
    def initialize(self, vocab_size=None, output_layer=None):
        """Initializes the decoder configuration.

    Args:
      vocab_size: The target vocabulary size.
      output_layer: The output layer to use.

    Raises:
      ValueError: if both :obj:`vocab_size` and :obj:`output_layer` are not set.
    """
        if output_layer is not None:
            self.output_layer = output_layer
        else:
            if vocab_size is None:
                raise ValueError(
                    "One of vocab_size and output_layer must be set")
            self.output_layer = common.Dense(vocab_size)