Exemplo n.º 1
0
def attention(*args, **kwargs):
    # number of input positions to remember in a cache when doing fast inference.
    kwargs['predict_mem_len'] = 120
    # number of input elements to drop once the fast inference input cache fills up.
    kwargs['predict_drop_len'] = 120
    # return the attention layer with the parameters defined above
    return tl.SelfAttention(*args, **kwargs)
Exemplo n.º 2
0
def EncoderDecoderBlock(d_model, d_ff, n_heads, dropout, ff_activation,
                        ff_dropout, mode, ff_use_sru=0, ff_chunk_size=0,
                        ff_sparsity=0):
  """Reversible transformer decoder layer.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    ff_activation: the non-linearity in feed-forward layer
    ff_dropout: float: (optional) separate dropout rate for feed-forward layer
    mode: str: 'train' or 'eval'
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    ff_sparsity: int, if > 0 use sparse feed-forward block with this sparsity

  Returns:
    the layer.
  """
  enc_dec_attention = tl.EncDecAttention(
      n_heads=n_heads, d_qk=d_model//n_heads, d_v=d_model//n_heads,
      attention_dropout=dropout, output_dropout=dropout,
      mode=mode)
  enc_dec_attention_half_residual = tl.ReversibleHalfResidual(
      tl.LayerNorm(),
      attention_layer=enc_dec_attention,
  )

  causal_attention = tl.SelfAttention(
      n_heads=n_heads, d_qk=d_model//n_heads, d_v=d_model//n_heads,
      causal=True,
      attention_dropout=dropout, output_dropout=dropout,
      mode=mode)
  causal_attention_half_residual = tl.ReversibleHalfResidual(
      tl.LayerNorm(),
      attention_layer=causal_attention,
  )

  feed_forward = ct.FeedForwardWithOptions(
      d_model, d_ff, dropout, [-2], ff_activation, ff_dropout,
      ff_chunk_size, ff_use_sru, ff_sparsity, mode)

  return [                             # vec_d1 vec_d2 vec_e masks
      causal_attention_half_residual,
      tl.ReversibleSwap(),
      enc_dec_attention_half_residual,
      tl.ReversibleSwap(),
      tl.ReversibleHalfResidual(feed_forward),
      tl.ReversibleSwap(),
  ]
Exemplo n.º 3
0
def EncoderDecoderBlock(d_model, d_ff, n_heads, dropout, ff_activation,
                        ff_dropout, mode):
    """Reversible transformer decoder layer.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    ff_activation: the non-linearity in feed-forward layer
    ff_dropout: float: (optional) separate dropout rate for feed-forward layer
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
    enc_dec_attention = tl.EncDecAttention(n_heads=n_heads,
                                           d_qk=d_model // n_heads,
                                           d_v=d_model // n_heads,
                                           attention_dropout=dropout,
                                           output_dropout=dropout,
                                           mode=mode)
    enc_dec_attention_half_residual = ReversibleHalfResidualV2(
        tl.LayerNorm(),
        attention_layer=enc_dec_attention,
    )

    causal_attention = tl.SelfAttention(n_heads=n_heads,
                                        d_qk=d_model // n_heads,
                                        d_v=d_model // n_heads,
                                        causal=True,
                                        attention_dropout=dropout,
                                        output_dropout=dropout,
                                        mode=mode)
    causal_attention_half_residual = ReversibleHalfResidualV2(
        tl.LayerNorm(),
        attention_layer=causal_attention,
    )

    feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation,
                               ff_dropout, mode)

    return [  # vec_d1 vec_d2 vec_e masks
        causal_attention_half_residual,
        tl.ReversibleSwap(),
        enc_dec_attention_half_residual,
        tl.ReversibleSwap(),
        ReversibleHalfResidualV2(feed_forward),
        tl.ReversibleSwap(),
    ]
Exemplo n.º 4
0
def RecommenderTransformer(n_classes_in, embedding_size, n_out_classes,
                           dropout_rate):
    transfomer = tl.Serial(
        tl.Embedding(n_classes_in, d_feature=embedding_size),
        tl.Dropout(dropout_rate),
        tl.SelfAttention(2),
        tl.Flatten(),
        tl.Dropout(dropout_rate),
        #tl.DotProductCausalAttention(4),
        tl.Dense(n_out_classes),
        tl.LogSoftmax())

    print(str(transfomer))
    return transfomer
Exemplo n.º 5
0
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, ff_dropout,
                 mode):
    """Returns a list of layers that implements a Reformer encoder block.

  The input to the layer is a pair, (activations, mask), where the mask was
  created from the original source tokens to prevent attending to the padding
  part of the input.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    ff_activation: the non-linearity in feed-forward layer
    ff_dropout: the dropout rate in feed-forward layer
    mode: str: 'train' or 'eval'

  Returns:
    A list of layers that maps (activations, mask) to (activations, mask).
  """
    if mode == 'predict':
        # Mode 'predict' means that the decoder should be run one token at a time.
        # The encoder only ever runs over full sequences, which is why it's switched
        # to 'eval' mode instead.
        mode = 'eval'

    attention = tl.SelfAttention(n_heads=n_heads,
                                 d_qk=d_model // n_heads,
                                 d_v=d_model // n_heads,
                                 masked=True,
                                 attention_dropout=dropout,
                                 output_dropout=dropout,
                                 mode=mode)
    attention_half_residual = ReversibleHalfResidualV2(
        tl.LayerNorm(),
        attention_layer=attention,
    )

    feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation,
                               ff_dropout, mode)

    return [
        attention_half_residual,
        tl.ReversibleSwap(),
        ReversibleHalfResidualV2(feed_forward),
        tl.ReversibleSwap(),
    ]
Exemplo n.º 6
0
def EncoderBlock(d_model, d_ff, n_heads, dropout, ff_activation, mode):
  """Returns a list of layers that implements a Reformer encoder block.

  The input to the layer is a pair, (activations, mask), where the mask was
  created from the original source tokens to prevent attending to the padding
  part of the input.

  Args:
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    ff_activation: the non-linearity in feed-forward layer
    mode: str: 'train' or 'eval'

  Returns:
    A list of layers that maps (activations, mask) to (activations, mask).
  """
  attention = tl.SelfAttention(
      n_heads=n_heads, d_qk=d_model//n_heads, d_v=d_model//n_heads,
      masked=True,
      attention_dropout=0.0,  # TODO(kitaev): attention dropout
      mode=mode)
  attention_half_residual = ReversibleHalfResidualV2(
      tl.LayerNorm(),
      attention_layer=attention,
      # TODO(kitaev): add output dropout to attention layer. rate=dropout
  )

  # TODO(kitaev): Switch to FeedForward with BroadcastedDropout?
  feed_forward = transformer._FeedForwardBlock(  # pylint: disable=protected-access
      d_model, d_ff, dropout, -1, mode, ff_activation)
  # feed_forward = FeedForward(d_model, d_ff, dropout, ff_activation, mode)

  return [
      attention_half_residual,
      tl.ReversibleSwap(),
      ReversibleHalfResidualV2(feed_forward),
      tl.ReversibleSwap(),
  ]
Exemplo n.º 7
0
def BERT(d_model=768,
         vocab_size=30522,
         max_len=512,
         type_vocab_size=2,
         n_heads=12,
         d_ff=3072,
         n_layers=12,
         head=None,
         init_checkpoint=None,
         mode='eval',
        ):
  """BERT (default hparams are for bert-base-uncased)."""
  layer_norm_eps = 1e-12
  d_head = d_model // n_heads

  word_embeddings = tl.Embedding(d_model, vocab_size)
  type_embeddings = tl.Embedding(d_model, type_vocab_size)
  position_embeddings = tl.PositionalEncoding(max_len, mode=mode)
  embeddings = [
      tl.Select([0, 1, 0], n_in=3),  # Drops 'idx' input.
      tl.Parallel(
          word_embeddings,
          type_embeddings,
          [tl.PaddingMask(),
           tl.Fn('Squeeze', lambda x: np.squeeze(x, (1, 2)), n_out=1)]
      ),
      tl.Add(),
      position_embeddings,
      tl.LayerNorm(epsilon=layer_norm_eps),
  ]

  encoder = []
  for _ in range(n_layers):
    attn = tl.SelfAttention(n_heads=n_heads, d_qk=d_head, d_v=d_head,
                            bias=True, masked=True, mode=mode)
    feed_forward = [
        tl.Dense(d_ff),
        tl.Gelu(),
        tl.Dense(d_model)
    ]
    encoder += [
        tl.Select([0, 1, 1]),  # Save a copy of the mask
        tl.Residual(attn, AddBias()),  # pylint: disable=no-value-for-parameter
        tl.LayerNorm(epsilon=layer_norm_eps),
        tl.Residual(*feed_forward),
        tl.LayerNorm(epsilon=layer_norm_eps),
    ]

  encoder += [tl.Select([0], n_in=2)]  # Drop the mask

  pooler = [
      tl.Fn('', lambda x: (x[:, 0, :], x), n_out=2),
      tl.Dense(d_model),
      tl.Tanh(),
  ]

  init_checkpoint = init_checkpoint if mode == 'train' else None
  bert = PretrainedBERT(
      embeddings + encoder + pooler, init_checkpoint=init_checkpoint)

  if head is not None:
    bert = tl.Serial(bert, head())

  return bert
Exemplo n.º 8
0
def attention(*args, **kwargs):
    kwargs['predict_mem_len'] = 120  # max length for predictions
    kwargs['predict_drop_len'] = 120  # never drop old stuff
    return tl.SelfAttention(*args, **kwargs)
Exemplo n.º 9
0
def attention(*args, **kwargs):
    kwargs['predict_mem_len'] = 120
    kwargs['predict_drop_len'] = 120
    return tl.SelfAttention(*args, **kwargs)