Пример #1
0
 def call(self,
          inputs,
          training=None,
          sample_shape=(),
          projection=True,
          prior=None):
     # projection by Dense layer could be skipped by setting projection=False
     # NOTE: a 2D inputs is important here, but we don't want to flatten
     # automatically
     if projection and not self._disable_projection:
         params = super().call(inputs)
     else:
         params = inputs
     # applying dropout
     if self._dropout > 0:
         params = bk.dropout(params,
                             p_drop=self._dropout,
                             training=training)
     # create posterior distribution (this will create a new layer everytime)
     posterior = self.posterior_layer(sample_shape=sample_shape)(
         params, training=training)
     self._last_distribution = posterior
     # NOTE: all distribution has the method kl_divergence, so we cannot use it
     prior = self.prior if prior is None else prior
     posterior.KL_divergence = KLdivergence(
         posterior, prior=prior,
         sample_shape=None)  # None mean reuse samples here
     assert not hasattr(posterior,
                        'prior'), "Cannot assign prior to the output"
     posterior.prior = prior
     return posterior
Пример #2
0
 def call(self,
          inputs,
          training=None,
          mask=None,
          sample_shape=(),
          projection=None,
          prior=None):
   # projection by Dense layer could be skipped by setting projection=False
   # NOTE: a 2D inputs is important here, but we don't want to flatten
   # automatically
   params = inputs
   if projection is None:
     projection = self.projection
   else:
     projection = self.projection and projection
   # do not use tf.cond here, it infer the wrong shape when trying to build
   # the layer in Graph mode.
   if projection:
     params = super().call(params)
   # applying dropout
   if self._dropout > 0:
     params = bk.dropout(params, p_drop=self._dropout, training=training)
   # create posterior distribution
   self._posterior_sample_shape = sample_shape
   posterior = self.posterior_layer(params, training=training)
   self._most_recent_distribution = posterior
   # NOTE: all distribution has the method kl_divergence, so we cannot use it
   prior = self.prior if prior is None else prior
   posterior.KL_divergence = KLdivergence(
       posterior, prior=prior,
       sample_shape=None)  # None mean reuse sampled data here
   assert not hasattr(posterior, 'prior'), "Cannot assign prior to the output"
   posterior.prior = prior
   return posterior
Пример #3
0
 def call(self,
          inputs,
          training=None,
          sample_shape=(),
          projection=None,
          **kwargs):
   ## NOTE: a 2D inputs is important here, but we don't want to flatten
   # automatically
   if self.flatten_inputs:
     inputs = tf.reshape(inputs, (tf.shape(inputs)[0], -1))
   params = inputs
   ## do not use tf.cond here, it infer the wrong shape when
   # trying to build the layer in Graph mode.
   projection = projection if projection is not None else self.projection
   if projection:
     params = self._dense(params)
     if self.autoregressive:
       params = tf.concat(tf.unstack(params, axis=-1), axis=-1)
   ## applying dropout
   if self._dropout > 0:
     params = bk.dropout(params, p_drop=self._dropout, training=training)
   ## create posterior distribution
   self._posterior_sample_shape = sample_shape
   kw = dict()
   if 'training' in self._posterior_call_kw:
     kw['training'] = training
   if 'sample_shape' in self._posterior_call_kw:
     kw['sample_shape'] = sample_shape
   for k, v in kwargs.items():
     if k in self._posterior_call_kw:
       kw[k] = v
   posterior = self.posterior_layer(params, **kw)
   # tensorflow tries to serialize the distribution, which raise exception
   # when saving the graphs, to avoid this, store it as non-tracking list.
   with trackable.no_automatic_dependency_tracking_scope(self):
     # self._no_dependency
     self._most_recently_built_distribution = posterior
   ## NOTE: all distribution has the method kl_divergence, so we cannot use it
   posterior.KL_divergence = KLdivergence(
     posterior, prior=self.prior,
     sample_shape=None)  # None mean reuse sampled data here
   return posterior
Пример #4
0
    def align(self,
              scores,
              value,
              query=None,
              q_mask=None,
              v_mask=None,
              causal=False,
              residual=False,
              dropout=0,
              temporal_dropout=False,
              sample_shape=1,
              temperature=0.5,
              training=None):
        r"""Applies attention scores to the given value tensor.

    Arguments:
      scores: Attention Scores float tensor of shape
        `[num_heads, batch_size, Tq, Tv]`.
      value: Value (or source sequence) tensor of shape
        `[num_heads, batch_size, Tv, dim]`.
      query: Query (or target sequence) tensor of shape
        `[num_heads, batch_size, Tq, dim]`.
      q_mask: A boolean query mask `Tensor` of shape `[batch_size, Tq]`.
        If given, the output will be zero at the positions where
        `mask==False`.
      v_mask: A boolean value mask `Tensor` of shape `[batch_size, Tv]`.
        If given, will apply the mask such that values at positions where
        `mask==False` do not contribute to the result.
      dropout : Float. Dropout probability of the attention scores.
      temporal_dropout : Boolean. If `True`, using the same dropout mask along
        temporal axis (i.e. the 1-st dimension)
      sample_shape (`Integer`) : number of mcmc samples for estimating the gradient
        of hard attention
      temperature: An 0-D `Tensor`, representing the temperature
        of a set of RelaxedOneHotCategorical distributions. The temperature
        should be positive.

    Returns:
      attended sequence: Tensor of shape
        * `[sample_shape, num_heads, batch_size, Tq, dim]` for (hard + multi-heads)
        * `[sample_shape, batch_size, Tq, dim]` for (hard + no-head)
        * `[num_heads, batch_size, Tq, dim]` for (soft + multi-heads)
        * `[batch_size, Tq, dim]` for (soft + no-head)
      attention distribution : for soft attention, return Tensor of shape
        * `[num_heads, batch_size, Tq]` for self-attention
        * `[num_heads, batch_size, Tq, Tv]` for inter-attention.
        for hard attention, return one-hot categorical distribution of shape
        * `[sample_shape, num_heads, batch_size, Tq]` for self-attention
        * `[sample_shape, num_heads, batch_size, Tq, Tv]` for inter-attention.
        if multi-heads attention wasn't used, omit the `[num_heads]`.
    """
        num_heads = _get_num_heads(scores)
        if num_heads == 0:
            Tq = scores.shape[1]
            Tv = scores.shape[2]
        else:
            Tq = scores.shape[2]
            Tv = scores.shape[3]
        if value is None:
            if query is None:
                raise ValueError("both query and value are None, "
                                 "at least one of them must be given")
            value = query
        # ====== Causal mask ====== #
        if causal:
            # Creates a lower triangular mask, so position i cannot attend to
            # positions j>i. This prevents the flow of information from the future
            # into the past.
            scores_shape = scores.shape
            # causal_mask_shape = [1, Tq, Tv].
            causal_mask_shape = bk.concatenate(
                [bk.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0)
            causal_mask = bk.tril_mask(causal_mask_shape)
        else:
            causal_mask = None
        if v_mask is not None:
            # LocalM applied
            if PosLocalM in self:
                v_mask = v_mask[:, -Tv:]
            # Mask of shape [batch_size, 1, Tv].
            v_mask = bk.expand_dims(v_mask, axis=-2)
            v_mask = bk.cast(v_mask, 'bool')
            if num_heads > 0:
                v_mask = bk.expand_dims(v_mask, axis=0)
        scores_mask = bk.logical_and(v_mask, causal_mask)
        ### applying the scores mask
        if scores_mask is not None:
            padding_mask = bk.logical_not(scores_mask)
            # Bias so padding positions do not contribute to attention distribution.
            scores -= 1.e9 * bk.cast(padding_mask, dtype=scores.dtype)
        # ====== convert attention score to distribution ====== #
        # if the last dimension is 1, no point for applying softmax, hence,
        # softmax to the second last dimension
        ### soft attention
        if AlignSoft in self:
            attention_distribution = bk.softmax(
                scores, axis=-2 if scores.shape[-1] == 1 else -1)
        ### relaxed hard attention
        elif AlignRelax in self:
            attention_distribution = bay.distributions.RelaxedOneHotCategorical(
                temperature=temperature,
                logits=bk.squeeze(scores, axis=-1)
                if scores.shape[-1] == 1 else scores)
            fsample = partial(bay.Distribution.sample,
                              sample_shape=sample_shape)
            attention_distribution = bay.coercible_tensor(
                attention_distribution, convert_to_tensor_fn=fsample)
        ### hard attention
        elif AlignHard in self:
            attention_distribution = bay.distributions.OneHotCategorical(
                logits=bk.squeeze(scores, axis=-1)
                if scores.shape[-1] == 1 else scores,
                dtype=value.dtype)
            fsample = partial(bay.Distribution.sample,
                              sample_shape=sample_shape)
            attention_distribution = bay.coercible_tensor(
                attention_distribution, convert_to_tensor_fn=fsample)
        # ======  dropout the attention scores ====== #
        attention = bk.dropout(attention_distribution,
                               p_drop=dropout,
                               axis=1 if temporal_dropout else None,
                               training=training and dropout > 0)
        # ====== applying the attention ====== #
        if self.is_self_attention and ScoreLocation in self:
            result = bk.expand_dims(bk.array(attention), axis=-1) * value  \
                if attention.shape[-1] != 1 else \
                  attention * value
        else:
            if PosLocalM in self:
                value = value[:, -Tv:] if num_heads == 0 else value[:, :, -Tv:]
            result = bk.matmul(attention, value)
        # ====== applying the Query mask ====== #
        if q_mask is not None:
            assert q_mask.shape[1] == Tq,\
              "Query mask has time dimension %d, but query has time dimension %d" \
                % (q_mask.shape[1], Tq)
            # Mask of shape [batch_size, Tq, 1].
            q_mask = bk.expand_dims(q_mask, axis=-1)
            result *= bk.cast(q_mask, dtype=result.dtype)
        # ====== residual connection ====== #
        if residual:
            if query is None:
                raise ValueError("query must be given for residual connection")
            result += query
        # ====== return ====== #
        return result, attention_distribution