def call(self, inputs, training=None, sample_shape=(), projection=True, prior=None): # projection by Dense layer could be skipped by setting projection=False # NOTE: a 2D inputs is important here, but we don't want to flatten # automatically if projection and not self._disable_projection: params = super().call(inputs) else: params = inputs # applying dropout if self._dropout > 0: params = bk.dropout(params, p_drop=self._dropout, training=training) # create posterior distribution (this will create a new layer everytime) posterior = self.posterior_layer(sample_shape=sample_shape)( params, training=training) self._last_distribution = posterior # NOTE: all distribution has the method kl_divergence, so we cannot use it prior = self.prior if prior is None else prior posterior.KL_divergence = KLdivergence( posterior, prior=prior, sample_shape=None) # None mean reuse samples here assert not hasattr(posterior, 'prior'), "Cannot assign prior to the output" posterior.prior = prior return posterior
def call(self, inputs, training=None, mask=None, sample_shape=(), projection=None, prior=None): # projection by Dense layer could be skipped by setting projection=False # NOTE: a 2D inputs is important here, but we don't want to flatten # automatically params = inputs if projection is None: projection = self.projection else: projection = self.projection and projection # do not use tf.cond here, it infer the wrong shape when trying to build # the layer in Graph mode. if projection: params = super().call(params) # applying dropout if self._dropout > 0: params = bk.dropout(params, p_drop=self._dropout, training=training) # create posterior distribution self._posterior_sample_shape = sample_shape posterior = self.posterior_layer(params, training=training) self._most_recent_distribution = posterior # NOTE: all distribution has the method kl_divergence, so we cannot use it prior = self.prior if prior is None else prior posterior.KL_divergence = KLdivergence( posterior, prior=prior, sample_shape=None) # None mean reuse sampled data here assert not hasattr(posterior, 'prior'), "Cannot assign prior to the output" posterior.prior = prior return posterior
def call(self, inputs, training=None, sample_shape=(), projection=None, **kwargs): ## NOTE: a 2D inputs is important here, but we don't want to flatten # automatically if self.flatten_inputs: inputs = tf.reshape(inputs, (tf.shape(inputs)[0], -1)) params = inputs ## do not use tf.cond here, it infer the wrong shape when # trying to build the layer in Graph mode. projection = projection if projection is not None else self.projection if projection: params = self._dense(params) if self.autoregressive: params = tf.concat(tf.unstack(params, axis=-1), axis=-1) ## applying dropout if self._dropout > 0: params = bk.dropout(params, p_drop=self._dropout, training=training) ## create posterior distribution self._posterior_sample_shape = sample_shape kw = dict() if 'training' in self._posterior_call_kw: kw['training'] = training if 'sample_shape' in self._posterior_call_kw: kw['sample_shape'] = sample_shape for k, v in kwargs.items(): if k in self._posterior_call_kw: kw[k] = v posterior = self.posterior_layer(params, **kw) # tensorflow tries to serialize the distribution, which raise exception # when saving the graphs, to avoid this, store it as non-tracking list. with trackable.no_automatic_dependency_tracking_scope(self): # self._no_dependency self._most_recently_built_distribution = posterior ## NOTE: all distribution has the method kl_divergence, so we cannot use it posterior.KL_divergence = KLdivergence( posterior, prior=self.prior, sample_shape=None) # None mean reuse sampled data here return posterior
def align(self, scores, value, query=None, q_mask=None, v_mask=None, causal=False, residual=False, dropout=0, temporal_dropout=False, sample_shape=1, temperature=0.5, training=None): r"""Applies attention scores to the given value tensor. Arguments: scores: Attention Scores float tensor of shape `[num_heads, batch_size, Tq, Tv]`. value: Value (or source sequence) tensor of shape `[num_heads, batch_size, Tv, dim]`. query: Query (or target sequence) tensor of shape `[num_heads, batch_size, Tq, dim]`. q_mask: A boolean query mask `Tensor` of shape `[batch_size, Tq]`. If given, the output will be zero at the positions where `mask==False`. v_mask: A boolean value mask `Tensor` of shape `[batch_size, Tv]`. If given, will apply the mask such that values at positions where `mask==False` do not contribute to the result. dropout : Float. Dropout probability of the attention scores. temporal_dropout : Boolean. If `True`, using the same dropout mask along temporal axis (i.e. the 1-st dimension) sample_shape (`Integer`) : number of mcmc samples for estimating the gradient of hard attention temperature: An 0-D `Tensor`, representing the temperature of a set of RelaxedOneHotCategorical distributions. The temperature should be positive. Returns: attended sequence: Tensor of shape * `[sample_shape, num_heads, batch_size, Tq, dim]` for (hard + multi-heads) * `[sample_shape, batch_size, Tq, dim]` for (hard + no-head) * `[num_heads, batch_size, Tq, dim]` for (soft + multi-heads) * `[batch_size, Tq, dim]` for (soft + no-head) attention distribution : for soft attention, return Tensor of shape * `[num_heads, batch_size, Tq]` for self-attention * `[num_heads, batch_size, Tq, Tv]` for inter-attention. for hard attention, return one-hot categorical distribution of shape * `[sample_shape, num_heads, batch_size, Tq]` for self-attention * `[sample_shape, num_heads, batch_size, Tq, Tv]` for inter-attention. if multi-heads attention wasn't used, omit the `[num_heads]`. """ num_heads = _get_num_heads(scores) if num_heads == 0: Tq = scores.shape[1] Tv = scores.shape[2] else: Tq = scores.shape[2] Tv = scores.shape[3] if value is None: if query is None: raise ValueError("both query and value are None, " "at least one of them must be given") value = query # ====== Causal mask ====== # if causal: # Creates a lower triangular mask, so position i cannot attend to # positions j>i. This prevents the flow of information from the future # into the past. scores_shape = scores.shape # causal_mask_shape = [1, Tq, Tv]. causal_mask_shape = bk.concatenate( [bk.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0) causal_mask = bk.tril_mask(causal_mask_shape) else: causal_mask = None if v_mask is not None: # LocalM applied if PosLocalM in self: v_mask = v_mask[:, -Tv:] # Mask of shape [batch_size, 1, Tv]. v_mask = bk.expand_dims(v_mask, axis=-2) v_mask = bk.cast(v_mask, 'bool') if num_heads > 0: v_mask = bk.expand_dims(v_mask, axis=0) scores_mask = bk.logical_and(v_mask, causal_mask) ### applying the scores mask if scores_mask is not None: padding_mask = bk.logical_not(scores_mask) # Bias so padding positions do not contribute to attention distribution. scores -= 1.e9 * bk.cast(padding_mask, dtype=scores.dtype) # ====== convert attention score to distribution ====== # # if the last dimension is 1, no point for applying softmax, hence, # softmax to the second last dimension ### soft attention if AlignSoft in self: attention_distribution = bk.softmax( scores, axis=-2 if scores.shape[-1] == 1 else -1) ### relaxed hard attention elif AlignRelax in self: attention_distribution = bay.distributions.RelaxedOneHotCategorical( temperature=temperature, logits=bk.squeeze(scores, axis=-1) if scores.shape[-1] == 1 else scores) fsample = partial(bay.Distribution.sample, sample_shape=sample_shape) attention_distribution = bay.coercible_tensor( attention_distribution, convert_to_tensor_fn=fsample) ### hard attention elif AlignHard in self: attention_distribution = bay.distributions.OneHotCategorical( logits=bk.squeeze(scores, axis=-1) if scores.shape[-1] == 1 else scores, dtype=value.dtype) fsample = partial(bay.Distribution.sample, sample_shape=sample_shape) attention_distribution = bay.coercible_tensor( attention_distribution, convert_to_tensor_fn=fsample) # ====== dropout the attention scores ====== # attention = bk.dropout(attention_distribution, p_drop=dropout, axis=1 if temporal_dropout else None, training=training and dropout > 0) # ====== applying the attention ====== # if self.is_self_attention and ScoreLocation in self: result = bk.expand_dims(bk.array(attention), axis=-1) * value \ if attention.shape[-1] != 1 else \ attention * value else: if PosLocalM in self: value = value[:, -Tv:] if num_heads == 0 else value[:, :, -Tv:] result = bk.matmul(attention, value) # ====== applying the Query mask ====== # if q_mask is not None: assert q_mask.shape[1] == Tq,\ "Query mask has time dimension %d, but query has time dimension %d" \ % (q_mask.shape[1], Tq) # Mask of shape [batch_size, Tq, 1]. q_mask = bk.expand_dims(q_mask, axis=-1) result *= bk.cast(q_mask, dtype=result.dtype) # ====== residual connection ====== # if residual: if query is None: raise ValueError("query must be given for residual connection") result += query # ====== return ====== # return result, attention_distribution