def prepare(self, query, key=None, value=None, mask=None): r""" Preparing input for attention model Returns: query: Query (or target sequence) tensor of shape `[batch_size, Tq, dim]`. key: Key (or source sequence) tensor of shape `[batch_size, Tv, dim]`. value: Value (or source sequence) tensor of shape `[batch_size, Tv, dim]`. mask: list of the following * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`. If given, the output will be zero at the positions where `mask==False`. * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`. If given, will apply the mask such that values at positions where `mask==False` do not contribute to the result. """ # by default, if key is not provide, using value query = bk.array(query, ignore_none=True) key = bk.array(key, ignore_none=True) value = bk.array(value, ignore_none=True) # ====== check if intra-attention ====== # if self.is_self_attention: if (key is not None or value is not None): warnings.warn( "Self-attention (intra-attention) need only query, " "ignore provided key and value", category=UserWarning) if key is not None: key = query if value is not None: value = query ### inter-attention else: if key is None: key = value if value is None: # value must always provided raise RuntimeError( "value must be given of inter-sequences attention.") # ====== masks ====== # if self.is_self_attention: # only 1 mask is need if isinstance(mask, (tuple, list)): q_mask = mask[0] else: q_mask = mask v_mask = None else: q_mask = mask[0] if mask else None v_mask = mask[1] if mask else None if v_mask is not None: if v_mask.shape[1] != value.shape[1]: raise RuntimeError( "Value mask has time dimension %d, but value has time dimension %d" % (v_mask.shape[1], value.shape[1])) # ====== return ====== # return query, key, value, \ bk.array(q_mask, ignore_none=True), bk.array(v_mask, ignore_none=True)
def test_matmul(self): for shape1, shape2, outshape in [ [(2, 3), (4, 3, 5), (4, 2, 5)], [(2, 3, 4), (4, 5), (2, 3, 5)], [(5, 3, 4), (5, 4, 6), (5, 3, 6)], ]: x = np.random.rand(*shape1) y = np.random.rand(*shape2) for fw in FRAMEWORKS: a = bk.array(x, fw) b = bk.array(y, fw) c = bk.matmul(a, b) self.assertEqual(c.shape, outshape, msg=fw)
def test_countnonzero(self): x = np.random.randint(0, 10, size=(25, 12, 8)) for axis in (None, 0, 1, 2, (1, 2)): for keepdims in (True, False): for dtype in ('int32', 'float32'): y = [ bk.count_nonzero(bk.array(x, fw), axis=axis, keepdims=keepdims, dtype=dtype) for fw in FRAMEWORKS ] assert_equal(self, (axis, keepdims, dtype), *y)
def __init__(self, output_dim, max_len=10000, trainable=False, mask_zero=False): super().__init__() self.output_dim = output_dim self.mask_zero = bool(mask_zero) self.trainable = bool(trainable) self.supports_masking = mask_zero self.max_len = max_len # Applying the cosine to even columns and sin to odds. # if zero-masked, dont use the 0 position # (i - i % 2) create a sequence of (0,0,1,1,2,2,...) which is needed # for two running sequence of sin and cos in odd and even position position_encoding = np.array([[ pos / np.power(10000, (i - i % 2) / output_dim) for i in range(output_dim) ] if pos != 0 or not mask_zero else [0.] * output_dim for pos in range(max_len)]) # [max_len, output_dim] position_encoding[:, 0::2] = np.sin(position_encoding[:, 0::2]) # dim 2i position_encoding[:, 1::2] = np.cos(position_encoding[:, 1::2]) # dim 2i+1 if not trainable: self.position_encoding = bk.array(position_encoding, dtype='float32', framework=self) else: self.position_encoding = bk.variable( initial_value=position_encoding, dtype='float32', trainable=True, framework=self)
def compute_mask(self, mask=None): if mask: q_mask = mask[0] if isinstance(mask, (tuple, list)) else mask return bk.array(q_mask)
def align(self, scores, value, query=None, q_mask=None, v_mask=None, causal=False, residual=False, dropout=0, temporal_dropout=False, sample_shape=1, temperature=0.5, training=None): r"""Applies attention scores to the given value tensor. Arguments: scores: Attention Scores float tensor of shape `[num_heads, batch_size, Tq, Tv]`. value: Value (or source sequence) tensor of shape `[num_heads, batch_size, Tv, dim]`. query: Query (or target sequence) tensor of shape `[num_heads, batch_size, Tq, dim]`. q_mask: A boolean query mask `Tensor` of shape `[batch_size, Tq]`. If given, the output will be zero at the positions where `mask==False`. v_mask: A boolean value mask `Tensor` of shape `[batch_size, Tv]`. If given, will apply the mask such that values at positions where `mask==False` do not contribute to the result. dropout : Float. Dropout probability of the attention scores. temporal_dropout : Boolean. If `True`, using the same dropout mask along temporal axis (i.e. the 1-st dimension) sample_shape (`Integer`) : number of mcmc samples for estimating the gradient of hard attention temperature: An 0-D `Tensor`, representing the temperature of a set of RelaxedOneHotCategorical distributions. The temperature should be positive. Returns: attended sequence: Tensor of shape * `[sample_shape, num_heads, batch_size, Tq, dim]` for (hard + multi-heads) * `[sample_shape, batch_size, Tq, dim]` for (hard + no-head) * `[num_heads, batch_size, Tq, dim]` for (soft + multi-heads) * `[batch_size, Tq, dim]` for (soft + no-head) attention distribution : for soft attention, return Tensor of shape * `[num_heads, batch_size, Tq]` for self-attention * `[num_heads, batch_size, Tq, Tv]` for inter-attention. for hard attention, return one-hot categorical distribution of shape * `[sample_shape, num_heads, batch_size, Tq]` for self-attention * `[sample_shape, num_heads, batch_size, Tq, Tv]` for inter-attention. if multi-heads attention wasn't used, omit the `[num_heads]`. """ num_heads = _get_num_heads(scores) if num_heads == 0: Tq = scores.shape[1] Tv = scores.shape[2] else: Tq = scores.shape[2] Tv = scores.shape[3] if value is None: if query is None: raise ValueError("both query and value are None, " "at least one of them must be given") value = query # ====== Causal mask ====== # if causal: # Creates a lower triangular mask, so position i cannot attend to # positions j>i. This prevents the flow of information from the future # into the past. scores_shape = scores.shape # causal_mask_shape = [1, Tq, Tv]. causal_mask_shape = bk.concatenate( [bk.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0) causal_mask = bk.tril_mask(causal_mask_shape) else: causal_mask = None if v_mask is not None: # LocalM applied if PosLocalM in self: v_mask = v_mask[:, -Tv:] # Mask of shape [batch_size, 1, Tv]. v_mask = bk.expand_dims(v_mask, axis=-2) v_mask = bk.cast(v_mask, 'bool') if num_heads > 0: v_mask = bk.expand_dims(v_mask, axis=0) scores_mask = bk.logical_and(v_mask, causal_mask) ### applying the scores mask if scores_mask is not None: padding_mask = bk.logical_not(scores_mask) # Bias so padding positions do not contribute to attention distribution. scores -= 1.e9 * bk.cast(padding_mask, dtype=scores.dtype) # ====== convert attention score to distribution ====== # # if the last dimension is 1, no point for applying softmax, hence, # softmax to the second last dimension ### soft attention if AlignSoft in self: attention_distribution = bk.softmax( scores, axis=-2 if scores.shape[-1] == 1 else -1) ### relaxed hard attention elif AlignRelax in self: attention_distribution = bay.distributions.RelaxedOneHotCategorical( temperature=temperature, logits=bk.squeeze(scores, axis=-1) if scores.shape[-1] == 1 else scores) fsample = partial(bay.Distribution.sample, sample_shape=sample_shape) attention_distribution = bay.coercible_tensor( attention_distribution, convert_to_tensor_fn=fsample) ### hard attention elif AlignHard in self: attention_distribution = bay.distributions.OneHotCategorical( logits=bk.squeeze(scores, axis=-1) if scores.shape[-1] == 1 else scores, dtype=value.dtype) fsample = partial(bay.Distribution.sample, sample_shape=sample_shape) attention_distribution = bay.coercible_tensor( attention_distribution, convert_to_tensor_fn=fsample) # ====== dropout the attention scores ====== # attention = bk.dropout(attention_distribution, p_drop=dropout, axis=1 if temporal_dropout else None, training=training and dropout > 0) # ====== applying the attention ====== # if self.is_self_attention and ScoreLocation in self: result = bk.expand_dims(bk.array(attention), axis=-1) * value \ if attention.shape[-1] != 1 else \ attention * value else: if PosLocalM in self: value = value[:, -Tv:] if num_heads == 0 else value[:, :, -Tv:] result = bk.matmul(attention, value) # ====== applying the Query mask ====== # if q_mask is not None: assert q_mask.shape[1] == Tq,\ "Query mask has time dimension %d, but query has time dimension %d" \ % (q_mask.shape[1], Tq) # Mask of shape [batch_size, Tq, 1]. q_mask = bk.expand_dims(q_mask, axis=-1) result *= bk.cast(q_mask, dtype=result.dtype) # ====== residual connection ====== # if residual: if query is None: raise ValueError("query must be given for residual connection") result += query # ====== return ====== # return result, attention_distribution
def score(self, query, key=None, scale=1, window_width=None, q_proj=None, target_proj=None): r""" Arguments: query: Query (or target sequence) tensor of shape `[batch_size, Tq, dim]` or `[num_heads, batch_size, Tq, dim]` in case of multi-heads attention. key: Key (or source sequence) tensor of shape `[batch_size, Tv, dim]` or `[num_heads, batch_size, Tv, dim]` in case of multi-heads attention. scale: single `Scalar` or `Tensor` of shape `[dim]` for scaling the attention scores, suggested `1/sqrt(dim)` in (Vaswani et al. 2017). window_width : `None`, `Integer` or `Float` ([0, 1]). The total number of frames for a single window in local attention (i.e. `left + 1 + right`) Can be given as a fixed number of frames (`int`), or percentage of the sequence length (`float`). If `None`, use `Tq` q_proj : `Dense`, instance of dense or fully connected layer - for `ScoreLocation`, the number of hidden unit is `1` - for `ScoreGeneral`, the number of hidden unit is `dim` target_proj : `Dense`, for predictive local attention, applying a fully connected network on target sequence (i.e. the query) to predict the position on source sequence (i.e. the key). The layer must has output dimension equal to 1 and return logit value. Returns: Tensor of shape `[num_heads, batch_size, Tq, Tv]`, or `[num_heads, batch_size, Tq, 1]` if `ScoreLocation` """ ### Check if multi-head attention is used num_heads = _get_num_heads(query) if num_heads > 0: query = bk.reshape(query, [-1] + [i for i in query.shape[2:]]) if key is not None: key = bk.reshape(key, [-1] + [i for i in key.shape[2:]]) Tq = query.shape[1] Tv = Tq if key is None else key.shape[1] # scale shape is `[]` or `[dim]` scale = bk.array(scale, dtype=query.dtype) ### Check the window width if window_width is None: window_width = Tq elif window_width < 1: window_width = window_width * Tv window_width = int(window_width) ### Locative attention if AttentionMechanism.ScoreLocation in self: if PosLocalM in self or PosLocalP in self: raise NotImplementedError( "ScoreLocation only support Global attention, but given: %s" % str(self)) # [batch_size * num_heads, Tq, dim] scores = bk.reduce_mean(scale) * q_proj(query) assert scores.shape[-1] == 1, \ " q_proj must have only 1 hidden unit, but given %d" % scores.shape[-1] ### Other score mode need the key tensor else: if key is None: raise ValueError( "key must be provided for attention type: %s" % str(self)) ### Attention position (local or global) if PosLocalM in self: key = key[:, -window_width:] elif PosLocalP in self: pt = bk.sigmoid(target_proj(bk.reshape(query, ([0], -1)))) assert pt.shape[-1] == 1, \ "target_proj must project the query [., Tq * dim] to [., 1], i.e. " + \ "predicting the attention position on source sequence using " + \ "knowledge from target sequence." pt = Tv * pt # `[batch_size * num_heads, 1]` # `[batch_size * num_heads, Tv]` # Eq (10) (Luong et al. 2015) gauss_est = bk.exp( -bk.square(bk.arange(Tv, dtype=pt.dtype) - pt) / (2 * bk.square(window_width / 2))) # `[batch_size * num_heads, 1, Tv]` gauss_est = bk.expand_dims(gauss_est, axis=1) ### Additive or concat method if AttentionMechanism.ScoreAdditive in self: # [batch_size * num_heads, Tq, 1, dim] q = bk.expand_dims(query, axis=2) # [batch_size * num_heads, 1, Tv, dim] k = bk.expand_dims(key, axis=1) # [batch_size * num_heads, Tq, Tv] scores = bk.reduce_sum(scale * bk.tanh(q + k), axis=-1) ### Dot product or multiplicative scoring elif AttentionMechanism.ScoreDotProd in self: # this is a trick to make attention_scale broadcastable when # scale_tied=False scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2)) ### cosine scoring elif AttentionMechanism.ScoreCosine in self: # [batch_size * num_heads, Tq, 1, dim] q = bk.expand_dims(query, axis=2) # [batch_size * num_heads, 1, Tv, dim] k = bk.expand_dims(key, axis=1) # [batch_size * num_heads, Tq, Tv, dim] scores = (q * k) / (bk.norm(q, p=2) * bk.norm(k, p=2)) scores = bk.reduce_sum(scale * scores, axis=-1, keepdims=False) ### general method with only project on the query elif AttentionMechanism.ScoreGeneral in self: query = q_proj(query) assert query.shape[-1] == key.shape[-1], \ " q_proj must have %d hidden units, but given %d units" % \ (key.shape[-1], query.shape[-1]) scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2)) else: raise NotImplementedError( "No support for attention_type='%s'" % str(self)) ### applying the local-predictive attention if PosLocalP in self: scores = scores * gauss_est ### get back the multi-heads shape if num_heads > 0: scores = bk.reshape(scores, shape=[num_heads, -1] + [i for i in scores.shape[1:]]) return scores