def transduce(self, expr_seq: ExpressionSequence) -> ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ Wq, Wk, Wv, Wo = [ dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo) ] bq, bk, bv, bo = [ dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo) ] # Start with a [(length, model_size) x batch] tensor x = expr_seq.as_transposed_tensor() x_len = x.dim()[0][0] x_batch = x.dim()[1] # Get the query key and value vectors # TODO: do we need bias broadcasting in DyNet? # q = dy.affine_transform([bq, x, Wq]) # k = dy.affine_transform([bk, x, Wk]) # v = dy.affine_transform([bv, x, Wv]) q = bq + x * Wq k = bk + x * Wk v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q, k, v) ] # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys attn_score = q * dy.transpose(k) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = dy.inputTensor(np.repeat( expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10 attn_score = attn_score + mask attn_prob = dy.softmax(attn_score, d=1) # Reduce using attention and resize to match [(length, model_size) x batch] o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch) # Final transformation # o = dy.affine_transform([bo, attn_prob * v, Wo]) o = bo + o * Wo expr_seq = ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask) self._final_states = [FinalTransducerState(expr_seq[-1], None)] return expr_seq
def transduce(self, x: ExpressionSequence) -> ExpressionSequence: x_T = x.as_transposed_tensor() scores = x_T * dy.parameter(self.W) if x.mask is not None: scores = x.mask.add_to_tensor_expr(scores, multiplicator=-100.0, time_first=True) if self.pos_enc_max: seq_len = x_T.dim()[0][0] pos_enc = self.pos_enc[:seq_len,:] scores = dy.cmult(scores, dy.inputTensor(pos_enc)) attention = dy.softmax(scores) output_expr = x.as_tensor() * attention return expression_sequence.ExpressionSequence(expr_tensor=output_expr, mask=None)