def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = src.dim()[0][1] # src_channels = 1 batch_size = src.dim()[1] # convolution and pooling layers # src dim is ((40, 1000), 128) src = padding(src, self.filter_width[0]+3) l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128) pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128) pool1 = padding(pool1, self.filter_width[1]+3) l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128) pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128) pool2 = padding(pool2, self.filter_width[2]) l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128) pool3 = dy.max_dim(l3, d = 1) my_norm = dy.l2_norm(pool3) + 1e-6 output = dy.cdiv(pool3,my_norm) output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size) return ExpressionSequence(expr_tensor=output)
def transduce(self, sent: ExpressionSequence) -> ExpressionSequence: if self.pos_encoding_type == "trigonometric": if self.position_encoding_block is None or self.position_encoding_block.shape[ 2] < len(sent): self.initialize_position_encoding( int(len(sent) * 1.2), self.input_dim if self.pos_encoding_combine == "add" else self.pos_encoding_size) encoding = dy.inputTensor( self.position_encoding_block[0, :, :len(sent)]) elif self.pos_encoding_type == "embedding": encoding = self.positional_embedder.embed_sent( len(sent)).as_tensor() if self.pos_encoding_type: if self.pos_encoding_combine == "add": sent = ExpressionSequence(expr_tensor=sent.as_tensor() + encoding, mask=sent.mask) else: # concat sent = ExpressionSequence(expr_tensor=dy.concatenate( [sent.as_tensor(), encoding]), mask=sent.mask) elif self.pos_encoding_type: raise ValueError(f"unknown encoding type {self.pos_encoding_type}") for module in self.modules: enc_sent = module.transduce(sent) sent = enc_sent self._final_states = [transducers.FinalTransducerState(sent[-1])] return sent
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.dim()[1] > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = [] c = [] batch_size = expr_seq.dim()[1] if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for node_i in range(lattice.sent_len()): cur_node = lattice.nodes[node_i] val = expr_seq[node_i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(cur_node.nodes_prev) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in cur_node.nodes_prev) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in cur_node.nodes_prev: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(cur_node.nodes_prev) == 0: c.append(dy.cmult(i_it, i_gt)) else: fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]]) for i in range(1, len(cur_node.nodes_prev)): fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]]) c.append(fc + dy.cmult(i_it, i_gt)) h_t = dy.cmult(i_ot, dy.tanh(c[-1])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h.append(h_t) self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])] return expression_seqs.ExpressionSequence(expr_list=h)
def transduce(self, embed_sent: ExpressionSequence) -> ExpressionSequence: src = embed_sent.as_tensor() sent_len = src.dim()[0][1] batch_size = src.dim()[1] pad_size = (self.window_receptor - 1) / 2 #TODO adapt it also for even window size src = dy.concatenate([ dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src, dy.zeroes((self.input_dim, pad_size), batch_size=batch_size) ], d=1) padded_sent_len = sent_len + 2 * pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1), batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1]) hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid), dy.parameter(bias_hid), stride=[1, 1]) hidden_layer = dy.reshape(hidden_layer, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer, last_conv, last_bias, stride=[1, 1]) output = dy.reshape(output, (sent_len, self.output_dim), batch_size=batch_size) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, es: ExpressionSequence) -> ExpressionSequence: """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one. Args: es: an ExpressionSequence """ es_list = [es] for layer_i, fb in enumerate(self.builder_layers): reduce_factor = self._reduce_factor_for_layer(layer_i) if es_list[0].mask is None: mask_out = None else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor) if self.downsampling_method == "concat" and len( es_list[0]) % reduce_factor != 0: raise ValueError( f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, " f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. " f"Set Batcher's pad_src_to_multiple argument accordingly.") fs = fb.transduce(es_list) if layer_i < len(self.builder_layers) - 1: if self.downsampling_method == "skip": es_list = [ ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out) ] elif self.downsampling_method == "concat": es_len = len(es_list[0]) es_list_fwd = [] for i in range(0, es_len, reduce_factor): for j in range(reduce_factor): if i == 0: es_list_fwd.append([]) es_list_fwd[j].append(fs[i + j]) es_list = [ ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor) ] else: raise RuntimeError( f"unknown downsampling_method {self.downsampling_method}" ) else: # concat final outputs ret_es = ExpressionSequence(expr_list=[f for f in fs], mask=mask_out) self._final_states = [ FinalTransducerState(fb.get_final_states()[0].main_expr(), fb.get_final_states()[0].cell_expr()) for fb in self.builder_layers ] return ret_es
def transduce(self, expr_seq: ExpressionSequence) -> ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ Wq, Wk, Wv, Wo = [ dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo) ] bq, bk, bv, bo = [ dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo) ] # Start with a [(length, model_size) x batch] tensor x = expr_seq.as_transposed_tensor() x_len = x.dim()[0][0] x_batch = x.dim()[1] # Get the query key and value vectors # TODO: do we need bias broadcasting in DyNet? # q = dy.affine_transform([bq, x, Wq]) # k = dy.affine_transform([bk, x, Wk]) # v = dy.affine_transform([bv, x, Wv]) q = bq + x * Wq k = bk + x * Wk v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q, k, v) ] # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys attn_score = q * dy.transpose(k) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = dy.inputTensor(np.repeat( expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10 attn_score = attn_score + mask attn_prob = dy.softmax(attn_score, d=1) # Reduce using attention and resize to match [(length, model_size) x batch] o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch) # Final transformation # o = dy.affine_transform([bo, attn_prob * v, Wo]) o = bo + o * Wo expr_seq = ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask) self._final_states = [FinalTransducerState(expr_seq[-1], None)] return expr_seq
def transduce(self, seq: ExpressionSequence) -> ExpressionSequence: seq_tensor = self.child.transduce(seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0], ), batch_size=d[0][1] * d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return ExpressionSequence(expr_tensor=seq_tensor)
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() # convolutional layer src = padding(src, src.dim()[0][0], src.dim()[0][1], self.filter_width, self.stride, src.dim()[1]) l1 = dy.rectify( dy.conv2d(src, dy.parameter(self.filter_conv), stride=[self.stride, self.stride], is_valid=True)) timestep = l1.dim()[0][1] features = l1.dim()[0][2] batch_size = l1.dim()[1] # transpose l1 to be (timesetp, dim), but keep the batch_size. rhn_in = dy.reshape(l1, (timestep, features), batch_size=batch_size) rhn_in = [dy.pick(rhn_in, i) for i in range(timestep)] for l in range(self.rhn_num_hidden_layers): rhn_out = [] # initialize a random vector for the first state vector, keep the same batch size. prev_state = dy.parameter(self.init[l]) # begin recurrent high way network for t in range(timestep): for m in range(0, self.rhn_microsteps): H = dy.affine_transform([ dy.parameter(self.recur[l][m][1]), dy.parameter(self.recur[l][m][0]), prev_state ]) T = dy.affine_transform([ dy.parameter(self.recur[l][m][3]), dy.parameter(self.recur[l][m][2]), prev_state ]) if m == 0: H += dy.parameter(self.linear[l][0]) * rhn_in[t] T += dy.parameter(self.linear[l][1]) * rhn_in[t] H = dy.tanh(H) T = dy.logistic(T) prev_state = dy.cmult(1 - T, prev_state) + dy.cmult( T, H) # ((1024, ), batch_size) rhn_out.append(prev_state) if self.residual and l > 0: rhn_out = [sum(x) for x in zip(rhn_out, rhn_in)] rhn_in = rhn_out # Compute the attention-weighted average of the activations rhn_in = dy.concatenate_cols(rhn_in) scores = dy.transpose(dy.parameter(self.attention[0][1])) * dy.tanh( dy.parameter(self.attention[0][0]) * rhn_in) # ((1,510), batch_size) scores = dy.reshape(scores, (scores.dim()[0][1], ), batch_size=scores.dim()[1]) attn_out = rhn_in * dy.softmax( scores ) # # rhn_in.as_tensor() is ((1024,510), batch_size) softmax is ((510,), batch_size) return ExpressionSequence(expr_tensor=attn_out)
def transduce(self, es: ExpressionSequence) -> ExpressionSequence: """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one, to both the forward and backward RNNs, and concatenating. Args: es: an ExpressionSequence """ es_list = [es] for layer_i, (fb, bb) in enumerate(self.builder_layers): reduce_factor = self._reduce_factor_for_layer(layer_i) if es_list[0].mask is None: mask_out = None else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor) if self.downsampling_method=="concat" and len(es_list[0]) % reduce_factor != 0: raise ValueError(f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, " f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. " f"Set Batcher's pad_src_to_multiple argument accordingly.") fs = fb.transduce(es_list) bs = bb.transduce([ReversedExpressionSequence(es_item) for es_item in es_list]) if layer_i < len(self.builder_layers) - 1: if self.downsampling_method=="skip": es_list = [ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out), ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out)] elif self.downsampling_method=="concat": es_len = len(es_list[0]) es_list_fwd = [] es_list_bwd = [] for i in range(0, es_len, reduce_factor): for j in range(reduce_factor): if i==0: es_list_fwd.append([]) es_list_bwd.append([]) es_list_fwd[j].append(fs[i+j]) es_list_bwd[j].append(bs[len(es_list[0])-reduce_factor+j-i]) es_list = [ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \ [ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)] else: raise RuntimeError(f"unknown downsampling_method {self.downsampling_method}") else: # concat final outputs ret_es = ExpressionSequence( expr_list=[dy.concatenate([f, b]) for f, b in zip(fs, ReversedExpressionSequence(bs))], mask=mask_out) self._final_states = [FinalTransducerState(dy.concatenate([fb.get_final_states()[0].main_expr(), bb.get_final_states()[0].main_expr()]), dy.concatenate([fb.get_final_states()[0].cell_expr(), bb.get_final_states()[0].cell_expr()])) \ for (fb, bb) in self.builder_layers] return ret_es
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: sent_len = len(src) embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = dy.concatenate([embeddings, src.as_tensor()]) else: raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")') output_seq = ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, x: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: x_T = x.as_transposed_tensor() scores = x_T * dy.parameter(self.W) if x.mask is not None: scores = x.mask.add_to_tensor_expr(scores, multiplicator=-100.0, time_first=True) if self.pos_enc_max: seq_len = x_T.dim()[0][0] pos_enc = self.pos_enc[:seq_len,:] scores = dy.cmult(scores, dy.inputTensor(pos_enc)) attention = dy.softmax(scores) output_expr = x.as_tensor() * attention return expression_seqs.ExpressionSequence(expr_tensor=output_expr, mask=None)
def exprseq_pooling(self, exprseq): # Reduce to vector exprseq = ExpressionSequence( expr_tensor=exprseq.mask.add_to_tensor_expr( exprseq.as_tensor(), -1e10), mask=exprseq.mask) if exprseq.expr_tensor is not None: if len(exprseq.expr_tensor.dim()[0]) > 1: return dy.max_dim(exprseq.expr_tensor, d=1) else: return exprseq.expr_tensor else: return dy.emax(exprseq.expr_list)
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = 1 batch_size = src.dim()[1] W = dy.parameter(self.pW) b = dy.parameter(self.pb) src = dy.reshape(src, (src_height, src_width), batch_size=batch_size) # ((276, 80, 3), 1) # convolution and pooling layers l1 = (W*src)+b output = dy.cdiv(l1,dy.sqrt(dy.squared_norm(l1))) return ExpressionSequence(expr_tensor=output)
def __call__(self, x: ExpressionSequence) -> tt.Tensor: """ Move the time-dimension of an input expression into the batch dimension via a reshape. Args: x: expression of dimensions ((hidden, timesteps), batch_size) Returns: expression of dimensions ((hidden,), timesteps*batch_size) """ batch_size = x[0].dim()[1] model_dim = x[0].dim()[0][0] seq_len = x.sent_len() total_words = seq_len * batch_size input_tensor = x.as_tensor() return dy.reshape(input_tensor, (model_dim, ), batch_size=total_words)
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src_tensor = src.as_tensor() out_mask = src.mask if self.downsample_by > 1: assert len(src_tensor.dim()[0])==2, \ f"Downsampling only supported for tensors of order two. Found dims {src_tensor.dim()}" (hidden_dim, seq_len), batch_size = src_tensor.dim() if seq_len % self.downsample_by != 0: raise ValueError( "For downsampling, sequence lengths must be multiples of the total reduce factor. " "Configure batcher accordingly.") src_tensor = dy.reshape(src_tensor, (hidden_dim * self.downsample_by, seq_len // self.downsample_by), batch_size=batch_size) if out_mask: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_by) output = self.transform.transform(src_tensor) if self.downsample_by == 1: if len(output.dim()) != src_tensor.dim( ): # can happen with seq length 1 output = dy.reshape(output, src_tensor.dim()[0], batch_size=src_tensor.dim()[1]) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=out_mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce( self, seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if self.train and self.dropout > 0.0: seq_tensor = dy.dropout( self.child.transduce(seq).as_tensor(), self.dropout) + seq.as_tensor() else: seq_tensor = self.child.transduce( seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0], ), batch_size=d[0][1] * d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ # Start with a [(length, model_size) x batch] tensor # B x T x H -> B x H x T x = expr_seq.as_tensor() x_len = x.size()[1] x_batch = x.size()[0] # Get the query key and value vectors q = self.lin_q(x).transpose(1, 2).contiguous() k = self.lin_k(x).transpose(1, 2).contiguous() v = self.lin_v(x).transpose(1, 2).contiguous() # q = bq + x * Wq # k = bk + x * Wk # v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ temp.view((x_batch * self.num_heads, self.head_dim, x_len)) for temp in (q, k, v) ] # Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = torch.Tensor( np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) * -1e10).to(xnmt.device) attn_score = attn_score + mask.unsqueeze(2) attn_prob = torch.nn.Softmax(dim=1)(attn_score) # attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = tt.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim, x_len).transpose(1, 2) # Final transformation o = self.lin_o(o) # o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o, mask=expr_seq.mask) self._final_states = [ transducers.FinalTransducerState(expr_seq[-1], None) ] return expr_seq
def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: mask = es.mask sent_len = len(es) es_expr = es.as_transposed_tensor() batch_size = es_expr.dim()[1] es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size) h_out = {} for direction in ["fwd", "bwd"]: # input convolutions gates_xt_bias = dy.conv2d_bias(es_chn, dy.parameter(self.params["x2all_" + direction]), dy.parameter(self.params["b_" + direction]), stride=(1, 1), is_valid=False) gates_xt_bias_list = [dy.pick_range(gates_xt_bias, i, i + 1) for i in range(sent_len)] h = [] c = [] for input_pos in range(sent_len): directional_pos = input_pos if direction == "fwd" else sent_len - input_pos - 1 gates_t = gates_xt_bias_list[directional_pos] if input_pos > 0: # recurrent convolutions gates_h_t = dy.conv2d(h[-1], dy.parameter(self.params["h2all_" + direction]), stride=(1, 1), is_valid=False) gates_t += gates_h_t # standard LSTM logic if len(c) == 0: c_tm1 = dy.zeros((self.freq_dim * self.num_filters,), batch_size=batch_size) else: c_tm1 = c[-1] gates_t_reshaped = dy.reshape(gates_t, (4 * self.freq_dim * self.num_filters,), batch_size=batch_size) c_t = dy.reshape(dy.vanilla_lstm_c(c_tm1, gates_t_reshaped), (self.freq_dim * self.num_filters,), batch_size=batch_size) h_t = dy.vanilla_lstm_h(c_t, gates_t_reshaped) h_t = dy.reshape(h_t, (1, self.freq_dim, self.num_filters,), batch_size=batch_size) if mask is None or np.isclose(np.sum(mask.np_arr[:, input_pos:input_pos + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask.cmult_by_timestep_expr(c_t, input_pos, True) + mask.cmult_by_timestep_expr(c[-1], input_pos, False)) h.append( mask.cmult_by_timestep_expr(h_t, input_pos, True) + mask.cmult_by_timestep_expr(h[-1], input_pos, False)) h_out[direction] = h ret_expr = [] for state_i in range(len(h_out["fwd"])): state_fwd = h_out["fwd"][state_i] state_bwd = h_out["bwd"][-1 - state_i] output_dim = (state_fwd.dim()[0][1] * state_fwd.dim()[0][2],) fwd_reshape = dy.reshape(state_fwd, output_dim, batch_size=batch_size) bwd_reshape = dy.reshape(state_bwd, output_dim, batch_size=batch_size) ret_expr.append(dy.concatenate([fwd_reshape, bwd_reshape], d=0 if self.reshape_output else 2)) return expression_seqs.ExpressionSequence(expr_list=ret_expr, mask=mask) # TODO: implement get_final_states()
def transduce( self, seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if self.train and self.dropout > 0.0: seq_tensor = tt.dropout( self.child.transduce(seq).as_tensor(), self.dropout) + seq.as_tensor() else: seq_tensor = self.child.transduce( seq).as_tensor() + seq.as_tensor() if self.layer_norm: batch_size = tt.batch_size(seq_tensor) merged_seq_tensor = tt.merge_time_batch_dims(seq_tensor) transformed_seq_tensor = self.layer_norm_component.transform( merged_seq_tensor) seq_tensor = tt.unmerge_time_batch_dims(transformed_seq_tensor, batch_size) return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
def transduce(self, x: ExpressionSequence) -> ExpressionSequence: seq_len = len(x) batch_size = x[0].dim()[1] att_mask = None if self.diagonal_mask_width is not None: if self.diagonal_mask_width is None: att_mask = np.zeros((seq_len, seq_len)) else: att_mask = np.ones((seq_len, seq_len)) for i in range(seq_len): from_i = max(0, i - self.diagonal_mask_width // 2) to_i = min(seq_len, i + self.diagonal_mask_width // 2 + 1) att_mask[from_i:to_i, from_i:to_i] = 0.0 mid = self.self_attn(x=x, att_mask=att_mask, batch_mask=x.mask.np_arr if x.mask else None, p=self.dropout) if self.downsample_factor > 1: seq_len = int(math.ceil(seq_len / float(self.downsample_factor))) hidden_dim = mid.dim()[0][0] out_mask = x.mask if self.downsample_factor > 1 and out_mask is not None: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_factor) if self.ff_lstm: mid_re = dy.reshape(mid, (hidden_dim, seq_len), batch_size=batch_size) out = self.feed_forward.transduce( ExpressionSequence(expr_tensor=mid_re, mask=out_mask)) out = dy.reshape(out.as_tensor(), (hidden_dim, ), batch_size=seq_len * batch_size) else: out = self.feed_forward.transduce(mid, p=self.dropout) self._recent_output = out return ExpressionSequence(expr_tensor=dy.reshape( out, (out.dim()[0][0], seq_len), batch_size=batch_size), mask=out_mask)
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: sent_len = src.sent_len() batch_size = tt.batch_size(src[0]) embeddings = self.embeddings( torch.tensor([list(range(sent_len))] * batch_size).to(xnmt.device)) # embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = tt.concatenate([embeddings, src.as_tensor()]) else: raise ValueError( f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")' ) if self.train and self.dropout > 0.0: output = tt.dropout(output, self.dropout) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq
def transduce( self, x: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: expr = x.as_transposed_tensor() batch_size, hidden_dim, seq_len = expr.size() expr = expr.view((batch_size, self.in_channels, hidden_dim // self.in_channels, seq_len)) expr = self.cnn_layer(expr) if self.use_pooling: expr = self.pooling_layer(expr) expr = self.activation_fct(expr) batch_size, out_chn, out_h, seq_len = expr.size() expr = expr.view((batch_size, out_chn * out_h, seq_len)) output_seq = expression_seqs.ExpressionSequence( expr_transposed_tensor=expr, mask=x.mask.lin_subsampled(trg_len=seq_len) if x.mask else None) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, embed_sent: ExpressionSequence) -> List[ExpressionSequence]: batch_size = embed_sent[0].dim()[1] actions = self.sample_segmentation(embed_sent, batch_size) embeddings = dy.concatenate(embed_sent.expr_list, d=1) embeddings.value() # composed_words = [] for i in range(batch_size): sequence = dy.pick_batch_elem(embeddings, i) # For each sampled segmentations lower_bound = 0 for j, upper_bound in enumerate(actions[i]): if self.no_char_embed: char_sequence = [] else: char_sequence = dy.pick_range(sequence, lower_bound, upper_bound + 1, 1) composed_words.append( (char_sequence, i, j, lower_bound, upper_bound + 1)) lower_bound = upper_bound + 1 outputs = self.segment_composer.compose(composed_words, batch_size) # Padding + return try: if self.length_prior: seg_size_unpadded = [ len(outputs[i]) for i in range(batch_size) ] sampled_sentence, segment_mask = self.pad(outputs) expr_seq = ExpressionSequence( expr_tensor=dy.concatenate_to_batch(sampled_sentence), mask=segment_mask) return self.final_transducer.transduce(expr_seq) finally: if self.length_prior: self.seg_size_unpadded = seg_size_unpadded self.compose_output = outputs self.segment_actions = actions if not self.train and self.is_reporting(): if len(actions) == 1: # Support only AccuracyEvalTask self.report_sent_info({"segment_actions": actions})
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src_tensor = src.as_tensor() out_mask = src.mask if self.downsample_by > 1: assert src_tensor.dim()==3, \ f"Downsampling only supported for tensors of order two (+ batch). Found dims {src_tensor.size()}" batch_size, seq_len, hidden_dim = src_tensor.size() if seq_len % self.downsample_by != 0: raise ValueError( "For downsampling, sequence lengths must be multiples of the total reduce factor. " "Configure batcher accordingly.") src_tensor = src_tensor.view( (batch_size, seq_len // self.downsample_by, hidden_dim * self.downsample_by)) if out_mask: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_by) output = self.transform.transform(src_tensor) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=out_mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce( self, es: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: output = self.transform(es.as_tensor(), es.mask) return expression_seqs.ExpressionSequence(expr_tensor=output, mask=es.mask)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.batch_size() > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.batch_size()}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = {} c = {} h_list = [] batch_size = expr_seq.batch_size() if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for i, cur_node_id in enumerate(lattice.nodes): prev_node = lattice.graph.predecessors(cur_node_id) val = expr_seq[i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(prev_node) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in prev_node) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in prev_node: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(prev_node) == 0: c[cur_node_id] = dy.cmult(i_it, i_gt) else: fc = dy.cmult(i_ft_list[0], c[prev_node[0]]) for i in range(1, len(prev_node)): fc += dy.cmult(i_ft_list[i], c[prev_node[i]]) c[cur_node_id] = fc + dy.cmult(i_it, i_gt) h_t = dy.cmult(i_ot, dy.tanh(c[cur_node_id])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h[cur_node_id] = h_t h_list.append(h_t) self._final_states = [ transducers.FinalTransducerState(h_list[-1], h_list[-1]) ] return expression_seqs.ExpressionSequence(expr_list=h_list)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence (will be accessed via tensor_expr) Return: expression sequence """ if isinstance(expr_seq, list): mask_out = expr_seq[0].mask seq_len = len(expr_seq[0]) batch_size = expr_seq[0].dim()[1] tensors = [e.as_tensor() for e in expr_seq] input_tensor = dy.reshape(dy.concatenate(tensors), (seq_len, 1, self.input_dim), batch_size=batch_size) else: mask_out = expr_seq.mask seq_len = len(expr_seq) batch_size = expr_seq.dim()[1] input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()), (seq_len, 1, self.input_dim), batch_size=batch_size) if self.dropout > 0.0 and self.train: input_tensor = dy.dropout(input_tensor, self.dropout) proj_inp = dy.conv2d_bias(input_tensor, dy.parameter(self.p_f), dy.parameter(self.p_b), stride=(self.stride, 1), is_valid=False) reduced_seq_len = proj_inp.dim()[0][0] proj_inp = dy.transpose( dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3), batch_size=batch_size)) # proj_inp dims: (hidden, 1, seq_len), batch_size if self.stride > 1 and mask_out is not None: mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len) h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] for t in range(reduced_seq_len): f_t = dy.logistic( dy.strided_select(proj_inp, [], [0, t], [self.hidden_dim, t + 1])) o_t = dy.logistic( dy.strided_select(proj_inp, [], [self.hidden_dim, t], [self.hidden_dim * 2, t + 1])) z_t = dy.tanh( dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t], [self.hidden_dim * 3, t + 1])) if self.dropout > 0.0 and self.train: retention_rate = 1.0 - self.dropout dropout_mask = dy.random_bernoulli((self.hidden_dim, 1), retention_rate, batch_size=batch_size) f_t = 1.0 - dy.cmult( dropout_mask, 1.0 - f_t ) # TODO: would be easy to make a zoneout dynet operation to save memory i_t = 1.0 - f_t if t == 0: c_t = dy.cmult(i_t, z_t) else: c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t) h_t = dy.cmult( o_t, c_t) # note: LSTM would use dy.tanh(c_t) instead of c_t if mask_out is None or np.isclose( np.sum(mask_out.np_arr[:, t:t + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask_out.cmult_by_timestep_expr(c_t, t, True) + mask_out.cmult_by_timestep_expr(c[-1], t, False)) h.append( mask_out.cmult_by_timestep_expr(h_t, t, True) + mask_out.cmult_by_timestep_expr(h[-1], t, False)) self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \ dy.reshape(c[-1], (self.hidden_dim,), batch_size=batch_size))] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask_out)
def __call__(self, x: dy.Expression, att_mask: np.ndarray, batch_mask: np.ndarray, p: numbers.Real): """ x: expression of dimensions (input_dim, time) x batch att_mask: numpy array of dimensions (time, time); pre-transposed batch_mask: numpy array of dimensions (batch, time) p: dropout prob """ sent_len = x.dim()[0][1] batch_size = x[0].dim()[1] if self.downsample_factor > 1: if sent_len % self.downsample_factor != 0: raise ValueError( "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. " "Configure batcher accordingly.") if batch_mask is not None: batch_mask = batch_mask[:, ::self.downsample_factor] sent_len_out = sent_len // self.downsample_factor sent_len = sent_len_out out_mask = x.mask if self.downsample_factor > 1 and out_mask is not None: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_factor) x = ExpressionSequence(expr_tensor=dy.reshape( x.as_tensor(), (x.dim()[0][0] * self.downsample_factor, x.dim()[0][1] / self.downsample_factor), batch_size=batch_size), mask=out_mask) residual = SAAMTimeDistributed()(x) else: residual = SAAMTimeDistributed()(x) sent_len_out = sent_len if self.model_dim != self.input_dim * self.downsample_factor: residual = self.res_shortcut.transform(residual) # Concatenate all the words together for doing vectorized affine transform if self.kq_pos_encoding_type is None: kvq_lin = self.linear_kvq.transform(SAAMTimeDistributed()(x)) key_up = self.shape_projection( dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head), batch_size) value_up = self.shape_projection( dy.pick_range(kvq_lin, self.head_count * self.dim_per_head, 2 * self.head_count * self.dim_per_head), batch_size) query_up = self.shape_projection( dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head, 3 * self.head_count * self.dim_per_head), batch_size) else: assert self.kq_pos_encoding_type == "embedding" encoding = self.kq_positional_embedder.embed_sent( sent_len).as_tensor() kq_lin = self.linear_kq.transform(SAAMTimeDistributed()( ExpressionSequence( expr_tensor=dy.concatenate([x.as_tensor(), encoding])))) key_up = self.shape_projection( dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head), batch_size) query_up = self.shape_projection( dy.pick_range(kq_lin, self.head_count * self.dim_per_head, 2 * self.head_count * self.dim_per_head), batch_size) v_lin = self.linear_v.transform(SAAMTimeDistributed()(x)) value_up = self.shape_projection(v_lin, batch_size) if self.cross_pos_encoding_type: assert self.cross_pos_encoding_type == "embedding" emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0, sent_len) emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0, sent_len) key_up = dy.reshape(key_up, (sent_len, self.dim_per_head, self.head_count), batch_size=batch_size) key_up = dy.concatenate_cols( [dy.cmult(key_up, emb1), dy.cmult(key_up, emb2)]) key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2), batch_size=self.head_count * batch_size) query_up = dy.reshape( query_up, (sent_len, self.dim_per_head, self.head_count), batch_size=batch_size) query_up = dy.concatenate_cols( [dy.cmult(query_up, emb2), dy.cmult(query_up, -emb1)]) query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2), batch_size=self.head_count * batch_size) scaled = query_up * dy.transpose( key_up / math.sqrt(self.dim_per_head) ) # scale before the matrix multiplication to save memory # Apply Mask here if not self.ignore_masks: if att_mask is not None: att_mask_inp = att_mask * -100.0 if self.downsample_factor > 1: att_mask_inp = att_mask_inp[::self.downsample_factor, :: self.downsample_factor] scaled += dy.inputTensor(att_mask_inp) if batch_mask is not None: # reshape (batch, time) -> (time, head_count*batch), then *-100 inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :], (sent_len, self.head_count, batch_size)), (1, sent_len, self.head_count * batch_size)) \ * -100 mask_expr = dy.inputTensor(inp, batched=True) scaled += mask_expr if self.diag_gauss_mask: diag_growing = np.zeros((sent_len, sent_len, self.head_count)) for i in range(sent_len): for j in range(sent_len): diag_growing[i, j, :] = -(i - j)**2 / 2.0 e_diag_gauss_mask = dy.inputTensor(diag_growing) e_sigma = dy.parameter(self.diag_gauss_mask_sigma) if self.square_mask_std: e_sigma = dy.square(e_sigma) e_sigma_sq_inv = dy.cdiv( dy.ones(e_sigma.dim()[0], batch_size=batch_size), dy.square(e_sigma)) e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask, e_sigma_sq_inv) scaled += dy.reshape(e_diag_gauss_mask_final, (sent_len, sent_len), batch_size=batch_size * self.head_count) # Computing Softmax here. attn = dy.softmax(scaled, d=1) if LOG_ATTENTION: yaml_logger.info({ "key": "selfatt_mat_ax0", "value": np.average(attn.value(), axis=0).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax1", "value": np.average(attn.value(), axis=1).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax0_ent", "value": entropy(attn.value()).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax1_ent", "value": entropy(attn.value().transpose()).dumps(), "desc": self.desc }) self.select_att_head = 0 if self.select_att_head is not None: attn = dy.reshape(attn, (sent_len, sent_len, self.head_count), batch_size=batch_size) sel_mask = np.zeros((1, 1, self.head_count)) sel_mask[0, 0, self.select_att_head] = 1.0 attn = dy.cmult(attn, dy.inputTensor(sel_mask)) attn = dy.reshape(attn, (sent_len, sent_len), batch_size=self.head_count * batch_size) # Applying dropout to attention if p > 0.0: drop_attn = dy.dropout(attn, p) else: drop_attn = attn # Computing weighted attention score attn_prod = drop_attn * value_up # Reshaping the attn_prod to input query dimensions out = dy.reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), batch_size=batch_size) out = dy.transpose(out) out = dy.reshape(out, (self.model_dim, ), batch_size=batch_size * sent_len_out) # out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out) if self.plot_attention: from sklearn.metrics.pairwise import cosine_similarity assert batch_size == 1 mats = [] for i in range(attn.dim()[1]): mats.append(dy.pick_batch_elem(attn, i).npvalue()) self.plot_att_mat( mats[-1], "{}.sent_{}.head_{}.png".format( self.plot_attention, self.plot_attention_counter, i), 300) avg_mat = np.average(mats, axis=0) self.plot_att_mat( avg_mat, "{}.sent_{}.head_avg.png".format(self.plot_attention, self.plot_attention_counter), 300) cosim_before = cosine_similarity(x.as_tensor().npvalue().T) self.plot_att_mat( cosim_before, "{}.sent_{}.cosim_before.png".format( self.plot_attention, self.plot_attention_counter), 600) cosim_after = cosine_similarity(out.npvalue().T) self.plot_att_mat( cosim_after, "{}.sent_{}.cosim_after.png".format( self.plot_attention, self.plot_attention_counter), 600) self.plot_attention_counter += 1 # Adding dropout and layer normalization if p > 0.0: res = dy.dropout(out, p) + residual else: res = out + residual ret = self.layer_norm.transform(res) return ret
def transduce(self, embed): self.seq_transducer.transduce(ExpressionSequence(expr_tensor=embed)) return self.seq_transducer.get_final_states()[-1].main_expr()