def transduce(self, sent: ExpressionSequence) -> ExpressionSequence: if self.pos_encoding_type == "trigonometric": if self.position_encoding_block is None or self.position_encoding_block.shape[ 2] < len(sent): self.initialize_position_encoding( int(len(sent) * 1.2), self.input_dim if self.pos_encoding_combine == "add" else self.pos_encoding_size) encoding = dy.inputTensor( self.position_encoding_block[0, :, :len(sent)]) elif self.pos_encoding_type == "embedding": encoding = self.positional_embedder.embed_sent( len(sent)).as_tensor() if self.pos_encoding_type: if self.pos_encoding_combine == "add": sent = ExpressionSequence(expr_tensor=sent.as_tensor() + encoding, mask=sent.mask) else: # concat sent = ExpressionSequence(expr_tensor=dy.concatenate( [sent.as_tensor(), encoding]), mask=sent.mask) elif self.pos_encoding_type: raise ValueError(f"unknown encoding type {self.pos_encoding_type}") for module in self.modules: enc_sent = module.transduce(sent) sent = enc_sent self._final_states = [transducers.FinalTransducerState(sent[-1])] return sent
def transduce(self, expr_seq: 'expression_seqs.ExpressionSequence'): """ transduce the sequence Args: expr_seq: expression sequence Returns: expression sequence """ batch_size = expr_seq[0].dim()[1] seq_len = len(expr_seq) output_exps = [] for pos_i in range(seq_len): input_i = expr_seq[pos_i] affine = self.linear_layer(input_i) # affine = dy.affine_transform([dy.parameter(self.p_b), dy.parameter(self.p_W), input_i]) if self.train and self.dropout_rate: affine = dy.dropout(affine, self.dropout_rate) if self.gumbel: affine = affine + dy.random_gumbel(dim=affine.dim()[0], batch_size=batch_size) softmax_out = dy.softmax(affine) # embedded = self.emb_layer(softmax_out) embedded = dy.parameter(self.p_E) * softmax_out if self.residual: embedded = embedded + input_i output_exps.append(embedded) self._final_states = [ transducers.FinalTransducerState(main_expr=embedded) ] return expression_seqs.ExpressionSequence(expr_list=output_exps, mask=expr_seq.mask)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.dim()[1] > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = [] c = [] batch_size = expr_seq.dim()[1] if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for node_i in range(lattice.sent_len()): cur_node = lattice.nodes[node_i] val = expr_seq[node_i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(cur_node.nodes_prev) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in cur_node.nodes_prev) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in cur_node.nodes_prev: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(cur_node.nodes_prev) == 0: c.append(dy.cmult(i_it, i_gt)) else: fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]]) for i in range(1, len(cur_node.nodes_prev)): fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]]) c.append(fc + dy.cmult(i_it, i_gt)) h_t = dy.cmult(i_ot, dy.tanh(c[-1])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h.append(h_t) self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])] return expression_seqs.ExpressionSequence(expr_list=h)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ # Start with a [(length, model_size) x batch] tensor # B x T x H -> B x H x T x = expr_seq.as_tensor() x_len = x.size()[1] x_batch = x.size()[0] # Get the query key and value vectors q = self.lin_q(x).transpose(1, 2).contiguous() k = self.lin_k(x).transpose(1, 2).contiguous() v = self.lin_v(x).transpose(1, 2).contiguous() # q = bq + x * Wq # k = bk + x * Wk # v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ temp.view((x_batch * self.num_heads, self.head_dim, x_len)) for temp in (q, k, v) ] # Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = torch.Tensor( np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) * -1e10).to(xnmt.device) attn_score = attn_score + mask.unsqueeze(2) attn_prob = torch.nn.Softmax(dim=1)(attn_score) # attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = tt.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim, x_len).transpose(1, 2) # Final transformation o = self.lin_o(o) # o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o, mask=expr_seq.mask) self._final_states = [ transducers.FinalTransducerState(expr_seq[-1], None) ] return expr_seq
def transduce(self, expr_seq: 'expression_seqs.ExpressionSequence') -> 'expression_seqs.ExpressionSequence': """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ if isinstance(expr_seq, expression_seqs.ExpressionSequence): expr_seq = [expr_seq] batch_size = expr_seq[0][0].dim()[1] seq_len = len(expr_seq[0]) if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) cur_input = expr_seq self._final_states = [] for layer_i in range(self.num_layers): h = [dy.zeroes(dim=(self.hidden_dim,), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim,), batch_size=batch_size)] for pos_i in range(seq_len): x_t = [cur_input[j][pos_i] for j in range(len(cur_input))] if isinstance(x_t, dy.Expression): x_t = [x_t] elif type(x_t) != list: x_t = list(x_t) if sum([x_t_i.dim()[0][0] for x_t_i in x_t]) != self.total_input_dim: found_dim = sum([x_t_i.dim()[0][0] for x_t_i in x_t]) raise ValueError(f"VanillaLSTMGates: x_t has inconsistent dimension {found_dim}, expecting {self.total_input_dim}") if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) gates_t = dy.vanilla_lstm_gates_dropout_concat(x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.dropout_mask_x[layer_i], self.dropout_mask_h[layer_i], self.weightnoise_std if self.train else 0.0) else: gates_t = dy.vanilla_lstm_gates_concat(x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.weightnoise_std if self.train else 0.0) c_t = dy.vanilla_lstm_c(c[-1], gates_t) h_t = dy.vanilla_lstm_h(c_t, gates_t) if expr_seq[0].mask is None or np.isclose(np.sum(expr_seq[0].mask.np_arr[:,pos_i:pos_i+1]), 0.0): c.append(c_t) h.append(h_t) else: c.append(expr_seq[0].mask.cmult_by_timestep_expr(c_t,pos_i,True) + expr_seq[0].mask.cmult_by_timestep_expr(c[-1],pos_i,False)) h.append(expr_seq[0].mask.cmult_by_timestep_expr(h_t,pos_i,True) + expr_seq[0].mask.cmult_by_timestep_expr(h[-1],pos_i,False)) self._final_states.append(transducers.FinalTransducerState(h[-1], c[-1])) cur_input = [h[1:]] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=expr_seq[0].mask)
def transduce( self, expr_seq: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ if isinstance(expr_seq, expression_seqs.ExpressionSequence): expr_seq = [expr_seq] concat_inputs = len(expr_seq) >= 2 batch_size = tt.batch_size(expr_seq[0][0]) seq_len = expr_seq[0].sent_len() mask = expr_seq[0].mask if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) cur_input = expr_seq self._final_states = [] for layer_i in range(self.num_layers): h = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)] c = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)] for pos_i in range(seq_len): if concat_inputs and layer_i == 0: x_t = tt.concatenate( [cur_input[i][pos_i] for i in range(len(cur_input))]) else: x_t = cur_input[0][pos_i] h_tm1 = h[-1] if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) x_t = torch.mul(x_t, self.dropout_mask_x[layer_i]) h_tm1 = torch.mul(h_tm1, self.dropout_mask_h[layer_i]) h_t, c_t = self.layers[layer_i](x_t, (h_tm1, c[-1])) if mask is None or np.isclose( np.sum(mask.np_arr[:, pos_i:pos_i + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask.cmult_by_timestep_expr(c_t, pos_i, True) + mask.cmult_by_timestep_expr(c[-1], pos_i, False)) h.append( mask.cmult_by_timestep_expr(h_t, pos_i, True) + mask.cmult_by_timestep_expr(h[-1], pos_i, False)) self._final_states.append( transducers.FinalTransducerState(h[-1], c[-1])) cur_input = [h[1:]] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask)
def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one, to both the forward and backward RNNs, and concatenating. Args: es: an ExpressionSequence """ es_list = [es] for layer_i, (fb, bb) in enumerate(self.builder_layers): reduce_factor = self._reduce_factor_for_layer(layer_i) if es_list[0].mask is None: mask_out = None else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor) if self.downsampling_method=="concat" and es_list[0].sent_len() % reduce_factor != 0: raise ValueError(f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, " f"but got sequence length={es_list[0].sent_len()} for reduce_factor={reduce_factor}. " f"Set Batcher's pad_src_to_multiple argument accordingly.") fs = fb.transduce(es_list) bs = bb.transduce([expression_seqs.ReversedExpressionSequence(es_item) for es_item in es_list]) if layer_i < len(self.builder_layers) - 1: if self.downsampling_method=="skip": es_list = [expression_seqs.ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out), expression_seqs.ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out)] elif self.downsampling_method=="concat": es_len = es_list[0].sent_len() es_list_fwd = [] es_list_bwd = [] for i in range(0, es_len, reduce_factor): for j in range(reduce_factor): if i==0: es_list_fwd.append([]) es_list_bwd.append([]) es_list_fwd[j].append(fs[i+j]) es_list_bwd[j].append(bs[es_list[0].sent_len()-reduce_factor+j-i]) es_list = [expression_seqs.ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \ [expression_seqs.ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)] else: raise RuntimeError(f"unknown downsampling_method {self.downsampling_method}") else: # concat final outputs ret_es = expression_seqs.ExpressionSequence( expr_list=[tt.concatenate([f, b]) for f, b in zip(fs, expression_seqs.ReversedExpressionSequence(bs))], mask=mask_out) self._final_states = [transducers.FinalTransducerState(tt.concatenate([fb.get_final_states()[0].main_expr(), bb.get_final_states()[0].main_expr()]), tt.concatenate([fb.get_final_states()[0].cell_expr(), bb.get_final_states()[0].cell_expr()])) \ for (fb, bb) in self.builder_layers] return ret_es
def transduce(self, src: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: sent_len = len(src) embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = dy.concatenate([embeddings, src.as_tensor()]) else: raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")') if self.train and self.dropout > 0.0: output = dy.dropout(output, self.dropout) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, expr_seq: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ Wq, Wk, Wv, Wo = [dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo)] bq, bk, bv, bo = [dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo)] # Start with a [(length, model_size) x batch] tensor x = expr_seq.as_transposed_tensor() x_len = x.dim()[0][0] x_batch = x.dim()[1] # Get the query key and value vectors # TODO: do we need bias broadcasting in DyNet? # q = dy.affine_transform([bq, x, Wq]) # k = dy.affine_transform([bk, x, Wk]) # v = dy.affine_transform([bv, x, Wv]) q = bq + x * Wq k = bk + x * Wk v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q,k,v)] # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys attn_score = q * dy.transpose(k) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = dy.inputTensor(np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10 attn_score = attn_score + mask attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = dy.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch) # Final transformation # o = dy.affine_transform([bo, attn_prob * v, Wo]) o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask) self._final_states = [transducers.FinalTransducerState(expr_seq[-1], None)] return expr_seq
def transduce( self, es: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': batch_size = tt.batch_size(es.as_tensor()) if es.mask: seq_lengths = es.mask.seq_lengths() else: seq_lengths = [es.sent_len()] * batch_size # Sort the input and lengths as the descending order seq_lengths = torch.LongTensor(seq_lengths).to(xnmt.device) lengths, perm_index = seq_lengths.sort(0, descending=True) sorted_input = es.as_tensor()[perm_index] perm_index_rev = [-1] * len(lengths) for i in range(len(lengths)): perm_index_rev[perm_index[i]] = i perm_index_rev = torch.LongTensor(perm_index_rev).to(xnmt.device) packed_input = nn.utils.rnn.pack_padded_sequence(sorted_input, list(lengths.data), batch_first=True) state_size = self.num_dir * self.num_layers, batch_size, self.hidden_dim // self.num_dir h0 = sorted_input.new_zeros(*state_size) c0 = sorted_input.new_zeros(*state_size) output, (final_hiddens, final_cells) = self.lstm(packed_input, (h0, c0)) output = nn.utils.rnn.pad_packed_sequence( output, batch_first=True, total_length=es.sent_len())[0] # restore the sorting decoded = output[perm_index_rev] self._final_states = [] for layer_i in range(self.num_layers): final_hidden = final_hiddens.view( self.num_layers, self.num_dir, batch_size, -1)[layer_i].transpose(0, 1).contiguous().view(batch_size, -1) final_hidden = final_hidden[perm_index_rev] self._final_states.append( transducers.FinalTransducerState(final_hidden)) ret = expression_seqs.ExpressionSequence(expr_tensor=decoded, mask=es.mask) return ret
def transduce( self, x: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: expr = x.as_transposed_tensor() batch_size, hidden_dim, seq_len = expr.size() expr = expr.view((batch_size, self.in_channels, hidden_dim // self.in_channels, seq_len)) expr = self.cnn_layer(expr) if self.use_pooling: expr = self.pooling_layer(expr) expr = self.activation_fct(expr) batch_size, out_chn, out_h, seq_len = expr.size() expr = expr.view((batch_size, out_chn * out_h, seq_len)) output_seq = expression_seqs.ExpressionSequence( expr_transposed_tensor=expr, mask=x.mask.lin_subsampled(trg_len=seq_len) if x.mask else None) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq
def calc_context(self, src_encoding): # Generating h_t based on RNN(h_{t-1}, embed(e_{t-1})) if self.prev_written_word is None: final_transducer_state = [transducers_base.FinalTransducerState(h, c) \ for h, c in zip(self.encoder_state.h(), self.encoder_state.c())] context_state = self.model.decoder.initial_state(final_transducer_state, vocabs.Vocab.SS) else: context_state = self.model.decoder.add_input(self.context_state, self.prev_written_word) # Reset attender if there is a read action reset_attender = self.reset_attender if reset_attender: self.model.attender.init_sent(expr_seq.ExpressionSequence(expr_list=src_encoding)) reset_attender = False # Calc context for decoding context_state.context = self.model.attender.calc_context(context_state.rnn_state.output()) return SimultaneousState(self.model, self.encoder_state, context_state, self.output_embed, self.has_been_read, self.has_been_written, self.prev_written_word, reset_attender)
def transduce(self, es): mask = es.mask # first layer forward_es = self.forward_layers[0].transduce(es) rev_backward_es = self.backward_layers[0].transduce( expression_seqs.ReversedExpressionSequence(es)) # TODO: concat input of each layer to its output; or, maybe just add standard residual connections for layer_i in range(1, len(self.forward_layers)): new_forward_es = self.forward_layers[layer_i].transduce([ forward_es, expression_seqs.ReversedExpressionSequence(rev_backward_es) ]) mask_out = mask if mask_out is not None and new_forward_es.mask.np_arr.shape != mask_out.np_arr.shape: mask_out = mask_out.lin_subsampled(trg_len=len(new_forward_es)) rev_backward_es = expression_seqs.ExpressionSequence( self.backward_layers[layer_i].transduce([ expression_seqs.ReversedExpressionSequence(forward_es), rev_backward_es ]).as_list(), mask=mask_out) forward_es = new_forward_es self._final_states = [ transducers.FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(), self.backward_layers[layer_i].get_final_states()[ 0].main_expr()]), dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(), self.backward_layers[layer_i].get_final_states()[ 0].cell_expr()])) \ for layer_i in range(len(self.forward_layers))] mask_out = mask if mask_out is not None and forward_es.mask.np_arr.shape != mask_out.np_arr.shape: mask_out = mask_out.lin_subsampled(trg_len=len(forward_es)) return expression_seqs.ExpressionSequence(expr_list=[ dy.concatenate([forward_es[i], rev_backward_es[-i - 1]]) for i in range(len(forward_es)) ], mask=mask_out)
def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: for layer_i, (fb, bb) in enumerate(self.lstm_layers): fs = fb.transduce(es) bs = bb.transduce(expression_seqs.ReversedExpressionSequence(es)) interleaved = [] if es.mask is None: mask = None else: mask = es.mask.lin_subsampled(0.5) # upsample the mask to encompass interleaved fwd / bwd expressions for pos in range(len(fs)): interleaved.append(fs[pos]) interleaved.append(bs[-pos-1]) projected = expression_seqs.ExpressionSequence(expr_list=interleaved, mask=mask) projected = self.nin_layers[layer_i].transduce(projected) assert math.ceil(len(es) / float(self.stride))==len(projected), \ f"mismatched len(es)=={len(es)}, stride=={self.stride}, len(projected)=={len(projected)}" es = projected self._final_states = [transducers.FinalTransducerState(projected[-1])] return projected
def write(self, src_encoding, word, policy_action): # Reset attender if there is a read action reset_attender = self.reset_attender if reset_attender: encodings = src_encoding[:self.has_been_read] self.model.attender.init_sent( expr_seq.ExpressionSequence(expr_list=encodings)) reset_attender = False # Generating h_t based on RNN(h_{t-1}, embed(e_{t-1})) if self.decoder_state is None or word is None: dim = src_encoding[0].dim() fin_tran_state = [ transducers_base.FinalTransducerState(dy.zeros(*dim), dy.zeros(*dim)) ] decoder_state = self.model.decoder.initial_state( fin_tran_state, vocabs.Vocab.SS) else: decoder_state = self.model.decoder.add_input( self.decoder_state, word) decoder_state.attention = self.model.attender.calc_attention( decoder_state.as_vector()) decoder_state.context = self.model.attender.calc_context( decoder_state.as_vector(), decoder_state.attention) # Calc context for decoding return SimultaneousState(self.model, self.encoder_state, decoder_state, has_been_read=self.has_been_read, has_been_written=self.has_been_written + 1, written_word=word, policy_action=policy_action, reset_attender=reset_attender, parent=self)
def transduce( self, xs: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': batch_size = xs[0][0].dim()[1] h_bot = [] h_mid = [] h_top = [] z_bot = [] z_mid = [] z_top = [] self.top_layer.h = None self.top_layer.c = None self.top_layer.z = None self.mid_layer.h = None self.mid_layer.c = None self.mid_layer.z = None self.bottom_layer.h = None self.bottom_layer.c = None self.bottom_layer.z = None #?? checkme. want to init z to ones? (cherry paper) z_one = dy.ones(1, batch_size=batch_size) h_bot.append( dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) #indices for timesteps are +1 h_mid.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) h_top.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) for i, x_t in enumerate(xs): h_t_bot, z_t_bot = self.bottom_layer.transduce( h_below=x_t, h_above=h_mid[i], z_below=z_one ) #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell) h_t_mid, z_t_mid = self.mid_layer.transduce( h_below=h_t_bot, h_above=h_top[i], z_below=z_t_bot ) #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell) h_t_top, z_t_top = self.top_layer.transduce( h_below=h_t_mid, h_above=None, z_below=z_t_mid ) #uses z_t_bot and h_t_bot from previous layer call, h_t_top and z_t_top from previous time step (saved in hmlstmcell) h_bot.append(h_t_bot) z_bot.append(z_t_bot) h_mid.append(h_t_mid) z_mid.append(z_t_mid) h_top.append(h_t_top) z_top.append(z_t_top) # #gated output module # # #sigmoid # W_layer = dy.parameters(dim=(len(self.modules), hidden_dim)) #needs to be moved to init? num layers by hidden_dim # h_cat = dy.transpose(dy.concatenate([h_bot, h_mid, h_top])) # dotted = dy.dot_product(e1, e2) # gates = dy.logistic(dotted) # #relu # # om = dy.relu() #final state is last hidden state from top layer self._final_states = [transducers.FinalTransducerState(h_top[-1])] fin_xs = expression_seqs.ExpressionSequence(expr_list=h_top[1:]) return fin_xs #removes the init zeros to make it same length as seq
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.batch_size() > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.batch_size()}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = {} c = {} h_list = [] batch_size = expr_seq.batch_size() if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for i, cur_node_id in enumerate(lattice.nodes): prev_node = lattice.graph.predecessors(cur_node_id) val = expr_seq[i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(prev_node) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in prev_node) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in prev_node: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(prev_node) == 0: c[cur_node_id] = dy.cmult(i_it, i_gt) else: fc = dy.cmult(i_ft_list[0], c[prev_node[0]]) for i in range(1, len(prev_node)): fc += dy.cmult(i_ft_list[i], c[prev_node[i]]) c[cur_node_id] = fc + dy.cmult(i_it, i_gt) h_t = dy.cmult(i_ot, dy.tanh(c[cur_node_id])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h[cur_node_id] = h_t h_list.append(h_t) self._final_states = [ transducers.FinalTransducerState(h_list[-1], h_list[-1]) ] return expression_seqs.ExpressionSequence(expr_list=h_list)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence (will be accessed via tensor_expr) Return: expression sequence """ if isinstance(expr_seq, list): mask_out = expr_seq[0].mask seq_len = len(expr_seq[0]) batch_size = expr_seq[0].dim()[1] tensors = [e.as_tensor() for e in expr_seq] input_tensor = dy.reshape(dy.concatenate(tensors), (seq_len, 1, self.input_dim), batch_size=batch_size) else: mask_out = expr_seq.mask seq_len = len(expr_seq) batch_size = expr_seq.dim()[1] input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()), (seq_len, 1, self.input_dim), batch_size=batch_size) if self.dropout > 0.0 and self.train: input_tensor = dy.dropout(input_tensor, self.dropout) proj_inp = dy.conv2d_bias(input_tensor, dy.parameter(self.p_f), dy.parameter(self.p_b), stride=(self.stride, 1), is_valid=False) reduced_seq_len = proj_inp.dim()[0][0] proj_inp = dy.transpose( dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3), batch_size=batch_size)) # proj_inp dims: (hidden, 1, seq_len), batch_size if self.stride > 1 and mask_out is not None: mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len) h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)] for t in range(reduced_seq_len): f_t = dy.logistic( dy.strided_select(proj_inp, [], [0, t], [self.hidden_dim, t + 1])) o_t = dy.logistic( dy.strided_select(proj_inp, [], [self.hidden_dim, t], [self.hidden_dim * 2, t + 1])) z_t = dy.tanh( dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t], [self.hidden_dim * 3, t + 1])) if self.dropout > 0.0 and self.train: retention_rate = 1.0 - self.dropout dropout_mask = dy.random_bernoulli((self.hidden_dim, 1), retention_rate, batch_size=batch_size) f_t = 1.0 - dy.cmult( dropout_mask, 1.0 - f_t ) # TODO: would be easy to make a zoneout dynet operation to save memory i_t = 1.0 - f_t if t == 0: c_t = dy.cmult(i_t, z_t) else: c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t) h_t = dy.cmult( o_t, c_t) # note: LSTM would use dy.tanh(c_t) instead of c_t if mask_out is None or np.isclose( np.sum(mask_out.np_arr[:, t:t + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask_out.cmult_by_timestep_expr(c_t, t, True) + mask_out.cmult_by_timestep_expr(c[-1], t, False)) h.append( mask_out.cmult_by_timestep_expr(h_t, t, True) + mask_out.cmult_by_timestep_expr(h[-1], t, False)) self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \ dy.reshape(c[-1], (self.hidden_dim,), batch_size=batch_size))] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask_out)
def transduce(self, x): # some preparations output_states = [] current_state = self._encode_src(x, apply_emb=False) if self.mode_transduce == "split": first_state = SymmetricDecoderState( rnn_state=current_state.rnn_state, context=current_state.context) batch_size = x.dim()[1] done = [False] * batch_size out_mask = batchers.Mask(np_arr=np.zeros((batch_size, self.max_dec_len))) out_mask.np_arr.flags.writeable = True # teacher / split mode: unfold guided by reference targets # -> feed everything up unto (except) the last token back into the LSTM # other modes: unfold until EOS is output or max len is reached max_dec_len = self.cur_src.batches[1].sent_len( ) if self.mode_transduce in ["teacher", "split"] else self.max_dec_len atts_list = [] generated_word_ids = [] for pos in range(max_dec_len): if self.train and self.mode_transduce in ["teacher", "split"]: # unroll RNN guided by reference prev_ref_action, ref_action = None, None if pos > 0: prev_ref_action = self._batch_ref_action(pos - 1) if self.transducer_loss: ref_action = self._batch_ref_action(pos) step_loss = self.calc_loss_one_step( dec_state=current_state, batch_size=batch_size, mode=self.mode_transduce, ref_action=ref_action, prev_ref_action=prev_ref_action) self.transducer_losses.append(step_loss) else: # inference # unroll RNN guided by model predictions if self.mode_transduce in ["teacher", "split"]: prev_ref_action = self._batch_max_action( batch_size, current_state, pos) else: prev_ref_action = None out_scores = self.generate_one_step( dec_state=current_state, mask=out_mask, cur_step=pos, batch_size=batch_size, mode=self.mode_transduce, prev_ref_action=prev_ref_action) word_id = np.argmax(out_scores.npvalue(), axis=0) word_id = word_id.reshape((word_id.size, )) generated_word_ids.append(word_id[0]) for batch_i in range(batch_size): if self._terminate_rnn(batch_i=batch_i, pos=pos, batched_word_id=word_id): done[batch_i] = True out_mask.np_arr[batch_i, pos + 1:] = 1.0 if pos > 0 and all(done): atts_list.append(self.attender.get_last_attention()) output_states.append(current_state.rnn_state.h()[-1]) break output_states.append(current_state.rnn_state.h()[-1]) atts_list.append(self.attender.get_last_attention()) if self.mode_transduce == "split": # split mode: use attentions to compute context, then run RNNs over these context inputs if self.split_regularizer: assert len(atts_list) == len( self._chosen_rnn_inputs ), f"{len(atts_list)} != {len(self._chosen_rnn_inputs)}" split_output_states = [] split_rnn_state = first_state.rnn_state for pos, att in enumerate(atts_list): lstm_input_context = self.attender.curr_sent.as_tensor( ) * att # TODO: better reuse the already computed context vecs lstm_input_context = dy.reshape( lstm_input_context, (lstm_input_context.dim()[0][0], ), batch_size=batch_size) if self.split_dual: lstm_input_label = self._chosen_rnn_inputs[pos] if self.split_dual[0] > 0.0 and self.train: lstm_input_context = dy.dropout_batch( lstm_input_context, self.split_dual[0]) if self.split_dual[1] > 0.0 and self.train: lstm_input_label = dy.dropout_batch( lstm_input_label, self.split_dual[1]) if self.split_context_transform: lstm_input_context = self.split_context_transform.transform( lstm_input_context) lstm_input_context = self.split_dual_proj.transform( dy.concatenate([lstm_input_context, lstm_input_label])) if self.split_regularizer and pos < len( self._chosen_rnn_inputs): # _chosen_rnn_inputs does not contain first (empty) input, so this is in fact like comparing to pos-1: penalty = dy.squared_norm(lstm_input_context - self._chosen_rnn_inputs[pos]) if self.split_regularizer != 1: penalty = self.split_regularizer * penalty self.split_reg_penalty_expr = penalty split_rnn_state = split_rnn_state.add_input(lstm_input_context) split_output_states.append(split_rnn_state.h()[-1]) assert len(output_states) == len(split_output_states) output_states = split_output_states out_mask.np_arr = out_mask.np_arr[:, :len(output_states)] self._final_states = [] if self.compute_report: # for symmetric reporter (this can only be run at inference time) assert batch_size == 1 atts_matrix = np.asarray([att.npvalue() for att in atts_list ]).reshape(len(atts_list), atts_list[0].dim()[0][0]).T self.report_sent_info({ "symm_att": atts_matrix, "symm_out": sent.SimpleSentence( words=generated_word_ids, idx=self.cur_src.batches[0][0].idx, vocab=self.cur_src.batches[1][0].vocab, output_procs=self.cur_src.batches[1][0].output_procs), "symm_ref": self.cur_src.batches[1][0] if isinstance( self.cur_src, batchers.CompoundBatch) else None }) # prepare final outputs for layer_i in range(len(current_state.rnn_state.h())): self._final_states.append( transducers.FinalTransducerState( main_expr=current_state.rnn_state.h()[layer_i], cell_expr=current_state.rnn_state._c[layer_i])) out_mask.np_arr.flags.writeable = False return expression_seqs.ExpressionSequence(expr_list=output_states, mask=out_mask)
def transduce( self, embed_sent: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src = embed_sent.as_tensor() sent_len = src.dim()[0][1] batch_size = src.dim()[1] pad_size = (self.window_receptor - 1) / 2 #TODO adapt it also for even window size src = dy.concatenate([ dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src, dy.zeroes((self.input_dim, pad_size), batch_size=batch_size) ], d=1) padded_sent_len = sent_len + 2 * pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1), batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1]) hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid), dy.parameter(bias_hid), stride=[1, 1]) hidden_layer = dy.reshape(hidden_layer, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer, last_conv, last_bias, stride=[1, 1]) output = dy.reshape(output, (sent_len, self.output_dim), batch_size=batch_size) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq