def transduce( self, es: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': mask = es.mask # first layer forward_es = self.forward_layers[0].transduce(es) rev_backward_es = self.backward_layers[0].transduce( expression_seqs.ReversedExpressionSequence(es)) for layer_i in range(1, len(self.forward_layers)): new_forward_es = self.forward_layers[layer_i].transduce([ forward_es, expression_seqs.ReversedExpressionSequence(rev_backward_es) ]) rev_backward_es = expression_seqs.ExpressionSequence( self.backward_layers[layer_i].transduce([ expression_seqs.ReversedExpressionSequence(forward_es), rev_backward_es ]).as_list(), mask=mask) forward_es = new_forward_es self._final_states = [ transducers.FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(), self.backward_layers[layer_i].get_final_states()[ 0].main_expr()]), dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(), self.backward_layers[layer_i].get_final_states()[ 0].cell_expr()])) \ for layer_i in range(len(self.forward_layers))] return expression_seqs.ExpressionSequence(expr_list=[ dy.concatenate([forward_es[i], rev_backward_es[-i - 1]]) for i in range(len(forward_es)) ], mask=mask)
def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one, to both the forward and backward RNNs, and concatenating. Args: es: an ExpressionSequence """ es_list = [es] for layer_i, (fb, bb) in enumerate(self.builder_layers): reduce_factor = self._reduce_factor_for_layer(layer_i) if es_list[0].mask is None: mask_out = None else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor) if self.downsampling_method=="concat" and es_list[0].sent_len() % reduce_factor != 0: raise ValueError(f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, " f"but got sequence length={es_list[0].sent_len()} for reduce_factor={reduce_factor}. " f"Set Batcher's pad_src_to_multiple argument accordingly.") fs = fb.transduce(es_list) bs = bb.transduce([expression_seqs.ReversedExpressionSequence(es_item) for es_item in es_list]) if layer_i < len(self.builder_layers) - 1: if self.downsampling_method=="skip": es_list = [expression_seqs.ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out), expression_seqs.ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out)] elif self.downsampling_method=="concat": es_len = es_list[0].sent_len() es_list_fwd = [] es_list_bwd = [] for i in range(0, es_len, reduce_factor): for j in range(reduce_factor): if i==0: es_list_fwd.append([]) es_list_bwd.append([]) es_list_fwd[j].append(fs[i+j]) es_list_bwd[j].append(bs[es_list[0].sent_len()-reduce_factor+j-i]) es_list = [expression_seqs.ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \ [expression_seqs.ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)] else: raise RuntimeError(f"unknown downsampling_method {self.downsampling_method}") else: # concat final outputs ret_es = expression_seqs.ExpressionSequence( expr_list=[tt.concatenate([f, b]) for f, b in zip(fs, expression_seqs.ReversedExpressionSequence(bs))], mask=mask_out) self._final_states = [transducers.FinalTransducerState(tt.concatenate([fb.get_final_states()[0].main_expr(), bb.get_final_states()[0].main_expr()]), tt.concatenate([fb.get_final_states()[0].cell_expr(), bb.get_final_states()[0].cell_expr()])) \ for (fb, bb) in self.builder_layers] return ret_es
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src_tensor = src.as_tensor() out_mask = src.mask if self.downsample_by > 1: assert len(src_tensor.dim()[0])==2, \ f"Downsampling only supported for tensors of order two. Found dims {src_tensor.dim()}" (hidden_dim, seq_len), batch_size = src_tensor.dim() if seq_len % self.downsample_by != 0: raise ValueError( "For downsampling, sequence lengths must be multiples of the total reduce factor. " "Configure batcher accordingly.") src_tensor = dy.reshape(src_tensor, (hidden_dim * self.downsample_by, seq_len // self.downsample_by), batch_size=batch_size) if out_mask: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_by) output = self.transform.transform(src_tensor) if self.downsample_by == 1: if len(output.dim()) != src_tensor.dim( ): # can happen with seq length 1 output = dy.reshape(output, src_tensor.dim()[0], batch_size=src_tensor.dim()[1]) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=out_mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def embed_sent(self, x) -> expression_seqs.ExpressionSequence: """Embed a full sentence worth of words. By default, just do a for loop. Args: x: This will generally be a list of word IDs, but could also be a list of strings or some other format. It could also be batched, in which case it will be a (possibly masked) :class:`xnmt.batcher.Batch` object Returns: An expression sequence representing vectors of each word in the input. """ # single mode if not batchers.is_batched(x): embeddings = [self.embed(word) for word in x] # minibatch mode else: embeddings = [] seq_len = x.sent_len() for single_sent in x: assert single_sent.sent_len() == seq_len for word_i in range(seq_len): batch = batchers.mark_as_batch( [single_sent[word_i] for single_sent in x]) embeddings.append(self.embed(batch)) return expression_seqs.ExpressionSequence( expr_list=embeddings, mask=x.mask if batchers.is_batched(x) else None)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.dim()[1] > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = [] c = [] batch_size = expr_seq.dim()[1] if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for node_i in range(lattice.sent_len()): cur_node = lattice.nodes[node_i] val = expr_seq[node_i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(cur_node.nodes_prev) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in cur_node.nodes_prev) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in cur_node.nodes_prev: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(cur_node.nodes_prev) == 0: c.append(dy.cmult(i_it, i_gt)) else: fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]]) for i in range(1, len(cur_node.nodes_prev)): fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]]) c.append(fc + dy.cmult(i_it, i_gt)) h_t = dy.cmult(i_ot, dy.tanh(c[-1])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h.append(h_t) self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])] return expression_seqs.ExpressionSequence(expr_list=h)
def embed_sent( self, sent_len: numbers.Integral) -> expression_seqs.ExpressionSequence: embeddings = dy.strided_select(dy.parameter(self.embeddings), [1, 1], [0, 0], [self.emb_dim, sent_len]) return expression_seqs.ExpressionSequence(expr_tensor=embeddings, mask=None)
def embed_sent(self, x): speech_x = x.batches[0] factor_x = x.batches[1] # if speech_x.sent_len()!=factor_x.sent_len(): # print(speech_x.sent_len()) # print(factor_x.sent_len()) # if speech_x.sent_len()!=factor_x.sent_len()+4: # print("PROBLEM !!!!!!!!") #ah this is due to concatenated sentences which don't have both phonemes for both parts. shouldn't happen with the kaldi phones i don't think # print("---") # if speech_x.sent_len()==factor_x.sent_len(): # speech_xs = self.embed_speech_sent(speech_x) # factor_xs = self.embed_factor_sent(factor_x, speech_x.sent_len()) # elif speech_x.sent_len()+4==factor_x.sent_len(): # speech_xs = self.embed_speech_sent(speech_x) # factor_xs = self.embed_factor_sent(factor_x, speech_x.sent_len()) # elif speech_x.sent_len() > factor_x.sent_len(): # speech_xs = self.embed_speech_sent(speech_x) # factor_xs = self.embed_factor_sent(factor_x, speech_x.sent_len()) # else: # raise ValueError("!! unforseen sent mismatch in factor embedder") speech_xs = self.embed_speech_sent(speech_x) factor_xs = self.embed_factor_sent(factor_x, speech_x.sent_len()) catted = dy.concatenate([speech_xs.as_tensor(), factor_xs.as_tensor()]) output_seq = expression_seqs.ExpressionSequence(expr_tensor=catted, mask=speech_x.mask) return output_seq
def embed_sent(self, x: sent.Sentence) -> expression_seqs.ExpressionSequence: # TODO refactor: seems a bit too many special cases that need to be distinguished batched = batchers.is_batched(x) first_sent = x[0] if batched else x if hasattr(first_sent, "get_array"): if not batched: return expression_seqs.LazyNumpyExpressionSequence( lazy_data=x.get_array()) else: return expression_seqs.LazyNumpyExpressionSequence( lazy_data=batchers.mark_as_batch([s for s in x]), mask=x.mask) else: if not batched: embeddings = [self.embed(word) for word in x] else: embeddings = [] for word_i in range(x.sent_len()): embeddings.append( self.embed( batchers.mark_as_batch( [single_sent[word_i] for single_sent in x]))) return expression_seqs.ExpressionSequence(expr_list=embeddings, mask=x.mask)
def transduce(self, expr_seq: 'expression_seqs.ExpressionSequence'): """ transduce the sequence Args: expr_seq: expression sequence Returns: expression sequence """ batch_size = expr_seq[0].dim()[1] seq_len = len(expr_seq) output_exps = [] for pos_i in range(seq_len): input_i = expr_seq[pos_i] affine = self.linear_layer(input_i) # affine = dy.affine_transform([dy.parameter(self.p_b), dy.parameter(self.p_W), input_i]) if self.train and self.dropout_rate: affine = dy.dropout(affine, self.dropout_rate) if self.gumbel: affine = affine + dy.random_gumbel(dim=affine.dim()[0], batch_size=batch_size) softmax_out = dy.softmax(affine) # embedded = self.emb_layer(softmax_out) embedded = dy.parameter(self.p_E) * softmax_out if self.residual: embedded = embedded + input_i output_exps.append(embedded) self._final_states = [ transducers.FinalTransducerState(main_expr=embedded) ] return expression_seqs.ExpressionSequence(expr_list=output_exps, mask=expr_seq.mask)
def embed_sent(self, x: Any): embeddings = [embedder.embed_sent(x) for embedder in self.embedders] ret = [] for j in range(len(embeddings[0])): ret.append( dy.esum([embeddings[i][j] for i in range(len(embeddings))])) return expression_seqs.ExpressionSequence(expr_list=ret, mask=embeddings[0].mask)
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ # Start with a [(length, model_size) x batch] tensor # B x T x H -> B x H x T x = expr_seq.as_tensor() x_len = x.size()[1] x_batch = x.size()[0] # Get the query key and value vectors q = self.lin_q(x).transpose(1, 2).contiguous() k = self.lin_k(x).transpose(1, 2).contiguous() v = self.lin_v(x).transpose(1, 2).contiguous() # q = bq + x * Wq # k = bk + x * Wk # v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ temp.view((x_batch * self.num_heads, self.head_dim, x_len)) for temp in (q, k, v) ] # Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = torch.Tensor( np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) * -1e10).to(xnmt.device) attn_score = attn_score + mask.unsqueeze(2) attn_prob = torch.nn.Softmax(dim=1)(attn_score) # attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = tt.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim, x_len).transpose(1, 2) # Final transformation o = self.lin_o(o) # o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o, mask=expr_seq.mask) self._final_states = [ transducers.FinalTransducerState(expr_seq[-1], None) ] return expr_seq
def calculate_baseline( self, input_states: expr_seq.ExpressionSequence ) -> expr_seq.ExpressionSequence: transform_seq = [] for input_state in input_states: transform_seq.append( self.transform.transform(dy.nobackprop(input_state))) return expr_seq.ExpressionSequence(expr_list=transform_seq, mask=input_states.mask)
def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: mask = es.mask sent_len = len(es) es_expr = es.as_transposed_tensor() batch_size = es_expr.dim()[1] es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size) h_out = {} for direction in ["fwd", "bwd"]: # input convolutions gates_xt_bias = dy.conv2d_bias(es_chn, dy.parameter(self.params["x2all_" + direction]), dy.parameter(self.params["b_" + direction]), stride=(1, 1), is_valid=False) gates_xt_bias_list = [dy.pick_range(gates_xt_bias, i, i + 1) for i in range(sent_len)] h = [] c = [] for input_pos in range(sent_len): directional_pos = input_pos if direction == "fwd" else sent_len - input_pos - 1 gates_t = gates_xt_bias_list[directional_pos] if input_pos > 0: # recurrent convolutions gates_h_t = dy.conv2d(h[-1], dy.parameter(self.params["h2all_" + direction]), stride=(1, 1), is_valid=False) gates_t += gates_h_t # standard LSTM logic if len(c) == 0: c_tm1 = dy.zeros((self.freq_dim * self.num_filters,), batch_size=batch_size) else: c_tm1 = c[-1] gates_t_reshaped = dy.reshape(gates_t, (4 * self.freq_dim * self.num_filters,), batch_size=batch_size) c_t = dy.reshape(dy.vanilla_lstm_c(c_tm1, gates_t_reshaped), (self.freq_dim * self.num_filters,), batch_size=batch_size) h_t = dy.vanilla_lstm_h(c_t, gates_t_reshaped) h_t = dy.reshape(h_t, (1, self.freq_dim, self.num_filters,), batch_size=batch_size) if mask is None or np.isclose(np.sum(mask.np_arr[:, input_pos:input_pos + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask.cmult_by_timestep_expr(c_t, input_pos, True) + mask.cmult_by_timestep_expr(c[-1], input_pos, False)) h.append( mask.cmult_by_timestep_expr(h_t, input_pos, True) + mask.cmult_by_timestep_expr(h[-1], input_pos, False)) h_out[direction] = h ret_expr = [] for state_i in range(len(h_out["fwd"])): state_fwd = h_out["fwd"][state_i] state_bwd = h_out["bwd"][-1 - state_i] output_dim = (state_fwd.dim()[0][1] * state_fwd.dim()[0][2],) fwd_reshape = dy.reshape(state_fwd, output_dim, batch_size=batch_size) bwd_reshape = dy.reshape(state_bwd, output_dim, batch_size=batch_size) ret_expr.append(dy.concatenate([fwd_reshape, bwd_reshape], d=0 if self.reshape_output else 2)) return expression_seqs.ExpressionSequence(expr_list=ret_expr, mask=mask) # TODO: implement get_final_states()
def compose( self, embeds: Union[dy.Expression, List[dy.Expression]]) -> dy.Expression: if type(embeds) != list: embeds = [ dy.pick_batch_elem(embeds, i) for i in range(embeds.dim()[1]) ] self.seq_transducer.transduce( expr_seq.ExpressionSequence(expr_list=embeds)) return self.seq_transducer.get_final_states()[-1].main_expr()
def embed_sent(self, x: Any) -> expression_seqs.ExpressionSequence: """Embed a full sentence worth of words. By default, just do a for loop. Args: x: This will generally be a list of word IDs, but could also be a list of strings or some other format. It could also be batched, in which case it will be a (possibly masked) :class:`xnmt.batcher.Batch` object Returns: An expression sequence representing vectors of each word in the input. """ # single mode if not batchers.is_batched(x): expr = expression_seqs.ExpressionSequence( expr_list=[self.embed(word) for word in x]) # minibatch mode elif type(self) == LookupEmbedder: embeddings = [] for word_i in range(x.sent_len()): batch = batchers.mark_as_batch( [single_sent[word_i] for single_sent in x]) embeddings.append(self.embed(batch)) expr = expression_seqs.ExpressionSequence(expr_list=embeddings, mask=x.mask) else: assert type( x[0] ) == sent.SegmentedSentence, "Need to use CharFromWordTextReader for non standard embeddings." embeddings = [] all_embeddings = [] for sentence in x: embedding = [] for i in range(sentence.len_unpadded()): embed_word = self.embed(sentence.words[i]) embedding.append(embed_word) all_embeddings.append(embed_word) embeddings.append(embedding) # Useful when using dy.autobatch dy.forward(all_embeddings) all_embeddings.clear() # Pad the results expr = batchers.pad_embedding(embeddings) return expr
def transduce(self, expr_seq: 'expression_seqs.ExpressionSequence') -> 'expression_seqs.ExpressionSequence': """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ if isinstance(expr_seq, expression_seqs.ExpressionSequence): expr_seq = [expr_seq] batch_size = expr_seq[0][0].dim()[1] seq_len = len(expr_seq[0]) if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) cur_input = expr_seq self._final_states = [] for layer_i in range(self.num_layers): h = [dy.zeroes(dim=(self.hidden_dim,), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim,), batch_size=batch_size)] for pos_i in range(seq_len): x_t = [cur_input[j][pos_i] for j in range(len(cur_input))] if isinstance(x_t, dy.Expression): x_t = [x_t] elif type(x_t) != list: x_t = list(x_t) if sum([x_t_i.dim()[0][0] for x_t_i in x_t]) != self.total_input_dim: found_dim = sum([x_t_i.dim()[0][0] for x_t_i in x_t]) raise ValueError(f"VanillaLSTMGates: x_t has inconsistent dimension {found_dim}, expecting {self.total_input_dim}") if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) gates_t = dy.vanilla_lstm_gates_dropout_concat(x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.dropout_mask_x[layer_i], self.dropout_mask_h[layer_i], self.weightnoise_std if self.train else 0.0) else: gates_t = dy.vanilla_lstm_gates_concat(x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.weightnoise_std if self.train else 0.0) c_t = dy.vanilla_lstm_c(c[-1], gates_t) h_t = dy.vanilla_lstm_h(c_t, gates_t) if expr_seq[0].mask is None or np.isclose(np.sum(expr_seq[0].mask.np_arr[:,pos_i:pos_i+1]), 0.0): c.append(c_t) h.append(h_t) else: c.append(expr_seq[0].mask.cmult_by_timestep_expr(c_t,pos_i,True) + expr_seq[0].mask.cmult_by_timestep_expr(c[-1],pos_i,False)) h.append(expr_seq[0].mask.cmult_by_timestep_expr(h_t,pos_i,True) + expr_seq[0].mask.cmult_by_timestep_expr(h[-1],pos_i,False)) self._final_states.append(transducers.FinalTransducerState(h[-1], c[-1])) cur_input = [h[1:]] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=expr_seq[0].mask)
def transduce( self, expr_seq: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ if isinstance(expr_seq, expression_seqs.ExpressionSequence): expr_seq = [expr_seq] concat_inputs = len(expr_seq) >= 2 batch_size = tt.batch_size(expr_seq[0][0]) seq_len = expr_seq[0].sent_len() mask = expr_seq[0].mask if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) cur_input = expr_seq self._final_states = [] for layer_i in range(self.num_layers): h = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)] c = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)] for pos_i in range(seq_len): if concat_inputs and layer_i == 0: x_t = tt.concatenate( [cur_input[i][pos_i] for i in range(len(cur_input))]) else: x_t = cur_input[0][pos_i] h_tm1 = h[-1] if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) x_t = torch.mul(x_t, self.dropout_mask_x[layer_i]) h_tm1 = torch.mul(h_tm1, self.dropout_mask_h[layer_i]) h_t, c_t = self.layers[layer_i](x_t, (h_tm1, c[-1])) if mask is None or np.isclose( np.sum(mask.np_arr[:, pos_i:pos_i + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask.cmult_by_timestep_expr(c_t, pos_i, True) + mask.cmult_by_timestep_expr(c[-1], pos_i, False)) h.append( mask.cmult_by_timestep_expr(h_t, pos_i, True) + mask.cmult_by_timestep_expr(h[-1], pos_i, False)) self._final_states.append( transducers.FinalTransducerState(h[-1], c[-1])) cur_input = [h[1:]] return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask)
def transduce(self, x: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: x_T = x.as_transposed_tensor() scores = x_T * dy.parameter(self.W) if x.mask is not None: scores = x.mask.add_to_tensor_expr(scores, multiplicator=-100.0, time_first=True) if self.pos_enc_max: seq_len = x_T.dim()[0][0] pos_enc = self.pos_enc[:seq_len,:] scores = dy.cmult(scores, dy.inputTensor(pos_enc)) attention = dy.softmax(scores) output_expr = x.as_tensor() * attention return expression_seqs.ExpressionSequence(expr_tensor=output_expr, mask=None)
def transduce(self, es): mask = es.mask # first layer forward_es = self.forward_layers[0].transduce(es) rev_backward_es = self.backward_layers[0].transduce( expression_seqs.ReversedExpressionSequence(es)) # TODO: concat input of each layer to its output; or, maybe just add standard residual connections for layer_i in range(1, len(self.forward_layers)): new_forward_es = self.forward_layers[layer_i].transduce([ forward_es, expression_seqs.ReversedExpressionSequence(rev_backward_es) ]) mask_out = mask if mask_out is not None and new_forward_es.mask.np_arr.shape != mask_out.np_arr.shape: mask_out = mask_out.lin_subsampled(trg_len=len(new_forward_es)) rev_backward_es = expression_seqs.ExpressionSequence( self.backward_layers[layer_i].transduce([ expression_seqs.ReversedExpressionSequence(forward_es), rev_backward_es ]).as_list(), mask=mask_out) forward_es = new_forward_es self._final_states = [ transducers.FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(), self.backward_layers[layer_i].get_final_states()[ 0].main_expr()]), dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(), self.backward_layers[layer_i].get_final_states()[ 0].cell_expr()])) \ for layer_i in range(len(self.forward_layers))] mask_out = mask if mask_out is not None and forward_es.mask.np_arr.shape != mask_out.np_arr.shape: mask_out = mask_out.lin_subsampled(trg_len=len(forward_es)) return expression_seqs.ExpressionSequence(expr_list=[ dy.concatenate([forward_es[i], rev_backward_es[-i - 1]]) for i in range(len(forward_es)) ], mask=mask_out)
def exprseq_pooling(self, exprseq): # Reduce to vector exprseq = expression_seqs.ExpressionSequence( expr_tensor=exprseq.mask.add_to_tensor_expr( exprseq.as_tensor(), -1e10), mask=exprseq.mask) if exprseq.expr_tensor is not None: if len(exprseq.expr_tensor.dim()[0]) > 1: return dy.max_dim(exprseq.expr_tensor, d=1) else: return exprseq.expr_tensor else: return dy.emax(exprseq.expr_list)
def transduce(self, src: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: sent_len = len(src) embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = dy.concatenate([embeddings, src.as_tensor()]) else: raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")') if self.train and self.dropout > 0.0: output = dy.dropout(output, self.dropout) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq
def transduce( self, expr_sequence: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: # first layer forward_es = self.forward_layers[0].transduce(expr_sequence) rev_backward_es = self.backward_layers[0].transduce( expression_seqs.ReversedExpressionSequence(expr_sequence)) for layer_i in range(1, len(self.forward_layers)): concat_fwd = expression_seqs.ExpressionSequence(expr_list=[ dy.concatenate([fwd_expr, bwd_expr]) for fwd_expr, bwd_expr in zip( forward_es.as_list(), reversed(rev_backward_es.as_list())) ]) concat_bwd = expression_seqs.ExpressionSequence(expr_list=[ dy.concatenate([fwd_expr, bwd_expr]) for fwd_expr, bwd_expr in zip(reversed(forward_es.as_list()), rev_backward_es.as_list()) ]) new_forward_es = self.forward_layers[layer_i].transduce(concat_fwd) rev_backward_es = self.backward_layers[layer_i].transduce( concat_bwd) forward_es = new_forward_es self._final_states = [ transducers.FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(), self.backward_layers[layer_i].get_final_states()[ 0].main_expr()]), dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(), self.backward_layers[layer_i].get_final_states()[ 0].cell_expr()])) \ for layer_i in range(len(self.forward_layers))] return expression_seqs.ExpressionSequence(expr_list=[ dy.concatenate([forward_es[i], rev_backward_es[-i - 1]]) for i in range(len(forward_es)) ])
def transduce(self, expr_seq: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ Wq, Wk, Wv, Wo = [dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo)] bq, bk, bv, bo = [dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo)] # Start with a [(length, model_size) x batch] tensor x = expr_seq.as_transposed_tensor() x_len = x.dim()[0][0] x_batch = x.dim()[1] # Get the query key and value vectors # TODO: do we need bias broadcasting in DyNet? # q = dy.affine_transform([bq, x, Wq]) # k = dy.affine_transform([bk, x, Wk]) # v = dy.affine_transform([bv, x, Wv]) q = bq + x * Wq k = bk + x * Wk v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q,k,v)] # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys attn_score = q * dy.transpose(k) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = dy.inputTensor(np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10 attn_score = attn_score + mask attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = dy.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch) # Final transformation # o = dy.affine_transform([bo, attn_prob * v, Wo]) o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask) self._final_states = [transducers.FinalTransducerState(expr_seq[-1], None)] return expr_seq
def pad_embedding(embeddings) -> expression_seqs.ExpressionSequence: max_col = max(len(xs) for xs in embeddings) p0 = dy.zeros(embeddings[0][0].dim()[0][0]) masks = np.zeros((len(embeddings), max_col), dtype=int) modified = False ret = [] for xs, mask in zip(embeddings, masks): deficit = max_col - len(xs) if deficit > 0: xs = xs + ([p0] * deficit) mask[-deficit:] = 1 modified = True ret.append(dy.concatenate_cols(xs)) mask = Mask(masks) if modified else None return expression_seqs.ExpressionSequence( expr_tensor=dy.concatenate_to_batch(ret), mask=mask)
def embed_speech_sent(self, x): # TODO refactor: seems a bit too many special cases that need to be distinguished # x = x.batches[0] batched = batchers.is_batched(x) first_sent = x[0] if batched else x if hasattr(first_sent, "get_array"): if not batched: return expression_seqs.LazyNumpyExpressionSequence( lazy_data=x.get_array()) else: return expression_seqs.LazyNumpyExpressionSequence( lazy_data=batchers.mark_as_batch([s for s in x]), mask=x.mask) else: raise ValueError("!! Expected to use above") return expression_seqs.ExpressionSequence(expr_list=embeddings, mask=x.mask)
def transduce( self, es: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': batch_size = tt.batch_size(es.as_tensor()) if es.mask: seq_lengths = es.mask.seq_lengths() else: seq_lengths = [es.sent_len()] * batch_size # Sort the input and lengths as the descending order seq_lengths = torch.LongTensor(seq_lengths).to(xnmt.device) lengths, perm_index = seq_lengths.sort(0, descending=True) sorted_input = es.as_tensor()[perm_index] perm_index_rev = [-1] * len(lengths) for i in range(len(lengths)): perm_index_rev[perm_index[i]] = i perm_index_rev = torch.LongTensor(perm_index_rev).to(xnmt.device) packed_input = nn.utils.rnn.pack_padded_sequence(sorted_input, list(lengths.data), batch_first=True) state_size = self.num_dir * self.num_layers, batch_size, self.hidden_dim // self.num_dir h0 = sorted_input.new_zeros(*state_size) c0 = sorted_input.new_zeros(*state_size) output, (final_hiddens, final_cells) = self.lstm(packed_input, (h0, c0)) output = nn.utils.rnn.pad_packed_sequence( output, batch_first=True, total_length=es.sent_len())[0] # restore the sorting decoded = output[perm_index_rev] self._final_states = [] for layer_i in range(self.num_layers): final_hidden = final_hiddens.view( self.num_layers, self.num_dir, batch_size, -1)[layer_i].transpose(0, 1).contiguous().view(batch_size, -1) final_hidden = final_hidden[perm_index_rev] self._final_states.append( transducers.FinalTransducerState(final_hidden)) ret = expression_seqs.ExpressionSequence(expr_tensor=decoded, mask=es.mask) return ret
def transduce( self, seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if self.train and self.dropout > 0.0: seq_tensor = dy.dropout( self.child.transduce(seq).as_tensor(), self.dropout) + seq.as_tensor() else: seq_tensor = self.child.transduce( seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0], ), batch_size=d[0][1] * d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
def transduce( self, x: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: expr = x.as_transposed_tensor() batch_size, hidden_dim, seq_len = expr.size() expr = expr.view((batch_size, self.in_channels, hidden_dim // self.in_channels, seq_len)) expr = self.cnn_layer(expr) if self.use_pooling: expr = self.pooling_layer(expr) expr = self.activation_fct(expr) batch_size, out_chn, out_h, seq_len = expr.size() expr = expr.view((batch_size, out_chn * out_h, seq_len)) output_seq = expression_seqs.ExpressionSequence( expr_transposed_tensor=expr, mask=x.mask.lin_subsampled(trg_len=seq_len) if x.mask else None) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq
def embed_factor_sent(self, x, speech_len): # single mode if not batchers.is_batched(x): embeddings = [self.embed_factor(word) for word in x] # minibatch mode else: embeddings = [] seq_len = x.sent_len() for single_sent in x: assert single_sent.sent_len() == seq_len # for word_i in range(seq_len): for word_i in range(speech_len): batch = batchers.mark_as_batch( [single_sent[word_i] for single_sent in x]) embeddings.append(self.embed_factor(batch)) return expression_seqs.ExpressionSequence( expr_list=embeddings, mask=x.mask if batchers.is_batched(x) else None)
def transduce( self, seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if self.train and self.dropout > 0.0: seq_tensor = tt.dropout( self.child.transduce(seq).as_tensor(), self.dropout) + seq.as_tensor() else: seq_tensor = self.child.transduce( seq).as_tensor() + seq.as_tensor() if self.layer_norm: batch_size = tt.batch_size(seq_tensor) merged_seq_tensor = tt.merge_time_batch_dims(seq_tensor) transformed_seq_tensor = self.layer_norm_component.transform( merged_seq_tensor) seq_tensor = tt.unmerge_time_batch_dims(transformed_seq_tensor, batch_size) return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)