def transduce( self, xs: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': Wx = dy.parameter(self.p_Wx) Wh = dy.parameter(self.p_Wh) b = dy.parameter(self.p_b) h = [] c = [] for i, x_t in enumerate(xs): if i == 0: tmp = dy.affine_transform([b, Wx, x_t]) else: tmp = dy.affine_transform([b, Wx, x_t, Wh, h[-1]]) i_ait = dy.pick_range(tmp, 0, self.hidden_dim) i_aft = dy.pick_range(tmp, self.hidden_dim, self.hidden_dim * 2) i_aot = dy.pick_range(tmp, self.hidden_dim * 2, self.hidden_dim * 3) i_agt = dy.pick_range(tmp, self.hidden_dim * 3, self.hidden_dim * 4) i_it = dy.logistic(i_ait) i_ft = dy.logistic(i_aft + 1.0) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if i == 0: c.append(dy.cmult(i_it, i_gt)) else: c.append(dy.cmult(i_ft, c[-1]) + dy.cmult(i_it, i_gt)) h.append(dy.cmult(i_ot, dy.tanh(c[-1]))) return h
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.dim()[1] > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = [] c = [] batch_size = expr_seq.dim()[1] if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for node_i in range(lattice.sent_len()): cur_node = lattice.nodes[node_i] val = expr_seq[node_i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(cur_node.nodes_prev) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in cur_node.nodes_prev) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in cur_node.nodes_prev: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(cur_node.nodes_prev) == 0: c.append(dy.cmult(i_it, i_gt)) else: fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]]) for i in range(1, len(cur_node.nodes_prev)): fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]]) c.append(fc + dy.cmult(i_it, i_gt)) h_t = dy.cmult(i_ot, dy.tanh(c[-1])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h.append(h_t) self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])] return expression_seqs.ExpressionSequence(expr_list=h)
def transduce(self, embed_sent: ExpressionSequence) -> List[ExpressionSequence]: batch_size = embed_sent[0].dim()[1] actions = self.sample_segmentation(embed_sent, batch_size) sample_size = len(actions) embeddings = dy.concatenate(embed_sent.expr_list, d=1) embeddings.value() # composed_words = [] for i in range(batch_size): sequence = dy.pick_batch_elem(embeddings, i) # For each sampled segmentations for j, sample in enumerate(actions): lower_bound = 0 # Read every 'segment' decision for k, upper_bound in enumerate(sample[i]): char_sequence = dy.pick_range(sequence, lower_bound, upper_bound + 1, 1) composed_words.append( (dy.pick_range(sequence, lower_bound, upper_bound + 1, 1), j, i, k, lower_bound, upper_bound + 1)) #self.segment_composer.set_word_boundary(lower_bound, upper_bound, self.src_sent[i]) #composed = self.segment_composer.transduce(char_sequence) #outputs[j][i].append(composed) lower_bound = upper_bound + 1 outputs = self.segment_composer.compose(composed_words, sample_size, batch_size) # Padding + return try: if self.length_prior: seg_size_unpadded = [[ len(outputs[i][j]) for j in range(batch_size) ] for i in range(sample_size)] enc_outputs = [] for batched_sampled_sentence in outputs: sampled_sentence, segment_mask = self.pad( batched_sampled_sentence) expr_seq = ExpressionSequence( expr_tensor=dy.concatenate_to_batch(sampled_sentence), mask=segment_mask) sent_context = self.final_transducer.transduce(expr_seq) self.final_states.append( self.final_transducer.get_final_states()) enc_outputs.append(sent_context) return CompoundSeqExpression(enc_outputs) finally: if self.length_prior: self.seg_size_unpadded = seg_size_unpadded self.compose_output = outputs self.segment_actions = actions if not self.train and self.compute_report: self.add_sent_for_report({"segment_actions": actions})
def transduce( self, embed_sent: expr_seq.ExpressionSequence ) -> List[expr_seq.ExpressionSequence]: self.create_trajectories(embed_sent, force_oracle=False) actions = [np.nonzero(a.content) for a in self.actions] actions = [[ a for a in actions[i] if a < self.src_sents[i].len_unpadded() ] for i in range(len(actions))] # Create sentence embedding outputs = [] embeddings = dy.concatenate(embed_sent.expr_list, d=1) for i in range(self.src_sents.batch_size()): sequence = dy.pick_batch_elem(embeddings, i) src = self.src_sents[i] lower_bound = 0 output = [] for j, upper_bound in enumerate(actions[i]): char_sequence = dy.pick_range( sequence, lower_bound, upper_bound + 1, 1) if self.no_char_embed else None output.append( self.segment_composer.compose_single( char_sequence, src, lower_bound, upper_bound + 1)) lower_bound = upper_bound + 1 outputs.append(output) outputs = pad_output() return self.final_transducer.transduce(outputs)
def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence: mask = es.mask sent_len = len(es) es_expr = es.as_transposed_tensor() batch_size = es_expr.dim()[1] es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size) h_out = {} for direction in ["fwd", "bwd"]: # input convolutions gates_xt_bias = dy.conv2d_bias(es_chn, dy.parameter(self.params["x2all_" + direction]), dy.parameter(self.params["b_" + direction]), stride=(1, 1), is_valid=False) gates_xt_bias_list = [dy.pick_range(gates_xt_bias, i, i + 1) for i in range(sent_len)] h = [] c = [] for input_pos in range(sent_len): directional_pos = input_pos if direction == "fwd" else sent_len - input_pos - 1 gates_t = gates_xt_bias_list[directional_pos] if input_pos > 0: # recurrent convolutions gates_h_t = dy.conv2d(h[-1], dy.parameter(self.params["h2all_" + direction]), stride=(1, 1), is_valid=False) gates_t += gates_h_t # standard LSTM logic if len(c) == 0: c_tm1 = dy.zeros((self.freq_dim * self.num_filters,), batch_size=batch_size) else: c_tm1 = c[-1] gates_t_reshaped = dy.reshape(gates_t, (4 * self.freq_dim * self.num_filters,), batch_size=batch_size) c_t = dy.reshape(dy.vanilla_lstm_c(c_tm1, gates_t_reshaped), (self.freq_dim * self.num_filters,), batch_size=batch_size) h_t = dy.vanilla_lstm_h(c_t, gates_t_reshaped) h_t = dy.reshape(h_t, (1, self.freq_dim, self.num_filters,), batch_size=batch_size) if mask is None or np.isclose(np.sum(mask.np_arr[:, input_pos:input_pos + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append( mask.cmult_by_timestep_expr(c_t, input_pos, True) + mask.cmult_by_timestep_expr(c[-1], input_pos, False)) h.append( mask.cmult_by_timestep_expr(h_t, input_pos, True) + mask.cmult_by_timestep_expr(h[-1], input_pos, False)) h_out[direction] = h ret_expr = [] for state_i in range(len(h_out["fwd"])): state_fwd = h_out["fwd"][state_i] state_bwd = h_out["bwd"][-1 - state_i] output_dim = (state_fwd.dim()[0][1] * state_fwd.dim()[0][2],) fwd_reshape = dy.reshape(state_fwd, output_dim, batch_size=batch_size) bwd_reshape = dy.reshape(state_bwd, output_dim, batch_size=batch_size) ret_expr.append(dy.concatenate([fwd_reshape, bwd_reshape], d=0 if self.reshape_output else 2)) return expression_seqs.ExpressionSequence(expr_list=ret_expr, mask=mask) # TODO: implement get_final_states()
def transduce(self, embed_sent: ExpressionSequence) -> List[ExpressionSequence]: batch_size = embed_sent[0].dim()[1] actions = self.sample_segmentation(embed_sent, batch_size) embeddings = dy.concatenate(embed_sent.expr_list, d=1) embeddings.value() # composed_words = [] for i in range(batch_size): sequence = dy.pick_batch_elem(embeddings, i) # For each sampled segmentations lower_bound = 0 for j, upper_bound in enumerate(actions[i]): if self.no_char_embed: char_sequence = [] else: char_sequence = dy.pick_range(sequence, lower_bound, upper_bound + 1, 1) composed_words.append( (char_sequence, i, j, lower_bound, upper_bound + 1)) lower_bound = upper_bound + 1 outputs = self.segment_composer.compose(composed_words, batch_size) # Padding + return try: if self.length_prior: seg_size_unpadded = [ len(outputs[i]) for i in range(batch_size) ] sampled_sentence, segment_mask = self.pad(outputs) expr_seq = ExpressionSequence( expr_tensor=dy.concatenate_to_batch(sampled_sentence), mask=segment_mask) return self.final_transducer.transduce(expr_seq) finally: if self.length_prior: self.seg_size_unpadded = seg_size_unpadded self.compose_output = outputs self.segment_actions = actions if not self.train and self.is_reporting(): if len(actions) == 1: # Support only AccuracyEvalTask self.report_sent_info({"segment_actions": actions})
def transduce(self, h_below: 'expression_seqs.ExpressionSequence', h_above, z_below) -> 'expression_seqs.ExpressionSequence': if self.c == None: self.c = dy.zeroes( dim=(self.hidden_dim, )) #?? does (hidden,) take care of batch_size? if self.h == None: self.h = dy.zeroes(dim=(self.hidden_dim, )) if self.z == None: self.z = dy.ones(dim=(1, )) W_1l_r = dy.parameter(self.p_W_1l_r) bias = dy.parameter(self.p_bias) h = dy.parameter(self.h) s_recur = W_1l_r * h #matrix multiply is *, element-wise is dy.cmult. CURRERROR: stale expression if not self.last_layer: W_2l_td = dy.parameter(self.p_W_2l_td) W_0l_bu = dy.parameter(self.p_W_0l_bu) s_bottomup = W_0l_bu * h_below #?? this is becoming (2049,). does it need to be (2049,1) to do scalar * matrix? s_topdown = W_2l_td * h_above else: s_topdown = dy.zeroes( s_recur.dim()[0][0], ) #?? this gets the shape e.g. ((5, 1), 1). do i actually want batch_size as well? s_bottomup = W_1l_r * h s_bottomup = dy.cmult( z_below, s_bottomup ) #to handle batched scalar * matrix -> e.g. (1x10, 2049x10) s_topdown = dy.cmult( self.z, s_topdown ) #will be zeros if last_layer. is this right, or should z=1 in this case ?? fslice = s_recur + s_topdown + s_bottomup + bias #?? checkme. bias has same shape as s_recur et al? [4*hidden+1, batch_size]? i_ft = dy.pick_range(fslice, 0, self.hidden_dim) i_it = dy.pick_range(fslice, self.hidden_dim, self.hidden_dim * 2) i_ot = dy.pick_range(fslice, self.hidden_dim * 2, self.hidden_dim * 3) i_gt = dy.pick_range(fslice, self.hidden_dim * 3, self.hidden_dim * 4) f_t = dy.logistic( i_ft + 1.0 ) #+1.0 bc a paper said it was better to init that way (matthias) i_t = dy.logistic(i_it) o_t = dy.logistic(i_ot) g_t = dy.tanh(i_gt) #z * normal_update + (1-z)*copy: ie, when z_below is 0, z_new = z (copied prev timestamp). when z_below is 1, z_new = dy.round etc #hier = True # z_tmp = dy.pick_range(fslice, self.hidden_dim*4,self.hidden_dim*4+1) # z_tilde = dy.logistic(z_tmp) #original: hard sigmoid + slope annealing (a) # z_new = dy.cmult(1-z_below, self.z) + dy.cmult(z_below, dy.round(z_tilde, gradient_mode="straight_through_gradient")) #hier = False z_tmp = dy.pick_range(fslice, self.hidden_dim * 4, self.hidden_dim * 4 + 1) z_tilde = dy.logistic( z_tmp) #original: hard sigmoid + slope annealing (a) z_new = dy.round( z_tilde, gradient_mode="straight_through_gradient" ) #use straight-through estimator for gradient: step fn forward, hard sigmoid backward #z = z_l,t-1 #z_below = z_l-1,t # if self.z.value() == 1: #FLUSH # c_new = dy.cmult(i_t, g_t) # h_new = dy.cmult(o_t, dy.tanh(c_new)) # elif z_below.value() == 0: #COPY # if flush removed, only copy or normal update # when z_below is 0, c_new and h_new are self.c and self.h. when z_below is 1, c_new, h_new = normal update c_new = dy.cmult((1 - z_below), self.c) + dy.cmult( z_below, (dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t))) h_new = dy.cmult((1 - z_below), self.h) + dy.cmult( z_below, dy.cmult(o_t, dy.tanh(c_new))) # if z_below.value() == 0: #COPY # c_new = self.c # h_new = self.h # else: #UPDATE # c_new = dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t) # h_new = dy.cmult(o_t, dy.tanh(c_new)) self.c = c_new self.h = h_new self.z = z_new return h_new, z_new
def __call__(self, x: dy.Expression, att_mask: np.ndarray, batch_mask: np.ndarray, p: numbers.Real): """ x: expression of dimensions (input_dim, time) x batch att_mask: numpy array of dimensions (time, time); pre-transposed batch_mask: numpy array of dimensions (batch, time) p: dropout prob """ sent_len = x.dim()[0][1] batch_size = x[0].dim()[1] if self.downsample_factor > 1: if sent_len % self.downsample_factor != 0: raise ValueError( "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. " "Configure batcher accordingly.") if batch_mask is not None: batch_mask = batch_mask[:, ::self.downsample_factor] sent_len_out = sent_len // self.downsample_factor sent_len = sent_len_out out_mask = x.mask if self.downsample_factor > 1 and out_mask is not None: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_factor) x = ExpressionSequence(expr_tensor=dy.reshape( x.as_tensor(), (x.dim()[0][0] * self.downsample_factor, x.dim()[0][1] / self.downsample_factor), batch_size=batch_size), mask=out_mask) residual = SAAMTimeDistributed()(x) else: residual = SAAMTimeDistributed()(x) sent_len_out = sent_len if self.model_dim != self.input_dim * self.downsample_factor: residual = self.res_shortcut.transform(residual) # Concatenate all the words together for doing vectorized affine transform if self.kq_pos_encoding_type is None: kvq_lin = self.linear_kvq.transform(SAAMTimeDistributed()(x)) key_up = self.shape_projection( dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head), batch_size) value_up = self.shape_projection( dy.pick_range(kvq_lin, self.head_count * self.dim_per_head, 2 * self.head_count * self.dim_per_head), batch_size) query_up = self.shape_projection( dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head, 3 * self.head_count * self.dim_per_head), batch_size) else: assert self.kq_pos_encoding_type == "embedding" encoding = self.kq_positional_embedder.embed_sent( sent_len).as_tensor() kq_lin = self.linear_kq.transform(SAAMTimeDistributed()( ExpressionSequence( expr_tensor=dy.concatenate([x.as_tensor(), encoding])))) key_up = self.shape_projection( dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head), batch_size) query_up = self.shape_projection( dy.pick_range(kq_lin, self.head_count * self.dim_per_head, 2 * self.head_count * self.dim_per_head), batch_size) v_lin = self.linear_v.transform(SAAMTimeDistributed()(x)) value_up = self.shape_projection(v_lin, batch_size) if self.cross_pos_encoding_type: assert self.cross_pos_encoding_type == "embedding" emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0, sent_len) emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0, sent_len) key_up = dy.reshape(key_up, (sent_len, self.dim_per_head, self.head_count), batch_size=batch_size) key_up = dy.concatenate_cols( [dy.cmult(key_up, emb1), dy.cmult(key_up, emb2)]) key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2), batch_size=self.head_count * batch_size) query_up = dy.reshape( query_up, (sent_len, self.dim_per_head, self.head_count), batch_size=batch_size) query_up = dy.concatenate_cols( [dy.cmult(query_up, emb2), dy.cmult(query_up, -emb1)]) query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2), batch_size=self.head_count * batch_size) scaled = query_up * dy.transpose( key_up / math.sqrt(self.dim_per_head) ) # scale before the matrix multiplication to save memory # Apply Mask here if not self.ignore_masks: if att_mask is not None: att_mask_inp = att_mask * -100.0 if self.downsample_factor > 1: att_mask_inp = att_mask_inp[::self.downsample_factor, :: self.downsample_factor] scaled += dy.inputTensor(att_mask_inp) if batch_mask is not None: # reshape (batch, time) -> (time, head_count*batch), then *-100 inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :], (sent_len, self.head_count, batch_size)), (1, sent_len, self.head_count * batch_size)) \ * -100 mask_expr = dy.inputTensor(inp, batched=True) scaled += mask_expr if self.diag_gauss_mask: diag_growing = np.zeros((sent_len, sent_len, self.head_count)) for i in range(sent_len): for j in range(sent_len): diag_growing[i, j, :] = -(i - j)**2 / 2.0 e_diag_gauss_mask = dy.inputTensor(diag_growing) e_sigma = dy.parameter(self.diag_gauss_mask_sigma) if self.square_mask_std: e_sigma = dy.square(e_sigma) e_sigma_sq_inv = dy.cdiv( dy.ones(e_sigma.dim()[0], batch_size=batch_size), dy.square(e_sigma)) e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask, e_sigma_sq_inv) scaled += dy.reshape(e_diag_gauss_mask_final, (sent_len, sent_len), batch_size=batch_size * self.head_count) # Computing Softmax here. attn = dy.softmax(scaled, d=1) if LOG_ATTENTION: yaml_logger.info({ "key": "selfatt_mat_ax0", "value": np.average(attn.value(), axis=0).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax1", "value": np.average(attn.value(), axis=1).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax0_ent", "value": entropy(attn.value()).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax1_ent", "value": entropy(attn.value().transpose()).dumps(), "desc": self.desc }) self.select_att_head = 0 if self.select_att_head is not None: attn = dy.reshape(attn, (sent_len, sent_len, self.head_count), batch_size=batch_size) sel_mask = np.zeros((1, 1, self.head_count)) sel_mask[0, 0, self.select_att_head] = 1.0 attn = dy.cmult(attn, dy.inputTensor(sel_mask)) attn = dy.reshape(attn, (sent_len, sent_len), batch_size=self.head_count * batch_size) # Applying dropout to attention if p > 0.0: drop_attn = dy.dropout(attn, p) else: drop_attn = attn # Computing weighted attention score attn_prod = drop_attn * value_up # Reshaping the attn_prod to input query dimensions out = dy.reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), batch_size=batch_size) out = dy.transpose(out) out = dy.reshape(out, (self.model_dim, ), batch_size=batch_size * sent_len_out) # out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out) if self.plot_attention: from sklearn.metrics.pairwise import cosine_similarity assert batch_size == 1 mats = [] for i in range(attn.dim()[1]): mats.append(dy.pick_batch_elem(attn, i).npvalue()) self.plot_att_mat( mats[-1], "{}.sent_{}.head_{}.png".format( self.plot_attention, self.plot_attention_counter, i), 300) avg_mat = np.average(mats, axis=0) self.plot_att_mat( avg_mat, "{}.sent_{}.head_avg.png".format(self.plot_attention, self.plot_attention_counter), 300) cosim_before = cosine_similarity(x.as_tensor().npvalue().T) self.plot_att_mat( cosim_before, "{}.sent_{}.cosim_before.png".format( self.plot_attention, self.plot_attention_counter), 600) cosim_after = cosine_similarity(out.npvalue().T) self.plot_att_mat( cosim_after, "{}.sent_{}.cosim_after.png".format( self.plot_attention, self.plot_attention_counter), 600) self.plot_attention_counter += 1 # Adding dropout and layer normalization if p > 0.0: res = dy.dropout(out, p) + residual else: res = out + residual ret = self.layer_norm.transform(res) return ret
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.batch_size() > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.batch_size()}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = {} c = {} h_list = [] batch_size = expr_seq.batch_size() if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for i, cur_node_id in enumerate(lattice.nodes): prev_node = lattice.graph.predecessors(cur_node_id) val = expr_seq[i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(prev_node) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in prev_node) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in prev_node: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(prev_node) == 0: c[cur_node_id] = dy.cmult(i_it, i_gt) else: fc = dy.cmult(i_ft_list[0], c[prev_node[0]]) for i in range(1, len(prev_node)): fc += dy.cmult(i_ft_list[i], c[prev_node[i]]) c[cur_node_id] = fc + dy.cmult(i_it, i_gt) h_t = dy.cmult(i_ot, dy.tanh(c[cur_node_id])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h[cur_node_id] = h_t h_list.append(h_t) self._final_states = [ transducers.FinalTransducerState(h_list[-1], h_list[-1]) ] return expression_seqs.ExpressionSequence(expr_list=h_list)
def __call__(self, es, transitions): mask = es.mask #import pdb;pdb.set_trace() transitions = [t + [0, 1] for t in transitions] transitions = np.array(transitions) maxlen = max(len(r) for r in transitions) Wl = dy.parameter(self.p_Wl) Wr = dy.parameter(self.p_Wr) b = dy.parameter(self.p_b) batch_size = len(transitions) ha = [] c = [] self.hfinals = [] hfinal_state = None cfinal_state = None self.cfinals = [] for i in range(batch_size): hstack = [] cstack = [] htmp = [] count = 0 for j in range(len(transitions[i])): if transitions[i][j] == 0: #print("Shift") #shift onto stack e1 = dy.reshape(es[count], (batch_size, self.hidden_dim))[i] count += 1 hstack.append(e1) cstack.append(e1) elif transitions[i][j] == 1: #reduce #print("Reduce") h1 = hstack.pop() h2 = hstack.pop() c1 = cstack.pop() c2 = cstack.pop() tmp = dy.affine_transform([b, Wl, h1, Wr, h2]) i_gate = dy.pick_range(tmp, 0, self.hidden_dim) fl_gate = dy.pick_range(tmp, self.hidden_dim, self.hidden_dim * 2) fr_gate = dy.pick_range(tmp, self.hidden_dim * 2, self.hidden_dim * 3) o_gate = dy.pick_range(tmp, self.hidden_dim * 3, self.hidden_dim * 4) cell_inp = dy.pick_range(tmp, self.hidden_dim * 4, self.hidden_dim * 5) i_gate = dy.tanh(i_gate) cell_inp = dy.logistic(cell_inp) fl_gate = dy.logistic(fl_gate) fr_gate = dy.logistic(fr_gate) o_gate = dy.logistic(o_gate) c_t = dy.cmult(fl_gate, c1) + dy.cmult( fr_gate, c2) + dy.cmult(i_gate, cell_inp) h_t = dy.cmult(o_gate, dy.tanh(c_t)) cstack.append(c_t) hstack.append(h_t) htmp.append(h_t) hfinal_state = h_t cfinal_state = c_t else: htmp.append(dy.zeros(self.hidden_dim)) self.hfinals.append(h_t) self.cfinals.append(c_t) ha.append(htmp) self._final_states = [ FinalTransducerState(dy.concatenate_to_batch(self.hfinals), dy.concatenate_to_batch(self.cfinals)) ] ha = list(zip_longest(*ha)) hh = [] for x in ha: hh.append(list(x)) k = [ dy.reshape(dy.concatenate(xx), (xx[0].dim()[0][0], len(xx))) for xx in hh ] return ExpressionSequence(expr_list=k)