def calc_loss(self, src, db_idx, src_mask=None, trg_mask=None): src_embeddings = self.src_embedder.embed_sent(src, mask=src_mask) self.src_encoder.set_input(src) src_encodings = self.exprseq_pooling(self.src_encoder.transduce(src_embeddings)) trg_batch, trg_mask = self.database[db_idx] # print("trg_mask=\n",trg_mask) trg_encodings = self.encode_trg_example(trg_batch, mask=trg_mask) dim = trg_encodings.dim() trg_reshaped = dy.reshape(trg_encodings, (dim[0][0], dim[1])) # ### DEBUG # trg_npv = trg_reshaped.npvalue() # for i in range(dim[1]): # print("--- trg_reshaped {}: {}".format(i,list(trg_npv[:,i]))) # ### DEBUG prod = dy.transpose(src_encodings) * trg_reshaped # ### DEBUG # prod_npv = prod.npvalue() # for i in range(dim[1]): # print("--- prod {}: {}".format(i,list(prod_npv[0].transpose()[i]))) # ### DEBUG id_range = list(range(len(db_idx))) # This is ugly: if self.loss_direction == "forward": prod = dy.transpose(prod) loss = dy.sum_batches(dy.hinge_batch(prod, id_range)) elif self.loss_direction == "bidirectional": prod = dy.reshape(prod, (len(db_idx), len(db_idx))) loss = dy.sum_elems( dy.hinge_dim(prod, id_range, d=0) + dy.hinge_dim(prod, id_range, d=1)) else: raise RuntimeError("Illegal loss direction {}".format(self.loss_direction)) return loss
def hier_attend(self, context_pre, context_pos, state): w2 = dy.parameter(self.hier_w2) v = dy.parameter(self.hier_v) w2dt = w2 * dy.concatenate(list(state.s())) # context_pre w1_pre = dy.parameter(self.hier_w1_pre) w1dt_pre = w1_pre * context_pre energy_pre = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt_pre, w2dt))) w_pre = dy.parameter(self.hier_w_pre) wdt_pre = w_pre * context_pre # context_pos w1_pos = dy.parameter(self.hier_w1_pos) w1dt_pos = w1_pos * context_pos energy_pos = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt_pos, w2dt))) w_pos = dy.parameter(self.hier_w_pos) wdt_pos = w_pos * context_pos beta = dy.softmax(dy.concatenate([energy_pre, energy_pos])) wdt = dy.concatenate_cols([wdt_pre, wdt_pos]) context = wdt * beta return context
def encode_batch_seq(self, src_seq, src_seq_rev, sentLengths): # [src[i] for src in src_seq for i in range(len(src_seq[0]))] fwd_vectors = [ self.enc_fwd_lstm.initial_state().transduce(src) for src in src_seq ] bwd_vectors = [ self.enc_bwd_lstm.initial_state().transduce(src_rev) for src_rev in src_seq_rev ] bwd_vectors_T = dynet.transpose(bwd_vectors) i = 0 for vec, sentLen in izip(bwd_vectors_T, range(len(sentLengths))): sent_vec = vec[:sentLen][::-1] vec[:sentLen] = sent_vec bwd_vectors_T[i] = vec i += 1 bwd_vectors = dynet.transpose(bwd_vectors_T) vectors = [ dynet.concatenate(list(p)) for p in zip(fwd_vectors, bwd_vectors) ] return vectors
def cal_scores(self, src_encodings,predict=False): src_len = len(src_encodings) src_encodings = dy.concatenate_cols(src_encodings) # src_ctx_dim, src_len, batch_size batch_size = src_encodings.dim()[1] W_pos = dy.parameter(self.W_pos) b_pos = dy.parameter(self.b_pos) W_xpos = dy.parameter(self.W_xpos) b_xpos = dy.parameter(self.b_xpos) W_affine_pos = dy.parameter(self.W_affine_pos) b_affine_pos = dy.parameter(self.b_affine_pos) W_affine_xpos = dy.parameter(self.W_affine_xpos) b_affine_xpos = dy.parameter(self.b_affine_xpos) if predict: pos = self.leaky_ReLu(dy.affine_transform([b_pos, W_pos, src_encodings])) # n_pos_mlp_units, src_len, bs xpos = self.leaky_ReLu(dy.affine_transform([b_xpos, W_xpos, src_encodings])) else: src_encodings = dy.dropout_dim(src_encodings,1,self.dropout) pos = dy.dropout_dim(self.leaky_ReLu(dy.affine_transform([b_pos, W_pos, src_encodings])),1,self.dropout) # n_pos_mlp_units, src_len, bs xpos = dy.dropout_dim(self.leaky_ReLu(dy.affine_transform([b_xpos, W_xpos, src_encodings])),1,self.dropout) pos_label = dy.affine_transform([b_affine_pos, dy.transpose(W_affine_pos), pos]) xpos_label = dy.affine_transform([b_affine_xpos, dy.transpose(W_affine_xpos), xpos]) return pos_label, xpos_label
def calc_attention(self, state): logger.warning("BilinearAttender does currently not do masking, which may harm training results.") Wa = dy.parameter(self.pWa) scores = (dy.transpose(state) * Wa) * self.I normalized = dy.softmax(scores) self.attention_vecs.append(normalized) return dy.transpose(normalized)
def attend(self, H_e, h_t): H_e =dy.concatenate_cols(H_e) S = dy.transpose(h_t) * self.attention_weight * H_e S = dy.transpose(S) A = dy.softmax(S) context_vector = H_e * A return context_vector
def calc_attention(self, state): V = dy.parameter(self.pV) U = dy.parameter(self.pU) WI = self.WI curr_sent_mask = self.curr_sent.mask if self.attention_vecs: conv_feats = dy.conv2d(self.attention_vecs[-1], self.pL, stride=[1, 1], is_valid=False) conv_feats = dy.transpose( dy.reshape(conv_feats, (conv_feats.dim()[0][0], self.hidden_dim), batch_size=conv_feats.dim()[1])) h = dy.tanh(dy.colwise_add(WI + conv_feats, V * state)) else: h = dy.tanh(dy.colwise_add(WI, V * state)) scores = dy.transpose(U * h) if curr_sent_mask is not None: scores = curr_sent_mask.add_to_tensor_expr(scores, multiplicator=-100.0) normalized = dy.softmax(scores) self.attention_vecs.append(normalized) return normalized
def recurrence(self, xt, hmtm1, cmtm1, h_tilde_tm1, dropout_flag): """ recurrence function of LSTM with truncated self-attention :param xt: current input, shape: (n_in) :param hmtm1: hidden memory [htm1, ..., h1], shape: (n_steps, n_out) :param cmtm1: cell memory: (n_steps, n_out) :param h_tilde_tm1: previous hidden summary, shape: (n_out, ) :param h_tilde_tm1: previous cell summary :param dropout_flag: where perform partial dropout :return: """ score = dy.concatenate([dy.dot_product(self.u, dy.tanh(\ self.W_h * hmtm1[i] + self.W_x * xt + self.W_htilde * h_tilde_tm1)) for i in range(self.n_steps)]) # normalize the attention score score = dy.softmax(score) # shape: (1, n_out) h_tilde_t = dy.reshape(dy.transpose(score) * hmtm1, d=(self.n_out,)) c_tilde_t = dy.transpose(score) * cmtm1 Wx = self.W * xt if dropout_flag: # perform partial dropout over the lstm Wx = dy.dropout(Wx, self.dropout_rate) Uh = self.U * h_tilde_t # shape: (4*n_out) sum_item = Wx + Uh + self.b it = dy.logistic(sum_item[:self.n_out]) ft = dy.logistic(sum_item[self.n_out:2*self.n_out]) ot = dy.logistic(sum_item[2*self.n_out:3*self.n_out]) c_hat = dy.tanh(sum_item[3*self.n_out:]) ct = dy.cmult(ft, dy.reshape(c_tilde_t, d=(self.n_out,))) + dy.cmult(it, c_hat) ht = dy.cmult(ot, dy.tanh(ct)) hmt = dy.concatenate([hmtm1[1:], dy.reshape(ht, (1, self.n_out))]) cmt = dy.concatenate([cmtm1[1:], dy.reshape(ct, (1, self.n_out))]) return hmt, cmt, h_tilde_t
def dycosine(query_vec, question_vec): num = dy.transpose(query_vec) * question_vec dem1 = dy.sqrt(dy.transpose(query_vec) * query_vec) dem2 = dy.sqrt(dy.transpose(question_vec) * question_vec) dem = dem1 * dem2 return dy.cdiv(num, dem)
def word_assoc_score(self, source_idx, target_idx, relation): """ NOTE THAT DROPOUT IS BEING APPLIED HERE :param source_idx: embedding index of source atom :param target_idx: embedding index of target atom :param relation: relation type :return: score """ # prepare s = self.embeddings[source_idx] if self.no_assoc: A = dy.const_parameter(self.word_assoc_weights[relation]) else: A = dy.parameter(self.word_assoc_weights[relation]) dy.dropout(A, self.dropout) t = self.embeddings[target_idx] # compute if self.mode == BILINEAR_MODE: return dy.transpose(s) * A * t elif self.mode == DIAG_RANK1_MODE: diag_A = dyagonalize(A[0]) rank1_BC = A[1] * dy.transpose(A[2]) ABC = diag_A + rank1_BC return dy.transpose(s) * ABC * t elif self.mode == TRANSLATIONAL_EMBED_MODE: return -dy.l2_norm(s - t + A) elif self.mode == DISTMULT: return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
def attend(self, H_e, h_t): H_e = dy.transpose(H_e) S = dy.transpose(h_t) * self.attention_weight.expr() * H_e S = dy.transpose(S) A = dy.softmax(S) context_vector = H_e * A return context_vector, A
def apply(self, sent1, sent2): eL = dy.parameter(self.linear) sent1 = dy.inputTensor(self.embedding.all_embeds_from_ix(sent1)) * eL sent2 = dy.inputTensor(self.embedding.all_embeds_from_ix(sent2)) * eL out1, out2 = self.feed_F(sent1, sent2) e_out = out1 * dy.transpose(out2) prob_f_1 = dy.softmax(e_out) score = dy.transpose(e_out) prob_f_2 = dy.softmax(score) sent1_allign = dy.concatenate_cols([sent1, prob_f_1 * sent2]) sent2_allign = dy.concatenate_cols([sent2, prob_f_2 * sent1]) out_g_1, out_g_2 = self.feed_G(sent1_allign, sent2_allign) sent1_out_g = dy.sum_dim(out_g_1, [0]) sent2_out_g = dy.sum_dim(out_g_2, [0]) concat = dy.transpose(dy.concatenate([sent1_out_g, sent2_out_g])) h_step_1 = dy.parameter(self.h_step_1) sent_h = dy.rectify(dy.dropout(concat, 0.2) * h_step_1) h_step_2 = dy.parameter(self.h_step_2) sent_h = dy.rectify(dy.dropout(sent_h, 0.2) * h_step_2) final = dy.parameter(self.linear2) final = dy.transpose(sent_h * final) return final
def __call__(self, sent1, sent2): """ :param sent1: np matrix. :param sent2: np matrix. :return: np array of 3 elements. """ sent1_linear, sent2_linear = self.apply_linear_embed(sent1, sent2) f1, f2 = self.apply_f(sent1_linear, sent2_linear) score1 = f1 * dy.transpose(f2) prob1 = dy.softmax(score1) score2 = dy.transpose(score1) prob2 = dy.softmax(score2) sent1_combine = dy.concatenate_cols( [sent1_linear, prob1 * sent2_linear]) sent2_combine = dy.concatenate_cols( [sent2_linear, prob2 * sent1_linear]) # sum g1, g2 = self.apply_g(sent1_combine, sent2_combine) sent1_output = dy.sum_dim(g1, [0]) sent2_output = dy.sum_dim(g2, [0]) input_combine = dy.transpose( dy.concatenate([sent1_output, sent2_output])) h = self.apply_h(input_combine) linear_final = dy.parameter(self.linear_final) h = h * linear_final output = dy.log_softmax(dy.transpose(h)) return output
def scorer(self, q_d_hists, q_idf, bm25_score, overlap_features, p): """ Makes all the calculations and returns a relevance score """ idf_vec = dy.inputVector(q_idf) bm25_score = dy.scalarInput(bm25_score) overlap_features = dy.inputVector(overlap_features) # Pass each query term representation through the MLP term_scores = [] for hist in q_d_hists: q_d_hist = dy.reshape(dy.inputVector(hist), (1, len(hist))) hidd_out = dy.rectify(q_d_hist * self.W_1 + self.b_1) for i in range(0, self.mlp_layers): hidd_out = dy.rectify(hidd_out * self.W_n[i] + self.b_n[i]) term_scores.append(hidd_out * self.W_last + self.b_last) # Term Gating gating_weights = idf_vec * self.w_g bm25_feature = bm25_score * self.W_bm25 + self.b_bm25 drop_out = dy.scalarInput(1) drop_num = (np.random.rand(1) < p)/p #p= probability of keeping a unit active drop_out.set(drop_num) bm25_feature *= drop_out drmm_score = dy.transpose(dy.concatenate(term_scores)) * dy.reshape(gating_weights, (len(q_idf), 1)) #basic MLPs output doc_score = dy.transpose(dy.concatenate([drmm_score, overlap_features])) * self.W_scores + self.b_scores #extra features layer return doc_score
def predict_sequence_batched(self, inputs, mask_array, wlen, predictFlag=False): batch_size = inputs[0].dim()[1] src_len = len(inputs) if not predictFlag: self.charlstm.set_dropouts(self.dropout, self.dropout) self.charlstm.set_dropout_masks(batch_size) char_fwd = self.charlstm.initial_state(batch_size) recur_states, cells = char_fwd.add_inputs(inputs, mask_array, predictFlag) hidden_states = [] for idx in range(src_len): mask = dy.inputVector(mask_array[idx]) mask_expr = dy.reshape(mask, (1, ), batch_size) hidden_states.append(recur_states[idx] * mask_expr) H = dy.concatenate_cols(hidden_states) if (predictFlag): a = dy.softmax(dy.transpose(self.W_atten.expr()) * H) else: #dropout attention connections(keep the same dim across the sequence) a = dy.softmax( dy.transpose(self.W_atten.expr()) * dy.dropout_dim(H, 1, self.dropout)) cell_states = [] for idx in range(batch_size): if (wlen[idx] > 0): cell = dy.pick_batch_elem(cells[wlen[idx] - 1], idx) else: cell = dy.zeros(self.ldims) cell_states.append(cell) C = dy.concatenate_to_batch(cell_states) H_atten = H * dy.transpose(a) char_emb = dy.concatenate([H_atten, C]) if predictFlag: proj_char_emb = dy.affine_transform( [self.b_linear.expr(), self.W_linear.expr(), char_emb]) else: proj_char_emb = dy.affine_transform([ self.b_linear.expr(), self.W_linear.expr(), dy.dropout(char_emb, self.dropout) ]) return proj_char_emb
def get_alpha_beta(E_matrix, F_sen1, F_sen2): alpha_softmax = dy.softmax(E_matrix) beta_softmax = dy.softmax(dy.transpose(E_matrix)) beta = F_sen2 * dy.transpose(alpha_softmax) alpha = F_sen1 * dy.transpose(beta_softmax) return alpha, beta
def do_one_batch(X_batch, Z_batch): # Flatten the batch into 1-D vector for workaround batch_size = X_batch.shape[0] if DO_BATCH: X_batch_f = X_batch.flatten('F') Z_batch_f = Z_batch.flatten('F') x = dy.reshape(dy.inputVector(X_batch_f), (nmf, nframes), batch_size=batch_size) z = dy.reshape(dy.inputVector(Z_batch_f), (nvgg), batch_size=batch_size) scnn.add_input([X_batch[i] for i in range(X_batch.shape[0])]) vgg.add_input([Z_batch[i] for i in range(X_batch.shape[0])]) else: x = dy.matInput(X_batch.shape[0], X_batch.shape[1]) x.set(X_batch.flatten('F')) z = dy.vecInput(Z_batch.shape[0]) z.set(Z_batch.flatten('F')) x = dy.reshape(dy.transpose(x, [1, 0]), (1, X_batch.shape[1], X_batch.shape[0])) print(x.npvalue().shape) a_h1 = dy.conv2d_bias(x, w_i, b_i, [1, 1], is_valid=False) h1 = dy.rectify(a_h1) h1_pool = dy.kmax_pooling(h1, D[1], d=1) a_h2 = dy.conv2d_bias(h1_pool, w_h1, b_h1, [1, 1], is_valid=False) h2 = dy.rectify(a_h2) h2_pool = dy.kmax_pooling(h2, D[2], d=1) a_h3 = dy.conv2d_bias(h2_pool, w_h2, b_h2, [1, 1], is_valid=False) h3 = dy.rectify(a_h3) h3_pool = dy.kmax_pooling(h3, D[3], d=1) h4 = dy.kmax_pooling(h3_pool, 1, d=1) h4_re = dy.reshape(h4, (J[3], )) #print(h4_re.npvalue().shape) g = dy.scalarInput(1.) zem_sp = dy.weight_norm(h4_re, g) #print(zem_sp.npvalue().shape) zem_vgg = w_embed * z + b_embed #print(zem_vgg.npvalue().shape) sa = dy.transpose(zem_sp) * zem_vgg s = dy.rectify(sa) if PRINT_EMBED: print('Vgg embedding vector:', zem_vgg.npvalue().shape) print(zem_vgg.value()) print('Speech embedding vector:', zem_sp.npvalue().shape) print(zem_sp.value()) if PRINT_SIM: print('Raw Similarity:', sa.npvalue()) print(sa.value()) print('Similarity:', s.npvalue()) print(s.value()) return s
def _biaffine(self, x, W, y): x = dy.concatenate( [x, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) y = dy.concatenate( [y, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) nx, ny = self.input_dim + 1, self.input_dim + 1 lin = dy.reshape(W * x, (ny, self.hidden_dim)) blin = dy.transpose(dy.transpose(y) * lin) return blin
def __call__(self, embed_in, src_len, train=False, **kwargs): """Input shape: ((T, H), B) Output Shape: [((H,), B)] * T""" T = embed_in.dim()[0][0] embed_in = dy.transpose(embed_in) src_mask = sequence_mask(src_len, T) src_mask = [unsqueeze(m, 2) for m in src_mask] x = self.proj(embed_in) output = self.transformer(x, src_mask, train=train) output = [out for out in dy.transpose(output)] return TransformerEncoderOutput(output=output, src_mask=src_mask)
def entity_attend(self, H_e, h_e): H = dy.concatenate_cols(H_e) keys = self.key_weight.expr() * H query = self.query_weight.expr() * h_e values = self.value_weight.expr() * H context_vectors = [] S = dy.transpose(query) * keys A = dy.softmax(S) context_vectors = dy.cmult(values, A) return dy.transpose(context_vectors)
def forward(self, s1, s2, label=None): eL = dy.parameter(self.embeddingLinear) s1 = dy.inputTensor(s1) * eL s2 = dy.inputTensor(s2) * eL # F step Lf1 = dy.parameter(self.mlpF1) Fs1 = dy.rectify(dy.dropout(s1, 0.2) * Lf1) Fs2 = dy.rectify(dy.dropout(s2, 0.2) * Lf1) Lf2 = dy.parameter(self.mlpF2) Fs1 = dy.rectify(dy.dropout(Fs1, 0.2) * Lf2) Fs2 = dy.rectify(dy.dropout(Fs2, 0.2) * Lf2) # Attention scoring score1 = Fs1 * dy.transpose(Fs2) prob1 = dy.softmax(score1) score2 = dy.transpose(score1) prob2 = dy.softmax(score2) # Align pairs using attention s1Pairs = dy.concatenate_cols([s1, prob1 * s2]) s2Pairs = dy.concatenate_cols([s2, prob2 * s1]) # G step Lg1 = dy.parameter(self.mlpG1) Gs1 = dy.rectify(dy.dropout(s1Pairs, 0.2) * Lg1) Gs2 = dy.rectify(dy.dropout(s2Pairs, 0.2) * Lg1) Lg2 = dy.parameter(self.mlpG2) Gs1 = dy.rectify(dy.dropout(Gs1, 0.2) * Lg2) Gs2 = dy.rectify(dy.dropout(Gs2, 0.2) * Lg2) # Sum Ss1 = dy.sum_dim(Gs1, [0]) Ss2 = dy.sum_dim(Gs2, [0]) concatS12 = dy.transpose(dy.concatenate([Ss1, Ss2])) # H step Lh1 = dy.parameter(self.mlpH1) Hs = dy.rectify(dy.dropout(concatS12, 0.2) * Lh1) Lh2 = dy.parameter(self.mlpH2) Hs = dy.rectify(dy.dropout(Hs, 0.2) * Lh2) # Final layer final_layer = dy.parameter(self.final_layer) final = dy.transpose(Hs * final_layer) # Label can be 0... if label != None: return dy.pickneglogsoftmax(final, label) else: out = dy.softmax(final) return np.argmax(out.npvalue())
def _multilayer_perceptron(self, x): g = self.non_lin layer_1 = g(dy.transpose(dy.transpose(x * self.weights['h1']) + self.biases['b1'])) layer_2 = g(dy.transpose(dy.transpose(layer_1 * self.weights['h2']) + self.biases['b2'])) out_layer = dy.softmax(dy.transpose(layer_2 * self.weights['out']) + self.biases['out']) return out_layer
def __call__(self, h, s): # hT -> ((L, h_dim), B), s -> ((s_dim, L), B) hT = dy.transpose(h) lin = self.U * s # ((h_dim*n_label, L), B) if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = hT * lin if self.n_label == 1: return blin + (hT * self.B if self.bias else 0) else: return dy.transpose(blin)+(self.V*dy.concatenate([h, s])+self.B if self.bias else 0)
def calc_loss(self, src, db_idx): src_embeddings = self.src_embedder.embed_sent(src) src_encodings = self.exprseq_pooling( self.src_encoder.transduce(src_embeddings)) trg_encodings = self.encode_trg_example(self.database[db_idx]) prod = dy.transpose(dy.transpose(src_encodings) * trg_encodings) loss = dy.sum_batches( dy.hinge_batch(prod, list(six.moves.range(len(db_idx))))) print(loss.npvalue()) return loss
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols( src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify( dy.affine_transform( [b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify( dy.affine_transform( [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])) h_label_dep = dy.rectify( dy.affine_transform( [b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def __call__(self, X): d_x = X.dim()[0][0] d_y = X.dim()[0][1] g = dy.ones((d_x, d_y)) b = dy.zeros((d_x, d_y)) Y = [] for attention in self.attention: Y.append(attention(X)) Y = dy.esum(Y) Y = dy.layer_norm(X + Y, g, b) Y = dy.layer_norm(Y + dy.transpose(self.feedforward(dy.transpose(Y))), g, b) return Y
def attend(sentence_a, sentence_b): similarity_scores = dy.transpose(sentence_a) * sentence_b logging.debug("Similarity Matrix size: " + str(similarity_scores.dim())) sentence_a_softmax = dy.softmax(similarity_scores) logging.debug("Sentence a softmax size: " + str(sentence_a_softmax.dim())) sentence_b_softmax = dy.softmax(dy.transpose(similarity_scores)) logging.debug("Sentence b softmax size: " + str(sentence_b_softmax.dim())) sentence_b_attended = sentence_b * dy.transpose(sentence_a_softmax) sentence_a_attended = sentence_a * dy.transpose(sentence_b_softmax) return sentence_a_attended, sentence_b_attended
def self_encode_tags(self, tags): vectors = tags # Self attention for every tag: vectors = run_lstm(self.enc_tag_lstm.initial_state(), tags) tag_input_mat = dy.concatenate_cols(vectors) out_vectors = [] for v1 in vectors: # tag input mat: [tag_emb x seqlen] # v1: [tag_emb] unnormalized = dy.transpose(dy.transpose(v1) * tag_input_mat) self_att_weights = dy.softmax(unnormalized) to_add = tag_input_mat * self_att_weights out_vectors.append(v1 + tag_input_mat * self_att_weights) return out_vectors
def self_attend(self, H_e): H = dy.concatenate_cols(H_e) keys = self.key_weight.expr() * H queries = self.query_weight.expr() * H values = self.value_weight.expr() * H context_vectors = [] for q in dy.transpose(queries): S = dy.transpose(dy.transpose(q) * keys) A = dy.softmax(S) context_vectors.append(values * A) # S = dy.transpose(h_e) * self.self_attention_weight.expr() * H # S = dy.transpose(S) # A = dy.softmax(S) # context_vectors.append(H * A) return context_vectors
def __call__(self, x, z=None, mask=None): h = self.h if z == None: Q = self.W_Q(x) K = self.W_K(x) V = self.W_V(x) else: Q = self.W_Q(x) K = self.W_K(z) V = self.W_V(z) (n_units, n_querys), batch = Q.dim() (_, n_keys), _ = K.dim() batch_Q = dy.concatenate_to_batch(self.split_rows(Q, h)) batch_K = dy.concatenate_to_batch(self.split_rows(K, h)) batch_V = dy.concatenate_to_batch(self.split_rows(V, h)) assert(batch_Q.dim() == (n_units // h, n_querys), batch * h) assert(batch_K.dim() == (n_units // h, n_keys), batch * h) assert(batch_V.dim() == (n_units // h, n_keys), batch * h) mask = np.concatenate([mask] * h, axis=0) mask = np.moveaxis(mask, [1, 0, 2], [0, 2, 1]) mask = dy.inputTensor(mask, batched=True) batch_A = (dy.transpose(batch_Q) * batch_K) * self.scale_score batch_A = dy.cmult(batch_A, mask) + (1 - mask)*MIN_VALUE sent_len = batch_A.dim()[0][0] if sent_len == 1: batch_A = dy.softmax(batch_A) else: batch_A = dy.softmax(batch_A, d=1) batch_A = dy.cmult(batch_A, mask) assert (batch_A.dim() == ((n_querys, n_keys), batch * h)) if self.attn_dropout: if self.dropout != 0.0: batch_A = dy.dropout(batch_A, self.dropout) batch_C = dy.transpose(batch_A * dy.transpose(batch_V)) assert (batch_C.dim() == ((n_units // h, n_querys), batch * h)) C = dy.concatenate(self.split_batch(batch_C, h), d=0) assert (C.dim() == ((n_units, n_querys), batch)) C = self.finishing_linear_layer(C) return C
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def _forward(self, emissions): """Viterbi forward to calculate all path scores. :param emissions: List[dy.Expression] Returns: dy.Expression ((1,), B) """ init_alphas = [-1e4] * self.n_tags init_alphas[self.start_idx] = 0 alphas = dy.inputVector(init_alphas) transitions = self.transitions # len(emissions) == T for emission in emissions: add_emission = dy.colwise_add(transitions, emission) scores = dy.colwise_add(dy.transpose(add_emission), alphas) # dy.logsumexp takes a list of dy.Expression and computes logsumexp # elementwise across the lists so for example the logsumexp is calculated # for [0] in each list. This means we want the scores for a given # transition scores for a tag to be in the columns alphas = dy.logsumexp([x for x in scores]) last_alpha = alphas + dy.pick(transitions, self.end_idx) alpha = dy.logsumexp([x for x in last_alpha]) return alpha
def viterbi(emissions, transition, start_idx, end_idx, norm=False): n_tags = emissions[0].dim()[0][0] backpointers = [] inits = [-1e4] * n_tags inits[start_idx] = 0 alphas = dy.inputVector(inits) alphas = dy.log_softmax(alphas) if norm else alphas for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transition), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transition, end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def _attend(self, query, mask=None): query = unsqueeze(query, 0) # ((1, H), B) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) attn_scores = dy.transpose(query * self.context) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores)
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component): w1_att_src = dy.parameter(w1_att_src_p) w1_att_tgt = dy.parameter(w1_att_tgt_p) w2_att = dy.parameter(w2_att_p) a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att alignment = dy.softmax(a_t) att_output = src_output_matrix * alignment return att_output, alignment
def transpose(x, dim1, dim2): """Swap dimensions `dim1` and `dim2`.""" shape, _ = x.dim() dims = list(range(len(shape))) tmp = dims[dim1] dims[dim1] = dims[dim2] dims[dim2] = tmp return dy.transpose(x, dims=dims)
def ergm_score(self): """ :return: ERGM score (dynet Expression) computed based on ERGM weights and features only Does not populate any field """ W = dy.parameter(self.ergm_weights) f = dy.transpose(dy.inputVector([self.feature_vals[k] for k in self.feature_set])) return f * W
def _attend(self, query, mask=None): # query ((H), B) # mask ((T, 1), B) projected_state = self.decoder * query # ((H,), B) non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state)) # ((H, T), B) attn_scores = dy.transpose(self.v * non_lin) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores) # ((T, 1), B)
def __call__(self, encoder_output, dst, train): embed_out_th_b = self.tgt_embedding.encode(dst) embed_out_ht_b = dy.transpose(embed_out_th_b) embed_out_ht_b = self.proj_to_hsz(embed_out_ht_b) context = dy.concatenate_cols(encoder_output.output) T = embed_out_ht_b.dim()[0][1] dst_mask = subsequent_mask(T) src_mask = encoder_output.src_mask output = self.transformer_decoder(embed_out_ht_b, context, src_mask, dst_mask, train) output = self.proj_to_dsz(output) return self.output(output)
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols(src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify(dy.affine_transform([b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify(dy.affine_transform([b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify(dy.affine_transform([b_label_hidden_to_head, W_label_hidden_to_head, src_encodings])) h_label_dep = dy.rectify(dy.affine_transform([b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def attend(input_mat, state, w1dt): global attention_w2 global attention_v w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) # input_mat: (encoder_state x seqlen) => input vecs concatenated as cols # w1dt: (attdim x seqlen) # w2dt: (attdim x attdim) w2dt = w2*dy.concatenate(list(state.s())) # att_weights: (seqlen,) row vector unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) att_weights = dy.softmax(unnormalized) # context: (encoder_state) context = input_mat * att_weights return context
def calc_loss(sents): dy.renew_cg() src_fwd = LSTM_SRC_FWD.initial_state() src_bwd = LSTM_SRC_BWD.initial_state() trg_fwd = LSTM_TRG_FWD.initial_state() trg_bwd = LSTM_TRG_BWD.initial_state() # Encoding src_reps = encode_sents(LOOKUP_SRC, src_fwd, src_bwd, [src for src, trg in sents]) trg_reps = encode_sents(LOOKUP_TRG, trg_fwd, trg_bwd, [trg for src, trg in sents]) # Concatenate the sentence representations to a single matrix mtx_src = dy.concatenate_cols(src_reps) mtx_trg = dy.concatenate_cols(trg_reps) # Do matrix multiplication to get a matrix of dot product similarity scores sim_mtx = dy.transpose(mtx_src) * mtx_trg # Calculate the hinge loss over all dimensions loss = dy.hinge_dim(sim_mtx, list(range(len(sents))), d=1) return dy.sum_elems(loss)
def squeeze_and_transpose(x): return dy.transpose(squeeze(x))
def encode(self, embed_list): embed_list = dy.transpose(dy.concatenate_cols(embed_list)) return [self.output(out) for out in self.encoder(embed_list, self.train)]
def output(self, x): return [self.preds(y) for y in dy.transpose(x)]