def build_tagging_graph(self, sentence): dy.renew_cg() embeddings = [self.word_rep(w) for w in sentence] lstm_out = self.bi_lstm.transduce(embeddings) H = dy.parameter(self.lstm_to_tags_params) Hb = dy.parameter(self.lstm_to_tags_bias) O = dy.parameter(self.mlp_out) Ob = dy.parameter(self.mlp_out_bias) scores = [] if options.bigram: for rep, word in zip(lstm_out, sentence): bi1 = dy.lookup(self.bigram_lookup, word[0], update=self.we_update) bi2 = dy.lookup(self.bigram_lookup, word[1], update=self.we_update) if self.dropout is not None: bi1 = dy.dropout(bi1, self.dropout) bi2 = dy.dropout(bi2, self.dropout) score_t = O * dy.tanh(H * dy.concatenate([bi1, rep, bi2]) + Hb) + Ob scores.append(score_t) else: for rep in lstm_out: score_t = O * dy.tanh(H * rep + Hb) + Ob scores.append(score_t) return scores
def encode(self, w, o, s): k = 5 suffixes, prefixes = [], [] for i in range(1, k + 1): pre, suf = w[:i], w[-i:] pre_idx = self.P2I[pre] if pre in self.P2I else self.P2I["<unk>"] suf_idx = self.S2I[pre] if pre in self.S2I else self.S2I["<unk>"] suf_e = dy.lookup(self.E_suf, suf_idx) pre_e = dy.lookup(self.E_pre, pre_idx) suffixes.append(suf_e) prefixes.append(pre_e) word_encoded = self.W2I[w] if w in self.W2I else self.W2I["<unk>"] word_e = dy.lookup(self.E, word_encoded) exp_out = dy.vecInput(EMBEDDING_SIZE) if o == []: o = ["<unk>"] for out_token in o: out_token_encoded = self.OUTPUT2IND[ out_token] if out_token in self.OUTPUT2IND else self.OUTPUT2IND[ "<unk>"] out_embedding = dy.lookup(self.E_output, out_token_encoded) exp_out = exp_out + out_embedding W = dy.parameter(self.W) return W * dy.concatenate( [word_e, dy.esum(suffixes), dy.esum(prefixes), out_embedding])
def print_probs(self, sent): dy.renew_cg() # initialize the RNN init_state = self.builder.initial_state() # parameters -> expressions R = dy.parameter(self.R) bias = dy.parameter(self.bias) # get the cids and masks for each step tot_chars = 0 cids = [] for w in sent: cids.append(vocab.w2i[w]) # start the rnn with "<s>" init_ids = cids[0] s = init_state.add_input(dy.lookup(self.lookup, init_ids)) # feed char vectors into the RNN and predict the next char for cid in cids[1:]: score = dy.affine_transform([bias, R, s.output()]) loss = dy.pickneglogsoftmax(score, cid) print(f"{vocab.i2w[cid]} {loss.value()}") # update the state of the RNN cemb = dy.lookup(self.lookup, cid) s = s.add_input(cemb)
def evaluate_recurrent(self, fwd_bigrams, unigrams, test=False): fwd1 = self.fwd_lstm1.initial_state() back1 = self.back_lstm1.initial_state() fwd2 = self.fwd_lstm2.initial_state() back2 = self.back_lstm2.initial_state() fwd_input = [] for i in range(len(unigrams)): bivec = dynet.lookup(self.bigram_embed, fwd_bigrams[i]) univec = dynet.lookup(self.unigram_embed, unigrams[i]) vec = dynet.concatenate([bivec, univec]) # fwd_input.append(dynet.tanh(self.embed2lstm_W*vec)) fwd_input.append(vec) back_input = [] for i in range(len(unigrams)): bivec = dynet.lookup(self.bigram_embed, fwd_bigrams[i + 1]) univec = dynet.lookup(self.unigram_embed, unigrams[i]) vec = dynet.concatenate([bivec, univec]) # back_input.append(dynet.tanh(self.embed2lstm_W*vec)) back_input.append(vec) fwd1_out = [] for vec in fwd_input: fwd1 = fwd1.add_input(vec) fwd_vec = fwd1.output() fwd1_out.append(fwd_vec) back1_out = [] for vec in reversed(back_input): back = back1.add_input(vec) back1_vec = back.output() back1_out.append(back1_vec) lsmt2_input = [] for (f, b) in zip(fwd1_out, reversed(back1_out)): lsmt2_input.append(dynet.concatenate([f, b])) fwd2_out = [] for vec in lsmt2_input: if self.droprate > 0 and not test: vec = dynet.dropout(vec, self.droprate) fwd2 = fwd2.add_input(vec) fwd_vec = fwd2.output() fwd2_out.append(fwd_vec) back2_out = [] for vec in reversed(lsmt2_input): if self.droprate > 0 and not test: vec = dynet.dropout(vec, self.droprate) back2 = back2.add_input(vec) back_vec = back2.output() back2_out.append(back_vec) # fwd_out = [dynet.concatenate([f1,f2]) for (f1,f2) in zip(fwd1_out,fwd2_out)] # back_out = [dynet.concatenate([b1,b2]) for (b1,b2) in zip(back1_out,back2_out)] return fwd2_out, back2_out[::-1]
def evaluate_recurrent(self, word_inds, tag_inds, test=False): fwd1 = self.fwd_lstm1.initial_state() back1 = self.back_lstm1.initial_state() fwd2 = self.fwd_lstm2.initial_state() back2 = self.back_lstm2.initial_state() sentence = [] for (w, t) in zip(word_inds, tag_inds): wordvec = dynet.lookup(self.word_embed, w) tagvec = dynet.lookup(self.tag_embed, t) vec = dynet.concatenate([wordvec, tagvec]) sentence.append(vec) fwd1_out = [] for vec in sentence: fwd1 = fwd1.add_input(vec) fwd_vec = fwd1.output() fwd1_out.append(fwd_vec) back1_out = [] for vec in reversed(sentence): back1 = back1.add_input(vec) back_vec = back1.output() back1_out.append(back_vec) lstm2_input = [] for (f, b) in zip(fwd1_out, reversed(back1_out)): lstm2_input.append(dynet.concatenate([f, b])) fwd2_out = [] for vec in lstm2_input: if self.droprate > 0 and not test: vec = dynet.dropout(vec, self.droprate) fwd2 = fwd2.add_input(vec) fwd_vec = fwd2.output() fwd2_out.append(fwd_vec) back2_out = [] for vec in reversed(lstm2_input): if self.droprate > 0 and not test: vec = dynet.dropout(vec, self.droprate) back2 = back2.add_input(vec) back_vec = back2.output() back2_out.append(back_vec) fwd_out = [ dynet.concatenate([f1, f2]) for (f1, f2) in zip(fwd1_out, fwd2_out) ] back_out = [ dynet.concatenate([b1, b2]) for (b1, b2) in zip(back1_out, back2_out) ] return fwd_out, back_out[::-1]
def __call__(self, query, options, gold, lengths, query_no): if len(options) == 1: return None, 0 final = [] if args.word_vectors: qvecs = [dy.lookup(self.pEmbedding, w) for w in query] qvec_max = dy.emax(qvecs) qvec_mean = dy.average(qvecs) for otext, features in options: if not args.no_features: inputs = dy.inputTensor(features) if args.word_vectors: ovecs = [dy.lookup(self.pEmbedding, w) for w in otext] ovec_max = dy.emax(ovecs) ovec_mean = dy.average(ovecs) if args.no_features: inputs = dy.concatenate( [qvec_max, qvec_mean, ovec_max, ovec_mean]) else: inputs = dy.concatenate( [inputs, qvec_max, qvec_mean, ovec_max, ovec_mean]) if args.drop > 0: inputs = dy.dropout(inputs, args.drop) h = inputs for pH, pB in zip(self.hidden, self.bias): h = dy.affine_transform([pB, pH, h]) if args.nonlin == "linear": pass elif args.nonlin == "tanh": h = dy.tanh(h) elif args.nonlin == "cube": h = dy.cube(h) elif args.nonlin == "logistic": h = dy.logistic(h) elif args.nonlin == "relu": h = dy.rectify(h) elif args.nonlin == "elu": h = dy.elu(h) elif args.nonlin == "selu": h = dy.selu(h) elif args.nonlin == "softsign": h = dy.softsign(h) elif args.nonlin == "swish": h = dy.cmult(h, dy.logistic(h)) final.append(dy.sum_dim(h, [0])) final = dy.concatenate(final) nll = -dy.log_softmax(final) dense_gold = [] for i in range(len(options)): dense_gold.append(1.0 / len(gold) if i in gold else 0.0) answer = dy.inputTensor(dense_gold) loss = dy.transpose(answer) * nll predicted_link = np.argmax(final.npvalue()) return loss, predicted_link
def load_src_lookup_params(self, src_vectors_file, model): self.src_lookup = model.add_lookup_parameters( (self.src_vocab_size, self.embed_size)) pickle_fn = 'src_lookup_vectors.pkl' print('Loading source vectors as lookup parameters') count = 0 frozen_params = defaultdict(lambda: False) if not os.path.exists(pickle_fn): init_array = np.zeros((self.src_vocab_size, self.embed_size)) with open(src_vectors_file) as vector_file: first_line = True for l in vector_file: if first_line: first_line = False else: try: space_delim = l.split() word = space_delim[0] w_id = int(self.src_token_to_id[word]) if w_id != 0: init_array[w_id, :] = np.asarray( space_delim[1:]) count += 1 frozen_params[w_id] = True except Exception as e: print('Error:{0}, {1}'.format(e, l)) with open(pickle_fn, 'wb') as pickle_file: pickle.dump(init_array, pickle_file) for i in range(self.src_vocab_size): if not np.any(init_array[i, :]): expr = dy.lookup(self.src_lookup, i) init_array[i, :] = expr.npvalue() frozen_params[i] = False else: with open(pickle_fn, 'rb') as pickle_file: init_array = pickle.load(pickle_file) for i in range(self.src_vocab_size): if not np.any(init_array[i, :]): expr = dy.lookup(self.src_lookup, i) init_array[i, :] = expr.npvalue() frozen_params[i] = False else: count += 1 frozen_params[i] = True print('Set: {0} vectors out of vocab size: {1}'.format( count, self.src_vocab_size)) self.src_lookup.init_from_array(init_array) return frozen_params
def get_tok_embedding(self, tok): tok_embedding = dy.concatenate([ dy.lookup(self.word_lookup, self.w2i_raw[tok[0]]), dy.lookup(self.pretrained_lookup, self.w2i_pretrained[tok[1]], update=False), dy.lookup(self.unked_lookup, self.w2i_unked[tok[2]]), self.pos_lookup[self.w2i_pos[tok[3]]] ]) return tok_embedding
def represent(self, input): representations = [] for word in input: w_r = dy.lookup(self.E, word) p_r = dy.lookup( self.Epre, self.Wp2I[word_to_prefix(self.index_to_word(word))]) s_r = dy.lookup( self.Esuf, self.Ws2I[word_to_suffix(self.index_to_word(word))]) representations.append(w_r + p_r + s_r) return representations
def train(self, inputs, target): dropout = self.Config.train.dropout uts = [] for u in inputs: u = Utt(u) u.words_emb = [] for word in u.words: u.words_emb.append( dy.dropout( dy.lookup( self.input_lookup, word, update=True if word < 4 + self.Config.data.oov_size else False), dropout)) self.encode_words(u) uts.append(u) # last_output_embeddings = self.lookup(self.Config.data.START_ID, emb) last_output_embeddings = self.input_lookup[self.Config.data.START_ID] s = self.sess_lstm.initial_state().add_input( dy.concatenate([ dy.vecInput(self.Config.model.num_units), last_output_embeddings ])) loss = [] for gt in target: spt = dy.concatenate(list(s.s())) l = self.utt_lstm.initial_state_from_raw_vectors([ np.random.normal(0, 0.1, self.Config.model.num_units) for i in range(1 * self.Config.model.num_layers) ]) lpt = dy.concatenate(list(l.s())) # encode utt for i in range(len(uts) - 1, -1, -1): self.get_word_att(uts[i], lpt, spt) l = l.add_input(uts[i].context) uts[i].utt_enc = l.output() lpt = dy.concatenate(list(l.s())) # decode c = self.get_utt_att(uts, spt) s = s.add_input(dy.concatenate([c, last_output_embeddings])) probs = dy.softmax(self.decoder_w * s.output() + self.decoder_b) # last_output_embeddings = self.lookup(gt, emb) last_output_embeddings = dy.lookup( self.input_lookup, gt, update=True if gt < 4 + self.Config.data.oov_size else False) loss.append(-dy.log(dy.pick(probs, gt))) loss = dy.esum(loss) return loss
def greedy_search(self, char_seq, truth = None, mu =0.): init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) init_sentence = Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None, golden=True) if truth is not None: cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] else: cembs = [dy.lookup(self.params['embed'],char) for char in char_seq ] #cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] start_agenda = init_sentence agenda = [start_agenda] for idx, _ in enumerate(char_seq,1): # from left to right, character by character now = None for wlen in range(1,min(idx,self.options['max_word_len'])+1): # generate word candidate vectors # join segmentation sent + word word = self.word_repr(char_seq[idx-wlen:idx], cembs[idx-wlen:idx]) sent = agenda[idx-wlen] if truth is not None: word = dy.dropout(word,self.options['dropout_rate']) word_score = dy.dot_product(word,self.param_exprs['U']) if truth is not None: golden = sent.golden and truth[idx-1]==wlen margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.) score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score else: golden = False score = sent.score_expr + dy.dot_product(sent.y, word) + word_score good = (now is None or now.score < score.scalar_value()) if golden or good: new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) new_sent = Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen, golden=golden) if good: now = new_sent if golden: golden_sent = new_sent agenda.append(now) if truth is not None and truth[idx-1]>0 and (not now.golden): return (now.score_expr - golden_sent.score_expr) if truth is not None: return (now.score_expr - golden_sent.score_expr) return agenda
def step(self, instance): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) # W1_att = dy.parameter(self.W1_att) #w2_att = dy.parameter(self.w2_att) src_sent, tgt_sent = instance src_sent_rev = list(reversed(src_sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(src_sent, src_sent_rev): l2r_state = l2r_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r])) r2l_state = r2l_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l])) l2r_contexts.append( l2r_state.output()) #[<S>, x_1, x_2, ..., </S>] r2l_contexts.append( r2l_state.output()) #[</S> x_n, x_{n-1}, ... <S>] r2l_contexts.reverse() #[<S>, x_1, x_2, ..., </S>] # Combine the left and right representations for every word h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) losses = [] num_words = 0 # Decoder c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate( [dy.lookup(self.tgt_lookup, self.tgt_token_to_id['<S>']), c_t]) dec_state = self.dec_builder.initial_state().add_input(start) for (cw, nw) in zip(tgt_sent, tgt_sent[1:]): h_e = dec_state.output() c_t = self.__attention_mlp(h_fs_matrix, h_e) # Get the embedding for the current target word embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw]) # Create input vector to the decoder x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) y_star = dy.softmax(b_y + W_y * dec_state.output()) loss = -dy.log(dy.pick(y_star, self.tgt_token_to_id[nw])) losses.append(loss) num_words += 1 return dy.esum(losses), num_words
def __call__(self, sequence): next_input = [dy.lookup(self._E, self._W2I[i]) if i in self._W2I else dy.lookup(self._E, self._W2I["UNK"]) for i in sequence] for layer in self._stacks[0:-1]: output = layer(next_input) next_input = [dy.concatenate([next_input[i], output[i]]) for i in range(len(sequence))] output = self._stacks[-1](next_input) exp_output = dy.concatenate_cols(output) v = dy.kmax_pooling(exp_output, 1, d=1) return v
def represent(self, seq): output_vec = [] s0 = self.builder.initial_state() for word in seq: word_as_char_vec = [ dy.lookup(self.embed, self.c2i[ci]) if ci in self.c2i else dy.lookup(self.embed, self.c2i[self.c_unk]) for ci in word ] word_output = s0.transduce(word_as_char_vec)[ -1] # apply lstm and take last output output_vec.append(word_output) return output_vec
def evaluate(self, input_sentences, labels): dy.renew_cg() self.word_rnn.disable_dropout() self.sent_rnn.disable_dropout() embed_sents = [] for input_sentence in input_sentences: input_sentence = self._preprocess_input(input_sentence, self.word_to_ix) #input_sentence = [self.word_to_ix['<start>']] + input_sentence + [self.word_to_ix['<end>']] embed_words = self._embed_sentence(input_sentence) word_rnn_outputs = self._run_rnn(self.word_rnn, embed_words) sent_embed = dy.average(word_rnn_outputs) embed_sents.append(sent_embed) rnn_outputs = self._run_rnn(self.sent_rnn, embed_sents) doc_output_w = dy.parameter(self.doc_output_w) doc_output_b = dy.parameter(self.doc_output_b) doc_output = dy.tanh(doc_output_w * dy.average(rnn_outputs) + doc_output_b) probs = [] sum_output = dy.zeros(self.args.sent_hidden_dim) pred_labels = [] correct = 0 total = 0 loss = dy.zeros(1) for i, rnn_output in enumerate(rnn_outputs): abspos_embed = dy.lookup(self.abspos_embeddings, self.abspos_ix[i]) relpos_embed = dy.lookup(self.relpos_embeddings, self.relpos_ix[i]) prob = self._get_probs(rnn_output, doc_output, sum_output, abspos_embed, relpos_embed) sum_output += dy.cmult(prob, rnn_output) pred_label = self._predict(prob) pred_labels.append(pred_label) if pred_label == labels[i]: correct += 1 total += 1 if labels[i] == 1: loss -= dy.log(prob) else: loss -= dy.log(dy.scalarInput(1) - prob) return loss.value(), pred_labels, correct, total
def expr_for_tree(self, root, tree, WS, US, UFS, BS): nodes, edges = tree['nodes'], tree['edges'] if len(edges[root]) > 2: raise RuntimeError( 'Tree structure error: only binary trees are supported.') node_type = nodes[root]['type'] if node_type == 'terminal': raise RuntimeError('Tree structure error: meet with leaves') if node_type == 'preterminal': terminal_id = edges[root][0] terminal = nodes[terminal_id]['name'] try: idx = self.voc2id[terminal] except: idx = self.voc2id['UNK'] emb = dy.lookup(self.terminal_lp, idx) Wi, Wo, Wu = [w for w in WS] bi, bo, bu, _ = [b for b in BS] i = dy.logistic(dy.affine_transform([bi, Wi, emb])) o = dy.logistic(dy.affine_transform([bo, Wo, emb])) u = dy.tanh(dy.affine_transform([bu, Wu, emb])) c = dy.cmult(i, u) h = dy.cmult(o, dy.tanh(c)) else: nonterminal = nodes[root]['name'] try: idx = self.tree2id[nonterminal] except: idx = self.tree2id['UNK'] emb = dy.lookup(self.nonterminal_lp, idx) e1, c1 = self.expr_for_tree(edges[root][0], tree, WS, US, UFS, BS) e2, c2 = self.expr_for_tree(edges[root][1], tree, WS, US, UFS, BS) Ui, Uo, Uu = [u for u in US] Uf1, Uf2 = [u for u in UFS] bi, bo, bu, bf = [b for b in BS] e = dy.concatenate([emb, e1, e2]) i = dy.logistic(dy.affine_transform([bi, Ui, e])) o = dy.logistic(dy.affine_transform([bo, Uo, e])) f1 = dy.logistic(dy.affine_transform([bf, Uf1, e])) f2 = dy.logistic(dy.affine_transform([bf, Uf2, e])) u = dy.tanh(dy.affine_transform([bu, Uu, e])) c = dy.cmult(i, u) + dy.cmult(f1, c1) + dy.cmult(f2, c2) h = dy.cmult(o, dy.tanh(c)) if self.DROPOUT > 0: dy.dropout(h, self.DROPOUT) return h, c
def batch_loss(self, batch, train=True): # load the parameters W_hid = dy.parameter(self.W_hid) b_hid = dy.parameter(self.b_hid) W_out = dy.parameter(self.W_out) losses = [] for _, sent in batch: for i in range(1, len(sent)): # task 6 4gram if i==1: prev_word2_ix = sent[len(sent)-1] prev_word3_ix = sent[len(sent)-1] if i==2: prev_word2_ix = sent[i-2] prev_word3_ix = sent[len(sent)-1] else: prev_word2_ix = sent[i-2] prev_word3_ix = sent[i-3] prev_word_ix = sent[i - 1] curr_word_ix = sent[i] ctx1 = dy.lookup(self.embed, prev_word_ix) ctx2 = dy.lookup(self.embed, prev_word2_ix) ctx3 = dy.lookup(self.embed, prev_word3_ix) ctx = dy.concatenate([ctx3,ctx2,ctx1]) # hid is the hidden layer output, size=hidden_size # compute b_hid + W_hid * ctx, but faster hid = dy.affine_transform([b_hid, W_hid, ctx]) hid = dy.tanh(hid) # out is the prediction of the next word, size=vocab_size out = W_out * hid # Intepretation: The model estimates that # log P(curr_word=k | prev_word) ~ out[k] # in other words, # P(curr_word=k | prev_word) = exp(out[k]) / sum_j exp(out[j]) # = softmax(out)[k] # We want to maximize the probability of the correct word. # (equivalently, minimize the negative log-probability) loss = dy.pickneglogsoftmax(out, curr_word_ix) losses.append(loss) # esum simply adds up the expressions in the list return dy.esum(losses)
def translate_sentence(self, sent): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) #W1_att = dy.parameter(self.W1_att) #w2_att = dy.parameter(self.w2_att) sent_rev = list(reversed(sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(sent, sent_rev): l2r_state = l2r_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r])) r2l_state = r2l_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l])) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) # Decoder trans_sentence = ['<S>'] cw = trans_sentence[-1] c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate( [dy.lookup(self.tgt_lookup, self.tgt_token_to_id['<S>']), c_t]) dec_state = self.dec_builder.initial_state().add_input(start) while len(trans_sentence) < self.max_len: h_e = dec_state.output() c_t = self.__attention_mlp(h_fs_matrix, h_e) embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw]) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) y_star = b_y + W_y * dec_state.output() p = dy.softmax(y_star) cw = self.tgt_id_to_token[np.argmax(p.npvalue())] print np.max(p.npvalue()) if cw == '</S>': break trans_sentence.append(cw) return ' '.join(trans_sentence[1:])
def encode(self, pend_encs, pend_ids, head, dep, irel=None): dep_enc = pend_encs[dep].output rel_enc = dy.lookup(self.REL_LOOKUP, irel) if self.rel_feat else None if self.dist_feat: dist = pend_ids[head] - pend_ids[dep] - self.dist_min index = self.neg_unk if dist < 0 \ else (self.pos_unk if dist > self.dist_range \ else dist) dist_enc = dy.lookup(self.DIST_LOOKUP, index) else: dist_enc = None feat_emb = dy.tanh(self.transW * \ dy.concatenate(filter(None, [dep_enc, rel_enc, dist_enc]))) return LSTMState(feat_emb, pend_encs[dep].memory_cell)
def represent(self, seq): word_r = super(SubWordRepresentation, self).represent(seq) pref_r = [ dy.lookup(self.embed, self.w2i[self.pref_flag + w[:3]]) if self.pref_flag + w[:3] in self.w2i else dy.lookup(self.embed, self.w2i[self.pref_unk]) for w in seq ] suff_r = [ dy.lookup(self.embed, self.w2i[self.suff_flag + w[-3:]]) if self.suff_flag + w[-3:] in self.w2i else dy.lookup(self.embed, self.w2i[self.suff_unk]) for w in seq ] return [pref_r[i] + word_r[i] + suff_r[i] for i in range(len(word_r))]
def translate(self, x, beam_size=1): """Translate a source sentence Translate a single source sentence by decoding using beam search Arguments: x (list): Source sentence (list of indices) Keyword Arguments: beam_size (int): Size of the beam for beam search. A value of 1 means greedy decoding (default: (1)) Returns: list: generated translation (list of indices) """ dy.renew_cg() input_len = len(x) encodings = self.encode([x], test=True) # Decode # Add parameters to the graph Wp, bp = self.Wp_p.expr(), self.bp_p.expr() Wo, bo = self.Wo_p.expr(), self.bo_p.expr() D, b = dy.transpose(dy.parameter(self.MT_p)), self.b_p.expr() # Initialize decoder with last encoding last_enc = dy.select_cols(encodings, [encodings.dim()[0][-1] - 1]) init_state = dy.affine_transform([bp, Wp, last_enc]) ds = self.dec.initial_state([init_state, dy.zeroes((self.dh, ))]) # Initialize context context = dy.zeroes((self.enc_dim, )) # Initialize beam beam = [(ds, context, [self.trg_sos], 0.0)] # Loop for i in range(int(min(self.max_len, input_len * 1.5))): new_beam = [] for ds, pc, pw, logprob in beam: embs = dy.lookup(self.MT_p, pw[-1]) # Run LSTM ds = ds.add_input(dy.concatenate([embs, pc])) h = ds.output() # Compute next context context, _ = self.attend(encodings, h) # Compute output with residual connections output = dy.affine_transform( [bo, Wo, dy.concatenate([h, context, embs])]) # Score s = dy.affine_transform([b, D, output]) # Probabilities p = dy.softmax(s).npvalue().flatten() # Careful of float error p = p / p.sum() kbest = np.argsort(p) for nw in kbest[-beam_size:]: new_beam.append( (ds, context, pw + [nw], logprob + np.log(p[nw]))) beam = sorted(new_beam, key=lambda x: x[-1])[-beam_size:] if beam[-1][2][-1] == self.trg_eos: break return beam[-1][2]
def word_repr(self, char_seq, cembs): """ obtain the word representation when given its character sequence :param char_seq: character index sequence :param cembs: character embedding sequence :return: """ wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) chars = dy.concatenate(cembs) # [c1;c2...] # reste_gate = sigmoid(W_r_l * chars + b_r_l), shape: (m,char_dim) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) # word = tanh(W_c_l * (reset_gate .* chars) + b_c_l) word = dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]) if self.known_words is not None and tuple( char_seq) in self.known_words: # Frequent word = (word + word_embed) / 2 return (word + dy.lookup(self.params['word_embed'], self.known_words[tuple(char_seq)])) / 2. return word
def word_repr(self, char_seq, cembs): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) chars = dy.concatenate(cembs) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) word = dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]) if self.known_words is not None and tuple( char_seq) in self.known_words: return (word + dy.lookup(self.params['word_embed'], self.known_words[tuple(char_seq)])) / 2. return word
def calc_scores(words): # Create a computation graph, and add parameters dy.renew_cg() # Take the sum of all the embedding vectors for each word score = dy.esum([dy.lookup(W, x) for x in words]) # Add the bias vector and return return score + b
def encode(self, sentence): freqs = [float(self.word_count.get(root.norm, 0)) for root in sentence] wembs = [dy.lookup(self.WORD_LOOKUP, self.word_vocab.get(root.norm, 0) if not self._train_flag or (random.random() < \ (c / (self.word_dropout_rate + c))) else 0) for (root, c) in zip(sentence, freqs)] pembs = [ dy.lookup(self.POS_LOOKUP, self.pos_vocab[root.pos]) if (not self._train_flag or (random.random() > self.pos_dropout_rate)) else wembs[root.w_id] for root in sentence ] encode_states = [ dy.concatenate([wi, pi]) for wi, pi in zip(wembs, pembs) ] return dy.concatenate_cols(encode_states)
def get_word_features(self, word_indices): """ Produce word and character features that can be used as input for the predictions. :param word_indices: a list of word indices :return: a list of word embeddigs """ dynet.renew_cg( immediate_compute=True ) #(immediate_compute = True, check_validity = True) # new graph #is_valid() not implemented for CUDA yet features = [] for w_idx in word_indices: update_flag = False if (w_idx in self.oov_id): #Allow the vocabs which are not in pre-load embeddings to #be updated during training update_flag = True embed_vec = dynet.lookup(self.wembeds, index=w_idx, update=update_flag) features.append(embed_vec) return features
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print ('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print (display_activations(words, activations)) print ('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print (' bias=%s' % bias) contributions = W * features print (' very bad (%.4f): %s' % (scores[0], contributions[0])) print (' bad (%.4f): %s' % (scores[1], contributions[1])) print (' neutral (%.4f): %s' % (scores[2], contributions[2])) print (' good (%.4f): %s' % (scores[3], contributions[3])) print ('very good (%.4f): %s' % (scores[4], contributions[4]))
def train(self, inputs, target): words_emb = [] dropout = self.Config.train.dropout for u in inputs: for word in u: words_emb.append( dy.dropout( dy.lookup( self.input_lookup, word, update=True if word < 4 + self.Config.data.oov_size else False), dropout)) # words_emb.append(self.input_lookup[word]) fwd_vectors, state = self.run_lstm(self.enc_fwd_lstm.initial_state(), words_emb) # s = self.sess_lstm.initial_state(state.s()).add_input(dy.lookup(self.input_lookup, self.Config.data.EOS_ID)) s = self.sess_lstm.initial_state(state.s()).add_input( self.input_lookup[self.Config.data.EOS_ID]) loss = [] for char in target: s = s.add_input(s.output()) out_vector = self.decoder_w * s.output() + self.decoder_b probs = dy.softmax(out_vector) loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def _predict(self, batch, train=True): # load the network parameters W_hid = dy.parameter(self.W_hid) b_hid = dy.parameter(self.b_hid) w_clf = dy.parameter(self.w_clf) b_clf = dy.parameter(self.b_clf) probas = [] # predict the probability of positive sentiment for each sentence for _, sent in batch: sent_embed = [dy.lookup(self.embed, w) for w in sent] sent_embed = dy.average(sent_embed) # hid = tanh(b + W * sent_embed) # but it's faster to use affine_transform in dynet hid = dy.affine_transform([b_hid, W_hid, sent_embed]) hid = dy.tanh(hid) y_score = dy.affine_transform([b_clf, w_clf, hid]) y_proba = dy.logistic(y_score) probas.append(y_proba) return probas
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE - len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE, )) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print(display_activations(words, activations)) print('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print(' bias=%s' % bias) contributions = W * features print(' very bad (%.4f): %s' % (scores[0], contributions[0])) print(' bad (%.4f): %s' % (scores[1], contributions[1])) print(' neutral (%.4f): %s' % (scores[2], contributions[2])) print(' good (%.4f): %s' % (scores[3], contributions[3])) print('very good (%.4f): %s' % (scores[4], contributions[4]))
def backward(self, char_seq, truth): self.renew_cg() cembs = [ dy.dropout(dy.lookup(self.params['embed'], char), self.options['dropout_rate']) for char in char_seq ] word_seq, word = [], [] for char, label in zip(cembs, truth): word.append(char) if label > 0: word_seq.append(word) word = [] score = self.truth_score(word_seq) score_plus_margin_loss = self.beam_search( cembs, truth, self.options['margin_loss_discount']) loss = score_plus_margin_loss - score res = loss.scalar_value() loss.backward() return res
def calc_scores(words): dy.renew_cg() word_embs = [dy.lookup(W_emb, x) for x in words] fwd_init = fwdLSTM.initial_state() fwd_embs = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_embs = bwd_init.transduce(reversed(word_embs)) return W_sm * dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) + b_sm
def calc_loss(words, labels, heads): dy.renew_cg() word_embs = [dy.lookup(W_emb, x) for x in words] fwd_init = fwdLSTM.initial_state() fwd_embs = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_embs = bwd_init.transduce(reversed(word_embs)) src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))] return biaffineParser.decode_loss(src_encodings, ([heads], [labels]))
def calc_acc(words, labels, heads): dy.renew_cg() word_embs = [dy.lookup(W_emb, x) for x in words] fwd_init = fwdLSTM.initial_state() fwd_embs = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_embs = bwd_init.transduce(reversed(word_embs)) src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))] pred_heads, pred_labels = biaffineParser.decoding(src_encodings) return biaffineParser.cal_accuracy(pred_heads, pred_labels, heads, labels)
def calc_scores(wids): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) return W_sm * pool_out + b_sm
def forward(self, char_seq): self.renew_cg() cembs = [dy.lookup(self.params['embed'],char) for char in char_seq ] agenda = self.beam_search(cembs) now = agenda[-1].max() ans = [] while now.prevState is not None: ans.append(now.wlen) now = now.prevState return reversed(ans)
def build_lm_graph(self, sent): dy.renew_cg() init_state = self.builder.initial_state() errs = [] # will hold expressions es=[] state = init_state for (cw,nw) in zip(sent,sent[1:]): # assume word is already a word-id x_t = dy.lookup(self.lookup, int(cw)) state = state.add_input(x_t) y_t = state.output() r_t = self.bias + (self.R * y_t) err = dy.pickneglogsoftmax(r_t, int(nw)) errs.append(err) nerr = dy.esum(errs) return nerr
def backward(self, char_seq, truth): self.renew_cg() cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] word_seq,word = [],[] for char,label in zip(cembs,truth): word.append(char) if label > 0: word_seq.append(word) word = [] score = self.truth_score(word_seq) score_plus_margin_loss = self.beam_search(cembs,truth,self.options['margin_loss_discount']) loss = score_plus_margin_loss - score res = loss.scalar_value() loss.backward() return res
def sample(self, first=1, nchars=0, stop=-1): res = [first] dy.renew_cg() state = self.builder.initial_state() R = dy.parameter(self.R) bias = dy.parameter(self.bias) cw = first while True: x_t = dy.lookup(self.lookup, cw) state = state.add_input(x_t) y_t = state.output() r_t = bias + (R * y_t) ydist = dy.softmax(r_t) dist = ydist.vec_value() rnd = random.random() for i,p in enumerate(dist): rnd -= p if rnd <= 0: break res.append(i) cw = i if cw == stop: break if nchars and len(res) > nchars: break return res
def calc_scores(words): dy.renew_cg() b_sm_exp = dy.parameter(b_sm) score = dy.esum([dy.lookup(W_sm, x) for x in words]) return score + b_sm_exp
def __call__(self, char, DIRECT_LOOKUP=False): char_i = char if DIRECT_LOOKUP else self.vocab[char] return dy.lookup(self.enc, char_i)
def calc_scores(words): dy.renew_cg() cbow = dy.esum([dy.lookup(W_emb, x) for x in words]) return W_sm * cbow + b_sm
def calc_scores(words): dy.renew_cg() h = dy.esum([dy.lookup(W_emb, x) for x in words]) for W_h_i, b_h_i in zip(W_h, b_h): h = dy.tanh( W_h_i * h + b_h_i ) return W_sm * h + b_sm