def encode(input_, lengths): forward, backward = lstm(input_) states = dy.concatenate_cols(forward) final_states_forward = dy.pick_batch(states, lengths, dim=1) states = dy.concatenate_cols(backward) final_states_backward = dy.pick_batch(states, lengths, dim=1) return dy.concatenate([final_states_forward, final_states_backward])
def generate_output(self, translator, initial_state, src_length=None, forced_trg_ids=None): # Output variables score = [] word_ids = [] attentions = [] logsoftmaxes = [] states = [] masks = [] # Search Variables done = None current_state = initial_state for length in range(self.max_len): prev_word = word_ids[length - 1] if length > 0 else None current_output = translator.generate_one_step( prev_word, current_state) current_state = current_output.state if forced_trg_ids is None: word_id = np.argmax(current_output.logsoftmax.npvalue(), axis=0) if len(word_id.shape) == 2: word_id = word_id[0] else: if batchers.is_batched(forced_trg_ids): word_id = [ forced_trg_ids[i][length] for i in range(len(forced_trg_ids)) ] else: word_id = [forced_trg_ids[length]] logsoft = dy.pick_batch(current_output.logsoftmax, word_id) if done is not None: word_id = [ word_id[i] if not done[i] else Vocab.ES for i in range(len(done)) ] # masking for logsoftmax mask = [1 if not done[i] else 0 for i in range(len(done))] logsoft = dy.cmult(logsoft, dy.inputTensor(mask, batched=True)) masks.append(mask) # Packing outputs score.append(logsoft.npvalue()) word_ids.append(word_id) attentions.append(current_output.attention) logsoftmaxes.append( dy.pick_batch(current_output.logsoftmax, word_id)) states.append(translator.get_nobp_state(current_state)) # Check if we are done. done = [x == Vocab.ES for x in word_id] if all(done): break masks.insert(0, [1 for _ in range(len(done))]) words = np.stack(word_ids, axis=1) score = np.sum(score, axis=0) return [ SearchOutput(words, attentions, score, logsoftmaxes, states, masks) ]
def predict_chunks_by_tokens(self, w_t, chunk_batch): ender = [self.lattice_vocab.chunk_end.i] * self.BATCH_SIZE lps = [] state = self.lattice_rnn.initial_state(dropout=self.DROPOUT) cs = [[self.lattice_vocab.chunk_start.i] * self.BATCH_SIZE ] + chunk_batch cum_lp = dynet.scalarInput(0.0, device=self.args.param_device) for i, (cc, nc) in enumerate(zip(cs, cs[1:])): if self.args.concat_context_vector: x_t = dynet.pick_batch(self.vocab_R, cc) state.add_input(x_t) else: if i == 0: state.add_input(self.project_main_to_lattice_init_R * w_t) else: x_t = dynet.pick_batch(self.vocab_R, cc) state.add_input(x_t) y_t = state.output() y_t = dynet.to_device(y_t, self.args.param_device) if self.DROPOUT: y_t = dynet.cmult(y_t, self.dropout_mask_lattice_y_t) if self.args.concat_context_vector: y_t = dynet.concatenate([y_t, w_t]) r_t = dynet.affine_transform([ self.vocab_bias, self.vocab_R, dynet.tanh( dynet.affine_transform( [self.lattice_bias, self.lattice_R, y_t])) ]) if i > 0: lps.append(cum_lp + -dynet.pickneglogsoftmax_batch(r_t, ender)) cum_lp = cum_lp + -dynet.pickneglogsoftmax_batch(r_t, nc) lps.append(cum_lp) return lps
def cross_entropy_loss(self, score, next_word, cur_word): if self.__ls: log_prob = dy.log_softmax(score) if self.__lm is None: loss = - dy.pick_batch(log_prob, next_word) * (1 - self.__ls_eps) - \ dy.mean_elems(log_prob) * self.__ls_eps else: loss = - dy.pick_batch(log_prob, next_word) * (1 - self.__ls_eps) - \ dy.dot_product(self.__lm.next_expr(cur_word), log_prob) * self.__ls_eps else: loss = dy.pickneglogsoftmax(score, next_word) return loss
def predict_logprobs(self, X, Y): """ Returns the log probabilities of the predictions for this model (batched version). Returns a matrix of log probabilities. @param X: the input indexes from which to predict @param Y: a list of references indexes for which to extract the prob. @return the matrix of predicted logprobabilities for each of the provided ref y in Y as a numpy array """ assert (len(X) == len(Y)) assert (all([len(x) == len(y) for x, y in zip(X, Y)])) nlines = len(X) X = zip(*X) #transposes the batch Y = zip(*Y) #transposes the batch if self.tied: dy.renew_cg() state = self.rnn.initial_state() E = dy.parameter(self.embedding_matrix) preds = [] lookups = [dy.pick_batch(E, xcolumn) for xcolumn in X] outputs = state.transduce(lookups) ypred_batch = [ dy.pickneglogsoftmax_batch(E * lstm_out, y) for lstm_out, y in zip(outputs, Y) ] dy.forward(ypred_batch) if nlines > 1: preds = [(-col.npvalue()).tolist()[0] for col in ypred_batch] else: preds = [(-col.npvalue()).tolist() for col in ypred_batch] return list(zip(*preds)) #final back transposition else: dy.renew_cg() state = self.rnn.initial_state() O = dy.parameter(self.output_weights) E = dy.parameter(self.embedding_matrix) preds = [] lookups = [dy.pick_batch(E, xcolumn) for xcolumn in X] outputs = state.transduce(lookups) ypred_batch = [ dy.pickneglogsoftmax_batch(O * lstm_out, y) for lstm_out, y in zip(outputs, Y) ] dy.forward(ypred_batch) preds = [(-col.npvalue()).tolist()[0] for col in ypred_batch] if nlines > 1: preds = [(-col.npvalue()).tolist()[0] for col in ypred_batch] else: preds = [(-col.npvalue()).tolist() for col in ypred_batch] return list(zip(*preds)) #final back transposition
def cross_entropy_loss(self, s, nw, cw): """Calculates the cross-entropy """ if self.ls: log_prob = dy.log_softmax(s) if self.lm is None: loss = - dy.pick_batch(log_prob, nw) * (1 - self.ls_eps) - \ dy.mean_elems(log_prob) * self.ls_eps else: loss = - dy.pick_batch(log_prob, nw) * (1 - self.ls_eps) - \ dy.dot_product(self.lm_e, log_prob) * self.ls_eps else: loss = dy.pickneglogsoftmax_batch(s, nw) return loss
def compress_chunk(self, chunks, masks=None): compression_batch_size = len(chunks[0]) # token_embeddings = [dynet.reshape(dynet.select_cols(self.vocab_lookup, tokens), (self.args.dim,), compression_batch_size) # token_embeddings = [dynet.reshape(dynet.transpose(dynet.select_rows(self.vocab_R, tokens)), (self.args.dim,), compression_batch_size) token_embeddings = [ dynet.pick_batch(self.vocab_R, tokens) for tokens in chunks ] fwd_state = self.lattice_fwd_comp_rnn.initial_state( mb_size=compression_batch_size, dropout=self.DROPOUT) bwd_state = self.lattice_bwd_comp_rnn.initial_state( mb_size=compression_batch_size, dropout=self.DROPOUT) if masks is None: fwd_emb = fwd_state.transduce(token_embeddings)[-1] bwd_emb = bwd_state.transduce(list(reversed(token_embeddings)))[-1] else: masks = [ dynet.inputTensor( mask, batched=True, device=self.args.param_device) if min(mask) == 0 else None for mask in masks ] fwd_emb = fwd_state.transduce(token_embeddings, masks)[-1] bwd_emb = bwd_state.transduce(reversed(token_embeddings), reversed(masks))[-1] emb = dynet.concatenate([fwd_emb, bwd_emb]) emb = dynet.to_device(emb, self.args.param_device) return emb
def get_chunk_embedding(self, chunks, masks=None): if masks is None: merged_chunks = [ self.lattice_vocab.pp(chunk) for chunk in map(list, zip(*chunks)) ] else: merged_chunks = [ self.lattice_vocab.masked_pp(chunk, mask) for chunk, mask in zip(map(list, zip( *chunks)), map(list, zip(*masks))) ] chunk_emb_is = [ self.chunk_vocab[chunk].i if chunk in self.chunk_vocab.strings else self.chunk_vocab['<chunk_unk>'].i for chunk in merged_chunks ] # fixed_embs = dynet.reshape(dynet.transpose(dynet.select_rows(self.chunk_vocab_R, chunk_emb_is)), (self.args.dim,), len(chunk_emb_is)) # fixed_embs = dynet.reshape(dynet.select_cols(self.chunk_vocab_lookup, chunk_emb_is), (self.args.dim,), len(chunk_emb_is)) fixed_embs = dynet.pick_batch(self.chunk_vocab_R, chunk_emb_is) if self.args.no_dynamic_embs: return fixed_embs else: dynamic_embs = self.compress_chunk(chunks, masks) full_embs = dynet.concatenate([fixed_embs, dynamic_embs]) return full_embs
def on_calc_additional_loss(self, reward): if not self.learn_segmentation: return None ret = LossBuilder() if self.length_prior_alpha > 0: reward += self.segment_length_prior * self.length_prior_alpha reward = dy.cdiv(reward - dy.mean_batches(reward), dy.std_batches(reward)) # Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): baseline_loss.append(dy.squared_distance(reward, baseline)) ret.add_loss("Baseline", dy.esum(baseline_loss)) # Reinforce Loss lmbd = self.lmbd.get_value(self.warmup_counter) if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - self.bs[i] else: r_i = reward reinforce_loss.append(dy.logistic(r_i) * ll) ret.add_loss("Reinforce", -dy.esum(reinforce_loss) * lmbd) # Total Loss return ret
def sample_one( self, translator: 'xnmt.models.translators.AutoRegressiveTranslator', initial_state: decoders.AutoRegressiveDecoderState, forced_trg_ids: Optional[Sequence[numbers.Integral]] = None ) -> SearchOutput: # Search variables current_words = None current_state = initial_state done = None # Outputs logsofts = [] samples = [] states = [] attentions = [] masks = [] # Sample to the max length for length in range(self.max_len): translator_output = translator.generate_one_step( current_words, current_state) if forced_trg_ids is None: sample = translator_output.logsoftmax.tensor_value( ).categorical_sample_log_prob().as_numpy() if len(sample.shape) == 2: sample = sample[0] else: sample = [ forced_trg[length] if forced_trg.sent_len() > length else Vocab.ES for forced_trg in forced_trg_ids ] logsoft = dy.pick_batch(translator_output.logsoftmax, sample) if done is not None: sample = [ sample[i] if not done[i] else Vocab.ES for i in range(len(done)) ] # masking for logsoftmax mask = [1 if not done[i] else 0 for i in range(len(done))] logsoft = dy.cmult(logsoft, dy.inputTensor(mask, batched=True)) masks.append(mask) # Appending output logsofts.append(logsoft) samples.append(sample) states.append(translator.get_nobp_state(translator_output.state)) attentions.append(translator_output.attention) # Next time step current_words = sample current_state = translator_output.state # Check done done = [x == Vocab.ES for x in sample] # Check if we are done. if all(done): break # Packing output scores = dy.esum(logsofts).npvalue() masks.insert(0, [1 for _ in range(len(done))]) samples = np.stack(samples, axis=1) return SearchOutput(samples, attentions, scores, logsofts, states, masks)
def calc_loss( self, x: dy.Expression, y: Union[numbers.Integral, List[numbers.Integral]]) -> dy.Expression: if self.can_loss_be_derived_from_scores(): scores = self.calc_scores(x) # single mode if not batchers.is_batched(y): loss = dy.pickneglogsoftmax(scores, y) # minibatch mode else: loss = dy.pickneglogsoftmax_batch(scores, y) else: log_prob = self.calc_log_probs(x) if not batchers.is_batched(y): loss = -dy.pick(log_prob, y) else: loss = -dy.pick_batch(log_prob, y) if self.label_smoothing > 0: ls_loss = -dy.mean_elems(log_prob) loss = ((1 - self.label_smoothing) * loss) + (self.label_smoothing * ls_loss) return loss
def decode_loss(self, src_encodings, tgt_seqs): """ :param tgt_seqs: (tgt_heads, tgt_labels): list (length=batch_size) of (src_len) """ # todo(NOTE): Sentences should start with empty token (as root of dependency tree)! tgt_heads, tgt_labels = tgt_seqs src_len = len(tgt_heads[0]) batch_size = len(tgt_heads) np_tgt_heads = np.array(tgt_heads).flatten() # (src_len * batch_size) np_tgt_labels = np.array(tgt_labels).flatten() s_arc, s_label = self.cal_scores(src_encodings) # (src_len, src_len, bs), ([(src_len, src_len, bs)]) s_arc_value = s_arc.npvalue() s_arc_choice = np.argmax(s_arc_value, axis=0).transpose().flatten() # (src_len * batch_size) s_pick_labels = [dy.pick_batch(dy.reshape(score, (src_len,), batch_size=src_len * batch_size), s_arc_choice) for score in s_label] s_argmax_labels = dy.concatenate(s_pick_labels, d=0) # n_labels, src_len * batch_size reshape_s_arc = dy.reshape(s_arc, (src_len,), batch_size=src_len * batch_size) arc_loss = dy.pickneglogsoftmax_batch(reshape_s_arc, np_tgt_heads) label_loss = dy.pickneglogsoftmax_batch(s_argmax_labels, np_tgt_labels) loss = dy.sum_batches(arc_loss + label_loss) / batch_size return loss
def calc_loss(self, policy_reward, only_final_reward=True): loss = losses.FactoredLossExpr() ## Calculate baseline pred_reward, baseline_loss = self.calc_baseline_loss(policy_reward, only_final_reward) if only_final_reward: rewards = [policy_reward - pw_i for pw_i in pred_reward] else: rewards = [pr_i - pw_i for pr_i, pw_i in zip(policy_reward, pred_reward)] loss.add_loss("rl_baseline", baseline_loss) ## Z-Normalization rewards = dy.concatenate(rewards, d=0) if self.z_normalization: rewards_value = rewards.value() rewards_mean = np.mean(rewards_value) rewards_std = np.std(rewards_value) + 1e-10 rewards = (rewards - rewards_mean) / rewards_std ## Calculate Confidence Penalty if self.confidence_penalty: cp_loss = self.confidence_penalty.calc_loss(self.policy_lls) loss.add_loss("rl_confpen", cp_loss) ## Calculate Reinforce Loss reinf_loss = [] # Loop through all action in one sequence for i, (policy, action) in enumerate(zip(self.policy_lls, self.actions)): # Main Reinforce calculation reward = dy.pick(rewards, i) ll = dy.pick_batch(policy, action) if self.valid_pos is not None: ll = dy.pick_batch_elems(ll, self.valid_pos[i]) reward = dy.pick_batch_elems(reward, self.valid_pos[i]) reinf_loss.append(dy.sum_batches(ll * reward)) loss.add_loss("rl_reinf", -self.weight * dy.esum(reinf_loss)) ## the composed losses return loss
def cross_entropy_loss(self, scores, next_words): if self.label_smoothing: log_softmax = dy.log_softmax(scores) return -dy.pick_batch(log_softmax, next_words) * (1 - self.label_smoothing) \ - dy.mean_elems(log_softmax) * self.label_smoothing else: return dy.pickneglogsoftmax_batch(scores, next_words)
def calc_loss(self, mlp_dec_state, ref_action): """ Label Smoothing is implemented with reference to Section 7 of the paper "Rethinking the Inception Architecture for Computer Vision" (https://arxiv.org/pdf/1512.00567.pdf) """ scores = self.get_scores(mlp_dec_state) if self.label_smoothing == 0.0: # single mode if not xnmt.batcher.is_batched(ref_action): return dy.pickneglogsoftmax(scores, ref_action) # minibatch mode else: return dy.pickneglogsoftmax_batch(scores, ref_action) else: log_prob = dy.log_softmax(scores) if not xnmt.batcher.is_batched(ref_action): pre_loss = -dy.pick(log_prob, ref_action) else: pre_loss = -dy.pick_batch(log_prob, ref_action) ls_loss = -dy.mean_elems(log_prob) loss = ((1 - self.label_smoothing) * pre_loss) + (self.label_smoothing * ls_loss) return loss
def calc_loss( self, x: dy.Expression, y: Union[numbers.Integral, List[numbers.Integral]]) -> dy.Expression: scores = self.calc_scores(x) if self.label_smoothing == 0.0: # single mode if not batchers.is_batched(y): loss = dy.pickneglogsoftmax(scores, y) # minibatch mode else: loss = dy.pickneglogsoftmax_batch(scores, y) else: log_prob = dy.log_softmax(scores) if not batchers.is_batched(y): pre_loss = -dy.pick(log_prob, y) else: pre_loss = -dy.pick_batch(log_prob, y) ls_loss = -dy.mean_elems(log_prob) loss = ((1 - self.label_smoothing) * pre_loss) + (self.label_smoothing * ls_loss) return loss
def embed(self, x): if self.train and self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = x.batch_size() if xnmt.batcher.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] emb_e = dy.parameter(self.embeddings) # single mode if not xnmt.batcher.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = dy.pick(emb_e, index=x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = dy.pick_batch(emb_e, x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def __call__(self, translator, dec_state, src, trg): # TODO: apply trg.mask ? samples = [] logsofts = [] self.bs = [] done = [False for _ in range(len(trg))] for _ in range(self.sample_length): dec_state.context = translator.attender.calc_context(dec_state.rnn_state.output()) if self.use_baseline: h_t = dy.tanh(translator.decoder.context_projector(dy.concatenate([dec_state.rnn_state.output(), dec_state.context]))) self.bs.append(self.baseline(dy.nobackprop(h_t))) logsoft = dy.log_softmax(translator.decoder.get_scores(dec_state)) sample = logsoft.tensor_value().categorical_sample_log_prob().as_numpy()[0] # Keep track of previously sampled EOS sample = [sample_i if not done_i else Vocab.ES for sample_i, done_i in zip(sample, done)] # Appending and feeding in the decoder logsoft = dy.pick_batch(logsoft, sample) logsofts.append(logsoft) samples.append(sample) dec_state = translator.decoder.add_input(dec_state, translator.trg_embedder.embed(xnmt.batcher.mark_as_batch(sample))) # Check if we are done. done = list(six.moves.map(lambda x: x == Vocab.ES, sample)) if all(done): break samples = np.stack(samples, axis=1).tolist() self.eval_score = [] for trg_i, sample_i in zip(trg, samples): # Removing EOS try: idx = sample_i.index(Vocab.ES) sample_i = sample_i[:idx] except ValueError: pass try: idx = trg_i.words.index(Vocab.ES) trg_i.words = trg_i.words[:idx] except ValueError: pass # Calculate the evaluation score score = 0 if not len(sample_i) else self.evaluation_metric.evaluate_fast(trg_i.words, sample_i) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) loss = LossBuilder() if self.use_baseline: for i, (score, _) in enumerate(zip(self.bs, logsofts)): logsofts[i] = dy.cmult(logsofts[i], score - self.true_score) loss.add_loss("Reinforce", dy.sum_elems(dy.esum(logsofts))) else: loss.add_loss("Reinforce", dy.sum_elems(dy.cmult(-self.true_score, dy.esum(logsofts)))) if self.use_baseline: baseline_loss = [] for bs in self.bs: baseline_loss.append(dy.squared_distance(self.true_score, bs)) loss.add_loss("Baseline", dy.sum_elems(dy.esum(baseline_loss))) return loss
def score_one_sequence(self, tag_scores, tags, batch_size): ''' tags: list of tag ids at each time step ''' # print tags, batch_size # print batch_size # print "scoring one sentence" tags = [[self.start_id] * batch_size ] + tags # len(tag_scores) = len(tags) - 1 score = dy.inputTensor(np.zeros(batch_size), batched=True) # tag_scores = dy.concatenate_cols(tag_scores) # tot_tags, sent_len, batch_size # print "tag dim: ", tag_scores.dim() for i in range(len(tags) - 1): score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, tags[i + 1]), tags[i]) \ + dy.pick_batch(tag_scores[i], tags[i + 1]) score += dy.pick_batch( dy.lookup_batch(self.transition_matrix, [self.end_id] * batch_size), tags[-1]) return score
def predict_logprobs(self,X,Y,structural=True,hidden_out=False): """ Returns the log probabilities of the predictions for this model (batched version). @param X: the input indexes from which to predict (each xdatum is expected to be an iterable of integers) @param Y: a list of references indexes for which to extract the prob @param structural: switches between structural and lexical logprob evaluation @param hidden_out: outputs an additional list of hidden dimension vectors @return the list of predicted logprobabilities for each of the provided ref y in Y """ assert(len(X) == len(Y)) assert(all(len(x) == self.input_length for x in X)) if structural: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) A = dy.parameter(self.action_weights) batched_X = zip(*X) #transposes the X matrix embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(A * dy.tanh( W * xdense ),Y).value() return [-ypred for ypred in preds] else:#lexical if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) batched_X = zip(*X) #transposes the X matrix embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(E * dy.tanh( W * xdense ),Y).value() return [-ypred for ypred in preds] else: dy.renew_cg() O = dy.parameter(self.output_embeddings) W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) batched_X = zip(*X) #transposes the X matrix embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(O * dy.tanh( W * xdense ),Y).value() return [-ypred for ypred in preds]
def _embed_word(self, word, is_batched): if is_batched: embedding = dy.pick_batch( self.embeddings, word) if self.is_dense else self.embeddings.batch(word) else: embedding = dy.pick( self.embeddings, index=word) if self.is_dense else self.embeddings[word] return embedding
def on_calc_additional_loss(self, translator_loss): if not self.learn_segmentation or self.segment_decisions is None: return None reward = -translator_loss["mle"] if not self.log_reward: reward = dy.exp(reward) reward = dy.nobackprop(reward) # Make sure that reward is not scalar, but rather based on the each batch item assert reward.dim()[1] == len(self.src_sent) # Mask enc_mask = self.enc_mask.get_active_one_mask().transpose() if self.enc_mask is not None else None # Compose the lose ret = LossBuilder() ## Length prior alpha = self.length_prior_alpha.value() if self.length_prior_alpha is not None else 0 if alpha > 0: reward += self.segment_length_prior * alpha # reward z-score normalization if self.z_normalization: reward = dy.cdiv(reward-dy.mean_batches(reward), dy.std_batches(reward) + EPS) ## Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): loss = dy.squared_distance(reward, baseline) if enc_mask is not None: loss = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), loss) baseline_loss.append(loss) ret.add_loss("Baseline", dy.esum(baseline_loss)) if self.print_sample: print(dy.exp(self.segment_logsoftmaxes[i]).npvalue().transpose()[0]) ## Reinforce Loss lmbd = self.lmbd.value() if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - dy.nobackprop(self.bs[i]) else: r_i = reward if enc_mask is not None: ll = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), ll) reinforce_loss.append(r_i * -ll) loss = dy.esum(reinforce_loss) * lmbd ret.add_loss("Reinforce", loss) if self.confidence_penalty: ls_loss = self.confidence_penalty(self.segment_logsoftmaxes, enc_mask) ret.add_loss("Confidence Penalty", ls_loss) # Total Loss return ret
def next(self, w, c, test=True, state=None): e = dy.pick_batch(self.E, w) if not test: e = dy.dropout_dim(e, 0, self.wdr) # Run LSTM if state is None: self.ds = self.ds.add_input(e) next_state = self.ds else: next_state = state.add_input(e) h = next_state.output() return h, e, next_state
def next(self, word_idx, context, train, cur_state=None): embs = dy.pick_batch(self.E, word_idx) if train: embs = dy.dropout_dim(embs, 0, self.word_dropout) x = dy.concatenate([embs, context]) if cur_state is None: self.dec_state = self.dec_state.add_input(x) next_state = self.dec_state else: next_state = cur_state.add_input(x) hidden = next_state.output() return hidden, embs, next_state
def learn(self, batch_size): if self.prioritized: if not self.memory.is_full(): return -np.inf indices, exps, weights = self.memory.sample(batch_size, self.beta) else: exps = self.memory.sample(batch_size) obss, actions, rewards, obs_nexts, dones = self._process(exps) dy.renew_cg() target_network = self.target_network if self.use_double_dqn else self.network if self.dueling: target_values, v = target_network(obs_nexts, batched=True) target_values = target_values.npvalue() + v.npvalue() else: target_values = target_network(obs_nexts, batched=True) target_values = target_values.npvalue() target_values = np.max(target_values, axis=0) target_values = rewards + self.reward_decay * (target_values * (1 - dones)) dy.renew_cg() if self.dueling: all_values_expr, v = self.network(obss, batched=True) else: all_values_expr = self.network(obss, batched=True) picked_values = dy.pick_batch(all_values_expr, actions) diff = (picked_values + v if self.dueling else picked_values) - dy.inputTensor(target_values, batched=True) if self.prioritized: self.memory.update(indices, np.transpose(np.abs(diff.npvalue()))) losses = dy.pow(diff, dy.constant(1, 2)) if self.prioritized: losses = dy.cmult(losses, dy.inputTensor(weights, batched=True)) loss = dy.sum_batches(losses) loss_value = loss.npvalue() loss.backward() self.trainer.update() self.epsilon = max(self.epsilon - self.epsilon_decrease, self.epsilon_lower) if self.prioritized: self.beta = min(self.beta + self.beta_increase, 1.) self.learn_step += 1 if self.use_double_dqn and self.learn_step % self.n_replace_target == 0: self.target_network.update(self.network) return loss_value
def next(self, w, c, test=True, state=None): if isinstance(w, dy.Expression): e = w else: e = dy.pick_batch(self.E, w) if not test: e = dy.dropout_dim(e, 0, self.wdr) x = dy.concatenate([e, c]) # Run LSTM if state is None: self.ds = self.ds.add_input(x) next_state = self.ds else: next_state = state.add_input(x) h = next_state.output() return h, e, next_state
def embed_sentence(self, ws, pwords, ts, chars, is_train): cembed = [dy.lookup_batch(self.clookup, c) for c in chars] char_fwd, char_bckd = self.char_lstm.builder_layers[0][0].initial_state().transduce(cembed)[-1], \ self.char_lstm.builder_layers[0][1].initial_state().transduce(reversed(cembed))[-1] crnn = dy.reshape(dy.concatenate_cols([char_fwd, char_bckd]), (self.options.we, ws.shape[0] * ws.shape[1])) cnn_reps = [list() for _ in range(len(ws))] for i in range(ws.shape[0]): cnn_reps[i] = dy.pick_batch(crnn, [i * ws.shape[1] + j for j in range(ws.shape[1])], 1) wembed = [dy.lookup_batch(self.wlookup, ws[i]) + dy.lookup_batch(self.elookup, pwords[i]) + cnn_reps[i] for i in range(len(ws))] posembed = [dy.lookup_batch(self.tlookup, ts[i]) for i in range(len(ts))] if (not is_train) or self.options.dropout == 0: return [dy.concatenate([wembed[i], posembed[i]]) for i in range(len(ts))] else: emb_masks = self.generate_emb_mask(ws.shape[0], ws.shape[1]) return [dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(wembed, posembed, emb_masks)]
def process_batch(self, batch, training=False): self.TRAINING_ITER = training self.DROPOUT = self.args.dropout if ( self.TRAINING_ITER and self.args.dropout > 0) else None self.BATCH_SIZE = len(batch) sents, masks = self.vocab.batchify(batch) self.instantiate_parameters() init_state = self.rnn.initial_state(mb_size=self.BATCH_SIZE, dropout=self.DROPOUT) # embeddings = [dynet.reshape(dynet.select_cols(self.vocab_lookup, toks), (self.args.dim,), self.BATCH_SIZE) # embeddings = [dynet.reshape(dynet.transpose(dynet.select_rows(self.vocab_R, toks)), (self.args.dim*2,), self.BATCH_SIZE) embeddings = [dynet.pick_batch(self.vocab_R, toks) for toks in sents] outputs = init_state.transduce(embeddings) outputs = [ dynet.to_device(out, self.args.param_device) for out in outputs ] if self.DROPOUT: y_ts = [dynet.cmult(y_t, self.dropout_mask_y_t) for y_t in outputs] else: y_ts = outputs r_ts = [ dynet.affine_transform([ self.vocab_bias, self.vocab_R, dynet.tanh(dynet.affine_transform([self.bias, self.R, y_t])) ]) for y_t in y_ts ] errs = [ dynet.pickneglogsoftmax_batch(r_t, toks) for r_t, toks in zip(r_ts, sents[1:]) ] for tok_i, (err, mask) in enumerate(zip(errs, masks[1:])): if min(mask) == 0: errs[tok_i] = err * dynet.inputTensor( mask, batched=True, device=self.args.param_device) err = dynet.esum(errs) char_count = [1 + len(self.vocab.pp(sent[1:-1])) for sent in batch] word_count = [len(sent[1:]) for sent in batch] # word_count = [2+self.vocab.pp(sent[1:-1]).count(' ') for sent in batch] return {"loss": err, "charcount": char_count, "wordcount": word_count}
def compute_loss(self, words, extwords, tags, true_arcs, true_labels): arc_logits, rel_logits = self.forward(words, extwords, tags, True) seq_len = len(true_arcs) targets_1D = dynet_flatten_numpy(true_arcs) flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses) flat_rel_logits = dy.reshape(rel_logits, (seq_len, self.rel_size), seq_len) partial_rel_logits = dy.pick_batch(flat_rel_logits, targets_1D) targets_rel1D = dynet_flatten_numpy(true_labels) losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_rel1D) rel_loss = dy.sum_batches(losses) loss = arc_loss + rel_loss return loss
def decode_loss(self, src_encodings, masks, tgt_seqs, sents_len): """ :param tgt_seqs: (tgt_heads, tgt_labels): list (length=batch_size) of (src_len) """ tgt_heads, tgt_labels = tgt_seqs src_len = len(tgt_heads[0]) batch_size = len(tgt_heads) np_tgt_heads = np.array(tgt_heads).flatten() # (src_len * batch_size) np_tgt_labels = np.array(tgt_labels).flatten() np_masks = np.array(masks).transpose().flatten() masks_expr = dy.inputVector(np_masks) masks_expr = dy.reshape(masks_expr, (1, ), batch_size=src_len * batch_size) s_arc, s_label = self.cal_scores( src_encodings ) # (src_len, src_len, bs), ([(src_len, src_len, bs)]) s_arc = dy.select_cols(s_arc, range(1, src_len + 1)) s_label = [ dy.select_cols(label, range(1, src_len + 1)) for label in s_label ] s_arc_value = s_arc.npvalue() s_arc_choice = np.argmax( s_arc_value, axis=0).transpose().flatten() # (src_len * batch_size) s_pick_labels = [ dy.pick_batch( dy.reshape(score, (src_len + 1, ), batch_size=src_len * batch_size), s_arc_choice) for score in s_label ] s_argmax_labels = dy.concatenate(s_pick_labels, d=0) # n_labels, src_len * batch_size reshape_s_arc = dy.reshape(s_arc, (src_len + 1, ), batch_size=src_len * batch_size) arc_loss = dy.pickneglogsoftmax_batch(reshape_s_arc, np_tgt_heads) arc_loss = arc_loss * masks_expr label_loss = dy.pickneglogsoftmax_batch(s_argmax_labels, np_tgt_labels) label_loss = label_loss * masks_expr loss = dy.sum_batches(arc_loss + label_loss) return loss
# regular lookup a = lp[1].npvalue() b = lp[2].npvalue() c = lp[3].npvalue() # batch lookup instead of single elements. # two ways of doing this. abc1 = dy.lookup_batch(lp, [1,2,3]) print(abc1.npvalue()) abc2 = lp.batch([1,2,3]) print(abc2.npvalue()) print(np.hstack([a,b,c])) # use pick and pickneglogsoftmax in batch mode # (must be used in conjunction with lookup_batch): print("\nPick") W = dy.parameter( m.add_parameters((5, 10)) ) h = W * lp.batch([1,2,3]) print(h.npvalue()) print(dy.pick_batch(h,[1,2,3]).npvalue()) print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value()) # using pickneglogsoftmax_batch print("\nPick neg log softmax") print((-dy.log(dy.softmax(h))).npvalue()) print(dy.pickneglogsoftmax_batch(h,[1,2,3]).npvalue())