def calc_loss(self, translator, src, trg): search_outputs = translator.generate_search_output(src, self.search_strategy) sign = -1 if self.inv_eval else 1 total_loss = FactoredLossExpr() for search_output in search_outputs: self.eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = self.remove_eos(sample_i.tolist()) ref_i = trg_i.words[:trg_i.len_unpadded()] score = self.evaluation_metric.evaluate_one_sent(ref_i, sample_i) self.eval_score.append(sign * score) self.reward = dy.inputTensor(self.eval_score, batched=True) # Composing losses loss = FactoredLossExpr() if self.baseline is not None: baseline_loss = [] losses = [] for state, logsoft, mask in zip(search_output.state, search_output.logsoftmaxes, search_output.mask): bs_score = self.baseline.transform(state) baseline_loss.append(dy.squared_distance(self.reward, bs_score)) loss_i = dy.cmult(logsoft, self.reward - bs_score) valid = list(np.nonzero(mask)[0]) losses.append(dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) loss.add_loss("reinforce", dy.sum_elems(dy.esum(losses))) loss.add_loss("reinf_baseline", dy.sum_elems(dy.esum(baseline_loss))) else: loss.add_loss("reinforce", dy.sum_elems(dy.cmult(self.true_score, dy.esum(logsofts)))) total_loss.add_factored_loss_expr(loss) return loss
def calc_loss(self, model: 'model_base.ConditionedModel', src: Union[sent.Sentence, 'batchers.Batch'], trg: Union[sent.Sentence, 'batchers.Batch']) -> losses.FactoredLossExpr: search_outputs = model.generate_search_output(src, self.search_strategy) sign = -1 if self.inv_eval else 1 total_loss = losses.FactoredLossExpr() for search_output in search_outputs: # Calculate rewards eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = self.remove_eos(sample_i.tolist()) ref_i = trg_i.words[:trg_i.len_unpadded()] score = self.evaluation_metric.evaluate_one_sent(ref_i, sample_i) eval_score.append(sign * score) reward = dy.inputTensor(eval_score, batched=True) # Composing losses loss = losses.FactoredLossExpr() baseline_loss = [] cur_losses = [] for state, mask in zip(search_output.state, search_output.mask): bs_score = self.baseline.transform(dy.nobackprop(state.as_vector())) baseline_loss.append(dy.squared_distance(reward, bs_score)) logsoft = model.decoder.scorer.calc_log_probs(state.as_vector()) loss_i = dy.cmult(logsoft, reward - bs_score) cur_losses.append(dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) loss.add_loss("reinforce", dy.sum_elems(dy.esum(cur_losses))) loss.add_loss("reinf_baseline", dy.sum_elems(dy.esum(baseline_loss))) # Total losses total_loss.add_factored_loss_expr(loss) return loss
def __call__(self, translator, dec_state, src, trg): # TODO: apply trg.mask ? samples = [] logsofts = [] self.bs = [] done = [False for _ in range(len(trg))] for _ in range(self.sample_length): dec_state.context = translator.attender.calc_context(dec_state.rnn_state.output()) if self.use_baseline: h_t = dy.tanh(translator.decoder.context_projector(dy.concatenate([dec_state.rnn_state.output(), dec_state.context]))) self.bs.append(self.baseline(dy.nobackprop(h_t))) logsoft = dy.log_softmax(translator.decoder.get_scores(dec_state)) sample = logsoft.tensor_value().categorical_sample_log_prob().as_numpy()[0] # Keep track of previously sampled EOS sample = [sample_i if not done_i else Vocab.ES for sample_i, done_i in zip(sample, done)] # Appending and feeding in the decoder logsoft = dy.pick_batch(logsoft, sample) logsofts.append(logsoft) samples.append(sample) dec_state = translator.decoder.add_input(dec_state, translator.trg_embedder.embed(xnmt.batcher.mark_as_batch(sample))) # Check if we are done. done = list(six.moves.map(lambda x: x == Vocab.ES, sample)) if all(done): break samples = np.stack(samples, axis=1).tolist() self.eval_score = [] for trg_i, sample_i in zip(trg, samples): # Removing EOS try: idx = sample_i.index(Vocab.ES) sample_i = sample_i[:idx] except ValueError: pass try: idx = trg_i.words.index(Vocab.ES) trg_i.words = trg_i.words[:idx] except ValueError: pass # Calculate the evaluation score score = 0 if not len(sample_i) else self.evaluation_metric.evaluate_fast(trg_i.words, sample_i) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) loss = LossBuilder() if self.use_baseline: for i, (score, _) in enumerate(zip(self.bs, logsofts)): logsofts[i] = dy.cmult(logsofts[i], score - self.true_score) loss.add_loss("Reinforce", dy.sum_elems(dy.esum(logsofts))) else: loss.add_loss("Reinforce", dy.sum_elems(dy.cmult(-self.true_score, dy.esum(logsofts)))) if self.use_baseline: baseline_loss = [] for bs in self.bs: baseline_loss.append(dy.squared_distance(self.true_score, bs)) loss.add_loss("Baseline", dy.sum_elems(dy.esum(baseline_loss))) return loss
def test_save_load_with_gradient(self): # Make it so W1 has a gradient dy.renew_cg() dy.sum_elems(self.W1).backward() # Record gradients W1_grad = self.W1.grad_as_array() W2_grad = self.W2.grad_as_array() # Save the ParameterCollection self.m.save(self.file) # Populate self.m.populate(self.file) # Check that the gradients were saved self.assertTrue(np.allclose(self.W1.grad_as_array(), W1_grad)) self.assertTrue(np.allclose(self.W2.grad_as_array(), W2_grad))
def test_save_load_with_gradient(self): # Make it so W1 has a gradient dy.renew_cg() dy.sum_elems(self.W1).backward() # Record gradients W1_grad = self.W1.grad_as_array() W2_grad = self.W2.grad_as_array() # Save the ParameterCollection self.m.save(self.file) # Populate self.m.populate(self.file) # Check that the gradients were saved self.assertTrue(np.allclose(self.W1.grad_as_array(), W1_grad)) self.assertTrue(np.allclose(self.W2.grad_as_array(), W2_grad))
def aggregate_masked_loss(x: Tensor, mask: 'xnmt.batchers.Mask' = None) -> Tensor: """ Aggregate loss values for unmasked entries. Args: x: Batched sequence of losses. mask: An optional mask for the case of outputs of unequal lengths. Returns: Batched sequence of losses, with masked ones zeroed out. """ if xnmt.backend_dynet: if mask: x = dy.cmult(x, dy.inputTensor(1.0 - mask.np_arr.T, batched=True)) return dy.sum_elems(x) else: if mask: x = torch.mul( x, torch.as_tensor(1.0 - mask.np_arr, dtype=x.dtype, device=xnmt.device)) return torch.sum(x, dim=tuple(range(1, len( x.size())))) # sum over all but batch elems
def attention_entropy(self, a): entropy = [] for a_i in a: a_i += EPSILON entropy.append(dy.cmult(a_i, dy.log(a_i))) return -dy.sum_elems(dy.esum(entropy))
def test_item(model, sentence): seq = [ model.wlookup[int(model.w2i.get(entry, 0))] for entry in sentence.preprocessed_sentence ] if len(seq) > 0: encoded_sequence = encode_sequence(model, seq, model.sentence_rnn) global_max = max_pooling(encoded_sequence) global_min = average_pooling(encoded_sequence) if len(encoded_sequence) > 0: att_mlp_outputs = [] for e in encoded_sequence: mlp_out = (model.attention_w * e) + model.attention_b att_mlp_outputs.append(mlp_out) lst = [] for o in att_mlp_outputs: lst.append(dy.exp(dy.sum_elems(dy.cmult(o, model.att_context)))) sum_all = dy.esum(lst) probs = [dy.cdiv(e, sum_all) for e in lst] att_context = dy.esum( [dy.cmult(p, h) for p, h in zip(probs, encoded_sequence)]) context = dy.concatenate([att_context, global_max, global_min]) y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b) sentence.prediction_result = y_pred.scalar_value() dy.renew_cg() return sentence.prediction_result return 0
def word_assoc_score(self, source_idx, target_idx, relation): """ NOTE THAT DROPOUT IS BEING APPLIED HERE :param source_idx: embedding index of source atom :param target_idx: embedding index of target atom :param relation: relation type :return: score """ # prepare s = self.embeddings[source_idx] if self.no_assoc: A = dy.const_parameter(self.word_assoc_weights[relation]) else: A = dy.parameter(self.word_assoc_weights[relation]) dy.dropout(A, self.dropout) t = self.embeddings[target_idx] # compute if self.mode == BILINEAR_MODE: return dy.transpose(s) * A * t elif self.mode == DIAG_RANK1_MODE: diag_A = dyagonalize(A[0]) rank1_BC = A[1] * dy.transpose(A[2]) ABC = diag_A + rank1_BC return dy.transpose(s) * ABC * t elif self.mode == TRANSLATIONAL_EMBED_MODE: return -dy.l2_norm(s - t + A) elif self.mode == DISTMULT: return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
def calc_loss(self, src, trg, loss_calculator): if not batcher.is_batched(src): src = batcher.ListBatch([src]) src_inputs = batcher.ListBatch([s[:-1] for s in src], mask=batcher.Mask(src.mask.np_arr[:,:-1]) if src.mask else None) src_targets = batcher.ListBatch([s[1:] for s in src], mask=batcher.Mask(src.mask.np_arr[:,1:]) if src.mask else None) self.start_sent(src) embeddings = self.src_embedder.embed_sent(src_inputs) encodings = self.rnn.transduce(embeddings) encodings_tensor = encodings.as_tensor() ((hidden_dim, seq_len), batch_size) = encodings.dim() encoding_reshaped = dy.reshape(encodings_tensor, (hidden_dim,), batch_size=batch_size * seq_len) outputs = self.transform(encoding_reshaped) ref_action = np.asarray([sent.words for sent in src_targets]).reshape((seq_len * batch_size,)) loss_expr_perstep = self.scorer.calc_loss(outputs, batcher.mark_as_batch(ref_action)) loss_expr_perstep = dy.reshape(loss_expr_perstep, (seq_len,), batch_size=batch_size) if src_targets.mask: loss_expr_perstep = dy.cmult(loss_expr_perstep, dy.inputTensor(1.0-src_targets.mask.np_arr.T, batched=True)) loss_expr = dy.sum_elems(loss_expr_perstep) model_loss = loss.FactoredLossExpr() model_loss.add_loss("mle", loss_expr) return model_loss
def compute_entropy(self, distribution): """ Gets the entropy of a probability distribution that may contain zeroes. Inputs: probability_distribution (dy.Expression): The probability distribution. Returns: dy.Expression representing the entropy. """ num_actions = len(self.output_action_vocabulary) - 1 num_locations = len(self.output_location_vocabulary) - 1 num_arguments = len(self.output_argument_vocabulary) - 1 valid_mask = numpy.zeros(num_actions * num_locations * num_arguments) for index in self._valid_action_indices: valid_mask[index] = 1. # This mask is one for all valid indices, and zero for all others. valid_mask = dy.inputTensor(valid_mask) # This basically replaces everything in the probability distribution # with the original value (if valid), or zero (if not valid). valid_probs = dy.cmult(valid_mask, distribution) # The inverse of valid mask, this gives a value of 1. if something is invalid. invalid_probs = 1.-valid_mask # The result of this operation is that everything that's valid gets its # original probability, and everything that's not gets a probability of 1. probs = valid_probs + invalid_probs # dy.log(probs) will give log(p(action)) if action is valid, and # log(1)=0 for invalid actions. # then entropies will be zero for everything that isn't valid, and the # actual p log(p) otherwise. entropies = dy.cmult(probs, dy.log(probs + 0.00000000001)) return -dy.sum_elems(entropies)
def calc_nll(self, src: Union[batchers.Batch, sent.Sentence], trg: Union[batchers.Batch, sent.Sentence]) \ -> dy.Expression: assert batchers.is_batched(src) and batchers.is_batched(trg) batch_size, encodings, outputs, seq_len = self._encode_src(src) if trg.sent_len() != seq_len: if self.auto_cut_pad: trg = self._cut_or_pad_targets(seq_len, trg) else: raise ValueError( f"src/trg length do not match: {seq_len} != {len(trg[0])}") ref_action = np.asarray([trg_sent.words for trg_sent in trg]).reshape( (seq_len * batch_size, )) loss_expr_perstep = self.scorer.calc_loss( outputs, batchers.mark_as_batch(ref_action)) # loss_expr_perstep = dy.pickneglogsoftmax_batch(outputs, ref_action) loss_expr_perstep = dy.reshape(loss_expr_perstep, (seq_len, ), batch_size=batch_size) if trg.mask: loss_expr_perstep = dy.cmult( loss_expr_perstep, dy.inputTensor(1.0 - trg.mask.np_arr.T, batched=True)) loss_expr = dy.sum_elems(loss_expr_perstep) return loss_expr
def calc_loss(sents): dy.renew_cg() src_fwd = LSTM_SRC_FWD.initial_state() src_bwd = LSTM_SRC_BWD.initial_state() trg_fwd = LSTM_TRG_FWD.initial_state() trg_bwd = LSTM_TRG_BWD.initial_state() # Encoding src_reps = encode_sents(LOOKUP_SRC, src_fwd, src_bwd, [src for src, trg in sents]) trg_reps = encode_sents(LOOKUP_TRG, trg_fwd, trg_bwd, [trg for src, trg in sents]) # Concatenate the sentence representations to a single matrix mtx_src = dy.concatenate_cols(src_reps) mtx_trg = dy.concatenate_cols(trg_reps) # Do matrix multiplication to get a matrix of dot product similarity scores sim_mtx = dy.transpose(mtx_src) * mtx_trg # Calculate the hinge loss over all dimensions loss = dy.hinge_dim(sim_mtx, list(range(len(sents))), d=1) return dy.sum_elems(loss)
def cross_entropy_loss(y, yhat): """ Compute the cross entropy loss in tensorflow. The loss should be summed over the current minibatch. y is a one-hot tensor of shape (n_samples, n_classes) and yhat is a tensor of shape (n_samples, n_classes). y should be of dtype tf.int32, and yhat should be of dtype tf.float32. The functions tf.to_float, tf.reduce_sum, and tf.log might prove useful. (Many solutions are possible, so you may not need to use all of these functions). Note: You are NOT allowed to use the tensorflow built-in cross-entropy functions. Args: y: tf.Tensor with shape (n_samples, n_classes). One-hot encoded. yhat: tf.Tensorwith shape (n_sample, n_classes). Each row encodes a probability distribution and should sum to 1. Returns: out: tf.Tensor with shape (1,) (Scalar output). You need to construct this tensor in the problem. """ ### YOUR CODE HERE l_yhat = dy.log(yhat) product = dy.cmult(y, l_yhat) out = (-dy.sum_elems(product)) ### END YOUR CODE return out
def calc_loss(self, src, trg, loss_calculator): assert batcher.is_batched(src) and batcher.is_batched(trg) batch_size, encodings, outputs, seq_len = self._encode_src(src) if trg.sent_len() != seq_len: if self.auto_cut_pad: trg = self._cut_or_pad_targets(seq_len, trg) else: raise ValueError( f"src/trg length do not match: {seq_len} != {len(trg[0])}") ref_action = np.asarray([sent.words for sent in trg]).reshape( (seq_len * batch_size, )) loss_expr_perstep = self.scorer.calc_loss( outputs, batcher.mark_as_batch(ref_action)) # loss_expr_perstep = dy.pickneglogsoftmax_batch(outputs, ref_action) loss_expr_perstep = dy.reshape(loss_expr_perstep, (seq_len, ), batch_size=batch_size) if trg.mask: loss_expr_perstep = dy.cmult( loss_expr_perstep, dy.inputTensor(1.0 - trg.mask.np_arr.T, batched=True)) loss_expr = dy.sum_elems(loss_expr_perstep) model_loss = loss.FactoredLossExpr() model_loss.add_loss("mle", loss_expr) return model_loss
def l2_normalize(vector): square_sum = dy.sqrt( dy.bmax( dy.sum_elems(dy.square(vector)), np.finfo(float).eps * dy.ones((1))[0], )) return dy.cdiv(vector, square_sum)
def word_assoc_score(self, source_idx, target_idx, relation): """ NOTE THAT DROPOUT IS BEING APPLIED HERE :param source_idx: embedding index of source atom :param target_idx: embedding index of target atom :param relation: relation type :return: score """ # prepare s = self.embeddings[source_idx] if self.no_assoc: A = dy.const_parameter(self.word_assoc_weights[relation]) else: A = dy.parameter(self.word_assoc_weights[relation]) dy.dropout(A, self.dropout) t = self.embeddings[target_idx] # compute if self.mode == BILINEAR_MODE: return dy.transpose(s) * A * t elif self.mode == DIAG_RANK1_MODE: diag_A = dyagonalize(A[0]) rank1_BC = A[1] * dy.transpose(A[2]) ABC = diag_A + rank1_BC return dy.transpose(s) * ABC * t elif self.mode == TRANSLATIONAL_EMBED_MODE: return -dy.l2_norm(s - t + A) elif self.mode == DISTMULT: return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
def calc_loss(self, src, db_idx, src_mask=None, trg_mask=None): src_embeddings = self.src_embedder.embed_sent(src, mask=src_mask) self.src_encoder.set_input(src) src_encodings = self.exprseq_pooling(self.src_encoder.transduce(src_embeddings)) trg_batch, trg_mask = self.database[db_idx] # print("trg_mask=\n",trg_mask) trg_encodings = self.encode_trg_example(trg_batch, mask=trg_mask) dim = trg_encodings.dim() trg_reshaped = dy.reshape(trg_encodings, (dim[0][0], dim[1])) # ### DEBUG # trg_npv = trg_reshaped.npvalue() # for i in range(dim[1]): # print("--- trg_reshaped {}: {}".format(i,list(trg_npv[:,i]))) # ### DEBUG prod = dy.transpose(src_encodings) * trg_reshaped # ### DEBUG # prod_npv = prod.npvalue() # for i in range(dim[1]): # print("--- prod {}: {}".format(i,list(prod_npv[0].transpose()[i]))) # ### DEBUG id_range = list(range(len(db_idx))) # This is ugly: if self.loss_direction == "forward": prod = dy.transpose(prod) loss = dy.sum_batches(dy.hinge_batch(prod, id_range)) elif self.loss_direction == "bidirectional": prod = dy.reshape(prod, (len(db_idx), len(db_idx))) loss = dy.sum_elems( dy.hinge_dim(prod, id_range, d=0) + dy.hinge_dim(prod, id_range, d=1)) else: raise RuntimeError("Illegal loss direction {}".format(self.loss_direction)) return loss
def copy_src_probs_pick(token_type, token_literal): if token_type not in copy_atts: return dy.scalarInput(0.0) selected_indexes = copy_history[token_type][token_literal] if len(selected_indexes) == 0: return dy.scalarInput(0.0) probs = copy_src_probs(token_type) return dy.sum_elems(dy.select_rows(probs, selected_indexes))
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def copy_src_probs_map(token_type, lazy=False): if token_type not in copy_atts: return {} literal_history = copy_history[token_type] if all(len(history) == 0 for history in literal_history.values()): return {} probs = copy_src_probs(token_type) if lazy: return { literal: dy.sum_elems(dy.select_rows(probs, history)) for literal, history in literal_history.items() if len(history) > 0 } return { literal: dy.sum_elems(dy.select_rows(probs, history)).value() for literal, history in literal_history.items() if len(history) > 0 }
def cosine_proximity(self, pred, gold): def l2_normalize(x): square_sum = dynet.sqrt(dynet.bmax(dynet.sum_elems(dynet.square(x)), np.finfo(float).eps * dynet.ones((1))[0])) return dynet.cdiv(x, square_sum) y_true = l2_normalize(pred) y_pred = l2_normalize(gold) return -dynet.sum_elems(dynet.cmult(y_true, y_pred))
def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dy.pick(scores, argmax_score) max_score_expr_broadcast = dy.concatenate([max_score_expr] * self.dim_output) return max_score_expr + dy.log( dy.sum_elems( dy.transpose(dy.exp(scores - max_score_expr_broadcast))))
def loss_function(recon_x, x, mu, logvar): BCE = dy.binary_log_loss(recon_x, x) # equiv to torch.nn.functional.binary_cross_entropy(?,?, size_average=False) # see Appendix B from VAE paper: # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 # https://arxiv.org/abs/1312.6114 # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD = -0.5 * dy.sum_elems(1 + logvar - dy.pow(mu, dy.scalarInput(2)) - dy.exp(logvar)) return BCE + KLD
def calc_loss(self, policy): if self.weight < 1e-8: return None neg_entropy = [] for i, ll in enumerate(policy): if self.valid_pos is not None: ll = dy.pick_batch_elems(ll, self.valid_pos[i]) loss = dy.sum_batches(dy.sum_elems(dy.cmult(dy.exp(ll), ll))) neg_entropy.append(dy.sum_batches(loss)) return self.weight * dy.esum(neg_entropy)
def calculate_confidence(vec, proportions=0.5): """ calculate the value of alpha, the employed metric is GINI index :param vec: :return: """ square_sum = dy.sum_elems(dy.cmult(vec, vec)).value() if not 0 <= square_sum <= 1: raise Exception("Invalid square sum %.3lf" % square_sum) return (1 - square_sum) * proportions
def cal_context(self, s, selected=None): ws = self.cal_scores(s) if selected is None: return self.es_matrix * ws, ws selected_ws = dy.select_rows(ws, selected) selected_ws = dy.cdiv(selected_ws, dy.sum_elems(selected_ws)) return dy.concatenate_cols( [es[index] for index in selected]) * selected_ws, ws
def norm_vec(vec): """ normalize a dynet vector expression :param vec: :return: """ sum_item = dy.sum_elems(vec) norm_vec = vec / sum_item.value() print(norm_vec.npvalue()) return norm_vec
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def calc_loss( self, model: 'model_base.ConditionedModel', src: Union[sent.Sentence, 'batchers.Batch'], trg: Union[sent.Sentence, 'batchers.Batch']) -> losses.FactoredLossExpr: batch_size = trg.batch_size() uniques = [set() for _ in range(batch_size)] deltas = [] probs = [] sign = -1 if self.inv_eval else 1 search_outputs = model.generate_search_output(src, self.search_strategy) for search_output in search_outputs: assert len(search_output.word_ids) == 1 assert search_output.word_ids[0].shape == (len( search_output.state), ) logprob = [] for word, state in zip(search_output.word_ids[0], search_output.state): lpdist = model.decoder.scorer.calc_log_probs(state.as_vector()) lp = dy.pick(lpdist, word) logprob.append(lp) sample = search_output.word_ids logprob = dy.esum(logprob) * self.alpha # Calculate the evaluation score eval_score = np.zeros(batch_size, dtype=float) mask = np.zeros(batch_size, dtype=float) for j in range(batch_size): ref_j = self.remove_eos(trg[j].words) hyp_j = self.remove_eos(sample[j].tolist()) if self.unique_sample: hash_val = hash(tuple(hyp_j)) if len(hyp_j) == 0 or hash_val in uniques[j]: mask[j] = -1e20 # represents negative infinity continue else: uniques[j].add(hash_val) # Calc evaluation score eval_score[j] = self.evaluation_metric.evaluate_one_sent( ref_j, hyp_j) * sign # Appending the delta and logprob of this sample prob = logprob + dy.inputTensor(mask, batched=True) deltas.append(dy.inputTensor(eval_score, batched=True)) probs.append(prob) sample_prob = dy.softmax(dy.concatenate(probs)) deltas = dy.concatenate(deltas) risk = dy.sum_elems(dy.cmult(sample_prob, deltas)) ### Debug #print(sample_prob.npvalue().transpose()[0]) #print(deltas.npvalue().transpose()[0]) #print("----------------------") ### End debug return losses.FactoredLossExpr({"risk": risk})
def _perform_calc_loss( self, model: 'model_base.ConditionedModel', src: Union[sent.Sentence, 'batchers.Batch'], trg: Union[sent.Sentence, 'batchers.Batch']) -> losses.FactoredLossExpr: search_outputs = model.generate_search_output(src, self.search_strategy) sign = -1 if self.inv_eval else 1 # TODO: Fix units total_loss = collections.defaultdict(int) for search_output in search_outputs: # Calculate rewards eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = utils.remove_eos(sample_i.tolist(), vocabs.Vocab.ES) ref_i = trg_i.words[:trg_i.len_unpadded()] score = self.evaluation_metric.evaluate_one_sent( ref_i, sample_i) eval_score.append(sign * score) reward = dy.inputTensor(eval_score, batched=True) # Composing losses baseline_loss = [] cur_losses = [] for state, mask in zip(search_output.state, search_output.mask): bs_score = self.baseline.transform( dy.nobackprop(state.as_vector())) baseline_loss.append(dy.squared_distance(reward, bs_score)) logsoft = model.decoder.scorer.calc_log_probs( state.as_vector()) loss_i = dy.cmult(logsoft, reward - bs_score) cur_losses.append( dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) total_loss["polc_loss"] += dy.sum_elems(dy.esum(cur_losses)) total_loss["base_loss"] += dy.sum_elems(dy.esum(baseline_loss)) units = [t.len_unpadded() for t in trg] total_loss = losses.FactoredLossExpr( {k: losses.LossExpr(v, units) for k, v in total_loss.items()}) return losses.FactoredLossExpr({"risk": total_loss})
def __call__(self, translator, initial_state, src, trg): # TODO(philip30): currently only using the best hypothesis / first sample for reinforce loss # A small further implementation is needed if we want to do reinforce with multiple samples. search_output = translator.search_strategy.generate_output( translator, initial_state)[0] # Calculate evaluation scores self.eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = self.remove_eos(sample_i.tolist()) ref_i = self.remove_eos(trg_i.words) # Evaluating if len(sample_i) == 0: score = 0 else: score = self.evaluation_metric.evaluate(ref_i, sample_i) * \ (-1 if self.inv_eval else 1) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) # Composing losses loss = LossBuilder() if self.use_baseline: baseline_loss = [] losses = [] for state, logsoft, mask in zip(search_output.state, search_output.logsoftmaxes, search_output.mask): bs_score = self.baseline(state) baseline_loss.append( dy.squared_distance(self.true_score, bs_score)) loss_i = dy.cmult(logsoft, self.true_score - bs_score) losses.append( dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) loss.add_loss("reinforce", dy.sum_elems(dy.esum(losses))) loss.add_loss("reinf_baseline", dy.sum_elems(dy.esum(baseline_loss))) else: loss.add_loss( "reinforce", dy.sum_elems(dy.cmult(self.true_score, dy.esum(logsofts)))) return loss
def __call__(self, logsoftmaxes, mask): strength = self.strength.value() if strength == 0: return 0 neg_entropy = [] for i, logsoftmax in enumerate(logsoftmaxes): loss = dy.cmult(dy.exp(logsoftmax), logsoftmax) if mask is not None: loss = dy.cmult(dy.inputTensor(mask[i], batched=True), loss) neg_entropy.append(loss) return strength * dy.sum_elems(dy.esum(neg_entropy))
def loss_function(recon_x, x, mu, logvar): BCE = dy.binary_log_loss( recon_x, x ) # equiv to torch.nn.functional.binary_cross_entropy(?,?, size_average=False) # see Appendix B from VAE paper: # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 # https://arxiv.org/abs/1312.6114 # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD = -0.5 * dy.sum_elems(1 + logvar - dy.pow(mu, dy.scalarInput(2)) - dy.exp(logvar)) return BCE + KLD
def test_layer_norm(self): dy.renew_cg() x = dy.inputTensor(self.v1) g = dy.inputTensor(self.v2) b = dy.inputTensor(self.v3) y = dy.layer_norm(x,g,b) l = dy.sum_elems(y) l_value = l.scalar_value() l.backward() y_np_value = self.v2 / self.v1.std() * (self.v1 - self.v1.mean()) + self.v3 self.assertTrue(np.allclose(y.npvalue(),y_np_value))
def test_layer_norm(self): dy.renew_cg() x = dy.inputTensor(self.v1) g = dy.inputTensor(self.v2) b = dy.inputTensor(self.v3) y = dy.layer_norm(x, g, b) loss = dy.sum_elems(y) loss.backward() centered_v1 = self.v1 - self.v1.mean() y_np_value = self.v2 / self.v1.std() * centered_v1 + self.v3 self.assertTrue(np.allclose(y.npvalue(), y_np_value))
def calc_loss(sents): dy.renew_cg() src_fwd = LSTM_SRC_FWD.initial_state() src_bwd = LSTM_SRC_BWD.initial_state() trg_fwd = LSTM_TRG_FWD.initial_state() trg_bwd = LSTM_TRG_BWD.initial_state() # Encoding src_reps = encode_sents(LOOKUP_SRC, src_fwd, src_bwd, [src for src, trg in sents]) trg_reps = encode_sents(LOOKUP_TRG, trg_fwd, trg_bwd, [trg for src, trg in sents]) # Concatenate the sentence representations to a single matrix mtx_src = dy.concatenate_cols(src_reps) mtx_trg = dy.concatenate_cols(trg_reps) # Do matrix multiplication to get a matrix of dot product similarity scores sim_mtx = dy.transpose(mtx_src) * mtx_trg # Calculate the hinge loss over all dimensions loss = dy.hinge_dim(sim_mtx, list(range(len(sents))), d=1) return dy.sum_elems(loss)