def eval(self): if self.src_data == None: self.src_data, self.ref_data, self.src_batches, self.ref_batches = \ xnmt.input_reader.read_parallel_corpus(self.model.src_reader, self.model.trg_reader, self.src_file, self.ref_file, batcher=self.batcher, max_src_len=self.max_src_len, max_trg_len=self.max_trg_len) loss_val = LossScalarBuilder() ref_words_cnt = 0 for src, trg in zip(self.src_batches, self.ref_batches): dy.renew_cg(immediate_compute=settings.IMMEDIATE_COMPUTE, check_validity=settings.CHECK_VALIDITY) loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg, self.loss_calculator) additional_loss = self.model.calc_additional_loss(standard_loss) loss_builder.add_loss("standard_loss", standard_loss) loss_builder.add_loss("additional_loss", additional_loss) ref_words_cnt += self.model.trg_reader.count_words(trg) loss_val += loss_builder.get_loss_stats() loss_stats = {k: v/ref_words_cnt for k, v in loss_val.items()} try: return LossScore(loss_stats[self.model.get_primary_loss()], loss_stats=loss_stats, desc=self.desc), ref_words_cnt except KeyError: raise RuntimeError("Did you wrap your loss calculation with LossBuilder({'primary_loss': loss_value}) ?")
def on_calc_additional_loss(self, reward): if not self.learn_segmentation: return None ret = LossBuilder() if self.length_prior_alpha > 0: reward += self.segment_length_prior * self.length_prior_alpha reward = dy.cdiv(reward - dy.mean_batches(reward), dy.std_batches(reward)) # Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): baseline_loss.append(dy.squared_distance(reward, baseline)) ret.add_loss("Baseline", dy.esum(baseline_loss)) # Reinforce Loss lmbd = self.lmbd.get_value(self.warmup_counter) if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - self.bs[i] else: r_i = reward reinforce_loss.append(dy.logistic(r_i) * ll) ret.add_loss("Reinforce", -dy.esum(reinforce_loss) * lmbd) # Total Loss return ret
def training_step(self, src, trg): """ Performs forward pass, backward pass, parameter update for the given minibatch """ loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg, self.loss_calculator) additional_loss = self.model.calc_additional_loss(standard_loss) loss_builder.add_loss("standard_loss", standard_loss) loss_builder.add_loss("additional_loss", additional_loss) return loss_builder
def compute_dev_loss(self): loss_builder = LossBuilder() trg_words_cnt = 0 for src, trg in zip(self.dev_src, self.dev_trg): dy.renew_cg() standard_loss = self.model.calc_loss(src, trg) loss_builder.add_loss("loss", standard_loss) trg_words_cnt += self.logger.count_trg_words(trg) loss_builder.compute() return trg_words_cnt, LossScore(loss_builder.sum() / trg_words_cnt)
def calc_loss(self, src, trg, loss_calculator): sub_losses = collections.defaultdict(list) for model in self.models: for loss_name, loss in model.calc_loss(src, trg, loss_calculator).loss_values.items(): sub_losses[loss_name].append(loss) model_loss = LossBuilder() for loss_name, losslist in sub_losses.items(): # TODO: dy.average(losslist) _or_ dy.esum(losslist) / len(self.models) ? # -- might not be the same if not all models return all losses model_loss.add_loss(loss_name, dy.average(losslist)) return model_loss
def training_step(self, src, trg): """ Performs forward pass, backward pass, parameter update for the given minibatch """ loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg, self.loss_calculator) additional_loss = self.model.calc_additional_loss(standard_loss) loss_builder.add_loss("standard_loss", standard_loss) loss_builder.add_loss("additional_loss", additional_loss) loss_value = loss_builder.compute() self.logger.update_epoch_loss(src, trg, loss_builder.get_loss_stats()) self.logger.report_train_process() return loss_value
def calc_loss(self, src, trg, loss_calculator): self.start_sent(src) embeddings = self.src_embedder.embed_sent(src) encodings = self.encoder(embeddings) self.attender.init_sent(encodings) # Initialize the hidden state from the encoder ss = mark_as_batch([Vocab.SS] * len(src)) if is_batched(src) else Vocab.SS dec_state = self.decoder.initial_state(self.encoder.get_final_states(), self.trg_embedder.embed(ss)) # Compose losses model_loss = LossBuilder() model_loss.add_loss("mle", loss_calculator(self, dec_state, src, trg)) if self.calc_global_fertility or self.calc_attention_entropy: # philip30: I assume that attention_vecs is already masked src wisely. # Now applying the mask to the target masked_attn = self.attender.attention_vecs if trg.mask is not None: trg_mask = trg.mask.get_active_one_mask().transpose() masked_attn = [ dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask) ] if self.calc_global_fertility: model_loss.add_loss("fertility", self.global_fertility(masked_attn)) if self.calc_attention_entropy: model_loss.add_loss("H(attn)", self.attention_entropy(masked_attn)) return model_loss
def training_step(self, src, trg): """ Performs forward pass, backward pass, parameter update for the given minibatch """ loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg, self.loss_calculator) if standard_loss.__class__ == LossBuilder: loss = None for loss_name, loss_expr in standard_loss.loss_nodes: loss_builder.add_loss(loss_name, loss_expr) loss = loss_expr if not loss else loss + loss_expr standard_loss = loss else: loss_builder.add_loss("loss", standard_loss) additional_loss = self.model.calc_additional_loss( dy.nobackprop(-standard_loss)) if additional_loss != None: loss_builder.add_loss("additional_loss", additional_loss) loss_value = loss_builder.compute() self.logger.update_epoch_loss(src, trg, loss_builder) self.logger.report_train_process() return loss_value
def __call__(self, translator, dec_state, src, trg): # TODO: apply trg.mask ? samples = [] logsofts = [] self.bs = [] done = [False for _ in range(len(trg))] for _ in range(self.sample_length): dec_state.context = translator.attender.calc_context(dec_state.rnn_state.output()) if self.use_baseline: h_t = dy.tanh(translator.decoder.context_projector(dy.concatenate([dec_state.rnn_state.output(), dec_state.context]))) self.bs.append(self.baseline(dy.nobackprop(h_t))) logsoft = dy.log_softmax(translator.decoder.get_scores(dec_state)) sample = logsoft.tensor_value().categorical_sample_log_prob().as_numpy()[0] # Keep track of previously sampled EOS sample = [sample_i if not done_i else Vocab.ES for sample_i, done_i in zip(sample, done)] # Appending and feeding in the decoder logsoft = dy.pick_batch(logsoft, sample) logsofts.append(logsoft) samples.append(sample) dec_state = translator.decoder.add_input(dec_state, translator.trg_embedder.embed(xnmt.batcher.mark_as_batch(sample))) # Check if we are done. done = list(six.moves.map(lambda x: x == Vocab.ES, sample)) if all(done): break samples = np.stack(samples, axis=1).tolist() self.eval_score = [] for trg_i, sample_i in zip(trg, samples): # Removing EOS try: idx = sample_i.index(Vocab.ES) sample_i = sample_i[:idx] except ValueError: pass try: idx = trg_i.words.index(Vocab.ES) trg_i.words = trg_i.words[:idx] except ValueError: pass # Calculate the evaluation score score = 0 if not len(sample_i) else self.evaluation_metric.evaluate_fast(trg_i.words, sample_i) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) loss = LossBuilder() if self.use_baseline: for i, (score, _) in enumerate(zip(self.bs, logsofts)): logsofts[i] = dy.cmult(logsofts[i], score - self.true_score) loss.add_loss("Reinforce", dy.sum_elems(dy.esum(logsofts))) else: loss.add_loss("Reinforce", dy.sum_elems(dy.cmult(-self.true_score, dy.esum(logsofts)))) if self.use_baseline: baseline_loss = [] for bs in self.bs: baseline_loss.append(dy.squared_distance(self.true_score, bs)) loss.add_loss("Baseline", dy.sum_elems(dy.esum(baseline_loss))) return loss
def on_calc_additional_loss(self, translator_loss): if not self.learn_segmentation or self.segment_decisions is None: return None reward = -translator_loss["mle"] if not self.log_reward: reward = dy.exp(reward) reward = dy.nobackprop(reward) # Make sure that reward is not scalar, but rather based on the each batch item assert reward.dim()[1] == len(self.src_sent) # Mask enc_mask = self.enc_mask.get_active_one_mask().transpose() if self.enc_mask is not None else None # Compose the lose ret = LossBuilder() ## Length prior alpha = self.length_prior_alpha.value() if self.length_prior_alpha is not None else 0 if alpha > 0: reward += self.segment_length_prior * alpha # reward z-score normalization if self.z_normalization: reward = dy.cdiv(reward-dy.mean_batches(reward), dy.std_batches(reward) + EPS) ## Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): loss = dy.squared_distance(reward, baseline) if enc_mask is not None: loss = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), loss) baseline_loss.append(loss) ret.add_loss("Baseline", dy.esum(baseline_loss)) if self.print_sample: print(dy.exp(self.segment_logsoftmaxes[i]).npvalue().transpose()[0]) ## Reinforce Loss lmbd = self.lmbd.value() if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - dy.nobackprop(self.bs[i]) else: r_i = reward if enc_mask is not None: ll = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), ll) reinforce_loss.append(r_i * -ll) loss = dy.esum(reinforce_loss) * lmbd ret.add_loss("Reinforce", loss) if self.confidence_penalty: ls_loss = self.confidence_penalty(self.segment_logsoftmaxes, enc_mask) ret.add_loss("Confidence Penalty", ls_loss) # Total Loss return ret
def one_epoch(self, update_weights=True): """ :param update_weights: Whether to perform backward pass & update weights (useful for debugging) """ self.logger.new_epoch() if self.args["reload_command"] is not None: self._augment_data_next_epoch() self.model.set_train(update_weights) order = list(range(0, len(self.train_src))) np.random.shuffle(order) for batch_num in order: src = self.train_src[batch_num] trg = self.train_trg[batch_num] # Loss calculation dy.renew_cg() loss_builder = LossBuilder() standard_loss = self.model.calc_loss(src, trg) if standard_loss.__class__ == LossBuilder: loss = None for loss_name, loss_expr in standard_loss.loss_nodes: loss_builder.add_loss(loss_name, loss_expr) loss = loss_expr if not loss else loss + loss_expr standard_loss = loss else: loss_builder.add_loss("loss", standard_loss) additional_loss = self.model.calc_additional_loss( dy.nobackprop(-standard_loss)) if additional_loss != None: loss_builder.add_loss("additional_loss", additional_loss) # Log the loss sum loss_value = loss_builder.compute() self.logger.update_epoch_loss(src, trg, loss_builder) if update_weights: loss_value.backward() self.trainer.update() # Devel reporting self.logger.report_train_process() if self.logger.should_report_dev(): self.dev_evaluation() self.model.new_epoch()
def __call__(self, translator, initial_state, src, trg): # TODO(philip30): currently only using the best hypothesis / first sample for reinforce loss # A small further implementation is needed if we want to do reinforce with multiple samples. search_output = translator.search_strategy.generate_output( translator, initial_state)[0] # Calculate evaluation scores self.eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = self.remove_eos(sample_i.tolist()) ref_i = self.remove_eos(trg_i.words) # Evaluating if len(sample_i) == 0: score = 0 else: score = self.evaluation_metric.evaluate(ref_i, sample_i) * \ (-1 if self.inv_eval else 1) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) # Composing losses loss = LossBuilder() if self.use_baseline: baseline_loss = [] losses = [] for state, logsoft, mask in zip(search_output.state, search_output.logsoftmaxes, search_output.mask): bs_score = self.baseline(state) baseline_loss.append( dy.squared_distance(self.true_score, bs_score)) loss_i = dy.cmult(logsoft, self.true_score - bs_score) losses.append( dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) loss.add_loss("reinforce", dy.sum_elems(dy.esum(losses))) loss.add_loss("reinf_baseline", dy.sum_elems(dy.esum(baseline_loss))) else: loss.add_loss( "reinforce", dy.sum_elems(dy.cmult(self.true_score, dy.esum(logsofts)))) return loss
def calc_loss(self, src, trg, loss_calculator): self.start_sent(src) tokens = [x[0] for x in src] transitions = [x[1] for x in src] print("Current Batch: " + str(len(tokens)) + " pairs.\n") is_batched = xnmt.batcher.is_batched(src) tokens = xnmt.batcher.mark_as_batch(tokens) embeddings = self.src_embedder.embed_sent(tokens) encodings = self.encoder(embeddings, transitions) self.attender.init_sent(encodings) #import pdb;pdb.set_trace() # Initialize the hidden state from the encoder ss = mark_as_batch( [Vocab.SS] * len(tokens)) if xnmt.batcher.is_batched(src) else Vocab.SS dec_state = self.decoder.initial_state(self.encoder._final_states, self.trg_embedder.embed(ss)) # Compose losses model_loss = LossBuilder() loss, wer = loss_calculator(self, dec_state, src, trg) model_loss.add_loss("mle", loss) print("wer_b:" + str(wer)) if self.calc_global_fertility or self.calc_attention_entropy: # philip30: I assume that attention_vecs is already masked src wisely. # Now applying the mask to the target masked_attn = self.attender.attention_vecs if trg.mask is not None: trg_mask = trg.mask.get_active_one_mask().transpose() masked_attn = [ dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask) ] if self.calc_global_fertility: model_loss.add_loss("fertility", self.global_fertility(masked_attn)) if self.calc_attention_entropy: model_loss.add_loss("H(attn)", self.attention_entropy(masked_attn)) return model_loss