Пример #1
0
    def calculate_metric(self):
        total_exact_match = 0
        total_f1 = 0.0
        num_samples = len(self.all_targets)

        trg_vocab = self.tensorizers["trg_seq_tokens"].vocab
        bleu_scorer = bleu.Scorer(
            bleu.BleuConfig(
                pad=trg_vocab.get_pad_index(),
                eos=trg_vocab.get_eos_index(),
                unk=trg_vocab.get_unk_index(),
            ))

        for (beam_preds, target) in zip(self.all_preds, self.all_targets):
            pred = beam_preds[0]
            if self._compare_target_prediction_tokens(pred, target):
                total_exact_match += 1
            total_f1 += compute_f1(pred, target)
            # Bleu Metric calculation is always done with tensors on CPU or
            # type checks in fairseq/bleu.py:add() will fail
            bleu_scorer.add(
                torch.IntTensor(target).cpu(),
                torch.IntTensor(pred).cpu())

        loss = self.calculate_loss()
        exact_match = round(
            safe_division(total_exact_match, num_samples) * 100.0, 2)
        f1 = round(safe_division(total_f1, num_samples) * 100.0, 2)
        bleu_score = round(
            0.0 if len(self.all_preds) == 0 else bleu_scorer.score(), 2)

        return Seq2SeqMetrics(loss, exact_match, f1, bleu_score)
Пример #2
0
def main():
    parser = argparse.ArgumentParser(
        description=("Rescore generated hypotheses with extra models"))
    add_args(parser)
    add_args_rescore(parser)
    args = parser.parse_args()

    assert (args.translation_info_export_path is not None
            ), "--translation_info_export_path is required for rescoring"

    assert args.l2r_model_path is not None, "Rescoring needs forward model"

    _, _, forward_task = utils.load_diverse_ensemble_for_inference(
        [args.l2r_model_path])
    rescorer = Rescorer(args, forward_task)
    dst_dict = forward_task.tgt_dict
    base_bleu_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                                   dst_dict.unk())
    rescoring_bleu_scorer = bleu.Scorer(
        bleu.BleuConfig(
            pad=dst_dict.pad(),
            eos=dst_dict.eos(),
            unk=dst_dict.unk(),
        ))

    with open(args.translation_info_export_path, "rb") as file:
        translation_info_list = pickle.load(file)

    scores_to_export_list = []
    trans_batch_info = []
    for k in tqdm(range(0, len(translation_info_list), args.batch_size)):
        trans_batch_info = translation_info_list[k:k + args.batch_size]
        for j in range(len(trans_batch_info)):
            trans_batch_info[j]["hypos"] = [{
                "score": hypo["score"],
                "tokens": hypo["tokens"].cuda()
            } for hypo in trans_batch_info[j]["hypos"]]
        top_tokens, scores_to_export = find_top_tokens(args, trans_batch_info,
                                                       rescorer,
                                                       dst_dict.pad())
        if args.scores_info_export_path is not None:
            scores_to_export_list += scores_to_export

        for i, trans_info in enumerate(trans_batch_info):
            base_bleu_scorer.add(
                trans_info["target_tokens"].int().cpu(),
                trans_info["hypos"][0]["tokens"].int().cpu(),
            )
            rescoring_bleu_scorer.add(trans_info["target_tokens"].int().cpu(),
                                      top_tokens[i].int().cpu())
        trans_batch_info = []

    print("| Base ", base_bleu_scorer.result_string())
    print("| Rescoring ", rescoring_bleu_scorer.result_string())

    if args.scores_info_export_path is not None:
        with open(args.scores_info_export_path, "wb") as file:
            pickle.dump(scores_to_export_list, file)
Пример #3
0
def build_scorer(choice, tgt_dict):
    _choice = choice._name if isinstance(choice, DictConfig) else choice

    if _choice == "bleu":
        from fairseq.scoring import bleu

        return bleu.Scorer(
            bleu.BleuConfig(pad=tgt_dict.pad(), eos=tgt_dict.eos(), unk=tgt_dict.unk())
        )
    return _build_scorer(choice)
Пример #4
0
 def __init__(self, args, src_dict, dst_dict):
     super().__init__(args, src_dict, dst_dict)
     self.translator = None
     self.scorer = bleu.Scorer(
         bleu.BleuConfig(
             pad=dst_dict.pad(),
             eos=dst_dict.eos(),
             unk=dst_dict.unk(),
         )
     )
Пример #5
0
 def score(fdsys):
     with open(args.ref) as fdref:
         scorer = bleu.Scorer(
             bleu.BleuConfig(
                 pad=dict.pad(),
                 eos=dict.eos(),
                 unk=dict.unk(),
             )
         )
         for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
             sys_tok = dict.encode_line(sys_tok)
             ref_tok = dict.encode_line(ref_tok)
             scorer.add(ref_tok, sys_tok)
         print(scorer.result_string(args.order))
Пример #6
0
 def score(fdsys):
     with open(args.ref) as fdref:
         scorer = bleu.Scorer(
             bleu.BleuConfig(
                 pad=dict.pad(),
                 eos=dict.eos(),
                 unk=dict.unk(),
             ))
         for i, (sys_tok, ref_tok) in enumerate(
                 zip(readlines(fdsys), readlines(fdref))):
             scorer.reset(one_init=True)
             sys_tok = dict.encode_line(sys_tok)
             ref_tok = dict.encode_line(ref_tok)
             scorer.add(ref_tok, sys_tok)
             print(i, scorer.result_string(args.order))
Пример #7
0
def evaluate_weights(scores_info, feature_weights, length_penalty):
    scorer = bleu.Scorer(
        bleu.BleuConfig(
            pad=vocab_constants.PAD_ID,
            eos=vocab_constants.EOS_ID,
            unk=vocab_constants.UNK_ID,
        ))

    for example in scores_info:
        weighted_scores = (example["scores"] * feature_weights).sum(axis=1)
        weighted_scores /= (example["tgt_len"]**length_penalty) + 1e-12
        top_hypo_ind = np.argmax(weighted_scores)
        top_hypo = example["hypos"][top_hypo_ind]
        ref = example["target_tokens"]
        scorer.add(torch.IntTensor(ref), torch.IntTensor(top_hypo))

    return scorer.score()
Пример #8
0
def smoothed_sentence_bleu(task, target_tokens, hypo_tokens):
    """
    Implements "Smoothing 3" method from Chen and Cherry. "A Systematic
    Comparison of Smoothing Techniques for Sentence-Level BLEU".
    http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
    """
    dst_dict = task.target_dictionary
    scorer = bleu.Scorer(
        bleu.BleuConfig(
            pad=dst_dict.pad(),
            eos=dst_dict.eos(),
            unk=dst_dict.unk(),
        )
    )
    scorer.add(target_tokens, hypo_tokens)

    invcnt = 1
    ratios = []
    for (match, count) in [
        (scorer.stat.match1, scorer.stat.count1),
        (scorer.stat.match2, scorer.stat.count2),
        (scorer.stat.match3, scorer.stat.count3),
        (scorer.stat.match4, scorer.stat.count4),
    ]:
        if count == 0:
            # disregard n-grams for values of n larger than hypothesis length
            continue
        if match == 0:
            invcnt *= 2
            match = 1.0 / invcnt
        ratios.append(match / count)

    brevity_penalty = np.min(
        [1, np.exp(1 - (scorer.stat.reflen / scorer.stat.predlen))]
    )
    geometric_mean = np.exp(np.log(ratios).mean())
    smoothed_bleu = brevity_penalty * geometric_mean
    return smoothed_bleu
    def calculate_metric(self):
        num_correct = 0
        total_count = len(self.all_targets)
        trg_vocab = self.tensorizers["trg_seq_tokens"].vocab
        bleu_scorer = bleu.Scorer(
            bleu.BleuConfig(
                pad=trg_vocab.get_pad_index(),
                eos=trg_vocab.get_eos_index(),
                unk=trg_vocab.get_unk_index(),
            )
        )
        for beam_pred, target in zip(self.all_preds, self.all_targets):
            pred = beam_pred[0]
            if self._compare_target_prediction_tokens(pred, target):
                num_correct = num_correct + 1
            # Bleu Metric calculation is always done with tensors on CPU or
            # type checks in fairseq/bleu.py:add() will fail
            bleu_scorer.add(torch.IntTensor(target).cpu(), torch.IntTensor(pred).cpu())

        bleu_score = 0.0 if len(self.all_preds) == 0 else bleu_scorer.score()
        accuracy = safe_division(num_correct, total_count)
        cross_entropy_loss = self.calculate_loss()
        return Seq2SeqMetrics(accuracy, cross_entropy_loss, bleu_score)
Пример #10
0
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile,
                      write_hypos, normalize):

    print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c)
    gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files(
        args)
    dict = dictionary.Dictionary()
    scorer = scorer = bleu.Scorer(
        bleu.BleuConfig(
            pad=dict.pad(),
            eos=dict.eos(),
            unk=dict.unk(),
        ))

    ordered_hypos = {}
    ordered_targets = {}

    for shard_id in range(len(bitext1_lst)):
        bitext1 = bitext1_lst[shard_id]
        bitext2 = bitext2_lst[shard_id]
        gen_output = gen_output_lst[shard_id]
        lm_res = lm_res_lst[shard_id]

        total = len(bitext1.rescore_source.keys())
        source_lst = []
        hypo_lst = []
        score_lst = []
        reference_lst = []
        j = 1
        best_score = -math.inf

        for i in range(total):
            # length is measured in terms of words, not bpe tokens, since models may not share the same bpe
            target_len = len(bitext1.rescore_hypo[i].split())

            if lm_res is not None:
                lm_score = lm_res.score[i]
            else:
                lm_score = 0

            if bitext2 is not None:
                bitext2_score = bitext2.rescore_score[i]
                bitext2_backwards = bitext2.backwards
            else:
                bitext2_score = None
                bitext2_backwards = None

            score = rerank_utils.get_score(
                a,
                b,
                c,
                target_len,
                bitext1.rescore_score[i],
                bitext2_score,
                lm_score=lm_score,
                lenpen=lenpen,
                src_len=bitext1.source_lengths[i],
                tgt_len=bitext1.target_lengths[i],
                bitext1_backwards=bitext1.backwards,
                bitext2_backwards=bitext2_backwards,
                normalize=normalize,
            )

            if score > best_score:
                best_score = score
                best_hypo = bitext1.rescore_hypo[i]

            if j == gen_output.num_hypos[i] or j == args.num_rescore:
                j = 1
                hypo_lst.append(best_hypo)
                score_lst.append(best_score)
                source_lst.append(bitext1.rescore_source[i])
                reference_lst.append(bitext1.rescore_target[i])

                best_score = -math.inf
                best_hypo = ""
            else:
                j += 1

        gen_keys = list(sorted(gen_output.no_bpe_target.keys()))

        for key in range(len(gen_keys)):
            if args.prefix_len is None:
                assert hypo_lst[key] in gen_output.no_bpe_hypo[
                    gen_keys[key]], ("pred and rescore hypo mismatch: i: " +
                                     str(key) + ", " + str(hypo_lst[key]) +
                                     str(gen_keys[key]) +
                                     str(gen_output.no_bpe_hypo[key]))
                sys_tok = dict.encode_line(hypo_lst[key])
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                scorer.add(ref_tok, sys_tok)

            else:
                full_hypo = rerank_utils.get_full_from_prefix(
                    hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                sys_tok = dict.encode_line(full_hypo)
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                scorer.add(ref_tok, sys_tok)

        # if only one set of hyper parameters is provided, write the predictions to a file
        if write_hypos:
            # recover the orinal ids from n best list generation
            for key in range(len(gen_output.no_bpe_target)):
                if args.prefix_len is None:
                    assert hypo_lst[key] in gen_output.no_bpe_hypo[
                        gen_keys[key]], ("pred and rescore hypo mismatch:" +
                                         "i:" + str(key) + str(hypo_lst[key]) +
                                         str(gen_output.no_bpe_hypo[key]))
                    ordered_hypos[gen_keys[key]] = hypo_lst[key]
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]

                else:
                    full_hypo = rerank_utils.get_full_from_prefix(
                        hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                    ordered_hypos[gen_keys[key]] = full_hypo
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]

    # write the hypos in the original order from nbest list generation
    if args.num_shards == (len(bitext1_lst)):
        with open(target_outfile, "w") as t:
            with open(hypo_outfile, "w") as h:
                for key in range(len(ordered_hypos)):
                    t.write(ordered_targets[key])
                    h.write(ordered_hypos[key])

    res = scorer.result_string(4)
    if write_hypos:
        print(res)
    score = rerank_utils.parse_bleu_scoring(res)
    return score
Пример #11
0
def _generate_score(models, args, task, dataset, modify_target_dict):
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    if not args.quiet:
        print("| loading model(s) from {}".format(", ".join(
            args.path.split(CHECKPOINT_PATHS_DELIMITER))))

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=True,
        )

    translator = build_sequence_generator(args, task, models)
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    print("seed number is" + str(args.max_examples_to_evaluate_seed))
    if args.max_examples_to_evaluate > 0:
        pytorch_translate_data.subsample_pair_dataset(
            dataset, args.max_examples_to_evaluate,
            args.max_examples_to_evaluate_seed)

    # Keep track of translations
    # Initialize with empty translations
    # and zero probs scores
    translated_sentences = [""] * len(dataset)
    translated_scores = [0.0] * len(dataset)
    hypos_list = []

    collect_output_hypos = getattr(args, "output_hypos_binary_path", False)
    if collect_output_hypos:
        output_hypos_token_arrays = [None] * len(dataset)

    # Generate and compute BLEU score
    dst_dict = task.target_dictionary
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer(bleu.SacrebleuConfig())
    else:
        scorer = bleu.Scorer(
            bleu.BleuConfig(
                pad=dst_dict.pad(),
                eos=dst_dict.eos(),
                unk=dst_dict.unk(),
            ))

    itr = task.get_batch_iterator(
        dataset=dataset,
        max_tokens=args.max_tokens,
        max_sentences=args.batch_size,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    oracle_scorer = None
    if args.report_oracle_bleu:
        oracle_scorer = bleu.Scorer(
            bleu.BleuConfig(
                pad=dst_dict.pad(),
                eos=dst_dict.eos(),
                unk=dst_dict.unk(),
            ))

    rescorer = None
    num_sentences = 0
    translation_samples = []
    translation_info_list = []
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=use_cuda,
            timer=gen_timer,
            prefix_size=1
            if pytorch_translate_data.is_multilingual_many_to_one(args) else 0,
        )

        for trans_info in _iter_translations(args, task, dataset, translations,
                                             align_dict, rescorer,
                                             modify_target_dict):
            if hasattr(scorer, "add_string"):
                scorer.add_string(trans_info.target_str, trans_info.hypo_str)
            else:
                scorer.add(trans_info.target_tokens, trans_info.hypo_tokens)
            if oracle_scorer is not None:
                oracle_scorer.add(trans_info.target_tokens,
                                  trans_info.best_hypo_tokens)

            if getattr(args, "translation_output_file", False):
                translated_sentences[
                    trans_info.sample_id] = trans_info.hypo_str
            if getattr(args, "translation_probs_file", False):
                translated_scores[trans_info.sample_id] = trans_info.hypo_score
            if getattr(args, "hypotheses_export_path", False):
                hypos_list.append(trans_info.hypos)
            if collect_output_hypos:
                output_hypos_token_arrays[
                    trans_info.sample_id] = trans_info.best_hypo_tokens
            if args.translation_info_export_path is not None:
                # Strip expensive data from hypotheses before saving
                hypos = [{
                    k: v
                    for k, v in hypo.items() if k in ["tokens", "score"]
                } for hypo in trans_info.hypos]
                # Make sure everything is on cpu before exporting
                hypos = [{
                    "score": hypo["score"],
                    "tokens": hypo["tokens"].cpu()
                } for hypo in hypos]
                translation_info_list.append({
                    "src_tokens":
                    trans_info.src_tokens.cpu(),
                    "target_tokens":
                    trans_info.target_tokens,
                    "hypos":
                    hypos,
                })
            translation_samples.append(
                collections.OrderedDict({
                    "sample_id":
                    trans_info.sample_id.item(),
                    "src_str":
                    trans_info.src_str,
                    "target_str":
                    trans_info.target_str,
                    "hypo_str":
                    trans_info.hypo_str,
                }))
            wps_meter.update(trans_info.src_tokens.size(0))
            t.log({"wps": round(wps_meter.avg)})
            num_sentences += 1

    # If applicable, save collected hypothesis tokens to binary output file
    if collect_output_hypos:
        output_dataset = pytorch_translate_data.InMemoryIndexedDataset()
        output_dataset.load_from_sequences(output_hypos_token_arrays)
        output_dataset.save(args.output_hypos_binary_path)
    if args.output_source_binary_path:
        dataset.src.save(args.output_source_binary_path)
    if args.translation_info_export_path is not None:
        f = open(args.translation_info_export_path, "wb")
        pickle.dump(translation_info_list, f)
        f.close()

    # If applicable, save the translations and scores to the output files
    # These two ouputs are used in dual learning for weighted backtranslation
    if getattr(args, "translation_output_file", False) and getattr(
            args, "translation_probs_file", False):
        with open(args.translation_output_file,
                  "w") as translation_file, open(args.translation_probs_file,
                                                 "w") as score_file:
            for hypo_str, hypo_score in zip(translated_sentences,
                                            translated_scores):
                if len(hypo_str.strip()) > 0:
                    print(hypo_str, file=translation_file)
                    print(np.exp(hypo_score), file=score_file)

    # For eg. external evaluation
    if getattr(args, "hypotheses_export_path", False):
        with open(args.hypotheses_export_path, "w") as out_file:
            for hypos in hypos_list:
                for hypo in hypos:
                    print(
                        task.tgt_dict.string(hypo["tokens"],
                                             bpe_symbol=args.post_process),
                        file=out_file,
                    )

    if oracle_scorer is not None:
        print(
            f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}"
        )

    return scorer, num_sentences, gen_timer, translation_samples
Пример #12
0
    def forward(
        self,
        sample,
        forward_model,
        forward_optimizer,
        tgt_dict,
        backward_model,
        backward_optimizer,
        src_dict,
        lm_scorer=None,
        reduce=True,
        **generate_kwargs,
    ):
        """Compute the reconstruction and LM loss from forward and backward
        models.

        Args:
            sample: original input.
            hypos: psudo labels generated by the forward model. They are used
                as approximation of the target space to do importance sampling.
            forward_model: the model used to generate psuedo labels.
            backward_model: the model to reconstruct original input using
                psuedo labels.
            lm_scorer: an LM model eval mode to score psuedo labels in target
                space.
        """
        # Generate translations
        nbest_translations = self._generate_translation(
            forward_model, tgt_dict, sample, self.args.beam, **generate_kwargs)

        forward_samples = []
        backward_samples = {}
        # TODO (T36875783): load pretrained lm to score
        lm_score = 0.0
        for sample_id, src_processed, tgt_hypos in nbest_translations:
            # compute each model's reward
            forward_reward = lm_score
            # construct the sample; compute the ce loss
            # backward_samples need to handle EOS
            src = self._maybe_reverse_source(src_processed)
            src = self._maybe_add_eos(src, src_dict.eos())
            assert len(tgt_hypos) == self.args.beam
            for tgt_hypo_i, tgt_hypo_struct in enumerate(tgt_hypos):
                dual_sample_id = sample_id.item() * self.args.beam + tgt_hypo_i
                tgt_hypo = tgt_hypo_struct["tokens"]
                # add EOS to the target, i.e. original source, since it'll be used
                # as target
                # remove EOS in the src is optional
                if self.remove_eos_at_src:
                    tgt_hypo = tgt_hypo[:-1]
                tgt_hypo_processed = self._maybe_reverse_source(tgt_hypo)

                backward_sample = {
                    "id": dual_sample_id,
                    "source": tgt_hypo_processed.cpu(),
                    "target": src.cpu(),
                    "weight": 1.0 - self.alpha,
                }
                assert dual_sample_id not in backward_samples
                backward_samples[dual_sample_id] = backward_sample

        bwd_model_input = utils.move_to_cuda(
            WeightedLanguagePairDataset.collate(
                samples=list(backward_samples.values()),
                pad_idx=src_dict.pad(),
                eos_idx=src_dict.eos(),
            ))
        reconstructed_source = self._generate_translation(
            backward_model, src_dict, bwd_model_input, 1, **generate_kwargs)
        for dual_sample_id, tgt_hypo_processed, src_hypos in reconstructed_source:
            backward_sample = backward_samples[dual_sample_id.item()]
            src = backward_sample["target"]
            tgt_hypo = self._maybe_reverse_source(tgt_hypo_processed)

            # use bleu score as reward
            scorer = bleu.Scorer(
                bleu.BleuConfig(
                    pad=src_dict.pad(),
                    eos=src_dict.eos(),
                    unk=src_dict.unk(),
                ))
            assert len(src_hypos) == 1
            src_hypo = src_hypos[0]["tokens"][:-1]
            scorer.add(src.int().cpu(), src_hypo.int().cpu())
            backward_reward = (
                scorer.score(order=self.args.reconstruction_bleu_order) /
                100.0)

            original_stc = " ".join(src_dict[tid] for tid in src.tolist())
            translated_stc = " ".join(tgt_dict[tid] for tid in tgt_hypo)
            recon_stc = " ".join(src_dict[tid] for tid in src_hypo.tolist())

            if int(dual_sample_id / self.args.beam) % 100 == 0:
                print("--------")
                print(
                    "original sentence:",
                    original_stc.replace(self.args.source_bpe_end_marker, ""),
                )
                print(
                    "translated sentence:",
                    translated_stc.replace(self.args.source_bpe_end_marker,
                                           ""),
                )
                print(
                    "reconstructed sentence:",
                    recon_stc.replace(self.args.source_bpe_end_marker, ""),
                )
                print("reward:", backward_reward)
                print("--------")

            total_reward = (self.alpha * forward_reward +
                            (1.0 - self.alpha) * backward_reward)
            src_processed = self._maybe_reverse_source(src)
            tgt_hypo = self._maybe_add_eos(tgt_hypo, tgt_dict.eos())
            forward_samples.append({
                "id": dual_sample_id,
                "source": src_processed.cpu(),
                "target": tgt_hypo.cpu(),  # first hypo is best hypo
                "weight": total_reward,
            })

        # Now combine pseudo labelled examples to corresponding batch with
        # rewards factored to weighting of each task's loss
        agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {}
        forward_model.train()
        forward_loss, sample_size, logging_output = self.task.criterion(
            forward_model,
            utils.move_to_cuda(
                WeightedLanguagePairDataset.collate(
                    samples=forward_samples,
                    pad_idx=tgt_dict.pad(),
                    eos_idx=tgt_dict.eos(),
                )),
        )
        agg_loss += forward_loss.detach().item()
        agg_sample_size += sample_size
        agg_logging_output["primal"] = logging_output
        # grad would be further scaled when passed back to trainer,
        # which will do the update
        forward_optimizer.backward(forward_loss)

        backward_model.train()
        backward_loss, sample_size, logging_output = self.task.criterion(
            backward_model, bwd_model_input)

        agg_loss += backward_loss.data.item()
        agg_sample_size += sample_size
        agg_logging_output["dual"] = logging_output
        backward_optimizer.backward(backward_loss)
        return agg_loss, agg_sample_size, agg_logging_output
Пример #13
0
def random_search(scores_info_export_path,
                  num_trials,
                  report_oracle_bleu=False):
    with open(scores_info_export_path, "rb") as f:
        scores_info = pickle.load(f)

    dummy_task = DummyTask()

    if report_oracle_bleu:
        oracle_scorer = bleu.Scorer(
            bleu.BleuConfig(
                pad=vocab_constants.PAD_ID,
                eos=vocab_constants.EOS_ID,
                unk=vocab_constants.UNK_ID,
            ))

        for example in scores_info:
            smoothed_bleu = []
            for hypo in example["hypos"]:
                eval_score = smoothed_sentence_bleu(
                    dummy_task,
                    torch.IntTensor(example["target_tokens"]),
                    torch.IntTensor(hypo),
                )
                smoothed_bleu.append(eval_score)
            best_hypo_ind = np.argmax(smoothed_bleu)
            example["best_hypo_ind"] = best_hypo_ind

            oracle_scorer.add(
                torch.IntTensor(example["target_tokens"]),
                torch.IntTensor(example["hypos"][best_hypo_ind]),
            )

        print("oracle BLEU: ", oracle_scorer.score())

    num_features = scores_info[0]["scores"].shape[1]
    assert all(
        example["scores"].shape[1] == num_features for example in
        scores_info), "All examples must have the same number of scores!"
    feature_weights = np.zeros(num_features)
    feature_weights[0] = 1
    score = evaluate_weights(scores_info, feature_weights, length_penalty=1)
    print("base BLEU: ", score)
    best_score = score
    best_weights = feature_weights
    best_length_penalty = 0

    nonzero_features = identify_nonzero_features(scores_info)

    for i in range(num_trials):
        feature_weights = np.zeros(num_features)
        random_weights = np.random.dirichlet(np.ones(nonzero_features.size))
        feature_weights[nonzero_features] = random_weights
        length_penalty = 1.5 * np.random.random()

        score = evaluate_weights(scores_info, feature_weights, length_penalty)
        if score > best_score:
            best_score = score
            best_weights = feature_weights
            best_length_penalty = length_penalty

        print(f"\r[{i}]  best: {best_score}", end="", flush=True)

    print()
    print("best weights: ", best_weights)
    print("best length penalty: ", length_penalty)

    return best_weights, best_length_penalty, best_score