Exemplo n.º 1
0
 def evaluate_file(self,
                   detok_hyp: Path,
                   ref: Union[Path, List[str]],
                   lowercase=True) -> float:
     detok_lines = list(IO.get_lines(detok_hyp))
     # takes multiple refs, but here we have only one
     ref_liness = [IO.get_lines(ref) if isinstance(ref, Path) else ref]
     bleu: BLEUScore = corpus_bleu(sys_stream=detok_lines,
                                   ref_streams=ref_liness,
                                   lowercase=lowercase)
     # this should be part of new sacrebleu  release (i sent a PR ;)
     bleu_str = bleu.format()
     bleu_file = detok_hyp.with_name(detok_hyp.name +
                                     ('.lc' if lowercase else '.oc') +
                                     '.sacrebleu')
     log.info(f'BLEU {detok_hyp} : {bleu_str}')
     IO.write_lines(bleu_file, bleu_str)
     return bleu.score
Exemplo n.º 2
0
 def evaluate_file(self,
                   detok_hyp: Path,
                   ref: Union[Path, List[str]],
                   lowercase=True) -> float:
     detok_lines = IO.get_lines(detok_hyp)
     # takes multiple refs, but here we have only one
     ref_liness = [IO.get_lines(ref) if isinstance(ref, Path) else ref]
     bleu: BLEU = corpus_bleu(sys_stream=detok_lines,
                              ref_streams=ref_liness,
                              lowercase=lowercase)
     # this should be part of new sacrebleu  release (i sent a PR ;)
     bleu_str = f'BLEU = {bleu.score:.2f} {"/".join(f"{p:.1f}" for p in bleu.precisions)}' \
         f' (BP = {bleu.bp:.3f} ratio = {(bleu.sys_len / bleu.ref_len):.3f}' \
         f' hyp_len = {bleu.sys_len:d} ref_len={bleu.ref_len:d})'
     bleu_file = detok_hyp.with_suffix(('.lc' if lowercase else '.oc') +
                                       '.sacrebleu')
     log.info(f'BLEU {detok_hyp} : {bleu_str}')
     IO.write_lines(bleu_file, bleu_str)
     return bleu.score
Exemplo n.º 3
0
 def moses_detokenize(self,
                      inp: Path,
                      out: Path,
                      col=0,
                      lang='en',
                      post_op=None):
     log.info(f"detok : {inp} --> {out}")
     tok_lines = IO.get_lines(inp, col=col, line_mapper=lambda x: x.split())
     with MosesDetokenizer(lang=lang) as detok:
         detok_lines = (detok(tok_line) for tok_line in tok_lines)
         if post_op:
             detok_lines = (post_op(line) for line in detok_lines)
         IO.write_lines(out, detok_lines)
Exemplo n.º 4
0
Arquivo: codec.py Projeto: isi-nlp/rtg
    def train(cls,
              model_type: str,
              vocab_size: int,
              model_path: str,
              files: List[str],
              no_split_toks: Optional[List[str]] = None,
              char_coverage: float = 0,
              dedup=True,
              spark=None):
        """
        :param model_type: word, char, bpe
        :param vocab_size: vocabulary size
        :param model_path: where to store vocabulary model
        :param files: text for creating vcabulary
        :param no_split_toks:
        :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage
        :return:
        """
        assert not no_split_toks, 'not supported in nlcodec yet'
        from nlcodec import learn_vocab, term_freq
        kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {}
        if not spark:
            inp = IO.get_liness(*files)
        else:
            # extract and store frequencies to this file
            stats_file = model_path + '.termfreqs'
            if not Path(stats_file).exists():
                log.info("Extracting term frequencies... ")
                paths = [f if isinstance(f, Path) else Path(f) for f in files]
                wfs, chfs, n_lines = term_freq.word_counts(paths=paths,
                                                           dedup=dedup,
                                                           spark=spark)
                log.info(
                    f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}"
                )
                stats = chfs if model_type == 'char' else wfs
                log.info(f"Writing frequencies to {stats_file}")
                with IO.writer(stats_file) as out:
                    term_freq.write_stats(stats=stats,
                                          out=out,
                                          line_count=n_lines)
                kwargs['term_freqs'] = True
            inp = IO.get_lines(stats_file, delim='\n')

        learn_vocab(inp=inp,
                    level=model_type,
                    model=model_path,
                    vocab_size=vocab_size,
                    **kwargs)
        return cls(model_path)
Exemplo n.º 5
0
 def decode_eval_file(self,
                      decoder,
                      src: Union[Path, List[str]],
                      out_file: Path,
                      ref: Optional[Union[Path, List[str]]],
                      lowercase: bool = True,
                      **dec_args) -> float:
     if out_file.exists() and out_file.stat().st_size > 0 and line_count(
             out_file) == (len(src)
                           if isinstance(src, list) else line_count(src)):
         log.warning(
             f"{out_file} exists and has desired number of lines. Skipped..."
         )
     else:
         if isinstance(src, Path):
             log.info(f"decoding {src.name}")
             src = list(IO.get_lines(src))
         if isinstance(ref, Path):
             ref = list(IO.get_lines(ref))
         with IO.writer(out_file) as out:
             decoder.decode_file(src, out, **dec_args)
     detok_hyp = self.detokenize(out_file)
     if ref:
         return self.evaluate_file(detok_hyp, ref, lowercase=lowercase)
Exemplo n.º 6
0
    def tune_decoder_params(self,
                            exp: Experiment,
                            tune_src: str,
                            tune_ref: str,
                            batch_size: int,
                            trials: int = 10,
                            lowercase=True,
                            beam_size=(1, 4, 8),
                            ensemble=(1, 5, 10),
                            lp_alpha=(0.0, 0.4, 0.6),
                            suggested: List[Tuple[int, int, float]] = None,
                            **fixed_args):
        _, _, _, tune_args = inspect.getargvalues(inspect.currentframe())
        tune_args.update(fixed_args)
        ex_args = ['exp', 'self', 'fixed_args', 'batch_size', 'max_len']
        if trials == 0:
            ex_args += ['beam_size', 'ensemble', 'lp_alpha']
        for x in ex_args:
            del tune_args[x]  # exclude some args

        _, step = exp.get_last_saved_model()
        tune_dir = exp.work_dir / f'tune_step{step}'
        log.info(f"Tune dir = {tune_dir}")
        tune_dir.mkdir(parents=True, exist_ok=True)
        tune_src, tune_ref = Path(tune_src), Path(tune_ref)
        assert tune_src.exists()
        assert tune_ref.exists()
        tune_src, tune_ref = list(IO.get_lines(tune_src)), list(
            IO.get_lines(tune_ref))
        assert len(tune_src) == len(tune_ref)

        tune_log = tune_dir / 'scores.json'  # resume the tuning
        memory: Dict[Tuple, float] = {}
        if tune_log.exists():
            data = json.load(tune_log.open())
            # JSON keys cant be tuples, so they were stringified
            memory = {eval(k): v for k, v in data.items()}

        beam_sizes, ensembles, lp_alphas = [], [], []
        if suggested:
            if isinstance(suggested[0], str):
                suggested = [eval(x) for x in suggested]
            suggested = [(x[0], x[1], round(x[2], 2)) for x in suggested]
            suggested_new = [x for x in suggested if x not in memory]
            beam_sizes += [x[0] for x in suggested_new]
            ensembles += [x[1] for x in suggested_new]
            lp_alphas += [x[2] for x in suggested_new]

        new_trials = trials - len(memory)
        if new_trials > 0:
            beam_sizes += [random.choice(beam_size) for _ in range(new_trials)]
            ensembles += [random.choice(ensemble) for _ in range(new_trials)]
            lp_alphas += [
                round(random.choice(lp_alpha), 2) for _ in range(new_trials)
            ]

        # ensembling is somewhat costlier, so try minimize the model ensembling, by grouping them together
        grouped_ens = defaultdict(list)
        for b, ens, l in zip(beam_sizes, ensembles, lp_alphas):
            grouped_ens[ens].append((b, l))
        try:
            for ens, args in grouped_ens.items():
                decoder = Decoder.new(exp, ensemble=ens)
                for b_s, lp_a in args:
                    eff_batch_size = batch_size // b_s  # effective batch size
                    name = f'tune_step{step}_beam{b_s}_ens{ens}_lp{lp_a:.2f}'
                    log.info(name)
                    out_file = tune_dir / f'{name}.out.tsv'
                    score = self.decode_eval_file(decoder,
                                                  tune_src,
                                                  out_file,
                                                  tune_ref,
                                                  batch_size=eff_batch_size,
                                                  beam_size=b_s,
                                                  lp_alpha=lp_a,
                                                  lowercase=lowercase,
                                                  **fixed_args)
                    memory[(b_s, ens, lp_a)] = score
            best_params = sorted(memory.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[0][0]
            return dict(zip(['beam_size', 'ensemble', 'lp_alpha'],
                            best_params)), tune_args
        finally:
            # JSON keys cant be tuples, so we stringify them
            data = {str(k): v for k, v in memory.items()}
            IO.write_lines(tune_log, json.dumps(data))