示例#1
0
 def __init__(self, score_type, bucket_cutoffs=None, case_insensitive=False):
   self.score_type = score_type
   self.scorer = scorers.create_scorer_from_profile(score_type)
   if bucket_cutoffs is None:
     bucket_cutoffs = [x * self.scorer.scale / 10.0 for x in range(1,10)]
   self.set_bucket_cutoffs(bucket_cutoffs, num_type='float')
   self.case_insensitive = case_insensitive
示例#2
0
 def setUpClass(self):
   self.ref, self.out1, self.out2 = _get_example_data()
   self.ids = list(range(len(self.ref)))
   self.scorer = scorers.create_scorer_from_profile("bleu", case_insensitive=False)
   self.cache_stats1 = self.scorer.cache_stats(self.ref, self.out1)
   self.cache_stats2 = self.scorer.cache_stats(self.ref, self.out2)
   self.n_random_retries = 10
示例#3
0
 def setUpClass(cls) -> None:
     example_path = os.path.join(compare_mt_root, "example")
     filenames = ["ted.ref.eng", "ted.sys1.eng", "ted.orig.slk"]
     cls.ref, cls.out, cls.src = [
         load_tokens(os.path.join(example_path, name)) for name in filenames
     ]
     cls.scorer = scorers.create_scorer_from_profile("gleu",
                                                     case_insensitive=False)
示例#4
0
def generate_score_report(ref,
                          outs,
                          score_type='bleu',
                          bootstrap=0,
                          prob_thresh=0.05,
                          meteor_directory=None,
                          options=None,
                          title=None,
                          case_insensitive=False):
    """
  Generate a report comparing overall scores of system(s) in both plain text and graphs.

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    score_type: A string specifying the scoring type (bleu/length)
    bootstrap: Number of samples for significance test (0 to disable)
    prob_thresh: P-value threshold for significance test
    meteor_directory: Path to the directory of the METEOR code
    options: Options when using external program
    compare_directions: A string specifying which systems to compare 
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
  """
    bootstrap = int(bootstrap)
    prob_thresh = float(prob_thresh)
    case_insensitive = True if case_insensitive == 'True' else False

    scorer = scorers.create_scorer_from_profile(
        score_type,
        case_insensitive=case_insensitive,
        meteor_directory=meteor_directory,
        options=options)

    scores, strs = zip(*[scorer.score_corpus(ref, out) for out in outs])

    if bootstrap != 0:
        direcs = []
        for i in range(len(scores)):
            for j in range(i + 1, len(scores)):
                direcs.append((i, j))
        wins, sys_stats = sign_utils.eval_with_paired_bootstrap(
            ref, outs, scorer, direcs, num_samples=bootstrap)
        wins = list(zip(direcs, wins))
    else:
        wins = sys_stats = direcs = None

    reporter = reporters.ScoreReport(scorer=scorer,
                                     scores=scores,
                                     strs=strs,
                                     wins=wins,
                                     sys_stats=sys_stats,
                                     prob_thresh=prob_thresh,
                                     title=title)
    reporter.generate_report(output_fig_file=f'score-{score_type}-{bootstrap}',
                             output_fig_format='pdf',
                             output_directory='outputs')
    return reporter
示例#5
0
def generate_sentence_examples(ref,
                               outs,
                               src=None,
                               score_type='sentbleu',
                               report_length=10,
                               compare_directions='0-1',
                               title=None,
                               case_insensitive=False):
    """
  Generate examples of sentences that satisfy some criterion, usually score of one system better

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    src: Tokens from the source (optional)
    score_type: The type of scorer to use
    report_length: Number of sentences to print for each system being better or worse
    compare_directions: A string specifying which systems to compare
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
  """
    report_length = int(report_length)
    case_insensitive = True if case_insensitive == 'True' else False

    scorer = scorers.create_scorer_from_profile(
        score_type, case_insensitive=case_insensitive)

    direcs = arg_utils.parse_compare_directions(compare_directions)

    scorediff_lists = []
    for (left, right) in direcs:
        scorediff_list = []
        deduplicate_set = set()
        for i, (o1, o2, r) in enumerate(zip(outs[left], outs[right], ref)):
            if (tuple(o1), tuple(o2), tuple(r)) in deduplicate_set:
                continue
            deduplicate_set.add((tuple(o1), tuple(o2), tuple(r)))
            s1, str1 = scorer.score_sentence(r, o1)
            s2, str2 = scorer.score_sentence(r, o2)
            scorediff_list.append((s2 - s1, s1, s2, str1, str2, i))
        scorediff_list.sort()
        scorediff_lists.append(scorediff_list)

    reporter = reporters.SentenceExampleReport(report_length=report_length,
                                               scorediff_lists=scorediff_lists,
                                               scorer=scorer,
                                               ref=ref,
                                               outs=outs,
                                               src=src,
                                               compare_directions=direcs,
                                               title=title)
    reporter.generate_report()
    return reporter
示例#6
0
def generate_sentence_bucketed_report(ref,
                                      outs,
                                      bucket_type='score',
                                      bucket_cutoffs=None,
                                      statistic_type='count',
                                      score_measure='bleu',
                                      case_insensitive=False):
    """
  Generate a report of sentences by bucket in both plain text and graphs

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    bucket_type: The type of bucketing method to use
    score_measure: If using 'score' as either bucket_type or statistic_type, which scorer to use
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
  """
    case_insensitive = True if case_insensitive == 'True' else False

    bucketer = bucketers.create_sentence_bucketer_from_profile(
        bucket_type,
        bucket_cutoffs=bucket_cutoffs,
        score_type=score_measure,
        case_insensitive=case_insensitive)
    bcs = [bucketer.create_bucketed_corpus(out, ref=ref) for out in outs]

    if statistic_type == 'count':
        scorer = None
        aggregator = lambda out, ref: len(out)
    elif statistic_type == 'score':
        scorer = scorers.create_scorer_from_profile(
            score_measure, case_insensitive=case_insensitive)
        aggregator = lambda out, ref: scorer.score_corpus(ref, out)[0]
    else:
        raise ValueError(f'Illegal statistic_type {statistic_type}')

    stats = [[aggregator(out, ref) for (out, ref) in bc] for bc in bcs]

    reporter = reporters.SentenceReport(bucketer=bucketer,
                                        sys_stats=stats,
                                        statistic_type=statistic_type,
                                        scorer=scorer)

    reporter.generate_report(
        output_fig_file=f'sentence-{statistic_type}-{score_measure}',
        output_fig_format='pdf',
        output_directory='outputs')
    return reporter
示例#7
0
 def setUpClass(self):
     self.ref, self.out, _ = _get_example_data()
     self.scorer = scorers.create_scorer_from_profile("length")
示例#8
0
 def setUpClass(self):
     self.ref, self.out, _ = _get_example_data_detokenized()
     self.scorer = scorers.create_scorer_from_profile("sacrebleu")
示例#9
0
def generate_sentence_bucketed_report(ref, outs,
                                   bucket_type='score', bucket_cutoffs=None,
                                   statistic_type='count',
                                   score_measure='bleu',
                                   label_set=None,
                                   ref_labels=None, out_labels=None,
                                   title=None,
                                   case_insensitive=False):
  """
  Generate a report of sentences by bucket in both plain text and graphs

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    bucket_type: The type of bucketing method to use
    score_measure: If using 'score' as either bucket_type or statistic_type, which scorer to use
    ref_labels: either a filename of a file full of reference labels, or a list of strings corresponding to `ref`. Would overwrite out_labels if specified.
    out_labels: output labels. 
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
  """
  case_insensitive = True if case_insensitive == 'True' else False

  if ref_labels is not None:
    ref_labels = corpus_utils.load_tokens(ref_labels) if type(ref_labels) == str else ref_labels
    if len(ref_labels) != len(ref):
      raise ValueError(f'The number of labels should be equal to the number of sentences.')

  elif out_labels is not None:
    out_labels = arg_utils.parse_files(out_labels)
    if len(out_labels) != len(outs):
      raise ValueError(f'The number of output files should be equal to the number of output labels.')

    out_labels = [corpus_utils.load_tokens(out_label) if type(out_label) == str else out_label for out_label in out_labels]
    for out, out_label in zip(outs, out_labels):
      if len(out_label) != len(out):
        raise ValueError(f'The number of labels should be equal to the number of sentences.')
    

  bucketer = bucketers.create_sentence_bucketer_from_profile(bucket_type, bucket_cutoffs=bucket_cutoffs,
                                                             score_type=score_measure, label_set=label_set, case_insensitive=case_insensitive)
  bcs = [bucketer.create_bucketed_corpus(out, ref=ref, ref_labels=ref_labels if ref_labels else None, out_labels=out_labels[i] if out_labels else None) for i, out in enumerate(outs)]

  if statistic_type == 'count':
    scorer = None
    aggregator = lambda out,ref: len(out)
  elif statistic_type == 'score':
    scorer = scorers.create_scorer_from_profile(score_measure, case_insensitive=case_insensitive)
    aggregator = lambda out,ref: scorer.score_corpus(ref,out)[0]
  else:
    raise ValueError(f'Illegal statistic_type {statistic_type}')

  stats = [[aggregator(out,ref) for (out,ref) in bc] for bc in bcs]

  reporter = reporters.SentenceReport(bucketer=bucketer,
                                      sys_stats=stats,
                                      statistic_type=statistic_type, scorer=scorer, 
                                      title=title)

  reporter.generate_report(output_fig_file=f'sentence-{statistic_type}-{score_measure}',
                           output_fig_format='pdf', 
                           output_directory='outputs')
  return reporter 
示例#10
0
def generate_sentence_examples(ref,
                               outs,
                               src=None,
                               score_type='sentbleu',
                               report_length=10,
                               compare_directions='0-1',
                               title=None,
                               case_insensitive=False,
                               to_cache=False,
                               cache_dicts=None):
    """
  Generate examples of sentences that satisfy some criterion, usually score of one system better

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    src: Tokens from the source (optional)
    score_type: The type of scorer to use
    report_length: Number of sentences to print for each system being better or worse
    compare_directions: A string specifying which systems to compare
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
    to_cache: Return a list of computed statistics if True
    cache_dicts: A list of dictionaries that store cached statistics for each output
  """
    # check and set parameters
    report_length = int(report_length)
    if type(case_insensitive) == str:
        case_insensitive = True if case_insensitive == 'True' else False

    # compute statistics
    scorer = scorers.create_scorer_from_profile(
        score_type, case_insensitive=case_insensitive)

    cache_key_list = ['scores', 'strs']
    scores, strs = cache_utils.extract_cache_dicts(cache_dicts, cache_key_list,
                                                   len(outs))
    src = [None for _ in ref] if src is None else src
    if cache_dicts is None:
        scores, strs = [], []
        for out in outs:
            scores_i, strs_i = [], []
            for (r, o, s) in zip(ref, out, src):
                score, string = scorer.score_sentence(r, o, s)
                scores_i.append(score)
                strs_i.append(string)
            scores.append(scores_i)
            strs.append(strs_i)

    if to_cache:
        cache_dict = cache_utils.return_cache_dict(cache_key_list,
                                                   [scores, strs])
        return cache_dict

    direcs = arg_utils.parse_compare_directions(compare_directions)

    scorediff_lists = []
    for (left, right) in direcs:
        scorediff_list = []
        deduplicate_set = set()
        for i, (o1, o2, r) in enumerate(zip(outs[left], outs[right], ref)):
            if (tuple(o1), tuple(o2), tuple(r)) in deduplicate_set:
                continue
            deduplicate_set.add((tuple(o1), tuple(o2), tuple(r)))
            s1, str1 = scores[left][i], strs[left][i]
            s2, str2 = scores[right][i], strs[right][i]
            scorediff_list.append((s2 - s1, s1, s2, str1, str2, i))
        scorediff_list.sort()
        scorediff_lists.append(scorediff_list)

    # generate reports
    reporter = reporters.SentenceExampleReport(report_length=report_length,
                                               scorediff_lists=scorediff_lists,
                                               scorer=scorer,
                                               ref=ref,
                                               outs=outs,
                                               src=src,
                                               compare_directions=direcs,
                                               title=title)
    reporter.generate_report()
    return reporter
示例#11
0
def generate_sentence_bucketed_report(ref,
                                      outs,
                                      src=None,
                                      bucket_type='score',
                                      bucket_cutoffs=None,
                                      statistic_type='count',
                                      score_measure='sentbleu',
                                      label_set=None,
                                      ref_labels=None,
                                      out_labels=None,
                                      title=None,
                                      case_insensitive=False,
                                      output_bucket_details=False,
                                      to_cache=False,
                                      cache_dicts=None):
    """
  Generate a report of sentences by bucket in both plain text and graphs

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    bucket_type: The type of bucketing method to use
    score_measure: If using 'score' as either bucket_type or statistic_type, which scorer to use
    ref_labels: either a filename of a file full of reference labels, or a list of strings corresponding to `ref`. Would overwrite out_labels if specified.
    out_labels: output labels. 
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
    output_bucket_details: A boolean specifying whether to output the number of words in each bucket
    to_cache: Return a list of computed statistics if True
    cache_dicts: A list of dictionaries that store cached statistics for each output
  """
    # check and set parameters
    if type(case_insensitive) == str:
        case_insensitive = True if case_insensitive == 'True' else False
    if type(output_bucket_details) == str:
        output_bucket_details = True if output_bucket_details == 'True' else False

    if ref_labels is not None:
        ref_labels = corpus_utils.load_tokens(ref_labels) if type(
            ref_labels) == str else ref_labels
        if len(ref_labels) != len(ref):
            raise ValueError(
                f'The number of labels should be equal to the number of sentences.'
            )

    elif out_labels is not None:
        out_labels = arg_utils.parse_files(out_labels)
        if len(out_labels) != len(outs):
            raise ValueError(
                f'The number of output files should be equal to the number of output labels.'
            )

        out_labels = [
            corpus_utils.load_tokens(out_label)
            if type(out_label) == str else out_label
            for out_label in out_labels
        ]
        for out, out_label in zip(outs, out_labels):
            if len(out_label) != len(out):
                raise ValueError(
                    f'The number of labels should be equal to the number of sentences.'
                )

    # compute statistics
    bucketer = bucketers.create_sentence_bucketer_from_profile(
        bucket_type,
        bucket_cutoffs=bucket_cutoffs,
        score_type=score_measure,
        label_set=label_set,
        case_insensitive=case_insensitive)

    src = [None for _ in ref] if src is None else src

    if statistic_type == 'count':
        scorer = None
        if bucket_type != 'score' and bucket_type != 'lengthdiff':
            ref = ref_label = None
        aggregator = lambda out, refs, src: len(out)
    elif statistic_type == 'score':
        scorer = scorers.create_scorer_from_profile(
            score_measure, case_insensitive=case_insensitive)
        aggregator = lambda out, ref, src: scorer.score_corpus(ref, out, src)[0
                                                                              ]
    else:
        raise ValueError(f'Illegal statistic_type {statistic_type}')

    cache_key_list = ['stats']
    stats = cache_utils.extract_cache_dicts(cache_dicts, cache_key_list,
                                            len(outs))

    if cache_dicts is None:
        bcs = [
            bucketer.create_bucketed_corpus(
                out,
                ref=ref,
                src=src,
                ref_labels=ref_labels if ref_labels else None,
                out_labels=out_labels[i] if out_labels else None)
            for i, out in enumerate(outs)
        ]
        stats = [[aggregator(out, ref, src) for (out, ref, src) in bc]
                 for bc in bcs]

    if output_bucket_details and statistic_type == 'score':
        bucket_cnt_calculator = lambda out, ref, src: len(out)
        bucket_interval_calculator = lambda out, ref: sign_utils.eval_with_paired_bootstrap(
            ref, [out], src, scorer, None)[1][0]
        if cache_dicts is not None:  # we don't cache bcs
            bcs = [
                bucketer.create_bucketed_corpus(
                    out,
                    ref=ref,
                    src=src,
                    ref_labels=ref_labels if ref_labels else None,
                    out_labels=out_labels[i] if out_labels else None)
                for i, out in enumerate(outs)
            ]
        bucket_cnts = [
            bucket_cnt_calculator(out, ref, src) for (out, ref, src) in bcs[0]
        ]
        bucket_intervals = [[
            bucket_interval_calculator(out, ref, src) for (out, ref, src) in bc
        ] for bc in bcs]
    else:
        bucket_cnts = bucket_intervals = None

    if to_cache:
        cache_dict = cache_utils.return_cache_dict(cache_key_list, [stats])
        return cache_dict

    # generate reports
    reporter = reporters.SentenceReport(bucketer=bucketer,
                                        sys_stats=stats,
                                        statistic_type=statistic_type,
                                        scorer=scorer,
                                        bucket_cnts=bucket_cnts,
                                        bucket_intervals=bucket_intervals,
                                        title=title)

    reporter.generate_report(
        output_fig_file=f'sentence-{statistic_type}-{score_measure}',
        output_fig_format='pdf',
        output_directory='outputs')
    return reporter
示例#12
0
def generate_score_report(ref,
                          outs,
                          src=None,
                          score_type='bleu',
                          bootstrap=0,
                          prob_thresh=0.05,
                          meteor_directory=None,
                          options=None,
                          title=None,
                          case_insensitive=False,
                          to_cache=False,
                          cache_dicts=None):
    """
  Generate a report comparing overall scores of system(s) in both plain text and graphs.

  Args:
    ref: Tokens from the reference
    outs: Tokens from the output file(s)
    src: Tokens for the source 
    score_type: A string specifying the scoring type (bleu/length)
    bootstrap: Number of samples for significance test (0 to disable)
    prob_thresh: P-value threshold for significance test
    meteor_directory: Path to the directory of the METEOR code
    options: Options when using external program
    compare_directions: A string specifying which systems to compare 
    title: A string specifying the caption of the printed table
    case_insensitive: A boolean specifying whether to turn on the case insensitive option
    to_cache: Return a list of computed statistics if True
    cache_dicts: A list of dictionaries that store cached statistics for each output
  """
    # check and set parameters
    bootstrap = int(bootstrap)
    prob_thresh = float(prob_thresh)
    if type(case_insensitive) == str:
        case_insensitive = True if case_insensitive == 'True' else False

    # compute statistics
    scorer = scorers.create_scorer_from_profile(
        score_type,
        case_insensitive=case_insensitive,
        meteor_directory=meteor_directory,
        options=options)

    cache_key_list = ['scores', 'strs', 'sign_stats']
    scores, strs, sign_stats = cache_utils.extract_cache_dicts(
        cache_dicts, cache_key_list, len(outs))
    if cache_dicts is None:
        scores, strs = zip(
            *[scorer.score_corpus(ref, out, src=src) for out in outs])

    if to_cache:
        cache_dict = cache_utils.return_cache_dict(
            cache_key_list,
            [scores, strs, [scorer.cache_stats(ref, outs[0], src=src)]])
        return cache_dict

    if bootstrap != 0:
        direcs = []
        for i in range(len(scores)):
            for j in range(i + 1, len(scores)):
                direcs.append((i, j))
        wins, sys_stats = sign_utils.eval_with_paired_bootstrap(
            ref,
            outs,
            src,
            scorer,
            direcs,
            num_samples=bootstrap,
            cache_stats=sign_stats)
        wins = list(zip(direcs, wins))
    else:
        wins = sys_stats = None

    # generate reports
    reporter = reporters.ScoreReport(scorer=scorer,
                                     scores=scores,
                                     strs=strs,
                                     wins=wins,
                                     sys_stats=sys_stats,
                                     prob_thresh=prob_thresh,
                                     title=title)
    reporter.generate_report(output_fig_file=f'score-{score_type}-{bootstrap}',
                             output_fig_format='pdf',
                             output_directory='outputs')
    return reporter