Exemplo n.º 1
0
def main(args):

    if args.human_scores:
        systems = []
        scores = {}
        for line in open(args.human_scores):
            system, score = line.rstrip().split()
            scores[system] = float(score)
        for system in args.systems:
            system_name = '.'.join(os.path.basename(system).split('.')[1:-1])
            if system_name not in scores:
                print(f"COULDN'T FIND SYSTEM {system_name}", file=sys.stderr)
            elif scores[system_name] >= args.human_min and scores[
                    system_name] <= args.human_max:
                systems.append(system)
    else:
        systems = args.systems

    sys_fhs = [open(ref) for ref in systems]
    ref_fhs = [open(system) for system in args.refs]

    tokenize = sacrebleu.TOKENIZERS["13a"]

    stats = Counter()
    totals = Counter()
    for lineno, (syss, refs) in enumerate(zip(zip(*sys_fhs), zip(*ref_fhs)),
                                          1):
        syss = [tokenize(system) for system in syss]
        refs = [tokenize(ref) for ref in refs]

        # All system n-grams.
        sys_ngrams = Counter()
        for system in syss:
            sys_ngrams += sacrebleu.extract_ngrams(system, max_order=args.n)

        for ngram in sys_ngrams.keys():
            totals[len(ngram.split())] += 1

        # reset counts for all n-grams found in references
        ref_ngrams = Counter()
        for ref in refs:
            ref_ngrams += sacrebleu.extract_ngrams(ref, max_order=args.n)

        ngrams = list(sys_ngrams.keys())
        for ngram in ngrams:
            if ngram not in ref_ngrams:
                del sys_ngrams[ngram]

        for ngram in sys_ngrams.keys():
            stats[len(ngram.split())] += 1

    def pretty(counter: Counter):
        return '\t'.join(f'{counter[x]}' for x in range(1, args.n + 1))

    stats = [stats[x] / totals[x] for x in range(1, args.n + 1)]
    print('UNSEEN:', *[f"{x*100:.2f}" for x in stats], sep='\t')
Exemplo n.º 2
0
def main(args):

    sys_fhs = [open(ref) for ref in args.systems]
    ref_fhs = [open(sys) for sys in args.refs]

    stats = Counter()
    for lineno, (syss, refs) in enumerate(zip(zip(*sys_fhs), zip(*ref_fhs)),
                                          1):
        syss = [sacrebleu.tokenize_13a(sys) for sys in syss]
        refs = [sacrebleu.tokenize_13a(ref) for ref in refs]

        # Find all ngrams in refs#2+ that are not in ref#1
        ref_ngrams = Counter()
        for ref in refs[1:]:
            ref_ngrams += sacrebleu.extract_ngrams(ref, max_order=args.n)
        for ngram in sacrebleu.extract_ngrams(refs[0], max_order=args.n):
            if ngram in ref_ngrams:
                del ref_ngrams[ngram]

        # The system ngrams that are only in refs 2+
        sys_ngrams = [
            sacrebleu.extract_ngrams(sys, max_order=args.n) for sys in syss
        ]
        if len(syss) == 1:
            for ngram in sys_ngrams[0]:
                if ngram not in ref_ngrams:
                    sys_ngrams[0][ngram] = 0
        sys_ngrams[0] += Counter()

        for ngram in sys_ngrams[0]:
            stats[len(ngram.split())] += sys_ngrams[0][ngram]

    def pretty(counter: Counter):
        return '\t'.join(f'{counter[x]}' for x in range(1, args.n + 1))

    print(pretty(stats))
Exemplo n.º 3
0
def main(args):

    print(args, file=sys.stderr)

    if args.human_scores:
        systems = []
        scores = {}
        for line in open(args.human_scores):
            system, score = line.rstrip().split()
            scores[system] = float(score)
        for system in args.systems:
            system_name = '.'.join(os.path.basename(system).split('.')[1:-1])
            if system_name not in scores:
                print(f"COULDN'T FIND SYSTEM {system_name}", file=sys.stderr)
            elif scores[system_name] > args.scope:
                systems.append(system)
    else:
        systems = args.systems

    if args.normalize:
        normalizer = MosesPunctNormalizer(lang='en', penn=False)

    if args.spm:
        sp = spm.SentencePieceProcessor()
        sp.Load(args.spm)

    # leave one out
    fds = [open(file) for file in systems]

    num_constraints = 0
    num_skipped = 0
    for lineno, (ref, *systems) in enumerate(zip(open(args.reference), *fds),
                                             1):

        def preprocess(text):
            if args.normalize:
                text = normalizer.normalize(text)
            if args.spm:
                text = ' '.join(sp.EncodeAsPieces(text))
            return ' '.join(text.split()[:args.maxlen])

        if len(ref.split()) > args.maxlen:
            continue

        ref_ngrams = sacrebleu.extract_ngrams(ref,
                                              min_order=args.ngram_min,
                                              max_order=args.ngram_max)

        ngrams = Counter()
        for system in systems:
            ngrams += sacrebleu.extract_ngrams(system,
                                               min_order=args.ngram_min,
                                               max_order=args.ngram_max)

        for ngram in ref_ngrams.keys():
            ngrams[ngram] = 0
        ngrams -= ref_ngrams
        if args.threshold <= 1:
            attested_ngrams = [
                ngram for ngram in ngrams.keys()
                if (ngrams[ngram] / len(systems)) >= args.threshold
            ]
        else:
            attested_ngrams = [
                ngram for ngram in ngrams.keys()
                if ngrams[ngram] >= args.threshold
            ]

        used_ngrams = []
        for ngram in sorted(attested_ngrams, key=len, reverse=True):
            for used in used_ngrams:
                if ngram in used:
                    #                    print(f"** {lineno} already saw '{ngram}' in '{used}', skipping", file=sys.stderr)
                    num_skipped += 1
                    break
            else:
                num_constraints += 1
                used_ngrams.append(ngram)
                j = {
                    'sentno': lineno,
                    'text': preprocess(ref),
                    'constraints': [preprocess(ngram)]
                }
                print(json.dumps(j, ensure_ascii=False), flush=True)
        #print(*attested_ngrams, sep='\t', flush=True)

    print(
        f"Created {num_constraints} constrained sentences, skipping {num_skipped} smaller ones",
        file=sys.stderr)
Exemplo n.º 4
0
def corpus_bleu(sys_stream,
                ref_streams,
                smooth='exp',
                smooth_floor=0.0,
                force=False,
                lowercase=False,
                tokenize=sacrebleu.DEFAULT_TOKENIZER,
                use_effective_order=False) -> sacrebleu.BLEU:
    """Produces BLEU scores along with its sufficient statistics from a source
    against one or more references.

    :param sys_stream: The system stream (a sequence of segments)
    :param ref_streams: A list of one or more reference streams (each a
                        sequence of segments)
    :param smooth: The smoothing method to use
    :param smooth_floor: For 'floor' smoothing, the floor to use
    :param force: Ignore data that looks already tokenized
    :param lowercase: Lowercase the data
    :param tokenize: The tokenizer to use
    :return: a BLEU object containing everything you'd want
    """

    # Add some robustness to the input arguments
    if isinstance(sys_stream, str):
        sys_stream = [sys_stream]
    if isinstance(ref_streams, str):
        ref_streams = [[ref_streams]]

    sys_len = 0
    ref_len = 0

    correct = [0 for n in range(sacrebleu.NGRAM_ORDER)]
    total = [0 for n in range(sacrebleu.NGRAM_ORDER)]

    fhs = [sys_stream] + ref_streams
    for lines in zip_longest(*fhs):
        if None in lines:
            raise EOFError("Source and reference streams have different "
                           "lengths!")

        if lowercase:
            lines = [x.lower() for x in lines]

        output, *refs = [
            sacrebleu.TOKENIZERS[tokenize](x.rstrip()) for x in lines
        ]

        ref_ngrams, closest_diff, closest_len = sacrebleu.ref_stats(
            output, refs)

        sys_len += len(output.split())
        ref_len += closest_len

        sys_ngrams = sacrebleu.extract_ngrams(output)
        for ngram in sys_ngrams.keys():
            n = len(ngram.split())
            correct[n - 1] += min(sys_ngrams[ngram], ref_ngrams.get(ngram, 0))
            total[n - 1] += sys_ngrams[ngram]

    correct = all_reduce(correct)
    total = all_reduce(total)
    sys_len = all_reduce(sys_len)
    ref_len = all_reduce(ref_len)

    return sacrebleu.compute_bleu(correct, total, sys_len, ref_len, smooth,
                                  smooth_floor, use_effective_order)
Exemplo n.º 5
0
            for word, tag in pairs:
                if tag == 'PERSON':
                    constraint.append(word)
                else:
                    if len(constraint) > 0:
                        phrase = ' '.join(constraint)
                        if phrase in termsdict:
                            add_constraint(termsdict[phrase])
                        constraint = []
            source = ' '.join([x[0] for x in pairs])

    for phrase in args.constraints:
        add_constraint(phrase)

    if not args.ner and args.dictionary is not None:
        ngrams = sorted(extract_ngrams(source, 4),
                        key=lambda x: len(x.split()),
                        reverse=True)
        for ngram in ngrams:
            if ngram in termsdict:
                add_constraint(termsdict[ngram])
                break

    if len(constraints) > 0:
        print(
            json.dumps(
                {
                    'text': bpe.segment(source),
                    'constraints': constraints
                },
                ensure_ascii=False))
Exemplo n.º 6
0
def main(args):

    if args.human_scores:
        systems = []
        scores = {}
        for line in open(args.human_scores):
            system, score = line.rstrip().split()
            scores[system] = float(score)
        for system_path in args.systems:
            system_name = '.'.join(
                os.path.basename(system_path).split('.')[1:-1])
            if system_name not in scores:
                print(f"COULDN'T FIND SYSTEM {system_name} ({system_path})",
                      file=sys.stderr)
            elif scores[system_name] >= args.human_min and scores[
                    system_name] <= args.human_max:
                systems.append(system_path)
    else:
        systems = args.systems

    print(f"SYSTEMS[{args.human_min} to {args.human_max}]: {systems}",
          file=sys.stderr)

    langpair = os.path.basename(args.refs[0]).split(".")[1]

    sys_fhs = [open(ref) for ref in systems]
    ref_fhs = [open(system) for system in args.refs]

    tokenize = sacrebleu.TOKENIZERS["13a"]

    new_ngrams = Counter()
    new_ngram_data = defaultdict(list)
    totals = Counter()
    for lineno, (syss, refs) in enumerate(zip(zip(*sys_fhs), zip(*ref_fhs)),
                                          1):
        syss = [tokenize(system) for system in syss]
        refs = [tokenize(ref) for ref in refs]

        # All system n-grams.
        sys_ngrams = Counter()
        for system in syss:
            sys_ngrams += sacrebleu.extract_ngrams(system,
                                                   min_order=args.m,
                                                   max_order=args.n)

        # reset counts for all n-grams found in references
        ref_ngrams = Counter()
        first_ref_ngrams = None
        for ref in refs:
            all_ngrams = sacrebleu.extract_ngrams(ref,
                                                  min_order=args.m,
                                                  max_order=args.n)

            # Only keep ngrams that don't appear in longer ngrams
            used_ngrams = []
            keep_ngrams = {}
            if args.uniq:
                for ngram in sorted(all_ngrams, key=len, reverse=True):
                    for used in used_ngrams:
                        if ngram in used:
                            break
                    else:
                        keep_ngrams[ngram] = all_ngrams[ngram]
                        used_ngrams.append(ngram)
            else:
                keep_ngrams = all_ngrams

            if not first_ref_ngrams:
                first_ref_ngrams = keep_ngrams
            else:
                ref_ngrams += keep_ngrams

        for ngram in sys_ngrams.keys():
            totals[ngram] += 1

            if ngram in ref_ngrams and ngram not in first_ref_ngrams:
                new_ngrams[ngram] += 1
                new_ngram_data[ngram].append(lineno)

    print("pair", "N", "count", "ngram", "lines", sep="\t")
    for ngram, count in sorted(new_ngrams.items(),
                               key=lambda x: (len(x[0].split()), x[1]),
                               reverse=True):
        lines = " ".join(map(str, new_ngram_data[ngram]))
        print(langpair, len(ngram.split()), count, ngram, lines, sep="\t")