Python Matcher.get_unmatched_annotated示例

def main():
    # maybe make it read from multiple places?
    # samples = ["CRAFT", "EAST", "USC", "GCP_lang_hints", "AWS", "GCP"]
    samples = ["GCP", "GCP_crops", "CRAFT_attn"]
    # samples = ["AWS"]
    filenames = ["{}_indo.txt".format(name) for name in samples]

    guessed_words = []
    annotations = {}
    data = {i: {} for i in range(len(filenames))}
    name_dict = {i: name for i, name in enumerate(samples)}

    for i, filename in enumerate(filenames):
        with open(filename, "r") as f:
            lines = f.readlines()
            for line in lines:
                file_path, *words = line.strip().split(",")
                words = [x for x in words if len(x) != 0]
                # Get the correct words
                n = get_filename(file_path)
                if n not in annotations:
                    img_annotations = get_annotations(n)
                    annotations[n] = img_annotations

                # i represents which of the sources it came from - CRAFT, EAST,
                # USC
                guessed_words.append((i, n, words))

    possible_true_words_dict = None
    for i, n, words in guessed_words:
        # i represents the source
        # get the real annotations
        annotation = annotations[n]
        # Skip the annotation
        if skip_annotation(annotation):
            continue

        matcher = Matcher(annotation,
                          words,
                          possible_true_words_dict=possible_true_words_dict)

        possible_true_words_dict = matcher.possible_true_words_dict

        word_n = len(annotation)
        if word_n == 0:
            # TODO remove this, if we end up testing with no text files
            continue

        if annotations:
            perfect_matches = matcher.get_perfect_matches()
            ign_symbols = matcher.get_perfect_matches_ignoring_symbols()
            perf = len(perfect_matches) / word_n * 100
            ign = len(ign_symbols) / word_n * 100
            # vocab_matched = matcher.get_vocab_matches()
            # vcb = len(vocab_matched) / word_n * 100
            # mismatched = matcher.get_imperfect_matches(1)
            # msm = len(mismatched)/word_n * 100
            # percent_matched = perf + ign  + vcb# maybe lets ignore the mismatched
            percent_matched = perf + ign
            # ones?
            # percent_matched = perf
            data[i][n] = percent_matched
            unmatched = matcher.get_number_unmatched()

            if unmatched > 4:  # ignore all but craft
                # if i == 3: # ignore all but craft
                print(name_dict[i], n)
                print("ANNOTATED", *matcher.get_unmatched_annotated())
                print("DETECTED", *matcher.get_unmatched_detected())
                print(">", matcher.char_level_accuracy)
                print("=================================")
            # print(n, perf, ign, unmatched)

    for k in data:
        avg = sum(data[k].values()) / (len(list(data[k].values())))
        print(name_dict[k], '\t', round(avg, 2))