예제 #1
0
def main():
    seq1 = "ACACACTA"
    seq2 = "AGCACACA"
    seq1 = "GCATGCU"
    seq2 = "GATTACA"
    #seq1 = "fjfjfJDfjdjdfjdfgsGDgssfsdrhse"
    #seq2 = "fjfjfjdfjdjdfjdfgsgdgssfsdrhse"
    seq1 = "このエナメル質と、象牙質、セメント質、歯髄で歯は構成される。通常目に見える部分がこのエナメル質であり、象牙質に支えられている。"
    #seq1 = "このエナメルと、、セメント、ではされる。にえるがこのエナメルであり、にえられている。"
    seq2 = "このエナメルしつと、ぞうげしつ、セメントしつ、しずいでははこうせいされる。つうじょうめにみえるぶぶんがこのエナメルしつであり、ぞうげしつにささえられている。"

    print("Sequence A:  %s" % seq1)
    print("Sequence B:  %s" % seq2)

    kanji, kana = next(kanamatcher.align(seq1, seq2, d=-1, fill=" "))
    print(kanji)
    print(kana)
    print()
    match = kanamatcher.find_matches(kanji, kana)
    match = kanamatcher.clear_fill(match, fill=" ")
    for m in match:
        print(m)
    print()

    for a,b in match:
        filtered = re.sub("[^\u4e00-\u9fff]", "", a)
        if len(filtered) > 0:
            if a == filtered:
                result = split_reading(a, b, True)
                x,y = zip(*result)
                print("%s = %s" % (a,', '.join(y)))
            else:
                print("%s = %s" % (a, b))

    print()

    kanji = "学校"
    kana = "がっこう"

    print(kanji)
    print(kana)
    print()

    result = split_reading(kanji, kana, True)

    print(result)
    for a, b in result:
        print("%s = %s" % (a,b))

    #print("Sequence 2: %s" % seq3)
    #filtered = re.sub("[\u4e00-\u9fff]", " ", alignA)
    #kanji = re.sub("[^\u4e00-\u9fff ]", " ", alignA)
    #rubyAB = get_ruby(alignA, alignB)
    #rubyCD = get_ruby(alignC, alignD)
    print(get_readings("匹"))

    print(kanamatcher.match_kana("日本語は、主に日本国内や日本人同士で使われている言語である。",
                                 "にほんごは、おもににほんこくないやににほんじんどうしでつかわれているげんごである。"))
예제 #2
0
def main(*argv):
    parser = argparse.ArgumentParser(description="Test aligner on corpus.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('kanji_corpus', help="Corpus including kanji")
    parser.add_argument('kana_corpus', help="Corpus only including kana")
    parser.add_argument('--all', action="store_true", default=False, help="Run on whole corpus at once")
    parser.add_argument('--skip', type=int, default=0, help="Number of lines at the start of the corpus to skip")
    parser.add_argument('--missing-ruby-penalty', type=int,
                        default=kanamatcher.NO_RUBY_PENALTY, help="Penalty for a kanji missing ruby")
    parser.add_argument('--kana-mismatch-penalty', type=int, default=kanamatcher.KANA_MISMATCH_PENALTY,
                        help="Penalty for a mismatch between the given kana and the generated ruby")
    parser.add_argument('--alignments-to-test', type=int,
                        default=kanamatcher.MAX_NUM_ALIGNMENTS, help="Max number of alignments to test")
    parser.add_argument('--save-output', default=None, help="Save statistics and scores to file")
    args = parser.parse_args()

    kanamatcher.NO_RUBY_PENALTY = args.missing_ruby_penalty
    kanamatcher.KANA_MISMATCH_PENALTY = args.kana_mismatch_penalty
    kanamatcher.MAX_NUM_ALIGNMENTS = args.alignments_to_test

    print_all = args.all

    strip_whitespace = re.compile(r"\s")

    kanji_file = args.kanji_corpus
    kana_file = args.kana_corpus

    lines = 0
    errors = 0
    total_score = 0
    bad_lines = []

    try:
        with open(kanji_file, encoding="utf-8-sig") as kjf, \
            open(kana_file, encoding="utf-8-sig") as knf:

            skip_lines = 0
            for kanji, kana in zip(kjf, knf):
                if kanji.strip() == '':
                    continue

                lines += 1

                if args.skip > 0:
                    args.skip -= 1
                    continue

                print("{}.".format(lines))

                kanji, kana = strip_whitespace.sub("", kanji), strip_whitespace.sub("", kana)
                print(kanji)
                print(kana)

                try:
                    result, score = kanamatcher.match_kana(kanji, kana, return_score=True)
                except Exception as e:
                    errors += 1
                    traceback.print_exc()
                    print()
                    continue


                total_score += score

                output = "\n".join(kanamatcher.pretty_print(result))
                print(output)
                print("Score:", score)

                if score > 0:
                    bad_lines.append((score, lines, kanji, kana, output))

                if print_all or skip_lines > 0:
                    if skip_lines > 0:
                        skip_lines -= 1
                    print()
                else:
                    l = input("Press enter or input the number of lines to skip: ")
                    try:
                        skip_lines = max(0, int(l))
                    except ValueError:
                        pass

    except KeyboardInterrupt:
        pass

    bad_lines.sort(reverse=True)
    std_output = ("================\n" +
                  "Errors during parsing: {}/{}\n".format(errors, lines) +
                  "Total score: {}\n".format(total_score) +
                  "Average score: {}\n".format(total_score / lines) +
                  "================\n")

    pager_output = ""
    for score, line, kanji, kana, output in bad_lines:
        pager_output += "{}.\n{}\n{}\n{}\nScore: {}\n\n".format(
            line, kanji, kana, output, score)

    if args.save_output is not None:
        try:
            with open(args.save_output, 'w') as f:
                f.write(std_output)
                f.write(pager_output)
        except:
            traceback.print_exc()

    print(std_output)
    input("Press enter to view bad lines: ")
    pydoc.pager(pager_output)