def main(): seq1 = "ACACACTA" seq2 = "AGCACACA" seq1 = "GCATGCU" seq2 = "GATTACA" #seq1 = "fjfjfJDfjdjdfjdfgsGDgssfsdrhse" #seq2 = "fjfjfjdfjdjdfjdfgsgdgssfsdrhse" seq1 = "このエナメル質と、象牙質、セメント質、歯髄で歯は構成される。通常目に見える部分がこのエナメル質であり、象牙質に支えられている。" #seq1 = "このエナメルと、、セメント、ではされる。にえるがこのエナメルであり、にえられている。" seq2 = "このエナメルしつと、ぞうげしつ、セメントしつ、しずいでははこうせいされる。つうじょうめにみえるぶぶんがこのエナメルしつであり、ぞうげしつにささえられている。" print("Sequence A: %s" % seq1) print("Sequence B: %s" % seq2) kanji, kana = next(kanamatcher.align(seq1, seq2, d=-1, fill=" ")) print(kanji) print(kana) print() match = kanamatcher.find_matches(kanji, kana) match = kanamatcher.clear_fill(match, fill=" ") for m in match: print(m) print() for a,b in match: filtered = re.sub("[^\u4e00-\u9fff]", "", a) if len(filtered) > 0: if a == filtered: result = split_reading(a, b, True) x,y = zip(*result) print("%s = %s" % (a,', '.join(y))) else: print("%s = %s" % (a, b)) print() kanji = "学校" kana = "がっこう" print(kanji) print(kana) print() result = split_reading(kanji, kana, True) print(result) for a, b in result: print("%s = %s" % (a,b)) #print("Sequence 2: %s" % seq3) #filtered = re.sub("[\u4e00-\u9fff]", " ", alignA) #kanji = re.sub("[^\u4e00-\u9fff ]", " ", alignA) #rubyAB = get_ruby(alignA, alignB) #rubyCD = get_ruby(alignC, alignD) print(get_readings("匹")) print(kanamatcher.match_kana("日本語は、主に日本国内や日本人同士で使われている言語である。", "にほんごは、おもににほんこくないやににほんじんどうしでつかわれているげんごである。"))
def main(*argv): parser = argparse.ArgumentParser(description="Test aligner on corpus.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('kanji_corpus', help="Corpus including kanji") parser.add_argument('kana_corpus', help="Corpus only including kana") parser.add_argument('--all', action="store_true", default=False, help="Run on whole corpus at once") parser.add_argument('--skip', type=int, default=0, help="Number of lines at the start of the corpus to skip") parser.add_argument('--missing-ruby-penalty', type=int, default=kanamatcher.NO_RUBY_PENALTY, help="Penalty for a kanji missing ruby") parser.add_argument('--kana-mismatch-penalty', type=int, default=kanamatcher.KANA_MISMATCH_PENALTY, help="Penalty for a mismatch between the given kana and the generated ruby") parser.add_argument('--alignments-to-test', type=int, default=kanamatcher.MAX_NUM_ALIGNMENTS, help="Max number of alignments to test") parser.add_argument('--save-output', default=None, help="Save statistics and scores to file") args = parser.parse_args() kanamatcher.NO_RUBY_PENALTY = args.missing_ruby_penalty kanamatcher.KANA_MISMATCH_PENALTY = args.kana_mismatch_penalty kanamatcher.MAX_NUM_ALIGNMENTS = args.alignments_to_test print_all = args.all strip_whitespace = re.compile(r"\s") kanji_file = args.kanji_corpus kana_file = args.kana_corpus lines = 0 errors = 0 total_score = 0 bad_lines = [] try: with open(kanji_file, encoding="utf-8-sig") as kjf, \ open(kana_file, encoding="utf-8-sig") as knf: skip_lines = 0 for kanji, kana in zip(kjf, knf): if kanji.strip() == '': continue lines += 1 if args.skip > 0: args.skip -= 1 continue print("{}.".format(lines)) kanji, kana = strip_whitespace.sub("", kanji), strip_whitespace.sub("", kana) print(kanji) print(kana) try: result, score = kanamatcher.match_kana(kanji, kana, return_score=True) except Exception as e: errors += 1 traceback.print_exc() print() continue total_score += score output = "\n".join(kanamatcher.pretty_print(result)) print(output) print("Score:", score) if score > 0: bad_lines.append((score, lines, kanji, kana, output)) if print_all or skip_lines > 0: if skip_lines > 0: skip_lines -= 1 print() else: l = input("Press enter or input the number of lines to skip: ") try: skip_lines = max(0, int(l)) except ValueError: pass except KeyboardInterrupt: pass bad_lines.sort(reverse=True) std_output = ("================\n" + "Errors during parsing: {}/{}\n".format(errors, lines) + "Total score: {}\n".format(total_score) + "Average score: {}\n".format(total_score / lines) + "================\n") pager_output = "" for score, line, kanji, kana, output in bad_lines: pager_output += "{}.\n{}\n{}\n{}\nScore: {}\n\n".format( line, kanji, kana, output, score) if args.save_output is not None: try: with open(args.save_output, 'w') as f: f.write(std_output) f.write(pager_output) except: traceback.print_exc() print(std_output) input("Press enter to view bad lines: ") pydoc.pager(pager_output)