def grammarize_headline(headline, sent): juman_prc.stdin.write(preprocess_sentence(sent) + '\n') sent_juman_output = read_until_EOS(juman_prc.stdout) sent_morphemes = decode_juman_info(sent_juman_output) sent_words = extract_open_classes(sent_morphemes) headline = preprocess_sentence(headline) titles = [s for t in headline.split(' ') for s in t.split('ーー')] while titles: title = ' '.join(titles) + '\n' juman_prc.stdin.write(preprocess_sentence(title)) title_juman_output = read_until_EOS(juman_prc.stdout) title_morphemes = decode_juman_info(title_juman_output) if len(title_morphemes) <= 6: return open_classes = extract_open_classes(title_morphemes) # TODO: 単語の順序も考える if len(open_classes) >= 4 and set(open_classes).issubset(set(sent_words)): knp_prc.stdin.write(sent_juman_output) sent_knp_output = read_until_EOS(knp_prc.stdout) knp_info = analyze_knp(sent_knp_output) oc_pairs = mark_words_in_sent(knp_info['morphemes'], title_morphemes, open_classes) try: compressed, alignment = compress_sentence(knp_info, title_morphemes, oc_pairs) except BadPairException: return return compressed, alignment else: titles = titles[:-1]
#!/usr/bin/python3 from knp.knpinfo import preprocess_sentence import sys if __name__ == '__main__': for sent in sys.stdin: if sent[0] == ' ': sent = sent[1:] print(preprocess_sentence(sent), end='') sys.stdout.flush()
knp_info = analyze_knp(sent_knp_output) oc_pairs = mark_words_in_sent(knp_info['morphemes'], title_morphemes, open_classes) try: compressed, alignment = compress_sentence(knp_info, title_morphemes, oc_pairs) except BadPairException: return return compressed, alignment else: titles = titles[:-1] if __name__ == '__main__': if len(sys.argv) < 2: print('usage: ./print_pairs.py xml-file-like-毎日新聞コーパス > file-to-store-pairs.txt', file=sys.stderr) sys.exit(1) for hline, sent in yield_headline_and_1st_sent(sys.argv[1]): sent = sent.lstrip().rstrip() compressed_alignment = grammarize_headline(hline, sent) if compressed_alignment: compressed, alignment = compressed_alignment print(hline) print(preprocess_sentence(sent)) print(compressed) for i, j in alignment: print(str(i) + '-' + str(j), end=' ') print('\n') # sys.stdin.readline() knp_prc.terminate() juman_prc.terminate() sys.exit(0)