예제 #1
0
#     'en_genia_line_iih', en_genia(), target_dir / 'en.genia.hiih.txt'
# )  # horizontal and IIH

# en_patterns = gentask.patterns('en_patterns', en_genia_line_iih(),
#                                target_dir / 'en.patterns.json.d')

# en_patterns_allline = gentask.pattern_allline(
#     'en_patterns_allline', en_genia_line_iih(), target_dir / 'en.patterns.d')

# en_patterns_pretty = gentask.patterns_pretty(
# 'en_patterns_pretty', en_patterns(), target_dir / 'en.patterns.json')

# patterns_allline_task = gentask_pattern.pipeline_allline_task(
# 'moviesub_en_patterns_allline', en_truecase())

filtered_patterns = gentask_pattern.filtered_patterns_from_sentences(
    'moviesub_en_filtered_patterns', en())

# ch = gentask.slice_lines_grouped_by_n('ch', ench(), target_dir / 'ch.txt',
# n=3,
# s=1)
ch_untok = gentask.untok('ch_untok', ch(), target_dir / 'ch.untok.txt')
ch_toktag = gentask.zhtoktag('ch_toktag', ch_untok(),
                             target_dir / 'ch.toktag.txt',
                             tm=sbc4_tok_tag_tm(),
                             lm=sbc4_tag_lm())

ch_tok = gentask.remove_slashtag('ch_tok', ch_toktag(),
                                 target_dir / 'ch.tok.txt')

en_chtok = gentask.parallel_lines_merge('en_chtok', en(), ch_tok(),
                                        target_dir / 'en_chtok.txt')
예제 #2
0
#     'en_genia_line_iih', en_genia(), target_dir / 'en.genia.hiih.txt'
# )  # horizontal and IIH

# en_patterns = gentask.patterns('en_patterns', en_genia_line_iih(),
#                                target_dir / 'en.patterns.json.d')

# en_patterns_allline = gentask.pattern_allline(
#     'en_patterns_allline', en_genia_line_iih(), target_dir / 'en.patterns.d')

# en_patterns_pretty = gentask.patterns_pretty(
# 'en_patterns_pretty', en_patterns(), target_dir / 'en.patterns.json')

# patterns_allline_task = gentask_pattern.pipeline_allline_task(
# 'medal_en_patterns_allline', en_truecase())

filtered_patterns = gentask_pattern.filtered_patterns_from_sentences(
    'medal_en_filtered_patterns', en_truecase())

ch = gentask.slice_lines_grouped_by_n('ch', ench(), target_dir / 'ch.txt',
                                      n=3,
                                      s=1)
ch_toktag = gentask.zhtoktag('ch_toktag', ch(), target_dir / 'ch.toktag.txt',
                             tm=sbc4_tok_tag_tm(),
                             lm=sbc4_tag_lm())

ch_tok = gentask.remove_slashtag('ch_tok', ch_toktag(),
                                 target_dir / 'ch.tok.txt')

en_chtok = gentask.parallel_lines_merge('en_chtok', en_truecase(), ch_tok(),
                                        target_dir / 'en_chtok.txt')

giza_task = gentask_giza.giza(inputf=str(target_dir / 'en_chtok.txt'),