#!/usr/bin/env python # -*- coding: utf-8 -*- import luigi import tools import gentask from sbc4_tm_lm_tasks import sbc4_zh_to_tok_tag_phrasetable, sbc4_tag_lm class test_zh_data(luigi.ExternalTask): def output(self): return luigi.LocalTarget('data/testzh.txt') gentask.zhtoktag('test_zh_tok', test_zh_data(), 'tt', tm=sbc4_zh_to_tok_tag_phrasetable(), lm=sbc4_tag_lm()) class oxford_np_ench(luigi.ExternalTask): def output(self): return luigi.LocalTarget('data/oxford.np.ench.txt') class oxford_np_ch(luigi.Task): def requires(self): return oxford_np_ench() def output(self): return luigi.LocalTarget('data/oxford.np.ch.txt')
def output(self): return { 'en': luigi.LocalTarget('data/fbis/fbis.en.pruned'), 'ch': luigi.LocalTarget('data/fbis/fbis.ch.pruned') } def run(self): with self.input()['en'].open( 'r') as en_infile, self.input()['ch'].open('r') as ch_infile: with self.output()['en'].open( 'w') as en_outfile, self.output()['ch'].open( 'w') as ch_outfile: for enline, chline in zip(en_infile, ch_infile): if len(chline) > 120: continue en_outfile.write(enline) ch_outfile.write(chline) fbis_ch_untok = gentask.untok( 'fbis_ch_untok', fbis_en_ch_prune_long(), 'data/fbis/fbis.ch.untok', input_target_key='ch') fbis_ch_untok_toktag = gentask.zhtoktag( 'fbis_ch_untok_toktag', fbis_ch_untok(), 'data/fbis/fbis.ch.untok.tok.txt', tm=sbc4_zh_to_tok_tag_phrasetable(), lm=sbc4_tag_lm()) if __name__ == '__main__': luigi.run(local_scheduler=True)
# en_patterns_pretty = gentask.patterns_pretty( # 'en_patterns_pretty', en_patterns(), target_dir / 'en.patterns.json') # patterns_allline_task = gentask_pattern.pipeline_allline_task( # 'moviesub_en_patterns_allline', en_truecase()) filtered_patterns = gentask_pattern.filtered_patterns_from_sentences( 'moviesub_en_filtered_patterns', en()) # ch = gentask.slice_lines_grouped_by_n('ch', ench(), target_dir / 'ch.txt', # n=3, # s=1) ch_untok = gentask.untok('ch_untok', ch(), target_dir / 'ch.untok.txt') ch_toktag = gentask.zhtoktag('ch_toktag', ch_untok(), target_dir / 'ch.toktag.txt', tm=sbc4_tok_tag_tm(), lm=sbc4_tag_lm()) ch_tok = gentask.remove_slashtag('ch_tok', ch_toktag(), target_dir / 'ch.tok.txt') en_chtok = gentask.parallel_lines_merge('en_chtok', en(), ch_tok(), target_dir / 'en_chtok.txt') # giza_task = gentask_giza.giza(inputf=str(target_dir / 'en_chtok.txt'), # outputd=str(target_dir / 'giza/')) unpack_singleline_patterns = gentask.localtarget_task( target_dir / 'en.gt.hiih.patterns.pretty.unpack_singleline.json') phrasetable = gentask.localtarget_task(target_dir / 'phrase-table.10000.gz')
sbc4_test_zh_untok = gentask.untok( 'sbc4_test_zh_untok', sbc4_test_zh(), base_dir / 'sbc4_test.zh.untok.txt') sbc4_train_tag = gentask.slice_lines_grouped_by_n( 'sbc4_train_tag', sbc4_train(), base_dir / 'sbc4_train.tag.txt', n=3, s=1) sbc4_train_tag_lm = gentask.lm( 'sbc4_train_tag_lm', sbc4_train_tag(), base_dir / 'sbc4_train.tag.lm', base_dir / 'sbc4_train.tag.blm') sbc4_train_zh_to_tok_tag_phrasetable = gentask.phrasetable( 'sbc4_train_zh_to_tok_tag_phrasetable', sbc4_train(), base_dir / 'sbc4_train.zh2toktag.phrasetable.h5') sbc4_train_toktag_sbc4_test = gentask.zhtoktag( 'sbc4_train_toktag_sbc4_test', sbc4_test_zh_untok(), base_dir / 'sbc4_test.zh.untok.tok.txt', tm=sbc4_train_zh_to_tok_tag_phrasetable(), lm=sbc4_train_tag_lm()) sbc4_test_slash = gentask.transformat_line2slash( 'sbc4_test_slash', sbc4_test(), base_dir / 'sbc4_test.slash.txt') wdiff_sbc4_test = gentask.word_diff('wdiff_sbc4_test', sbc4_test_slash(), sbc4_train_toktag_sbc4_test(), base_dir / 'sbc4-test.wdiff') wdiff_errors_sbc4_test = gentask.word_diff_errors( 'wdiff_errors_sbc4_test', wdiff_sbc4_test(), base_dir / 'sbc4-test.wdiff.errors') wdiff_src_error_words_sbc4_test = gentask.word_diff_src_error_words( 'wdiff_src_error_words_sbc4_test', wdiff_errors_sbc4_test(), base_dir / 'sbc4-test.wdiff.src_error_words')