# -*- coding: utf-8 -*- import luigi import tools import gentask from sbc4_tm_lm_tasks import sbc4_zh_to_tok_tag_phrasetable, sbc4_tag_lm class test_zh_data(luigi.ExternalTask): def output(self): return luigi.LocalTarget('data/testzh.txt') gentask.zhtoktag('test_zh_tok', test_zh_data(), 'tt', tm=sbc4_zh_to_tok_tag_phrasetable(), lm=sbc4_tag_lm()) class oxford_np_ench(luigi.ExternalTask): def output(self): return luigi.LocalTarget('data/oxford.np.ench.txt') class oxford_np_ch(luigi.Task): def requires(self): return oxford_np_ench() def output(self): return luigi.LocalTarget('data/oxford.np.ch.txt')
def output(self): return { 'en': luigi.LocalTarget('data/fbis/fbis.en.pruned'), 'ch': luigi.LocalTarget('data/fbis/fbis.ch.pruned') } def run(self): with self.input()['en'].open( 'r') as en_infile, self.input()['ch'].open('r') as ch_infile: with self.output()['en'].open( 'w') as en_outfile, self.output()['ch'].open( 'w') as ch_outfile: for enline, chline in zip(en_infile, ch_infile): if len(chline) > 120: continue en_outfile.write(enline) ch_outfile.write(chline) fbis_ch_untok = gentask.untok( 'fbis_ch_untok', fbis_en_ch_prune_long(), 'data/fbis/fbis.ch.untok', input_target_key='ch') fbis_ch_untok_toktag = gentask.zhtoktag( 'fbis_ch_untok_toktag', fbis_ch_untok(), 'data/fbis/fbis.ch.untok.tok.txt', tm=sbc4_zh_to_tok_tag_phrasetable(), lm=sbc4_tag_lm()) if __name__ == '__main__': luigi.run(local_scheduler=True)