import luigi import gentask import gentask_giza from pathlib import Path import sys from sbc4_tm_lm_tasks import sbc4_tok_tag_tm, sbc4_tag_lm from collections import Counter, defaultdict from operator import itemgetter from itertools import chain from functools import reduce import gentask_pattern import gentask_spg ch = gentask.localtarget_task('tgt_data/moviesub/ch.txt') en = gentask.localtarget_task('tgt_data/moviesub/en.txt') target_dir = Path('tgt_data/moviesub') # ench = gentask.transformat_tab2lines('line_sep_ench', orig_ench(), # target_dir / 'ench.txt') # en = gentask.slice_lines_grouped_by_n('en', ench(), target_dir / 'en.txt', # n=3, # s=0) # en_unidecode = gentask.unidecode('en_unidecode', en(), # target_dir / 'en.unidecode.txt') # en_retok = gentask.word_tokenize('en_retok', en_unidecode(), # target_dir / 'en.retok.txt') # en_truecase = gentask.truecase('moviesub_en_truecase', en_retok(), en_retok(), # target_dir / 'en.truecase.txt')
def requires(self): return localtarget_task(self.inputf)()
# -*- coding: utf-8 -*- import luigi import gentask import gentask_giza from pathlib import Path import sys from sbc4_tm_lm_tasks import sbc4_tok_tag_tm, sbc4_tag_lm from collections import Counter, defaultdict from operator import itemgetter from itertools import chain from functools import reduce import operator import gentask_pattern orig_ench = gentask.localtarget_task('src_data/medal.ench.txt') target_dir = Path('tgt_data/medal') ench = gentask.transformat_tab2lines('line_sep_ench', orig_ench(), target_dir / 'ench.txt') en = gentask.slice_lines_grouped_by_n('en', ench(), target_dir / 'en.txt', n=3, s=0) en_unidecode = gentask.unidecode('en_unidecode', en(), target_dir / 'en.unidecode.txt') en_retok = gentask.word_tokenize('en_retok', en_unidecode(), target_dir / 'en.retok.txt') en_truecase = gentask.truecase('medal_en_truecase', en_retok(), en_retok(), target_dir / 'en.truecase.txt') # en_genia = gentask.geniatagger('medal_en_genia', en_truecase(),