def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') logging.info('Using temp dir {}'.format(self.tmp)) # Normalization self.norm = norm if self.norm: self.tokenizer = util.popen_io([ os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u' ]) self.detokenizer = util.popen_io( [os.path.join(cdec_root, 'corpus', 'untok.pl')]) # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') fwd_err = os.path.join(configdir, 'a.fwd_err') rev_params = os.path.join(configdir, 'a.rev_params') rev_err = os.path.join(configdir, 'a.rev_err') self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err) # Grammar extractor sa_config = ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True) sa_config.filename = os.path.join(self.tmp, 'sa.ini') util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True) self.grammar_files = collections.deque() self.grammar_dict = {} self.cache_size = cache_size # HPYPLM reference stream ref_fifo_file = os.path.join(self.tmp, 'ref.fifo') os.mkfifo(ref_fifo_file) self.ref_fifo = open(ref_fifo_file, 'w+') # Start with empty line (do not learn prior to first input) self.ref_fifo.write('\n') self.ref_fifo.flush() # Decoder decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini')) ] util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file) decoder_config_file = os.path.join(self.tmp, 'cdec.ini') with open(decoder_config_file, 'w') as output: for (k, v) in decoder_config: output.write('{}={}\n'.format(k, v)) decoder_weights = os.path.join(configdir, 'weights.final') self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights)
def main(): if len(sys.argv[1:]) != 12: sys.stderr.write( 'usage: {} a.fwd_params a.fwd_err a.rev_params a.rev_err sa sa.ini mono.klm libcdec_ff_hpyplm.so corpus.hpyplm cdec.ini weights.final output.d\n' .format(sys.argv[0])) sys.exit(2) (a_fwd_params, a_fwd_err, a_rev_params, a_rev_err, sa, sa_ini, mono_klm, libcdec_ff_hpyplm_so, corpus_hpyplm, cdec_ini, weights_final, output_d) = sys.argv[1:] if os.path.exists(output_d): sys.stderr.write('Directory {} exists, exiting.\n'.format(output_d)) sys.exit(1) # output.d os.mkdir(output_d) # alignment model shutil.copy(a_fwd_params, os.path.join(output_d, 'a.fwd_params')) shutil.copy(a_fwd_err, os.path.join(output_d, 'a.fwd_err')) shutil.copy(a_rev_params, os.path.join(output_d, 'a.rev_params')) shutil.copy(a_rev_err, os.path.join(output_d, 'a.rev_err')) # grammar extractor shutil.copytree(sa, os.path.join(output_d, 'sa')) config = ConfigObj(sa_ini, unrepr=True) config.filename = os.path.join(output_d, 'sa.ini') rt.util.sa_ini_for_config(config) config.write() # language models shutil.copy(mono_klm, os.path.join(output_d, 'mono.klm')) shutil.copy(libcdec_ff_hpyplm_so, os.path.join(output_d, 'libcdec_ff_hpyplm.so')) shutil.copy(corpus_hpyplm, os.path.join(output_d, 'corpus.hpyplm')) # decoder config config = [[f.strip() for f in line.split('=')] for line in open(cdec_ini)] rt.util.cdec_ini_for_config(config) with open(os.path.join(output_d, 'cdec.ini'), 'w') as output: for (k, v) in config: output.write('{}={}\n'.format(k, v)) # weights shutil.copy(weights_final, os.path.join(output_d, 'weights.final'))
def main(): if len(sys.argv[1:]) not in (10, 12): sys.stderr.write( 'usage: {} a.fwd_params a.fwd_err a.rev_params a.rev_err sa sa.ini mono.klm cdec.ini weights.final output.d [libcdec_ff_hpyplm.so corpus.hpyplm]\n' .format(sys.argv[0])) sys.exit(2) (a_fwd_params, a_fwd_err, a_rev_params, a_rev_err, sa, sa_ini, mono_klm, cdec_ini, weights_final, output_d) = sys.argv[1:11] # Optional (libcdec_ff_hpyplm_so, corpus_hpyplm) = sys.argv[11:13] if len(sys.argv[1:]) == 12 else (None, None) if os.path.exists(output_d): sys.stderr.write('Directory {} exists, exiting.\n'.format(output_d)) sys.exit(1) # output.d os.mkdir(output_d) # alignment model shutil.copy(a_fwd_params, os.path.join(output_d, 'a.fwd_params')) shutil.copy(a_fwd_err, os.path.join(output_d, 'a.fwd_err')) shutil.copy(a_rev_params, os.path.join(output_d, 'a.rev_params')) shutil.copy(a_rev_err, os.path.join(output_d, 'a.rev_err')) # grammar extractor shutil.copytree(sa, os.path.join(output_d, 'sa')) config = ConfigObj(sa_ini, unrepr=True) config.filename = os.path.join(output_d, 'sa.ini') rt.util.sa_ini_for_config(config) config.write() # language models shutil.copy(mono_klm, os.path.join(output_d, 'mono.klm')) if libcdec_ff_hpyplm_so: shutil.copy(libcdec_ff_hpyplm_so, os.path.join(output_d, 'libcdec_ff_hpyplm.so')) if corpus_hpyplm: shutil.copy(corpus_hpyplm, os.path.join(output_d, 'corpus.hpyplm')) # decoder config config = [[f.strip() for f in line.split('=')] for line in open(cdec_ini)] rt.util.cdec_ini_for_config(config) with open(os.path.join(output_d, 'cdec.ini'), 'w') as output: for (k, v) in config: output.write('{}={}\n'.format(k, v)) # weights shutil.copy(weights_final, os.path.join(output_d, 'weights.final')) # other options rt_ini = os.path.join(output_d, 'rt.ini') with open(rt_ini, 'w') as out: if libcdec_ff_hpyplm_so and corpus_hpyplm: out.write('hpyplm=true\n') else: out.write('hpyplm=false\n') out.write('metric=ibm_bleu\n') sys.stderr.write( 'IMPORTANT: add any additional options such as metric=meteor to {}\n'. format(rt_ini))