def __init__(self, fwd_params, fwd_err, rev_params, rev_err): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') atools = os.path.join(cdec_root, 'utils', 'atools') (fwd_T, fwd_m) = self.read_err(fwd_err) (rev_T, rev_m) = self.read_err(rev_err) fwd_cmd = [ fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params ] rev_cmd = [ fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r' ] tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] logging.info('Executing: {}'.format(' '.join(fwd_cmd))) self.fwd_align = util.popen_io(fwd_cmd) logging.info('Executing: {}'.format(' '.join(rev_cmd))) self.rev_align = util.popen_io(rev_cmd) logging.info('Executing: {}'.format(' '.join(tools_cmd))) self.tools = util.popen_io(tools_cmd)
def __init__(self, configdir, tmpdir="/tmp", cache_size=5, norm=False): # name -> (method, set of possible nargs) self.COMMANDS = { "TR": (self.translate, set((1,))), "LEARN": (self.learn, set((2,))), "SAVE": (self.save_state, set((0, 1))), "LOAD": (self.load_state, set((0, 1))), "DROP": (self.drop_ctx, set((0,))), "LIST": (self.list_ctx, set((0,))), } cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) ### Single instance for all contexts self.config = configdir # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix="realtime.") logger.info("Using temp dir {}".format(self.tmp)) # Normalization self.norm = norm if self.norm: self.tokenizer = util.popen_io([os.path.join(cdec_root, "corpus", "tokenize-anything.sh"), "-u"]) self.tokenizer_lock = util.FIFOLock() self.detokenizer = util.popen_io([os.path.join(cdec_root, "corpus", "untok.pl")]) self.detokenizer_lock = util.FIFOLock() # Word aligner fwd_params = os.path.join(configdir, "a.fwd_params") fwd_err = os.path.join(configdir, "a.fwd_err") rev_params = os.path.join(configdir, "a.rev_params") rev_err = os.path.join(configdir, "a.rev_err") self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err) # Grammar extractor sa_config = cdec.configobj.ConfigObj(os.path.join(configdir, "sa.ini"), unrepr=True) sa_config.filename = os.path.join(self.tmp, "sa.ini") util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True) self.cache_size = cache_size ### One instance per context self.ctx_names = set() # All context-dependent operations are atomic self.ctx_locks = collections.defaultdict(util.FIFOLock) # ctx -> list of (source, target, alignment) self.ctx_data = {} # Grammar extractor is not threadsafe self.extractor_lock = util.FIFOLock() # ctx -> deque of file self.grammar_files = {} # ctx -> dict of {sentence: file} self.grammar_dict = {} self.decoders = {}
def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') logging.info('Using temp dir {}'.format(self.tmp)) # Normalization self.norm = norm if self.norm: self.tokenizer = util.popen_io([ os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u' ]) self.detokenizer = util.popen_io( [os.path.join(cdec_root, 'corpus', 'untok.pl')]) # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') fwd_err = os.path.join(configdir, 'a.fwd_err') rev_params = os.path.join(configdir, 'a.rev_params') rev_err = os.path.join(configdir, 'a.rev_err') self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err) # Grammar extractor sa_config = ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True) sa_config.filename = os.path.join(self.tmp, 'sa.ini') util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() self.extractor = cdec.sa.GrammarExtractor(sa_config.filename, online=True) self.grammar_files = collections.deque() self.grammar_dict = {} self.cache_size = cache_size # HPYPLM reference stream ref_fifo_file = os.path.join(self.tmp, 'ref.fifo') os.mkfifo(ref_fifo_file) self.ref_fifo = open(ref_fifo_file, 'w+') # Start with empty line (do not learn prior to first input) self.ref_fifo.write('\n') self.ref_fifo.flush() # Decoder decoder_config = [[f.strip() for f in line.split('=')] for line in open(os.path.join(configdir, 'cdec.ini')) ] util.cdec_ini_for_realtime(decoder_config, os.path.abspath(configdir), ref_fifo_file) decoder_config_file = os.path.join(self.tmp, 'cdec.ini') with open(decoder_config_file, 'w') as output: for (k, v) in decoder_config: output.write('{}={}\n'.format(k, v)) decoder_weights = os.path.join(configdir, 'weights.final') self.decoder = decoder.MIRADecoder(decoder_config_file, decoder_weights)
def __init__(self, config, weights): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) decoder = os.path.join(cdec_root, 'decoder', 'cdec') decoder_cmd = [decoder, '-c', config, '-w', weights] logger.info('Executing: {}'.format(' '.join(decoder_cmd))) self.decoder = util.popen_io(decoder_cmd) self.lock = util.FIFOLock()
def __init__(self, config, weights, metric="ibm_bleu"): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) mira = os.path.join(cdec_root, "training", "mira", "kbest_cut_mira") # optimizer=2 step=0.001 best=500, k=500, uniq, stream, metric mira_cmd = [ mira, "-c", config, "-w", weights, "-o", "2", "-C", "0.001", "-b", "500", "-k", "500", "-u", "-t", "-m", metric, ] logger.info("Executing: {}".format(" ".join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) self.lock = util.FIFOLock()
def __init__(self, config, weights): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) decoder = os.path.join(cdec_root, "decoder", "cdec") decoder_cmd = [decoder, "-c", config, "-w", weights] logger.info("Executing: {}".format(" ".join(decoder_cmd))) self.decoder = util.popen_io(decoder_cmd) self.lock = util.FIFOLock()
def __init__(self, config, weights, metric='ibm_bleu'): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira') # optimizer=2 step=0.001 best=500, k=500, uniq, stream, metric mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t', '-m', metric] logger.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) self.lock = util.FIFOLock()
def __init__(self, config, weights): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira') # optimizer=2 step=0.001 best=500, k=500, uniq, stream mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t'] logger.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) self.lock = util.FIFOLock()
def __init__(self, config, weights): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira') # optimizer=2 step=0.001 best=500, k=500, uniq, stream mira_cmd = [ mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t' ] logging.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd)
def __init__(self, fwd_params, fwd_err, rev_params, rev_err, heuristic='grow-diag-final-and'): cdec_root = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') atools = os.path.join(cdec_root, 'utils', 'atools') (fwd_T, fwd_m) = self.read_err(fwd_err) (rev_T, rev_m) = self.read_err(rev_err) fwd_cmd = [ fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params ] rev_cmd = [ fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r' ] tools_cmd = [atools, '-i', '-', '-j', '-', '-c', heuristic] logger.info('Executing: {}'.format(' '.join(fwd_cmd))) self.fwd_align = util.popen_io(fwd_cmd) logger.info('Executing: {}'.format(' '.join(rev_cmd))) self.rev_align = util.popen_io(rev_cmd) logger.info('Executing: {}'.format(' '.join(tools_cmd))) self.tools = util.popen_io(tools_cmd) # Used to guarantee thread safety self.lock = util.FIFOLock()
def __init__(self, fwd_params, fwd_err, rev_params, rev_err): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') atools = os.path.join(cdec_root, 'utils', 'atools') (fwd_T, fwd_m) = self.read_err(fwd_err) (rev_T, rev_m) = self.read_err(rev_err) fwd_cmd = [fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params] rev_cmd = [fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r'] tools_cmd = [atools, '-i', '-', '-j', '-', '-c', 'grow-diag-final-and'] logger.info('Executing: {}'.format(' '.join(fwd_cmd))) self.fwd_align = util.popen_io(fwd_cmd) logger.info('Executing: {}'.format(' '.join(rev_cmd))) self.rev_align = util.popen_io(rev_cmd) logger.info('Executing: {}'.format(' '.join(tools_cmd))) self.tools = util.popen_io(tools_cmd) # Used to guarantee thread safety self.lock = util.FIFOLock()
def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False): # name -> (method, set of possible nargs) self.COMMANDS = { 'TR': (self.translate, set((1,))), 'LEARN': (self.learn, set((2,))), 'SAVE': (self.save_state, set((0, 1))), 'LOAD': (self.load_state, set((0, 1))), 'DROP': (self.drop_ctx, set((0,))), 'LIST': (self.list_ctx, set((0,))), } cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # rt.ini options ini = dict(line.strip().split('=') for line in open(os.path.join(configdir, 'rt.ini'))) self.hpyplm = (ini.get('hpyplm', 'false') in TRUE) self.metric = ini.get('metric', 'ibm_bleu') ### Single instance for all contexts self.config = configdir # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') logger.info('Using temp dir {}'.format(self.tmp)) # Normalization self.norm = norm if self.norm: self.tokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u']) self.tokenizer_lock = util.FIFOLock() self.detokenizer = util.popen_io([os.path.join(cdec_root, 'corpus', 'untok.pl')]) self.detokenizer_lock = util.FIFOLock() # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') fwd_err = os.path.join(configdir, 'a.fwd_err') rev_params = os.path.join(configdir, 'a.rev_params') rev_err = os.path.join(configdir, 'a.rev_err') self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err) # Grammar extractor sa_config = cdec.configobj.ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True) sa_config.filename = os.path.join(self.tmp, 'sa.ini') util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() self.extractor = ExtractorWrapper(sa_config.filename) self.cache_size = cache_size ### One instance per context self.ctx_names = set() # All context-dependent operations are atomic self.ctx_locks = collections.defaultdict(util.FIFOLock) # ctx -> list of (source, target, alignment) self.ctx_data = {} # Grammar extractor is not threadsafe self.extractor_lock = util.FIFOLock() # ctx -> deque of file self.grammar_files = {} # ctx -> dict of {sentence: file} self.grammar_dict = {} self.decoders = {}
def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False): # name -> (method, set of possible nargs) self.COMMANDS = { 'TR': (self.translate, set((1, ))), 'LEARN': (self.learn, set((2, ))), 'SAVE': (self.save_state, set((0, 1))), 'LOAD': (self.load_state, set((0, 1))), 'DROP': (self.drop_ctx, set((0, ))), 'LIST': (self.list_ctx, set((0, ))), } cdec_root = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # rt.ini options ini = dict(line.strip().split('=') for line in open(os.path.join(configdir, 'rt.ini'))) self.hpyplm = (ini.get('hpyplm', 'false') in TRUE) self.metric = ini.get('metric', 'ibm_bleu') ### Single instance for all contexts self.config = configdir # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') logger.info('Using temp dir {}'.format(self.tmp)) # Normalization self.norm = norm if self.norm: self.tokenizer = util.popen_io([ os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u' ]) self.tokenizer_lock = util.FIFOLock() self.detokenizer = util.popen_io( [os.path.join(cdec_root, 'corpus', 'untok.pl')]) self.detokenizer_lock = util.FIFOLock() # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') fwd_err = os.path.join(configdir, 'a.fwd_err') rev_params = os.path.join(configdir, 'a.rev_params') rev_err = os.path.join(configdir, 'a.rev_err') self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err) # Grammar extractor sa_config = cdec.configobj.ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True) sa_config.filename = os.path.join(self.tmp, 'sa.ini') util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() self.extractor = ExtractorWrapper(sa_config.filename) self.cache_size = cache_size ### One instance per context self.ctx_names = set() # All context-dependent operations are atomic self.ctx_locks = collections.defaultdict(util.FIFOLock) # ctx -> list of (source, target, alignment) self.ctx_data = {} # Grammar extractor is not threadsafe self.extractor_lock = util.FIFOLock() # ctx -> deque of file self.grammar_files = {} # ctx -> dict of {sentence: file} self.grammar_dict = {} self.decoders = {}