def __init__(self, config, weights): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) decoder = os.path.join(cdec_root, 'decoder', 'cdec') decoder_cmd = [decoder, '-c', config, '-w', weights] logger.info('Executing: {}'.format(' '.join(decoder_cmd))) self.decoder = util.popen_io(decoder_cmd) self.lock = util.FIFOLock()
def __init__(self, config, weights, metric='ibm_bleu'): cdec_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) mira = os.path.join(cdec_root, 'training', 'mira', 'kbest_cut_mira') # optimizer=2 step=0.001 best=500, k=500, uniq, stream, metric mira_cmd = [mira, '-c', config, '-w', weights, '-o', '2', '-C', '0.001', '-b', '500', '-k', '500', '-u', '-t', '-m', metric] logger.info('Executing: {}'.format(' '.join(mira_cmd))) self.decoder = util.popen_io(mira_cmd) self.lock = util.FIFOLock()
def __init__(self, config): # Make sure pycdec is on PYTHONPATH cdec_root = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) pycdec = os.path.join(cdec_root, 'python') env = os.environ.copy() python_path = env.get('PYTHONPATH', '') if 'cdec/python' not in python_path: python_path = '{}:{}'.format( python_path, pycdec) if len(python_path) > 0 else pycdec env['PYTHONPATH'] = python_path # Start grammar extractor as separate process using stdio cmd = [ 'python', '-m', 'cdec.sa.extract', '-o', '-z', '-c', config, '-t' ] logger.info('Executing: {}'.format(' '.join(cmd))) self.p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) util.consume_stream(self.p.stderr) self.lock = util.FIFOLock()
def __init__(self, fwd_params, fwd_err, rev_params, rev_err, heuristic='grow-diag-final-and'): cdec_root = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) fast_align = os.path.join(cdec_root, 'word-aligner', 'fast_align') atools = os.path.join(cdec_root, 'utils', 'atools') (fwd_T, fwd_m) = self.read_err(fwd_err) (rev_T, rev_m) = self.read_err(rev_err) fwd_cmd = [ fast_align, '-i', '-', '-d', '-T', fwd_T, '-m', fwd_m, '-f', fwd_params ] rev_cmd = [ fast_align, '-i', '-', '-d', '-T', rev_T, '-m', rev_m, '-f', rev_params, '-r' ] tools_cmd = [atools, '-i', '-', '-j', '-', '-c', heuristic] logger.info('Executing: {}'.format(' '.join(fwd_cmd))) self.fwd_align = util.popen_io(fwd_cmd) logger.info('Executing: {}'.format(' '.join(rev_cmd))) self.rev_align = util.popen_io(rev_cmd) logger.info('Executing: {}'.format(' '.join(tools_cmd))) self.tools = util.popen_io(tools_cmd) # Used to guarantee thread safety self.lock = util.FIFOLock()
def __init__(self, configdir, tmpdir='/tmp', cache_size=5, norm=False): # name -> (method, set of possible nargs) self.COMMANDS = { 'TR': (self.translate, set((1, ))), 'LEARN': (self.learn, set((2, ))), 'SAVE': (self.save_state, set((0, 1))), 'LOAD': (self.load_state, set((0, 1))), 'DROP': (self.drop_ctx, set((0, ))), 'LIST': (self.list_ctx, set((0, ))), } cdec_root = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # rt.ini options ini = dict(line.strip().split('=') for line in open(os.path.join(configdir, 'rt.ini'))) self.hpyplm = (ini.get('hpyplm', 'false') in TRUE) self.metric = ini.get('metric', 'ibm_bleu') ### Single instance for all contexts self.config = configdir # Temporary work dir self.tmp = tempfile.mkdtemp(dir=tmpdir, prefix='realtime.') logger.info('Using temp dir {}'.format(self.tmp)) # Normalization self.norm = norm if self.norm: self.tokenizer = util.popen_io([ os.path.join(cdec_root, 'corpus', 'tokenize-anything.sh'), '-u' ]) self.tokenizer_lock = util.FIFOLock() self.detokenizer = util.popen_io( [os.path.join(cdec_root, 'corpus', 'untok.pl')]) self.detokenizer_lock = util.FIFOLock() # Word aligner fwd_params = os.path.join(configdir, 'a.fwd_params') fwd_err = os.path.join(configdir, 'a.fwd_err') rev_params = os.path.join(configdir, 'a.rev_params') rev_err = os.path.join(configdir, 'a.rev_err') self.aligner = aligner.ForceAligner(fwd_params, fwd_err, rev_params, rev_err) # Grammar extractor sa_config = cdec.configobj.ConfigObj(os.path.join(configdir, 'sa.ini'), unrepr=True) sa_config.filename = os.path.join(self.tmp, 'sa.ini') util.sa_ini_for_realtime(sa_config, os.path.abspath(configdir)) sa_config.write() self.extractor = ExtractorWrapper(sa_config.filename) self.cache_size = cache_size ### One instance per context self.ctx_names = set() # All context-dependent operations are atomic self.ctx_locks = collections.defaultdict(util.FIFOLock) # ctx -> list of (source, target, alignment) self.ctx_data = {} # Grammar extractor is not threadsafe self.extractor_lock = util.FIFOLock() # ctx -> deque of file self.grammar_files = {} # ctx -> dict of {sentence: file} self.grammar_dict = {} self.decoders = {}