def _PREPARE_DEVSET_(workspace, path, config, alias='dev', input_format='cdec', grammar_dir=None): # load dev set and separate input and references logging.info('Reading %s set: %s', alias, path) #if grammar_dir is None: # if config.has_section('chisel:sampler'): # sampler_map = dict(config.items('chisel:sampler')) # grammar_dir = sampler_map.get('grammars', None) with smart_ropen(path) as f: devset = [ SegmentMetaData.parse(line.strip(), input_format, grammar_dir=grammar_dir) for sid, line in enumerate(f) ] logging.info('%d %s instances', len(devset), alias) # dump source and references with smart_wopen('{0}/{1}.input'.format(workspace, alias)) as fi: with smart_wopen('{0}/{1}.refs'.format(workspace, alias)) as fr: for seg in devset: print >> fi, seg.to_sgm(dump_refs=False) print >> fr, ' ||| '.join(str(ref) for ref in seg.refs) return devset
def sample(self, run, alias, config, samples=1000, grammar=None, extra_parameters=''): """ Sample derivation for a certain set of segments. :param workspace: workspace :param alias: alias of the set (determines the workspace) :param config: the number of the configuration file to be used (if not given, we assume the same as iteration) For example, sample(1, 'dev', 0) will sample at the beginning of iteration 1 using config0.ini. Alternatively, sample(1, 'devtest', 1) will sample at the end of iteration 1 using config1.ini. :param samples: how samples to draw :param grammar: path to a grammar (typically necessary when sampling for a devtest set) :param extra_parameters: additional parameters to chisel.sampler :returns: path to samples """ options = { 'config': '{0}/{1}'.format(self.workspace, config), 'workspace': '{0}/{1}/{2}'.format(self.workspace, run, alias), 'samples': samples } mkdir(options['workspace']) # command line cmd_str = 'python -m chisel.sampler %(config)s %(workspace)s --samples %(samples)d' % options # additional parameters including --grammar if grammar is not None: cmd_str = '{0} --grammar {1}'.format(cmd_str, grammar) if extra_parameters: cmd_str = '{0} {1}'.format(cmd_str, extra_parameters) logging.debug('[%s] Run: %s', run, cmd_str) # prepare args cmd_args = shlex.split(cmd_str) # sample t0 = time() logging.info('[%s] Sampling %d solutions (%s)...', run, samples, alias) with smart_ropen('{0}/{1}.input'.format(self.workspace, alias)) as fi: with smart_wopen( self.path_to_log('sampling-{0}'.format(alias), run)) as fo: with smart_wopen( self.path_to_log('sampling-{0}'.format(alias), run, err=True)) as fe: fe.write('{0}\n'.format(cmd_str)) proc = sp.Popen(cmd_args, stdin=fi, stdout=fo, stderr=fe) proc.wait() dt = time() - t0 logging.info('[%s] sampling took %f seconds', run, dt) return '{0}/samples'.format(options['workspace'])
def decide(self, run, alias, config, extra_parameters=''): """ Apply a decision rule.. :param iteration: current iteration (determines the run folder) :param alias: alias of the set (determines the workspace) :param config: the number of the configuration file to be used (if not given, we assume the same as iteration) For example, decide(1, 'dev', 0) will decide from samples drawn at the beginning of iteration 1 using config0.ini. Alternatively, decide(1, 'devtest', 1) will decide from sample drawn at the end of iteration 1 using config1.ini. :param extra_parameters: additional parameters to chisel.fast_consensus :returns: (path to ranked decisions, path to 1-best outputs) """ # required options options = { 'config': '{0}/{1}'.format(self.workspace, config), 'workspace': '{0}/{1}/{2}'.format(self.workspace, run, alias) } # command line cmd_str = 'python -m chisel.fast_consensus %(config)s %(workspace)s ' % options # additional parameters if extra_parameters: cmd_str = '{0} {1}'.format(cmd_str, extra_parameters) logging.debug('[%s] Run: %s', run, cmd_str) # perpare args cmd_args = shlex.split(cmd_str) # decide t0 = time() logging.info('[%s] Deciding (%s)...', run, alias) with smart_wopen(self.path_to_log('decision-{0}'.format(alias), run)) as fo: with smart_wopen( self.path_to_log('decision-{0}'.format(alias), run, err=True)) as fe: proc = sp.Popen(cmd_args, stdin=None, stdout=fo, stderr=fe) proc.wait() dt = time() - t0 logging.info('[%s] deciding took %f seconds', run, dt) return '{0}/decisions'.format( options['workspace']), '{0}/output'.format(options['workspace'])
def _BASE_CONFIG_(config, workspace, proxy_wmap, target_wmap): config.remove_section('proxy') config.add_section('proxy') [config.set('proxy', f, v) for f, v in proxy_wmap.iteritems()] config.remove_section('target') config.add_section('target') [config.set('target', f, v) for f, v in target_wmap.iteritems()] with smart_wopen('{0}/config0.ini'.format(workspace)) as fo: config.write(fo) return '{0}/config0.ini'.format(workspace)
def save(self, raw_samples, odir, suffix=''): with smart_wopen('{0}/{1}{2}.gz'.format(odir, self.segment_.id, suffix)) as fo: print >> fo, '[proxy]' print >> fo, '\n'.join('{0}={1}'.format(k, v) for k, v in sorted(self.proxy_weights_.iteritems(), key=lambda (k,v): k)) print >> fo print >> fo, '[target]' print >> fo, '\n'.join('{0}={1}'.format(k, v) for k, v in sorted(self.target_weights_.iteritems(), key=lambda (k,v): k)) print >> fo print >> fo, '[samples]' print >> fo, '# count projection vector' for sample in sorted(raw_samples, key=lambda r: r.count, reverse=True): print >> fo, '{0}\t{1}\t{2}'.format(sample.count, sample.projection, sample.vector)
def assess(self, run, alias): # where samples, decisions and outputs can be found workspace = '{0}/{1}/{2}'.format(self.workspace, run, alias) # command line cmd_str = '{0} -r {1}'.format( self.args.scoring_tool, '{0}/{1}.refs'.format(self.workspace, alias)) logging.debug('[%s] Run: %s', run, cmd_str) # prepare args cmd_args = shlex.split(cmd_str) # assess t0 = time() trans_path = '{0}/output/consensus-bleu'.format(workspace) logging.info('[%s] Assessing (%s)...', run, alias) score = None with smart_ropen(trans_path) as fin: bleu_out = '{0}.bleu.stdout'.format(splitext(trans_path)[0]) bleu_err = '{0}.bleu.stderr'.format(splitext(trans_path)[0]) with smart_wopen(bleu_out) as fout: with smart_wopen(bleu_err) as ferr: # logging.info(cmd_args) proc = sp.Popen(cmd_args, stdin=fin, stdout=fout, stderr=ferr) proc.wait() try: with smart_ropen(bleu_out) as fi: line = next(fi) score = float(line.strip()) except: logging.error('[%s] Problem reading %s for %s', run, bleu_out, alias) dt = time() - t0 logging.info('[%s] assessing took %f seconds', run, dt) return score
def main(): options, config = argparse_and_config() # check for input folder samples_dir = '{0}/samples'.format(options.workspace) if not os.path.isdir(samples_dir): raise Exception( 'If a workspace is set, samples are expected to be found under $workspace/samples' ) logging.info('Reading samples from %s', samples_dir) # create output folders if not os.path.isdir('{0}/output'.format(options.workspace)): os.makedirs('{0}/output'.format(options.workspace)) output_dir = create_decision_rule_dir(options.workspace, 'consensus', 'bleu') one_best_file = '{0}/output/{1}-{2}'.format(options.workspace, 'consensus', 'bleu') logging.info("Writing '%s' solutions to %s", 'consensus', output_dir) logging.info("Writing 1-best '%s' yields to %s", 'consensus', one_best_file) # TODO: generalise this headers = { 'derivation': 'd', 'vector': 'v', 'count': 'n', 'log_ur': 'log_ur', 'importance': 'importance' } # read jobs from workspace input_files = list_numbered_files(samples_dir) jobs = [(fid, input_file) for fid, input_file in input_files] logging.info('%d jobs', len(jobs)) # run jobs in parallel pool = Pool(options.jobs) # run decision rules and save them to files results = pool.map( partial(decide_and_save, headers=headers, options=options, output_dir=output_dir), jobs) # save the 1-best solution for each decision rule in a separate file with smart_wopen(one_best_file) as fout: for y, l, p, q in results: fout.write('{0}\n'.format(y))
def decide_and_save(job_desc, headers, options, output_dirs): # this code runs in a Pool, thus we wrap in try/except in order to have more informative exceptions jid, path = job_desc try: # make decisions decisions = make_decisions(job_desc, headers, options) #, q_wmap, p_wmap) # write to file if necessary for rule, ranking in decisions.iteritems(): with smart_wopen('{0}/{1}.gz'.format(output_dirs[rule], jid)) as out: print >> out, '\t'.join(['#target', '#p', '#q', '#yield']) for solution in ranking: print >> out, solution.format_str(keys=['p', 'q', 'yield']) print >> out return {rule: solutions[0] for rule, solutions in decisions.iteritems()} except: raise Exception('job={0} exception={1}'.format(jid, ''.join(traceback.format_exception(*sys.exc_info()))))
def decide_and_save(job_desc, headers, options, output_dirs): # this code runs in a Pool, thus we wrap in try/except in order to have more informative exceptions jid, path = job_desc try: # make decisions decisions = make_decisions(job_desc, headers, options) #, q_wmap, p_wmap) # write to file if necessary for rule, ranking in decisions.iteritems(): with smart_wopen('{0}/{1}.gz'.format(output_dirs[rule], jid)) as out: print >> out, '\t'.join(['#target', '#p', '#q', '#yield']) for solution in ranking: print >> out, solution.format_str(keys=['p', 'q', 'yield']) print >> out return { rule: solutions[0] for rule, solutions in decisions.iteritems() } except: raise Exception('job={0} exception={1}'.format( jid, ''.join(traceback.format_exception(*sys.exc_info()))))
def update_config_file(self, before, after, proxy_scaling=None, target_scaling=None, proxy=None, target=None): config_path = '{0}/{1}'.format(self.workspace, before) if not os.path.exists(config_path): raise IOError( 'Perhaps iteration %s did not complete successfully?' % path) config = Config(config_path) config.add_section('chisel:model') if proxy_scaling is not None: config.set('chisel:model', 'proxy_scaling', proxy_scaling) if target_scaling is not None: config.set('chisel:model', 'target_scaling', target_scaling) config.add_section('proxy') if proxy is None: [config.set('proxy', f, v) for f, v in self.wmap.proxy.iteritems()] else: [config.set('proxy', f, v) for f, v in proxy.iteritems()] config.add_section('target') if target is None: [ config.set('target', f, v) for f, v in self.wmap.target.iteritems() ] else: [config.set('target', f, v) for f, v in target.iteritems()] config_path = '{0}/{1}'.format(self.workspace, after) with smart_wopen(config_path) as fo: config.write(fo) return config_path
def decide_and_save(job_desc, headers, options, output_dir): # this code runs in a Pool, thus we wrap in try/except in order to have more informative exceptions jid, path = job_desc try: # make decisions ranking = make_decisions(job_desc, headers, options) # write to file if necessary with smart_wopen('{0}/{1}.gz'.format(output_dir, jid)) as out: # TODO: save nbest out.write('{0}\n'.format('\t'.join( ['#target', '#p', '#q', '#yield']))) if options.nbest > 0: for y, l, p, q in ranking[0:options.nbest]: out.write('{0}\n'.format('\t'.join( str(x) for x in [l, p, q, y]))) else: for y, l, p, q in ranking: out.write('{0}\n'.format('\t'.join( str(x) for x in [l, p, q, y]))) return ranking[0] except: raise Exception('job={0} exception={1}'.format( jid, ''.join(traceback.format_exception(*sys.exc_info()))))
def training_loss(self, run, alias, segments, samples): L = [] if self.args.save_loss: loss_dir = '{0}/{1}/loss'.format(self.workspace, run, alias) mkdir(loss_dir) logging.info('[%s] Computing loss (%s)...', run, alias) t0 = time() # run fast bleu implementation # TODO: generalise to other metrics for seg, derivations in zip(segments, samples): projections = frozenset(d.tree.projection for d in derivations) scorer = TrainingBLEU(seg.refs) lmap = {y: scorer.loss(y.split()) for y in projections} L.append(lmap) if self.args.save_loss: with smart_wopen('{0}/{1}.gz'.format(loss_dir, seg.id)) as fo: for d in derivations: fo.write('{0}\n'.format(lmap[d.tree.projection])) dt = time() - t0 logging.info('[%s] computing loos took %s seconds', run, dt) return L
def main(): options, config = argparse_and_config() # loads mteval modules if config.has_section('chisel:metrics'): metrics_map = dict(config.items('chisel:metrics')) else: metrics_map = {'bleu': 'chisel.mteval.bleu'} mteval.load(metrics_map, frozenset([options.metric])) if not mteval.sanity_check(options.metric): raise Exception( "Perhaps you forgot to include the metric '%s' in the configuration file?" % options.metric) # configure mteval metrics if config.has_section('chisel:metrics:config'): metrics_config = dict(config.items('chisel:metrics:config')) else: metrics_config = {} logging.debug('chisel:metrics:config: %s', metrics_config) # configure metrics mteval.configure(metrics_config) # gather decision rules to be run decision_rules = [] if options.map: decision_rules.append('MAP') if options.mbr: decision_rules.append('MBR') if options.consensus: decision_rules.append('consensus') # check for input folder samples_dir = '{0}/samples'.format(options.workspace) if not os.path.isdir(samples_dir): raise Exception( 'If a workspace is set, samples are expected to be found under $workspace/samples' ) logging.info('Reading samples from %s', samples_dir) # create output folders if not os.path.isdir('{0}/output'.format(options.workspace)): os.makedirs('{0}/output'.format(options.workspace)) output_dirs = {} one_best_files = {} # TODO: check whether decisions already exist (and warn the user) for rule in decision_rules: if rule == 'MAP': output_dirs[rule] = create_decision_rule_dir( options.workspace, rule) one_best_files[rule] = '{0}/output/{1}'.format( options.workspace, rule) else: output_dirs[rule] = create_decision_rule_dir( options.workspace, rule, options.metric) one_best_files[rule] = '{0}/output/{1}-{2}'.format( options.workspace, rule, options.metric) logging.info("Writing '%s' solutions to %s", rule, output_dirs[rule]) logging.info("Writing 1-best '%s' yields to %s", rule, one_best_files[rule]) # TODO: generalise this headers = { 'derivation': 'd', 'vector': 'v', 'count': 'n', 'log_ur': 'log_ur', 'importance': 'importance' } # read jobs from workspace input_files = list_numbered_files(samples_dir) jobs = [(fid, input_file) for fid, input_file in input_files] logging.info('%d jobs', len(jobs)) """ # sometimes I use this for profiling (gotta write a better switch) for job in jobs: decide_and_save(job, headers=headers, options=options, fnames=target_features, gnames=proxy_features, output_dirs=output_dirs) sys.exit(0) """ # run jobs in parallel pool = Pool(options.jobs) # run decision rules and save them to files results = pool.map( partial( decide_and_save, headers=headers, options=options, #q_wmap=proxy_wmap, #p_wmap=target_wmap, output_dirs=output_dirs), jobs) # save the 1-best solution for each decision rule in a separate file for rule in decision_rules: with smart_wopen(one_best_files[rule]) as fout: for decisions in results: best = decisions[rule] # instance of KBestSolution print >> fout, best.solution.Dy.projection
def main(): options, config = argparse_and_config() # loads mteval modules if config.has_section('chisel:metrics'): metrics_map = dict(config.items('chisel:metrics')) else: metrics_map = {'bleu': 'chisel.mteval.bleu'} mteval.load(metrics_map, frozenset([options.metric])) if not mteval.sanity_check(options.metric): raise Exception("Perhaps you forgot to include the metric '%s' in the configuration file?" % options.metric) # configure mteval metrics if config.has_section('chisel:metrics:config'): metrics_config = dict(config.items('chisel:metrics:config')) else: metrics_config = {} logging.debug('chisel:metrics:config: %s', metrics_config) # configure metrics mteval.configure(metrics_config) # gather decision rules to be run decision_rules = [] if options.map: decision_rules.append('MAP') if options.mbr: decision_rules.append('MBR') if options.consensus: decision_rules.append('consensus') # check for input folder samples_dir = '{0}/samples'.format(options.workspace) if not os.path.isdir(samples_dir): raise Exception('If a workspace is set, samples are expected to be found under $workspace/samples') logging.info('Reading samples from %s', samples_dir) # create output folders if not os.path.isdir('{0}/output'.format(options.workspace)): os.makedirs('{0}/output'.format(options.workspace)) output_dirs = {} one_best_files = {} # TODO: check whether decisions already exist (and warn the user) for rule in decision_rules: if rule == 'MAP': output_dirs[rule] = create_decision_rule_dir(options.workspace, rule) one_best_files[rule] = '{0}/output/{1}'.format(options.workspace, rule) else: output_dirs[rule] = create_decision_rule_dir(options.workspace, rule, options.metric) one_best_files[rule] = '{0}/output/{1}-{2}'.format(options.workspace, rule, options.metric) logging.info("Writing '%s' solutions to %s", rule, output_dirs[rule]) logging.info("Writing 1-best '%s' yields to %s", rule, one_best_files[rule]) # TODO: generalise this headers = {'derivation': 'd', 'vector': 'v', 'count': 'n', 'log_ur': 'log_ur', 'importance': 'importance'} # read jobs from workspace input_files = list_numbered_files(samples_dir) jobs = [(fid, input_file) for fid, input_file in input_files] logging.info('%d jobs', len(jobs)) """ # sometimes I use this for profiling (gotta write a better switch) for job in jobs: decide_and_save(job, headers=headers, options=options, fnames=target_features, gnames=proxy_features, output_dirs=output_dirs) sys.exit(0) """ # run jobs in parallel pool = Pool(options.jobs) # run decision rules and save them to files results = pool.map(partial(decide_and_save, headers=headers, options=options, #q_wmap=proxy_wmap, #p_wmap=target_wmap, output_dirs=output_dirs), jobs) # save the 1-best solution for each decision rule in a separate file for rule in decision_rules: with smart_wopen(one_best_files[rule]) as fout: for decisions in results: best = decisions[rule] # instance of KBestSolution print >> fout, best.solution.Dy.projection