def main(): args = argparser().parse_args() if args.hypotheses: if not os.path.exists(args.hypotheses): raise FileNotFoundError('Hypotheses file not found: %s' % args.hypotheses) hstream = smart_ropen(args.hypotheses).readlines() else: hstream = sys.stdin.readlines() if not os.path.exists(args.references): raise FileNotFoundError('Reference file not found: %s' % args.references) rstream = smart_ropen(args.references).readlines() # compute bleu bleu, pn, bp = stream_doc_bleu(hstream, rstream, args.order, args.smoothing) print(bleu) # log brevity penalty, n-gram precisions, and BLEU-1 to BLEU-order print('grasp.mt.bleu loaded %d segments' % len(hstream), file=sys.stderr) bleus = [] for max_order in range(1, args.order + 1): bleus.append( (bp * np.exp(1.0 / max_order * np.sum(np.log(pn[0:max_order]))))) print('bp=%.4f ||| %s ||| %s' % (bp, ' '.join('p%d=%.4f' % (i, x) for i, x in enumerate(pn, 1)), ' '.join('bleu-%d=%.4f' % (i, x) for i, x in enumerate(bleus, 1))), file=sys.stderr)
def mteval(args, staticdir, model, segments, hyp_path, ref_path, eval_path, ranking_path): """ Decode and evaluate with an external tool. :return: BLEU score. """ if ranking_path: os.makedirs(ranking_path, exist_ok=True) # decode with Pool(args.jobs) as workers: results = workers.map( partial(training_decode, args=args, n_samples=args.samples[1], staticdir=staticdir, decisiondir=ranking_path, model=model, redo=args.redo, log=logging.info), segments) # write best decisions to file with smart_wopen(hyp_path) as fo: for y in results: print(y, file=fo) # call scoring tool cmd_str = '{0} -r {1}'.format(args.scoring_tool, ref_path) logging.info('Scoring: %s', cmd_str) # prepare args cmd_args = shlex.split(cmd_str) # assess score = None with smart_ropen(hyp_path) as fin: with smart_wopen('{0}.stdout'.format(eval_path)) as fout: with smart_wopen('{0}.stderr'.format(eval_path)) as ferr: with sp.Popen(cmd_args, stdin=fin, stdout=fout, stderr=ferr) as proc: proc.wait() try: with smart_ropen('{0}.stdout'.format(eval_path)) as fi: line = next(fi) score = float(line.strip()) except: logging.error('Problem reading %s.stdout', eval_path) return score
def load_grammar(path): """ Load a grammar from a text file. :param path: :return: """ return SCFG(iterrules(smart_ropen(path)))
def construct_extractors(path): """ Read a configuration file and construct the extractors specified in each line. :param path: path to configuration file :return: list of extractors (in the order they were listed in the configuration file) """ extractors = [] names = set() with smart_ropen(path) as fi: for i, line in enumerate(fi, 1): if line.startswith('#'): continue line = line.strip() if not line: continue try: cfg, [cls] = re_sub('^([^ ]+)', '', line) except: raise ValueError('In line %d, expected class name: %s' % (i, line)) cfg, name = re_key_value('name', cfg) if not name: name = cls if name in names: raise ValueError( 'In line %d, duplicate name (%s), rename your extractor with name=<CustomName>' % (i, name)) names.add(name) cfg, pkg = re_key_value('pkg', cfg) impl = get_extractor_implementation(cls, pkg) extractor = impl.construct(len(extractors), name, cfg) extractors.append(extractor) return extractors
def read_weights(path, default=None, random=False, temperature=1.0, u=0, std=0.01): """ Read a sequence of key-value pairs. :param path: file where to read sequence from :param default: if set, overwrites the values read from file :param random: if set, sample values from N(u, std) :param temperature: scales the final weight: weight/T :param u: mean of normal :param std: standard deviation :return: """ wmap = {} with smart_ropen(path) as fi: for line in fi.readlines(): fields = line.split() if len(fields) != 2: continue w = float(fields[1]) if default is not None: w = default elif random: w = np.random.normal(u, std) w /= temperature wmap[fields[0]] = w return wmap
def read_factorisation(path): """ Return a joint and a conditional factorisation of the model. :param path: path to a file with the complete factorisation of a model """ joint_cfg = defaultdict(set) conditional_cfg = defaultdict(set) if path: with smart_ropen(path) as fi: changes = None for line in fi: line = line.strip() if not line or line.startswith( '#'): # ignore comments and empty lines continue if line == '[joint]': changes = joint_cfg elif line == '[conditional]': changes = conditional_cfg elif changes is None: raise ValueError('Syntax error in factorisation file') elif line.startswith('local='): names = line.replace('local=', '', 1) changes['local'].update(names.split()) elif line.startswith('nonlocal='): names = line.replace('nonlocal=', '', 1) changes['nonlocal'].update(names.split()) return joint_cfg, conditional_cfg
def read_empirical_distribution(path): """ Return the empirical distribution (a numpy array) and the support (tuples). :param path: path of distribution over projections :return: """ Y = [] P = [] with smart_ropen(path) as fi: lines = fi.readlines() for line in lines: if line.startswith('#'): continue line = line.strip() if not line: continue fields = line.split('\t') if len(fields) < 2: raise ValueError( 'Bad format: I expected the first column to be an estimate and the last to be the solution.' ) estimate = float(fields[0]) projection = tuple(fields[-1].split()) Y.append(projection) P.append(estimate) return np.array(P), tuple(Y)
def mteval(args, workspace, iteration, proxy, target, segments, alias): decisions = sample_and_decode( args, '{0}/{1}'.format(workspace, alias), '{0}/iterations/{1}/{2}'.format(workspace, iteration, alias), proxy, target, segments) evaldir = '{0}/iterations/{1}/{2}'.format(workspace, iteration, alias) os.makedirs(evaldir, exist_ok=True) with smart_wopen('{0}/hyps'.format(evaldir)) as fo: for y, p, l in decisions: print(y, file=fo) bleu, pn, bp = stream_doc_bleu(smart_ropen('{0}/hyps'.format(evaldir)), smart_ropen('{0}/{1}/refs'.format( workspace, alias)), max_order=args.bleu_order, smoothing=args.bleu_smoothing) logging.info('BLEU %s: %.4f', alias, bleu) return bleu
def read_segments_from_file(path, grammar_dir=None, shuffle=False) -> 'tuple': """ Read cdec-formated input segments (possibly along with their reference translations) from a file. :param path: path to file (possibly gzipped) :param grammar_dir: overwrites grammar directory :param shuffle: shuffle segments inplace :return: tuple of SegmentMetaData objects """ return read_segments_from_stream(smart_ropen(path), grammar_dir=grammar_dir, shuffle=shuffle)
def decode(seg, args, n_samples, model, saving, redo, log=dummyfunc): # first we check whether the decisions have been completed before if is_step_complete('decisions', saving, redo): log('[%d] Reusing decisions', seg.id) with smart_ropen(saving['decisions']) as fi: for line in fi.readlines(): if line.startswith('#'): continue line = line.strip() if not line: continue fields = line.split(' ||| ') # that should be (loss, posterior, solution) if len(fields) == 3: return fields[2] # that's the solution forest, lfunc, tsort, sampler = make_slice_sampler(seg, model, extra_grammar_paths=args.extra_grammar, glue_grammar_paths=args.glue_grammar, pass_through=args.pass_through, default_symbol=args.default_symbol, goal_str=args.goal, start_str=args.start, saving=saving, redo=args.redo, log=log) d0, markov_chain = sampler.sample(n_samples=n_samples, batch_size=args.batch, within=args.within, initial=args.initial, prior=args.prior, burn=args.burn, lag=args.lag, temperature0=args.temperature0) # TODO: save stuff samples = apply_filters(markov_chain, burn=args.burn, lag=args.lag) decisions = consensus(seg, forest, samples) if 'decisions' in saving: # write all decisions to file with smart_wopen(saving['decisions']) as fo: print('# co-loss ||| posterior ||| solution', file=fo) for l, p, y in decisions: print('{0} ||| {1} ||| {2}'.format(l, p, y), file=fo) return decisions[0][2] # return best translation
def iterrules(path, transform, fname='Prob'): fi = smart_ropen(path) for line in fi: line = line.strip() if not line: continue fields = line.split() lhs = fields[0] (num, den) = fields[-1].split('/') num = float(num) den = float(den) rhs = fields[ 1:-2] # fields[-2] is the yield function, which we are ignoring yield CFGProduction(Nonterminal(lhs), [Nonterminal(s) for s in rhs], {fname: transform(num / den)})
def iterlexicon(path, transform, fname='Prob'): fi = smart_ropen(path) for line in fi: line = line.strip() if not line: continue fields = line.split('\t') word = fields[0] for pair in fields[1:]: tag, fraction = pair.split(' ') num, den = fraction.split('/') num = float(num) den = float(den) r = CFGProduction(Nonterminal(tag), (Terminal(word), ), {fname: transform(num / den)}) yield r
def read_grammar(istream, transform=float, cdec_adapt=False, fprefix='UnnamedFeature', ply_based=True): """ Read a grammar from an input stream. :param istream: an input stream or a path to grammar file. :param transform: a transformation (e.g. log). :param cdec_adapt: wehter or not the input grammar is in cdec format :param fprefix: prefix used in naming unnamed features :param ply_based: whether or not to use a lex-yacc parser :return: a CFG """ if type(istream) is str: istream = smart_ropen(istream) if cdec_adapt: istream = cdec_adaptor(istream) if ply_based: parser = CFGYacc(transform=transform, fprefix=fprefix) parser.build(debug=False, optimize=True, write_tables=True, tabmodule='cfg_yacctab') return CFG(parser.parse(istream)) else: return CFG(read_basic(istream, transform))
def get_factorised_models(model: Model, path='') -> (ModelView, ModelView): """ Return a joint and a conditional factorisation of the model. :param model: a Model :param path: (optional) path to a file changing the default way of factorising a model :return: joint view and conditional view """ joint_changes = defaultdict(set) conditional_changes = defaultdict(set) if path: with smart_ropen(path) as fi: changes = None for line in fi: line = line.strip() if not line or line.startswith('#'): # ignore comments and empty lines continue if line == '[joint]': changes = joint_changes elif line == '[conditional]': changes = conditional_changes elif changes is None: raise ValueError('Syntax error in factorisation file') elif line.startswith('local='): names = line.replace('local=', '', 1) changes['local'].update(names.split()) elif line.startswith('nonlocal='): names = line.replace('nonlocal=', '', 1) changes['nonlocal'].update(names.split()) joint_model = ModelView(model.wmap, model.extractors(), local_names=joint_changes['local'], nonlocal_names=joint_changes['nonlocal']) conditional_model = ModelView(model.wmap, model.extractors(), local_names=conditional_changes['local'], nonlocal_names=conditional_changes['nonlocal']) return joint_model, conditional_model