def pass0_to_pass1(seg, options, lookup, stateless, saving={}, redo=True, log=dummyfunc) -> 'str': """ Pass1 consists in obtaining a target forest and locally scoring it. Steps 1. Project target side of the forest 2. Lookup scoring 3. Stateless scoring :return: source forest """ if is_step_complete('pass1.forest', saving, redo): # try to reuse previous results tgt_forest = unpickle_it(saving['pass1.forest']) else: # execute pass0 src_forest = pass0(seg, options, n_goal=0, saving={}, redo=redo, log=dummyfunc) # target projection log('[%d] Project target rules', seg.id) tgt_forest = make_target_forest(src_forest) if 'pass1.forest' in saving: pickle_it(saving['pass1.forest'], tgt_forest) if is_step_complete('pass1.components', saving, redo): components = unpickle_it(saving['pass1.components']) else: log('[%d] Lookup scoring', seg.id) lookup_comps = get_lookup_components(tgt_forest, lookup.extractors()) log('[%d] Stateless scoring', seg.id) stateless_comps = get_stateless_components(tgt_forest, stateless.extractors()) components = [FComponents([comps1, comps2]) for comps1, comps2 in zip(lookup_comps, stateless_comps)] if 'pass1.components' in saving: pickle_it(saving['pass1.components'], components) return tgt_forest, components
def pass2(seg, forest, lookup_scorer, stateless_scorer, stateful_scorer, goal_rule, omega=None, saving={}, redo=True, log=dummyfunc) -> 'tuple': """ Pass2 consists in exactly rescoring a forest. :return: rescored forest (a Hypergraph), and components (one FComponents object per edge) """ if is_step_complete('forest', saving, redo) and is_step_complete('components', saving, redo) : rescored_forest = unpickle_it(saving['forest']) components = unpickle_it(saving['components']) return rescored_forest, components log('[%d] Forest rescoring', seg.id) rescored_forest, components = rescore_forest(forest, 0, lookup_scorer, stateless_scorer, stateful_scorer, goal_rule=goal_rule, omega=omega, keep_components=True) if 'forest' in saving: pickle_it(saving['forest'], rescored_forest) if 'components' in saving: pickle_it(saving['components'], components) return rescored_forest, components
def ref_expectations(seg, args, staticdir, model): """ Return Z(x, y \in ref) and the expected feature vector. """ # 1. Load pickled objects if necessary logging.debug('[%d] Loading pickled reference forest and components', seg.id) forest = unpickle_it('{0}/{1}.ref.forest'.format(staticdir, seg.id)) components = unpickle_it('{0}/{1}.ref.ffs.all'.format(staticdir, seg.id)) tsort = AcyclicTopSortTable(forest) # 2. Compute f(d|x, y) logging.debug('[%d] Computing f(d|x,y)', seg.id) weights = np.array( [model.score(components[e]) for e in range(forest.n_edges())], dtype=ptypes.weight) fe = TableLookupFunction(weights) # 3. Compute expectations logging.debug('[%d] Computing expectations', seg.id) Z, mean = expected_components(forest, fe, tsort, semiring.inside, model, components) return Z, mean
def make_slice_sampler(seg, model, extra_grammar_paths=[], glue_grammar_paths=[], pass_through=True, default_symbol='X', goal_str='GOAL', start_str='S', saving={}, redo=True, log=dummyfunc) -> 'str': """ Return the best translation according to a consensus decision rule. :return: best translation string """ # check for pass1 if all(is_step_complete(step, saving, redo) for step in ['forest', 'lookup', 'stateless']): tgt_forest = unpickle_it(saving['forest']) lookup_comps = unpickle_it(saving['lookup']) stateless_comps = unpickle_it(saving['stateless']) else: src_forest = pass0(seg, extra_grammar_paths=extra_grammar_paths, glue_grammar_paths=glue_grammar_paths, pass_through=pass_through, default_symbol=default_symbol, goal_str=goal_str, start_str=start_str, n_goal=0, saving={}, redo=redo, log=log) # pass1: local scoring tgt_forest, lookup_comps, stateless_comps = pass1(seg, src_forest, model, saving=saving, redo=redo, log=log) # l(d) lfunc = TableLookupFunction(np.array([semiring.inside.times(model.lookup.score(ff1), model.stateless.score(ff2)) for ff1, ff2 in zip(lookup_comps, stateless_comps)], dtype=ptypes.weight)) # top sort table tsort = AcyclicTopSortTable(tgt_forest) goal_maker = GoalRuleMaker(goal_str=goal_str, start_str=start_str, n=1) # slice sampler sampler = SlicedRescoring(tgt_forest, lfunc, tsort, TableLookupScorer(model.dummy), StatelessScorer(model.dummy), StatefulScorer(model.stateful), semiring.inside, goal_rule=goal_maker.get_oview(), dead_rule=make_dead_oview()) return tgt_forest, lfunc, tsort, sampler
def hyp_expectations(seg, args, workspace, model): """ Return Z(x) and the expected feature vector. """ # 1. Load ffs components = unpickle_it('{0}/{1}.hyp.ffs.all'.format(workspace, seg.id)) # 2. Re-estimate probabilities logging.debug('[%d] Computing f(d|x,y)', seg.id) # estimate p(d) by renormalising f(d) [which already incorporates sample frequency] fd = np.array([model.score(comp) for comp in components], dtype=ptypes.weight) pd = semiring.inside.normalise(fd) # estimate Z(x) = \sum_d f(d) Z = semiring.inside.plus.reduce(fd) # estimate <phi(d)> wrt f(d) mean = model.constant(semiring.prob.zero) # here we use the renormalised distribution to compute expected features for p, comp in zip(pd, components): mean = mean.hadamard( comp.elementwise( FixedLHS(semiring.inside.as_real(p), semiring.prob.times)), semiring.prob.plus) return Z, mean
def importance_sample(seg, options, proxy, target, saving={}, redo=True, log=dummyfunc): """ :param seg: :param options: :param proxy: :param target: :param log: :return: """ if is_step_complete('is.samples', saving, redo): return unpickle_it(saving['is.samples']) q_forest, q_components = pass0_to_pass2(seg, options, TableLookupScorer(proxy.lookup), StatelessScorer(proxy.stateless), StatefulScorer(proxy.stateful), saving=saving, redo=redo, log=log) # Make unnormalised q(d) q_func = TableLookupFunction(np.array([proxy.score(comps) for comps in q_components], dtype=ptypes.weight)) log('[%d] Q-forest: nodes=%d edges=%d', seg.id, q_forest.n_nodes(), q_forest.n_edges()) tsort = AcyclicTopSortTable(q_forest) sampler = AncestralSampler(q_forest, tsort, omega=q_func) samples = sampler.sample(options.samples) d_groups = group_by_identity(samples) y_groups = group_by_projection(d_groups, lambda group: yield_string(q_forest, group.key)) is_yields = [] for y_group in y_groups: y = y_group.key is_derivations = [] for d_group in y_group.values: edges = d_group.key # reduce q weights through inside.times q_score = derivation_weight(q_forest, edges, semiring.inside, omega=q_func) # reduce q components through inside.times q_comps = proxy.constant(semiring.inside.one) for e in edges: q_comps = q_comps.hadamard(q_components[e], semiring.inside.times) # compute p components and p score p_comps, p_score = score_derivation(q_forest, edges, semiring.inside, TableLookupScorer(target.lookup), StatelessScorer(target.stateless), StatefulScorer(target.stateful)) # TODO: save {y => {edges: (q_comps, p_comps, count)}} is_derivations.append(ISDerivation(edges, q_comps, p_comps, d_group.count)) is_yields.append(ISYield(y, is_derivations, y_group.count)) if 'is.samples' in saving: pickle_it(saving['is.samples'], is_yields) return is_yields
def pass1(seg, src_forest, model, saving={}, redo=True, log=dummyfunc) -> 'str': """ Pass1 consists in obtaining a target forest and locally scoring it. Steps 1. Project target side of the forest 2. Lookup scoring 3. Stateless scoring :return: source forest """ if is_step_complete('forest', saving, redo): tgt_forest = unpickle_it(saving['forest']) else: # target projection log('[%d] Project target rules', seg.id) tgt_forest = make_target_forest(src_forest) if 'forest' in saving: pickle_it(saving['forest'], tgt_forest) # local scoring if is_step_complete('lookup', saving, redo): lookup_comps = unpickle_it(saving['lookup']) else: log('[%d] Lookup scoring', seg.id) lookup_comps = get_lookup_components(tgt_forest, model.lookup.extractors()) if 'lookup' in saving: pickle_it(saving['lookup'], lookup_comps) if is_step_complete('stateless', saving, redo): stateless_comps = unpickle_it(saving['stateless']) else: log('[%d] Stateless scoring', seg.id) stateless_comps = get_stateless_components(tgt_forest, model.stateless.extractors()) if 'stateless' in saving: pickle_it(saving['stateless'], stateless_comps) return tgt_forest, lookup_comps, stateless_comps
def estimate_partition_function(seg, model, merging): """ Returns Z(x, y \in refset) and Z(x, y \not\in refset). """ # 1. Load unique derivations separating them depending on whether or not they belong to the reference set components_R = [] # references components_C = [] # complement (not reference) seen = set() for supportdir in merging: for is_ref, edges, components in unpickle_it( '{0}/{1}.D.ffs.all'.format(supportdir, seg.id)): if edges in seen: # no duplicates continue seen.add(edges) if is_ref: components_R.append(components) else: components_C.append(components) logging.info('[%d] D(x) |R|=%d |C|=%d', seg.id, len(components_R), len(components_C)) # 2. Re-estimate probabilities logging.debug('[%d] Computing f(d|x) to estimate partition function', seg.id) if len(components_R): Z_R = semiring.inside.plus.reduce( np.array([model.score(comp) for comp in components_R], dtype=ptypes.weight)) else: Z_R = semiring.inside.zero if len(components_C): Z_C = semiring.inside.plus.reduce( np.array([model.score(comp) for comp in components_C], dtype=ptypes.weight)) else: Z_C = semiring.inside.zero return Z_R, Z_C
def pass0(seg, extra_grammar_paths=[], glue_grammar_paths=[], pass_through=True, default_symbol='X', goal_str='GOAL', start_str='S', max_span=-1, n_goal=0, saving={}, redo=True, log=dummyfunc) -> 'Hypergraph': """ Pass0 consists in parsing with the source side of the grammar. For now, pass0 does not do any scoring (not even local), but it could (TODO). Steps 1. Make a hypergraph view of the grammar 2. Make an input DFA 3. Parse the input DFA :return: source forest """ if is_step_complete('forest', saving, redo): return unpickle_it(saving['forest']) # here we need to decode for sure log('[%d] Make hypergraph view of all available grammars', seg.id) # make a hypergraph view of all available grammars grammar = make_grammar_hypergraph(seg, extra_grammar_paths=extra_grammar_paths, glue_grammar_paths=glue_grammar_paths, pass_through=pass_through, default_symbol=default_symbol) # parse source lattice log('[%d] Parse source DFA', seg.id) goal_maker = GoalRuleMaker(goal_str=goal_str, start_str=start_str, n=n_goal) dfa = make_input_dfa(seg) forest = parse_dfa(grammar, grammar.fetch(Nonterminal(start_str)), dfa, goal_maker.get_iview(), bottomup=True, constraint=HieroConstraints(grammar, dfa, max_span)) if 'forest' in saving: pickle_it(saving['forest'], forest) return forest
def biparse(seg: SegmentMetaData, options: SimpleNamespace, joint_model: ModelView, conditional_model: ModelView, workingdir=None, redo=True, log=dummyfunc) -> SimpleNamespace: """ Biparse a segment using a local model. 1. we parse the source with a joint model 2. we bi-parse source and target with a conditional model This separation allows us to factorise these models differently wrt local/nonlocal components. For example, an LM maybe seen as a local (read tractable) component of a conditional model, and as a nonlocal (read intractable) component of a joint model. An implementation detail: bi-parsing is implemented as a cascade of intersections (with projections in between). :param seg: a segment :param options: parsing options :param joint_model: a factorised view of the joint model, here we use only the local components :param conditional_model: a factorised view of the conditional, here we use only the local components :param workingdir: where to save files :param redo: whether or not previously saved computation should be discarded :param log: a logging function :return: result.{joint,conditional}.{forest,components} for the respective local model """ if workingdir: saving = preprocessed_training_files('{0}/{1}'.format(workingdir, seg.id)) else: saving = {} result = SimpleNamespace() result.joint = SimpleNamespace() result.conditional = SimpleNamespace() if conditional_model is None: steps = ['joint.forest', 'joint.components'] if all(is_step_complete(step, saving, redo) for step in steps): log('[%d] Reusing joint and conditional distributions from files', seg.id) result.joint.forest = unpickle_it(saving['joint.forest']) result.joint.components = unpickle_it(saving['joint.components']) result.conditional.forest = None result.conditional.components = [] return result steps = ['joint.forest', 'joint.components', 'conditional.forest', 'conditional.components'] if all(is_step_complete(step, saving, redo) for step in steps): log('[%d] Reusing joint and conditional distributions from files', seg.id) result.joint.forest = unpickle_it(saving['joint.forest']) result.joint.components = unpickle_it(saving['joint.components']) result.conditional.forest = unpickle_it(saving['conditional.forest']) result.conditional.components = unpickle_it(saving['conditional.components']) return result # 1. Make a grammar # here we need to decode for sure log('[%d] Make hypergraph view of all available grammars', seg.id) # make a hypergraph view of all available grammars grammar = make_grammar_hypergraph(seg, extra_grammar_paths=options.extra_grammars, glue_grammar_paths=options.glue_grammars, pass_through=options.pass_through, default_symbol=options.default_symbol) #print('GRAMMAR') #print(grammar) # 2. Joint distribution - Step 1: parse source lattice n_goal = 0 log('[%d] Parse source DFA', seg.id) goal_maker = GoalRuleMaker(goal_str=options.goal, start_str=options.start, n=n_goal) src_dfa = make_input_dfa(seg) src_forest = parse_dfa(grammar, grammar.fetch(Nonterminal(options.start)), src_dfa, goal_maker.get_iview(), bottomup=True, constraint=HieroConstraints(grammar, src_dfa, options.max_span)) #print('SOURCE') #print(src_forest) if not src_forest: raise ValueError('I cannot parse the input lattice: i) make sure your grammar has glue rules; ii) make sure it handles OOVs') # 3. Target projection of the forest log('[%d] Project target rules', seg.id) tgt_forest = make_target_forest(src_forest) #print('TARGET') #print(tgt_forest) # 4. Joint distribution - Step 2: scoring log('[%d] Joint model: (exact) local scoring', seg.id) result.joint = exact_rescoring(joint_model.local_model(), tgt_forest, goal_maker, log) # save joint distribution if 'joint.forest' in saving: pickle_it(saving['joint.forest'], result.joint.forest) if 'joint.components' in saving: pickle_it(saving['joint.components'], result.joint.components) if conditional_model is None: result.conditional.forest = None result.conditional.components = [] return result # 5. Conditional distribution - Step 1: parse the reference lattice log('[%d] Parse reference DFA', seg.id) ref_dfa = make_reference_dfa(seg) goal_maker.update() ref_forest = parse_dfa(result.joint.forest, 0, ref_dfa, goal_maker.get_oview(), bottomup=False) if not ref_forest: # reference cannot be parsed log('[%d] References cannot be parsed', seg.id) result.conditional.forest = ref_forest result.conditional.components = [] else: # 6. Conditional distribution - Step 2: scoring log('[%d] Conditional model: exact (local) scoring', seg.id) result.conditional = exact_rescoring(conditional_model.local_model(), ref_forest, goal_maker, log) # save conditional distribution if 'conditional.forest' in saving: pickle_it(saving['conditional.forest'], result.conditional.forest) if 'conditional.components' in saving: pickle_it(saving['conditional.components'], result.conditional.components) return result
def pass0_to_pass2(seg, options, lookup, stateless, stateful, saving={}, redo=True, log=dummyfunc) -> 'tuple': """ Pass2 consists in exactly rescoring a forest. :return: rescored forest (a Hypergraph), and components (one FComponents object per edge) """ # We try to reuse previous results if is_step_complete('pass2.forest', saving, redo) and is_step_complete('pass2.components', saving, redo): forest = unpickle_it(saving['pass2.forest']) components = unpickle_it(saving['pass2.components']) return forest, components # We check whether we need pass2 if not stateful: # execute passes 0 to 1 only forest, components = pass0_to_pass1(seg, options, lookup, stateless, saving, redo=redo, log=log) # TODO: complete components with empty stateful model # save (or link) forest if 'pass2.forest' in saving: if 'pass1.forest' in saving: symlink(saving['pass1.forest'], saving['pass2.forest']) else: pickle_it(saving['pass2.forest'], forest) # save (or link) components if 'pass2.components' in saving: if 'pass1.components' in saving: symlink(saving['pass1.components'], saving['pass2.components']) else: pickle_it(saving['pass2.components'], components) return forest, components # From here we are sure we have stateful scorers # then we first execute passes 0 to 1 (and discard dummy components) forest, _ = pass0_to_pass1(seg, options, TableLookupScorer(DummyModel()), StatelessScorer(DummyModel()), saving, redo=redo, log=log) # then we fully re-score the forest (keeping all components) log('[%d] Forest rescoring', seg.id) goal_maker = GoalRuleMaker(goal_str=options.goal, start_str=options.start, n=1) forest, components = rescore_forest(forest, 0, TableLookupScorer(lookup), StatelessScorer(stateless), StatefulScorer(stateful), goal_rule=goal_maker.get_oview(), keep_components=True) # save the forest if 'pass2.forest' in saving: pickle_it(saving['pass2.forest'], forest) # save the components if 'pass2.components' in saving: pickle_it(saving['pass2.components'], components) return forest, components
def slice_sample(seg, args, staticdir, supportdir, workspace, model): files = [ '{0}/{1}.D.ffs.all'.format(supportdir, seg.id), '{0}/{1}.hyp.ffs.all'.format(workspace, seg.id) ] if all(os.path.exists(path) for path in files) and not args.redo: logging.info('Reusing samples for segment %d', seg.id) return # 1. Load pickled objects logging.debug('[%d] Loading target forest', seg.id) forest = unpickle_it('{0}/{1}.hyp.forest'.format(staticdir, seg.id)) # TODO: store top sort table logging.debug('[%d] Loading local components', seg.id) lookupffs = unpickle_it('{0}/{1}.hyp.ffs.rule'.format(staticdir, seg.id)) statelessffs = unpickle_it('{0}/{1}.hyp.ffs.stateless'.format( staticdir, seg.id)) # 2. Compute l(d) # there is a guarantee that lookup components and stateless components were computed over the same forest # that is, with the same nodes/edges structure # this is crucial to compute l(d) as below logging.debug('[%d] Computing l(d)', seg.id) lfunc = TableLookupFunction( np.array([ semiring.inside.times(model.lookup.score(ff1), model.stateless.score(ff2)) for ff1, ff2 in zip(lookupffs, statelessffs) ], dtype=ptypes.weight)) # 3. Sample from f(d) = n(d) * l(d) logging.debug('[%d] Sampling from f(d) = n(d) * l(d)', seg.id) tsort = AcyclicTopSortTable(forest) goal_maker = GoalRuleMaker(args.goal, args.start, n=2) sampler = SlicedRescoring(forest, lfunc, tsort, TableLookupScorer(model.dummy), StatelessScorer(model.dummy), StatefulScorer(model.stateful), semiring.inside, goal_maker.get_oview(), OutputView(make_dead_srule())) # here samples are represented as sequences of edge ids d0, markov_chain = sampler.sample(n_samples=args.samples[0], batch_size=args.batch, within=args.within, initial=args.initial, prior=args.prior, burn=args.burn, lag=args.lag, temperature0=args.temperature0) # save empirical support pickle_it( '{0}/{1}.D.ffs.all'.format(supportdir, seg.id), get_empirical_support(model, frozenset(seg.refs), forest, lookupffs, statelessffs, markov_chain)) # apply usual MCMC filters to the Markov chain samples = apply_filters(markov_chain, burn=args.burn, lag=args.lag) n_samples = len(samples) # 4. Complete feature vectors and compute expectation hypcomps = [] hypexp = model.constant(semiring.prob.zero) d_groups = group_by_identity(samples) for d_group in d_groups: derivation = d_group.key # reconstruct components lookup_comps = model.lookup.constant(semiring.inside.one) stateless_comps = model.stateless.constant(semiring.inside.one) for e in derivation.edges: lookup_comps = lookup_comps.hadamard(lookupffs[e], semiring.inside.times) stateless_comps = stateless_comps.hadamard(statelessffs[e], semiring.inside.times) # complete components (lookup, stateless, stateful) # note that here we are updating derivation.components! derivation.components = FComponents( [lookup_comps, stateless_comps, derivation.components]) # incorporate sample frequency hypcomps.append( derivation.components.power( float(d_group.count) / n_samples, semiring.inside)) hypexp = hypexp.hadamard(hypcomps[-1], semiring.prob.plus) # save feature vectors pickle_it('{0}/{1}.hyp.ffs.all'.format(workspace, seg.id), hypcomps) # 5. Log stuff if args.save_d: save_mcmc_derivations( '{0}/{1}.hyp.d.gz'.format(workspace, seg.id), d_groups, valuefunc=lambda d: d.score, compfunc=lambda d: d.components, derivation2str=lambda d: bracketed_string(forest, d.edges)) if args.save_y: projections = group_by_projection( samples, lambda d: yield_string(forest, d.edges)) save_mcmc_yields('{0}/{1}.hyp.y.gz'.format(workspace, seg.id), projections) if args.save_chain: markov_chain.appendleft(d0) save_markov_chain( '{0}/{1}.hyp.chain.gz'.format(workspace, seg.id), markov_chain, flat=True, valuefunc=lambda d: d.score, #compfunc=lambda d: d.components, # TODO: complete feature vectors of all derivations derivation2str=lambda d: bracketed_string(forest, d.edges))