Exemplo n.º 1
0
def exact_rescoring(model: ModelContainer,
                    forest: Hypergraph, goal_maker: GoalRuleMaker, log=dummyfunc) -> SimpleNamespace:
    """
    Exactly rescore a forest with a certain model.

    :param model: an instance of ModelContainer
    :param forest: a Hypergraph
    :param goal_maker: an object to deliver (output view of) goal rules
    :param log: a logging function
    :return: result.forest and result.components as a SimpleNamespace object
    """
    result = SimpleNamespace()

    if not model.stateful:  # when the model is not stateful, we don't need Earley
        log('Lookup scoring')
        lookup_comps = get_lookup_components(forest, model.lookup.extractors())  # lookup
        log('Stateless scoring')
        stateless_comps = get_stateless_components(forest, model.stateless.extractors())  # stateless
        result.forest = forest
        result.components = [FComponents([comps1, comps2]) for comps1, comps2 in zip(lookup_comps, stateless_comps)]

    else:  # here we cannot avoid it
        log('Forest rescoring')
        goal_maker.update()
        result.forest, result.components = rescore_forest(forest,
                                                          0,
                                                          TableLookupScorer(model.lookup),
                                                          StatelessScorer(model.stateless),
                                                          StatefulScorer(model.stateful),
                                                          goal_rule=goal_maker.get_oview(),
                                                          keep_components=True)

    return result
Exemplo n.º 2
0
def training_biparse(seg, args, workingdir, model, log=dummyfunc) -> 'bool':
    """
    Steps:
        I. Pass0 and pass1: parse source, project, local scoring
        II. Pass2
            - make a reference DFA
            - parse the reference DFA
            - fully score the reference forest (lookup, stateless, stateful)
                - save rescored forest and components
    :return: whether or not the input is bi-parsable
    """

    pass1_files = ['{0}/{1}.hyp.forest'.format(workingdir, seg.id),
                   '{0}/{1}.hyp.ffs.rule'.format(workingdir, seg.id),
                   '{0}/{1}.hyp.ffs.stateless'.format(workingdir, seg.id)]
    ref_files = ['{0}/{1}.ref.ffs.all'.format(workingdir, seg.id),
                 '{0}/{1}.ref.forest'.format(workingdir, seg.id)]

    # check for redundant work
    if all(os.path.exists(path) for path in pass1_files) and not args.redo:
        if all(os.path.exists(path) for path in ref_files):
            log('[%d] Reusing forests for segment', seg.id)
            return True   # parsable
        else:
            return False  # not parsable

    # pass0: parsing

    src_forest = pass0(seg,
                       extra_grammar_paths=args.extra_grammar,
                       glue_grammar_paths=args.glue_grammar,
                       pass_through=args.pass_through,
                       default_symbol=args.default_symbol,
                       goal_str=args.goal,
                       start_str=args.start,
                       n_goal=0,
                       saving={},
                       redo=args.redo,
                       log=log)

    # pass1: local scoring

    saving1 = {
        'forest': '{0}/{1}.hyp.forest'.format(workingdir, seg.id),
        'lookup': '{0}/{1}.hyp.ffs.rule'.format(workingdir, seg.id),
        'stateless': '{0}/{1}.hyp.ffs.stateless'.format(workingdir, seg.id)
    }

    tgt_forest, lookup_comps, stateless_comps = pass1(seg,
                                                      src_forest,
                                                      model,
                                                      saving=saving1,
                                                      redo=args.redo,
                                                      log=log)


    # parse reference lattice
    log('[%d] Parse reference DFA', seg.id)
    ref_dfa = make_reference_dfa(seg)
    goal_maker = GoalRuleMaker(goal_str=args.goal, start_str=args.start, n=1)
    ref_forest = parse_dfa(tgt_forest,
                           0,
                           ref_dfa,
                           goal_maker.get_oview(),
                           bottomup=False)

    if not ref_forest:
        return False  # not parsable

    # pass2: rescore reference forest

    saving2 = {
        'forest': '{0}/{1}.ref.forest'.format(workingdir, seg.id),
        'components': '{0}/{1}.ref.ffs.all'.format(workingdir, seg.id)
    }
    goal_maker.update()
    pass2(seg, ref_forest,
          TableLookupScorer(model.lookup),
          StatelessScorer(model.stateless),
          StatefulScorer(model.stateful),
          goal_maker.get_oview(),
          saving=saving2, redo=args.redo,
          log=log)

    return True  # parsable
Exemplo n.º 3
0
def biparse(seg: SegmentMetaData, options: SimpleNamespace,
            joint_model: ModelView, conditional_model: ModelView,
            workingdir=None, redo=True, log=dummyfunc) -> SimpleNamespace:
    """
    Biparse a segment using a local model.
    1. we parse the source with a joint model
    2. we bi-parse source and target with a conditional model
    This separation allows us to factorise these models differently wrt local/nonlocal components.
    For example, an LM maybe seen as a local (read tractable) component of a conditional model,
     and as a nonlocal (read intractable) component of a joint model.
    An implementation detail: bi-parsing is implemented as a cascade of intersections (with projections in between).

    :param seg: a segment
    :param options: parsing options
    :param joint_model: a factorised view of the joint model, here we use only the local components
    :param conditional_model: a factorised view of the conditional, here we use only the local components
    :param workingdir: where to save files
    :param redo: whether or not previously saved computation should be discarded
    :param log: a logging function
    :return: result.{joint,conditional}.{forest,components} for the respective local model
    """

    if workingdir:
        saving = preprocessed_training_files('{0}/{1}'.format(workingdir, seg.id))
    else:
        saving = {}

    result = SimpleNamespace()
    result.joint = SimpleNamespace()
    result.conditional = SimpleNamespace()

    if conditional_model is None:
        steps = ['joint.forest', 'joint.components']
        if all(is_step_complete(step, saving, redo) for step in steps):
            log('[%d] Reusing joint and conditional distributions from files', seg.id)
            result.joint.forest = unpickle_it(saving['joint.forest'])
            result.joint.components = unpickle_it(saving['joint.components'])
            result.conditional.forest = None
            result.conditional.components = []
            return result

    steps = ['joint.forest', 'joint.components', 'conditional.forest', 'conditional.components']
    if all(is_step_complete(step, saving, redo) for step in steps):
        log('[%d] Reusing joint and conditional distributions from files', seg.id)
        result.joint.forest = unpickle_it(saving['joint.forest'])
        result.joint.components = unpickle_it(saving['joint.components'])
        result.conditional.forest = unpickle_it(saving['conditional.forest'])
        result.conditional.components = unpickle_it(saving['conditional.components'])
        return result

    # 1. Make a grammar

    # here we need to decode for sure
    log('[%d] Make hypergraph view of all available grammars', seg.id)
    # make a hypergraph view of all available grammars
    grammar = make_grammar_hypergraph(seg,
                                      extra_grammar_paths=options.extra_grammars,
                                      glue_grammar_paths=options.glue_grammars,
                                      pass_through=options.pass_through,
                                      default_symbol=options.default_symbol)
    #print('GRAMMAR')
    #print(grammar)

    # 2. Joint distribution - Step 1: parse source lattice
    n_goal = 0
    log('[%d] Parse source DFA', seg.id)
    goal_maker = GoalRuleMaker(goal_str=options.goal, start_str=options.start, n=n_goal)
    src_dfa = make_input_dfa(seg)
    src_forest = parse_dfa(grammar,
                           grammar.fetch(Nonterminal(options.start)),
                           src_dfa,
                           goal_maker.get_iview(),
                           bottomup=True,
                           constraint=HieroConstraints(grammar, src_dfa, options.max_span))
    #print('SOURCE')
    #print(src_forest)

    if not src_forest:
        raise ValueError('I cannot parse the input lattice: i) make sure your grammar has glue rules; ii) make sure it handles OOVs')

    # 3. Target projection of the forest
    log('[%d] Project target rules', seg.id)
    tgt_forest = make_target_forest(src_forest)
    #print('TARGET')
    #print(tgt_forest)

    # 4. Joint distribution - Step 2: scoring

    log('[%d] Joint model: (exact) local scoring', seg.id)
    result.joint = exact_rescoring(joint_model.local_model(), tgt_forest, goal_maker, log)

    # save joint distribution
    if 'joint.forest' in saving:
        pickle_it(saving['joint.forest'], result.joint.forest)
    if 'joint.components' in saving:
        pickle_it(saving['joint.components'], result.joint.components)

    if conditional_model is None:
        result.conditional.forest = None
        result.conditional.components = []
        return result

    # 5. Conditional distribution - Step 1: parse the reference lattice

    log('[%d] Parse reference DFA', seg.id)
    ref_dfa = make_reference_dfa(seg)
    goal_maker.update()
    ref_forest = parse_dfa(result.joint.forest,
                           0,
                           ref_dfa,
                           goal_maker.get_oview(),
                           bottomup=False)

    if not ref_forest:  # reference cannot be parsed
        log('[%d] References cannot be parsed', seg.id)
        result.conditional.forest = ref_forest
        result.conditional.components = []
    else:
        # 6. Conditional distribution - Step 2: scoring
        log('[%d] Conditional model: exact (local) scoring', seg.id)
        result.conditional = exact_rescoring(conditional_model.local_model(), ref_forest, goal_maker, log)

    # save conditional distribution
    if 'conditional.forest' in saving:
        pickle_it(saving['conditional.forest'], result.conditional.forest)
    if 'conditional.components' in saving:
        pickle_it(saving['conditional.components'], result.conditional.components)

    return result