예제 #1
0
def check_matches(tgt_doc, matches, strict=True):
    """
    Check that the target document text is indeed a subsequence of
    the source document text (the source document is expected to be
    "augmented" version of the target with new text interspersed
    throughout)

    Parameters
    ----------
    tgt_doc :
    matches : list of (int, int, int)
        List of triples (i, j, n) representing matching subsequences:
        a[i:i+n] == b[j:j+n].
        See `difflib.SequenceMatcher.get_matching_blocks`.
    strict : boolean
        If True, raise an exception if there are match gaps in the
        target document, otherwise just print the gaps to stderr.
    """
    tgt_text = tgt_doc.text()

    if not tgt_text:
        return
    elif not matches:
        raise WeaveException('no matches in non-empty target doc')
    elif matches[0].b != 0:
        oops = ('matches ({}) do not start at beginning of target '
                'document <{}>').format(matches[0], tgt_doc.origin)
        raise WeaveException(oops)

    gaps = tgt_gaps(matches)
    if gaps:
        # we might want to give some slack because gaps can result from
        # manual rewrites that happened here and there in the soclogs
        # e.g. a pair of logical not (&not;) around _ => ^_^
        # in these cases, just print them on stderr for quick checks
        for gap in gaps:
            gap_txt = tgt_text[gap[0]:gap[0] + gap[1]]
            print(u"Match gap in tgt doc ({})\t{}\t{}".format(
                tgt_doc.origin, gap, gap_txt),
                  file=sys.stderr)
        print('Matches: ', matches)
        tgt_turns = set(
            turn_id(x) for x in tgt_doc.units if turn_id(x) is not None)
        print('Turns: ', sorted(tgt_turns))
        if strict:
            oops = 'there are match gaps in the target document {}: {}'
            raise WeaveException(oops.format(tgt_doc.origin, gaps))

    _, tgt, size = matches[-1]
    if tgt + size != len(tgt_text):
        raise WeaveException('matches do not cover the full target '
                             'document')
예제 #2
0
파일: weave.py 프로젝트: irit-melodi/educe
def check_matches(tgt_doc, matches, strict=True):
    """
    Check that the target document text is indeed a subsequence of
    the source document text (the source document is expected to be
    "augmented" version of the target with new text interspersed
    throughout)

    Parameters
    ----------
    tgt_doc :
    matches : list of (int, int, int)
        List of triples (i, j, n) representing matching subsequences:
        a[i:i+n] == b[j:j+n].
        See `difflib.SequenceMatcher.get_matching_blocks`.
    strict : boolean
        If True, raise an exception if there are match gaps in the
        target document, otherwise just print the gaps to stderr.
    """
    tgt_text = tgt_doc.text()

    if not tgt_text:
        return
    elif not matches:
        raise WeaveException('no matches in non-empty target doc')
    elif matches[0].b != 0:
        oops = ('matches ({}) do not start at beginning of target '
                'document <{}>').format(matches[0], tgt_doc.origin)
        raise WeaveException(oops)

    gaps = tgt_gaps(matches)
    if gaps:
        # we might want to give some slack because gaps can result from
        # manual rewrites that happened here and there in the soclogs
        # e.g. a pair of logical not (&not;) around _ => ^_^
        # in these cases, just print them on stderr for quick checks
        for gap in gaps:
            gap_txt = tgt_text[gap[0]:gap[0] + gap[1]]
            print(u"Match gap in tgt doc ({})\t{}\t{}".format(
                tgt_doc.origin, gap, gap_txt), file=sys.stderr)
        print('Matches: ', matches)
        tgt_turns = set(turn_id(x) for x in tgt_doc.units
                        if turn_id(x) is not None)
        print('Turns: ', sorted(tgt_turns))
        if strict:
            oops = 'there are match gaps in the target document {}: {}'
            raise WeaveException(oops.format(tgt_doc.origin, gaps))

    _, tgt, size = matches[-1]
    if tgt + size != len(tgt_text):
        raise WeaveException('matches do not cover the full target '
                             'document')
예제 #3
0
def get_eduinfo(config, doc, context, rstuff, edu):
    """
    extract the interesting parst of an EDU

    :: ... -> EduInfo
    """
    turn = context[edu].turn
    surface_act = edu.features.get("Surface_Act", "?")

    if edu.type == "Segment" or not config.emit_dialogue_acts:
        dialogue_act = "?"
    else:
        dialogue_act = edu.type


    addresee_feat = edu.features.get("Addressee", STAC_UNSET)
    addressees = set(x.strip() for x in addresee_feat.split(";"))
    if not addressees or STAC_UNSET in addressees:
        addressees = set("?")

    return EduInfo(edu=edu,
                   turn_id=turn_id(turn),
                   dialogue_act=dialogue_act,
                   text=doc.text(edu.text_span()),
                   surface_act=surface_act,
                   speaker=turn.features["Emitter"],
                   addressees=addressees,
                   rstuff=rstuff)
예제 #4
0
파일: mkseg.py 프로젝트: popescuv/irit-stac
def get_eduinfo(config, doc, context, rstuff, edu):
    """
    extract the interesting parst of an EDU

    :: ... -> EduInfo
    """
    turn = context[edu].turn
    surface_act = edu.features.get("Surface_Act", "?")

    if edu.type == "Segment" or not config.emit_dialogue_acts:
        dialogue_act = "?"
    else:
        dialogue_act = edu.type

    addresee_feat = edu.features.get("Addressee", STAC_UNSET)
    addressees = set(x.strip() for x in addresee_feat.split(";"))
    if not addressees or STAC_UNSET in addressees:
        addressees = set("?")

    return EduInfo(edu=edu,
                   turn_id=turn_id(turn),
                   dialogue_act=dialogue_act,
                   text=doc.text(edu.text_span()),
                   surface_act=surface_act,
                   speaker=turn.features["Emitter"],
                   addressees=addressees,
                   rstuff=rstuff)
예제 #5
0
파일: graph.py 프로젝트: tjane/educe
 def _get_turn_info(self, anno):
     "return the speaker and turn id for a unit-level annotation"
     enclosing_turns = [t for t in self.turns if t.span.encloses(anno.span)]
     if len(enclosing_turns) > 0:
         turn = enclosing_turns[0]
         speaker = stac_anno.speaker(turn)
         turn_id = stac_anno.turn_id(turn)
         return speaker, turn_id
     else:
         return None, None
예제 #6
0
파일: graph.py 프로젝트: irit-melodi/educe
 def _get_turn_info(self, anno):
     "return the speaker and turn id for a unit-level annotation"
     enclosing_turns = [t for t in self.turns if t.span.encloses(anno.span)]
     if len(enclosing_turns) > 0:
         turn = enclosing_turns[0]
         speaker = stac_anno.speaker(turn)
         turn_id = stac_anno.turn_id(turn)
         return speaker, turn_id
     else:
         return None, None
예제 #7
0
    def subgrouping(self):
        """What abstract subgrouping the EDU is in (here: turn stars)

        See also
        --------
        educe.stac.context.merge_turn_stars

        Return
        ------
        subgrouping: string
        """
        return self._doc.global_id('t' + str(turn_id(self.tstar)))
예제 #8
0
파일: fusion.py 프로젝트: irit-melodi/educe
    def subgrouping(self):
        """What abstract subgrouping the EDU is in (here: turn stars)

        See also
        --------
        educe.stac.context.merge_turn_stars

        Return
        ------
        subgrouping: string
        """
        return self._doc.global_id('t' + str(turn_id(self.tstar)))