def check_matches(tgt_doc, matches, strict=True): """ Check that the target document text is indeed a subsequence of the source document text (the source document is expected to be "augmented" version of the target with new text interspersed throughout) Parameters ---------- tgt_doc : matches : list of (int, int, int) List of triples (i, j, n) representing matching subsequences: a[i:i+n] == b[j:j+n]. See `difflib.SequenceMatcher.get_matching_blocks`. strict : boolean If True, raise an exception if there are match gaps in the target document, otherwise just print the gaps to stderr. """ tgt_text = tgt_doc.text() if not tgt_text: return elif not matches: raise WeaveException('no matches in non-empty target doc') elif matches[0].b != 0: oops = ('matches ({}) do not start at beginning of target ' 'document <{}>').format(matches[0], tgt_doc.origin) raise WeaveException(oops) gaps = tgt_gaps(matches) if gaps: # we might want to give some slack because gaps can result from # manual rewrites that happened here and there in the soclogs # e.g. a pair of logical not (¬) around _ => ^_^ # in these cases, just print them on stderr for quick checks for gap in gaps: gap_txt = tgt_text[gap[0]:gap[0] + gap[1]] print(u"Match gap in tgt doc ({})\t{}\t{}".format( tgt_doc.origin, gap, gap_txt), file=sys.stderr) print('Matches: ', matches) tgt_turns = set( turn_id(x) for x in tgt_doc.units if turn_id(x) is not None) print('Turns: ', sorted(tgt_turns)) if strict: oops = 'there are match gaps in the target document {}: {}' raise WeaveException(oops.format(tgt_doc.origin, gaps)) _, tgt, size = matches[-1] if tgt + size != len(tgt_text): raise WeaveException('matches do not cover the full target ' 'document')
def check_matches(tgt_doc, matches, strict=True): """ Check that the target document text is indeed a subsequence of the source document text (the source document is expected to be "augmented" version of the target with new text interspersed throughout) Parameters ---------- tgt_doc : matches : list of (int, int, int) List of triples (i, j, n) representing matching subsequences: a[i:i+n] == b[j:j+n]. See `difflib.SequenceMatcher.get_matching_blocks`. strict : boolean If True, raise an exception if there are match gaps in the target document, otherwise just print the gaps to stderr. """ tgt_text = tgt_doc.text() if not tgt_text: return elif not matches: raise WeaveException('no matches in non-empty target doc') elif matches[0].b != 0: oops = ('matches ({}) do not start at beginning of target ' 'document <{}>').format(matches[0], tgt_doc.origin) raise WeaveException(oops) gaps = tgt_gaps(matches) if gaps: # we might want to give some slack because gaps can result from # manual rewrites that happened here and there in the soclogs # e.g. a pair of logical not (¬) around _ => ^_^ # in these cases, just print them on stderr for quick checks for gap in gaps: gap_txt = tgt_text[gap[0]:gap[0] + gap[1]] print(u"Match gap in tgt doc ({})\t{}\t{}".format( tgt_doc.origin, gap, gap_txt), file=sys.stderr) print('Matches: ', matches) tgt_turns = set(turn_id(x) for x in tgt_doc.units if turn_id(x) is not None) print('Turns: ', sorted(tgt_turns)) if strict: oops = 'there are match gaps in the target document {}: {}' raise WeaveException(oops.format(tgt_doc.origin, gaps)) _, tgt, size = matches[-1] if tgt + size != len(tgt_text): raise WeaveException('matches do not cover the full target ' 'document')
def get_eduinfo(config, doc, context, rstuff, edu): """ extract the interesting parst of an EDU :: ... -> EduInfo """ turn = context[edu].turn surface_act = edu.features.get("Surface_Act", "?") if edu.type == "Segment" or not config.emit_dialogue_acts: dialogue_act = "?" else: dialogue_act = edu.type addresee_feat = edu.features.get("Addressee", STAC_UNSET) addressees = set(x.strip() for x in addresee_feat.split(";")) if not addressees or STAC_UNSET in addressees: addressees = set("?") return EduInfo(edu=edu, turn_id=turn_id(turn), dialogue_act=dialogue_act, text=doc.text(edu.text_span()), surface_act=surface_act, speaker=turn.features["Emitter"], addressees=addressees, rstuff=rstuff)
def _get_turn_info(self, anno): "return the speaker and turn id for a unit-level annotation" enclosing_turns = [t for t in self.turns if t.span.encloses(anno.span)] if len(enclosing_turns) > 0: turn = enclosing_turns[0] speaker = stac_anno.speaker(turn) turn_id = stac_anno.turn_id(turn) return speaker, turn_id else: return None, None
def subgrouping(self): """What abstract subgrouping the EDU is in (here: turn stars) See also -------- educe.stac.context.merge_turn_stars Return ------ subgrouping: string """ return self._doc.global_id('t' + str(turn_id(self.tstar)))