def compute_updates(src_doc, tgt_doc, matches): """Return updates that would need to be made on the target document. Given matches between the source and target document, return span updates along with any source annotations that do not have an equivalent in the target document (the latter may indicate that resegmentation has taken place, or that there is some kind of problem) Parameters ---------- src_doc : Document tgt_doc : Document matches : [Match] Returns ------- updates: Updates """ res = Updates() # case 2 and 5 (to be pruned below) res.expected_src_only.extend(src_doc.units) res.abnormal_tgt_only.extend(tgt_doc.units) # case 1, 2 and 4 for src, tgt, size in matches: tgt_to_src = src - tgt res.shift_if_ge[tgt] = tgt_to_src # case 1 and 2 src_annos = enclosed(Span(src, src + size), src_doc.units) tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units) for src_anno in src_annos: res.expected_src_only.remove(src_anno) # prune from case 5 src_span = src_anno.text_span() tgt_equiv = [ x for x in tgt_annos if x.text_span().shift(tgt_to_src) == src_span ] if not tgt_equiv: # case 4 res.abnormal_src_only.append(src_anno) for tgt_anno in tgt_equiv: # prun from case 2 if tgt_anno in res.abnormal_tgt_only: res.abnormal_tgt_only.remove(tgt_anno) return res
def compute_updates(src_doc, tgt_doc, matches): """Return updates that would need to be made on the target document. Given matches between the source and target document, return span updates along with any source annotations that do not have an equivalent in the target document (the latter may indicate that resegmentation has taken place, or that there is some kind of problem) Parameters ---------- src_doc : Document tgt_doc : Document matches : [Match] Returns ------- updates: Updates """ res = Updates() # case 2 and 5 (to be pruned below) res.expected_src_only.extend(src_doc.units) res.abnormal_tgt_only.extend(tgt_doc.units) # case 1, 2 and 4 for src, tgt, size in matches: tgt_to_src = src - tgt res.shift_if_ge[tgt] = tgt_to_src # case 1 and 2 src_annos = enclosed(Span(src, src + size), src_doc.units) tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units) for src_anno in src_annos: res.expected_src_only.remove(src_anno) # prune from case 5 src_span = src_anno.text_span() tgt_equiv = [x for x in tgt_annos if x.text_span().shift(tgt_to_src) == src_span] if not tgt_equiv: # case 4 res.abnormal_src_only.append(src_anno) for tgt_anno in tgt_equiv: # prun from case 2 if tgt_anno in res.abnormal_tgt_only: res.abnormal_tgt_only.remove(tgt_anno) return res
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus(args) for key in corpus: doc = corpus[key] dialogues = [x for x in doc.units if educe.stac.is_dialogue(x)] edus = [x for x in doc.units if educe.stac.is_edu(x)] for anno in dialogues: dspan = anno.text_span() edus_within = enclosed(dspan, edus) cols = [friendly_dialogue_id(key, dspan), anno.local_id(), len(edus_within)] print("\t".join(map(str, cols)))
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus(args) for key in corpus: doc = corpus[key] dialogues = [x for x in doc.units if educe.stac.is_dialogue(x)] edus = [x for x in doc.units if educe.stac.is_edu(x)] for anno in dialogues: dspan = anno.text_span() edus_within = enclosed(dspan, edus) cols = [friendly_dialogue_id(key, dspan), anno.local_id(), len(edus_within)] print('\t'.join(map(str, cols)))
def enclosed_lemmas(span, parses): """ Given a span and a list of parses, return any lemmas that are within that span """ return [x.features["lemma"] for x in enclosed(span, parses.tokens)]
def stretch_match_many(updates, src_doc, tgt_doc, doc_span_src, doc_span_tgt, annos_src, annos_tgt, verbose=0): """Compute n-m stretch matches between `annos_src` and `annos_tgt`. Parameters ---------- updates : Update src_doc : Document tgt_doc : Document doc_span_src : Span doc_span_tgt : Span annos_src : list of educe.annotation Unmatched annotations in `span_src`. annos_tgt : list of educe.annotation Unmatched annotations in `span_tgt`. verbose : int Verbosity level Returns ------- res : Update Possibly trimmed version of `updates`. """ # unmatched structs in src cands_src = enclosed(Span(doc_span_src[0], doc_span_src[1]), annos_src) cands_src = sorted(cands_src, key=lambda x: x.span) spans_src = [anno.text_span() for anno in cands_src] # unmatched structs in tgt cands_tgt = enclosed(Span(doc_span_tgt[0], doc_span_tgt[1]), annos_tgt) cands_tgt = sorted(cands_tgt, key=lambda x: x.span) spans_tgt = [anno.text_span() for anno in cands_tgt] if not (spans_src and spans_tgt): return updates # many to many match between source and target seqs_src = find_continuous_seqs(src_doc, spans_src, cands_src) seqs_tgt = find_continuous_seqs(tgt_doc, spans_tgt, cands_tgt) # TODO if both sequences span the same text (for common turns), use # stretched target annotations for seq_src, seq_tgt in zip(seqs_src, seqs_tgt): seq_spans_src = [spans_src[i] for i in seq_src] seq_annos_src = [cands_src[i] for i in seq_src] span_seq_src = Span(seq_spans_src[0].char_start, seq_spans_src[-1].char_end) seq_spans_tgt = [spans_tgt[i] for i in seq_tgt] seq_annos_tgt = [cands_tgt[i] for i in seq_tgt] span_seq_tgt = Span(seq_spans_tgt[0].char_start, seq_spans_tgt[-1].char_end) # compare (hollowed) text txt_src = src_doc.text(span=span_seq_src) txt_src = hollow_out_missing_turn_text( src_doc, tgt_doc, doc_span_src=span_seq_src, doc_span_tgt=span_seq_tgt).replace('\t ', '').replace('\t', '') txt_tgt = tgt_doc.text(span=span_seq_tgt) if txt_tgt.strip() == txt_src.strip(): if verbose: print('Many-to-many stretch match:\n', 'source:\n', '\n'.join(str(x) for x in seq_annos_src), '\ntarget:\n', '\n'.join(str(x) for x in seq_annos_tgt)) updates = update_updates(updates, seq_annos_src, seq_annos_tgt, verbose=verbose) return updates
def stretch_match(updates, src_doc, tgt_doc, doc_span_src, doc_span_tgt, annos_src, annos_tgt, verbose=0): """Compute stretch matches between `annos_src` and `annos_tgt`. Parameters ---------- updates : Update src_doc : Document tgt_doc : Document doc_span_src : Span doc_span_tgt : Span annos_src : list of educe.annotation Unmatched annotations in `span_src`. annos_tgt : list of educe.annotation Unmatched annotations in `span_tgt`. verbose : int Verbosity level Returns ------- res : Update Possibly trimmed version of `updates`. """ # unmatched structs in src cands_src = enclosed(Span(doc_span_src[0], doc_span_src[1]), annos_src) spans_src = [anno.text_span() for anno in cands_src] # unmatched structs in tgt cands_tgt = enclosed(Span(doc_span_tgt[0], doc_span_tgt[1]), annos_tgt) spans_tgt = [anno.text_span() for anno in cands_tgt] # {one,many} to one match between source and target # FIXME separate matching procedures for: # * Turn # * paragraph # * segment (+ segments with different names in units/ ?) for span_tgt, cand_tgt in zip(spans_tgt, cands_tgt): # span_tgt cast onto src_doc shifted_span_tgt = shift_span(span_tgt, updates) # 1-1 match on the exact (translated) same span src_equiv = [cand_src for span_src, cand_src in zip(spans_src, cands_src) if span_src == shifted_span_tgt] # 1-1 stretch match, based on comparing the text of the turns # that are common to source and target txt_tgt = tgt_doc.text(span=span_tgt).strip() # compute the maximal extension of span_src: include whitespaces # immediately before lctx_src = [src_doc.text(span=Span(span_src.char_start - 10, span_src.char_start)) for span_src in spans_src] lpad_src = [len(x) - len(x.rstrip()) for x in lctx_src] # ... and after rctx_src = [src_doc.text(span=Span(span_src.char_end, span_src.char_end + 10)) for span_src in spans_src] rpad_src = [len(x) - len(x.lstrip()) for x in rctx_src] # create the corresponding extended spans ext_spans_src = [Span(span_src.char_start - lpad, span_src.char_end + rpad) for span_src, lpad, rpad in zip(spans_src, lpad_src, rpad_src)] src_equiv_stretch = [cand_src for span_src, ext_span_src, cand_src in zip(spans_src, ext_spans_src, cands_src) if ((txt_tgt == hollow_out_missing_turn_text( src_doc, tgt_doc, doc_span_src=span_src, doc_span_tgt=span_tgt ).replace('\t ', '').replace('\t', '').strip()) and ext_span_src.encloses(shifted_span_tgt))] # extend list of 1-1 exact matches with 1-1 stretch matches if src_equiv_stretch: src_equiv.extend(src_equiv_stretch) if verbose: print('1-to-1 stretch match: ', [str(x) for x in src_equiv_stretch]) print('for target annotation: ', cand_tgt) if src_equiv: updates = update_updates(updates, src_equiv, [cand_tgt], verbose=verbose) else: # many to 1 match between source and target # # search for a sequence of contiguous annotations in source # that covers the same span as a single annotation of the # same type in target ; this is supposed to capture the # result of `stac-edit merge-{dialogue,edu}` src_equiv_cands = enclosed(shifted_span_tgt, cands_src) src_equiv_seq = sorted(src_equiv_cands, key=lambda x: x.span) # if the sequence covers the targeted span if ((src_equiv_seq and src_equiv_seq[0].span.char_start == shifted_span_tgt.char_start and src_equiv_seq[-1].span.char_end == shifted_span_tgt.char_end)): # and has no gap or just whitespaces gap_str = ''.join( src_doc.text(span=Span(elt_cur.span.char_end, elt_nex.span.char_start)) for elt_cur, elt_nex in zip(src_equiv_seq[:-1], src_equiv_seq[1:]) ) gap_str = gap_str.strip() if not gap_str: updates = update_updates( updates, src_equiv_seq, [cand_tgt], verbose=verbose) if verbose: print('Guess: {} results from a merge on {}'.format( str(cand_tgt), [str(x) for x in src_equiv_seq]), file=sys.stderr) shifted_spans_tgt = [shift_span(span_tgt, updates) for span_tgt in spans_tgt] # WIP # one to many match between source and target for span_src, cand_src in zip(spans_src, cands_src): # search for a sequence of contiguous annotations in target # that covers the same span as a single annotation of the # same type in source ; this is supposed to capture the # result of `stac-edit split-{dialogue,edu}` tgt_equiv_cands = [(shifted_span_tgt, cand_tgt) for shifted_span_tgt, cand_tgt in zip(shifted_spans_tgt, cands_tgt) if span_src.encloses(shifted_span_tgt)] tgt_equiv_seq = sorted(tgt_equiv_cands) # if the sequence covers the source span if ((tgt_equiv_seq and tgt_equiv_seq[0][0].char_start == span_src.char_start and tgt_equiv_seq[-1][0].char_end == span_src.char_end)): # and has no gap or just whitespaces gap_str = ''.join( tgt_doc.text(span=Span(elt_cur[1].span.char_end, elt_nex[1].span.char_start)) for elt_cur, elt_nex in zip(tgt_equiv_seq[:-1], tgt_equiv_seq[1:]) ) gap_str = gap_str.strip() if not gap_str: updates = update_updates( updates, [cand_src], [x[1] for x in tgt_equiv_seq], verbose=verbose ) if verbose: print('Guess: {} results from a split on {}'.format( [str(x[1]) for x in tgt_equiv_seq], str(cand_src)), file=sys.stderr) return updates
def compute_updates(src_doc, tgt_doc, matches): """Return updates that would need to be made on the target document. Given matches between the source and target document, return span updates along with any source annotations that do not have an equivalent in the target document (the latter may indicate that resegmentation has taken place, or that there is some kind of problem) Parameters ---------- src_doc : Document tgt_doc : Document matches : [Match] Returns ------- updates: Updates """ res = Updates() # case 2 and 5 (to be pruned below) res.expected_src_only.extend(src_doc.units) res.abnormal_tgt_only.extend(tgt_doc.units) # WIP separate matching procedures for EDUs, turns, paragraphs, # dialogues and the rest (seemingly only resources). def is_various(annotation): """None of {edu, turn, paragraph, dialogue}. It seems to capture only Resources (to be confirmed). """ return not(is_edu(annotation) or is_turn(annotation) or is_paragraph(annotation) or is_dialogue(annotation)) # case 1, 2 and 4 for src, tgt, size in matches: tgt_to_src = src - tgt res.shift_if_ge[tgt] = tgt_to_src # case 1 and 2 src_annos = enclosed(Span(src, src + size), src_doc.units) tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units) # WIP separate matching procedures for the different types of # annotations for anno_type in [is_edu, is_turn, is_paragraph, is_dialogue, is_various]: cands_src = [x for x in src_annos if anno_type(x)] cands_tgt = [x for x in tgt_annos if anno_type(x)] # compute (shifted) spans once spans_src = [x.text_span() for x in cands_src] spans_tgt = [x.text_span().shift(tgt_to_src) for x in cands_tgt] # loop over source annotations for src_span, src_anno in zip(spans_src, cands_src): res.expected_src_only.remove(src_anno) # prune from case 5 tgt_equiv = [tgt_anno for tgt_span, tgt_anno in zip(spans_tgt, cands_tgt) if tgt_span == src_span] if not tgt_equiv: # case 4 res.abnormal_src_only.append(src_anno) for tgt_anno in tgt_equiv: # prune from case 2 if tgt_anno in res.abnormal_tgt_only: res.abnormal_tgt_only.remove(tgt_anno) return res
def stretch_match(updates, src_doc, tgt_doc, doc_span_src, doc_span_tgt, annos_src, annos_tgt, verbose=0): """Compute stretch matches between `annos_src` and `annos_tgt`. Parameters ---------- updates : Update src_doc : Document tgt_doc : Document doc_span_src : Span doc_span_tgt : Span annos_src : list of educe.annotation Unmatched annotations in `span_src`. annos_tgt : list of educe.annotation Unmatched annotations in `span_tgt`. verbose : int Verbosity level Returns ------- res : Update Possibly trimmed version of `updates`. """ # unmatched structs in src cands_src = enclosed(Span(doc_span_src[0], doc_span_src[1]), annos_src) spans_src = [anno.text_span() for anno in cands_src] # unmatched structs in tgt cands_tgt = enclosed(Span(doc_span_tgt[0], doc_span_tgt[1]), annos_tgt) spans_tgt = [anno.text_span() for anno in cands_tgt] # {one,many} to one match between source and target # FIXME separate matching procedures for: # * Turn # * paragraph # * segment (+ segments with different names in units/ ?) for span_tgt, cand_tgt in zip(spans_tgt, cands_tgt): # span_tgt cast onto src_doc shifted_span_tgt = shift_span(span_tgt, updates) # 1-1 match on the exact (translated) same span src_equiv = [ cand_src for span_src, cand_src in zip(spans_src, cands_src) if span_src == shifted_span_tgt ] # 1-1 stretch match, based on comparing the text of the turns # that are common to source and target txt_tgt = tgt_doc.text(span=span_tgt).strip() # compute the maximal extension of span_src: include whitespaces # immediately before lctx_src = [ src_doc.text(span=Span(span_src.char_start - 10, span_src.char_start)) for span_src in spans_src ] lpad_src = [len(x) - len(x.rstrip()) for x in lctx_src] # ... and after rctx_src = [ src_doc.text(span=Span(span_src.char_end, span_src.char_end + 10)) for span_src in spans_src ] rpad_src = [len(x) - len(x.lstrip()) for x in rctx_src] # create the corresponding extended spans ext_spans_src = [ Span(span_src.char_start - lpad, span_src.char_end + rpad) for span_src, lpad, rpad in zip(spans_src, lpad_src, rpad_src) ] src_equiv_stretch = [ cand_src for span_src, ext_span_src, cand_src in zip( spans_src, ext_spans_src, cands_src) if ((txt_tgt == hollow_out_missing_turn_text( src_doc, tgt_doc, doc_span_src=span_src, doc_span_tgt=span_tgt ).replace('\t ', '').replace('\t', '').strip()) and ext_span_src.encloses(shifted_span_tgt)) ] # extend list of 1-1 exact matches with 1-1 stretch matches if src_equiv_stretch: src_equiv.extend(src_equiv_stretch) if verbose: print('1-to-1 stretch match: ', [str(x) for x in src_equiv_stretch]) print('for target annotation: ', cand_tgt) if src_equiv: updates = update_updates(updates, src_equiv, [cand_tgt], verbose=verbose) else: # many to 1 match between source and target # # search for a sequence of contiguous annotations in source # that covers the same span as a single annotation of the # same type in target ; this is supposed to capture the # result of `stac-edit merge-{dialogue,edu}` src_equiv_cands = enclosed(shifted_span_tgt, cands_src) src_equiv_seq = sorted(src_equiv_cands, key=lambda x: x.span) # if the sequence covers the targeted span if ((src_equiv_seq and (src_equiv_seq[0].span.char_start == shifted_span_tgt.char_start) and (src_equiv_seq[-1].span.char_end == shifted_span_tgt.char_end))): # and has no gap or just whitespaces gap_str = ''.join( src_doc.text(span=Span(elt_cur.span.char_end, elt_nex.span.char_start)) for elt_cur, elt_nex in zip(src_equiv_seq[:-1], src_equiv_seq[1:])) gap_str = gap_str.strip() if not gap_str: updates = update_updates(updates, src_equiv_seq, [cand_tgt], verbose=verbose) if verbose: print('Guess: {} results from a merge on {}'.format( str(cand_tgt), [str(x) for x in src_equiv_seq]), file=sys.stderr) shifted_spans_tgt = [ shift_span(span_tgt, updates) for span_tgt in spans_tgt ] # WIP # one to many match between source and target for span_src, cand_src in zip(spans_src, cands_src): # search for a sequence of contiguous annotations in target # that covers the same span as a single annotation of the # same type in source ; this is supposed to capture the # result of `stac-edit split-{dialogue,edu}` tgt_equiv_cands = [ (shifted_span_tgt, cand_tgt) for shifted_span_tgt, cand_tgt in zip(shifted_spans_tgt, cands_tgt) if span_src.encloses(shifted_span_tgt) ] tgt_equiv_seq = sorted(tgt_equiv_cands) # if the sequence covers the source span if ((tgt_equiv_seq and tgt_equiv_seq[0][0].char_start == span_src.char_start and tgt_equiv_seq[-1][0].char_end == span_src.char_end)): # and has no gap or just whitespaces gap_str = ''.join( tgt_doc.text(span=Span(elt_cur[1].span.char_end, elt_nex[1].span.char_start)) for elt_cur, elt_nex in zip(tgt_equiv_seq[:-1], tgt_equiv_seq[1:])) gap_str = gap_str.strip() if not gap_str: updates = update_updates(updates, [cand_src], [x[1] for x in tgt_equiv_seq], verbose=verbose) if verbose: print('Guess: {} results from a split on {}'.format( [str(x[1]) for x in tgt_equiv_seq], str(cand_src)), file=sys.stderr) return updates
def compute_updates(src_doc, tgt_doc, matches): """Return updates that would need to be made on the target document. Given matches between the source and target document, return span updates along with any source annotations that do not have an equivalent in the target document (the latter may indicate that resegmentation has taken place, or that there is some kind of problem) Parameters ---------- src_doc : Document tgt_doc : Document matches : [Match] Returns ------- updates: Updates """ res = Updates() # case 2 and 5 (to be pruned below) res.expected_src_only.extend(src_doc.units) res.abnormal_tgt_only.extend(tgt_doc.units) # WIP separate matching procedures for EDUs, turns, paragraphs, # dialogues and the rest (seemingly only resources). def is_various(annotation): """None of {edu, turn, paragraph, dialogue}. It seems to capture only Resources (to be confirmed). """ return not (is_edu(annotation) or is_turn(annotation) or is_paragraph(annotation) or is_dialogue(annotation)) # case 1, 2 and 4 for src, tgt, size in matches: tgt_to_src = src - tgt res.shift_if_ge[tgt] = tgt_to_src # case 1 and 2 src_annos = enclosed(Span(src, src + size), src_doc.units) tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units) # WIP separate matching procedures for the different types of # annotations for anno_type in [ is_edu, is_turn, is_paragraph, is_dialogue, is_various ]: cands_src = [x for x in src_annos if anno_type(x)] cands_tgt = [x for x in tgt_annos if anno_type(x)] # compute (shifted) spans once spans_src = [x.text_span() for x in cands_src] spans_tgt = [x.text_span().shift(tgt_to_src) for x in cands_tgt] # loop over source annotations for src_span, src_anno in zip(spans_src, cands_src): res.expected_src_only.remove(src_anno) # prune from case 5 tgt_equiv = [ tgt_anno for tgt_span, tgt_anno in zip(spans_tgt, cands_tgt) if tgt_span == src_span ] if not tgt_equiv: # case 4 res.abnormal_src_only.append(src_anno) for tgt_anno in tgt_equiv: # prune from case 2 if tgt_anno in res.abnormal_tgt_only: res.abnormal_tgt_only.remove(tgt_anno) return res