def mk_relation(tstamp, local_id_parent, local_id_child, label): """ Given a document and edu ids, create a relation instance betweenthem """ span = RelSpan(local_id_parent, local_id_child) label = label annotator = 'stacparser' date = tstamp.next() rel_id = stac_glozz.anno_id_from_tuple((annotator, date)) features = {} metadata = {} metadata['author'] = annotator metadata['creation-date'] = str(date) return Relation(rel_id=rel_id, span=span, rtype=label, features=features, metadata=metadata)
def strip_cdus(self, sloppy=False, mode='head'): """ Delete all CDUs in this graph. Links involving a CDU will point to/from the elements of this CDU. Non-head modes may add new edges to the graph. Parameters ---------- sloppy: boolean, default=False See `cdu_head`. mode: string, default='head' Strategy for replacing edges involving CDUs. `head` will relocate the edge on the recursive head of the CDU (see `recursive_cdu_heads`). `broadcast` will distribute the edge over all EDUs belonging to the CDU. A copy of the edge will be created for each of them. If the edge's source and target are both distributed, a new copy will be created for each combination of EDUs. `custom` (or any other string) will distribute or relocate on the head depending on the relation label. """ # Set of labels for which the source node should be distributed LEFT_DIST = frozenset( ('Acknowledgement', 'Explanation', 'Comment', 'Continuation', 'Narration', 'Contrast', 'Parallel', 'Background')) # Set of labels for which the target node should be distributed RIGHT_DIST = frozenset( ('Result', 'Continuation', 'Narration', 'Comment', 'Contrast', 'Parallel', 'Background', 'Elaboration')) # Warning: heads.keys() are hyperedges heads = self.recursive_cdu_heads(sloppy) def distrib_candidates(links, label): """ Return a pair of list of nodes to be attached, depending on the edge label. """ src_node, tgt_node = links def candidates(node, distributive): if not self.is_cdu(node): return [node] if (mode != 'head' and (mode == 'broadcast' or label in distributive)): # Either distribute over all components... # (always do in broadcast mode) nodes = edu_components(node) else: # ... or link to the CDU recursive head only # (always do in head mode) nodes = [heads[self.mirror(node)]] return nodes return (candidates(src_node, LEFT_DIST), candidates(tgt_node, RIGHT_DIST)) def edu_components(node): """ Returns a list of all EDUs contained by a node. """ if not self.is_cdu(node): return [node] return [ snode for snode in self.cdu_members(node, deep=True) if self.is_edu(snode) ] # Convert all edges in order for old_edge in self.relations(): links = self.links(old_edge) # Verify the edge is well-formed assert (len(links) == 2) if not any(self.is_cdu(l) for l in links): # No CDU to strip: skip continue old_attrs = self.edge_attributes(old_edge) old_anno = self.annotation(old_edge) src_nodes, tgt_nodes = distrib_candidates(links, old_anno.type) # Remove the old edge self.del_edge(old_edge) self.doc.relations.remove(old_anno) # Build a new edge for all new combinations for i, (n_src, n_tgt) in enumerate( itertools.product(src_nodes, tgt_nodes)): if n_src == n_tgt: print("WARNING: something is pointing to its own CDU : " + str(n_src)) continue # First, build a new Relation for the annotation layer n_src_anno = self.annotation(n_src) n_tgt_anno = self.annotation(n_tgt) new_anno = Relation( '{0}_{1}'.format(old_anno._anno_id, i), RelSpan(n_src_anno._anno_id, n_tgt_anno._anno_id), old_anno.type, dict()) new_anno.source = n_src_anno new_anno.target = n_tgt_anno self.doc.relations.append(new_anno) # Second, build a new graph edge new_edge = '{0}_{1}'.format(old_edge, i) new_attrs = dict(old_attrs) new_attrs['annotation'] = new_anno self.add_edge(new_edge) self.add_edge_attributes(new_edge, new_attrs.items()) self.link(n_src, new_edge) self.link(n_tgt, new_edge) # Now all the CDUs are edge-orphaned, remove them from the graph for e_cdu in self.cdus(): self.del_node(self.mirror(e_cdu)) self.del_edge(e_cdu) # Same for annotation-level CDUs self.doc.schemas = [s for s in self.doc.schemas if not stac.is_cdu(s)]
def infer_resegmentation(unanno_doc, anno_doc, verbose=0): """Infer resegmentation of EDUs. Parameters ---------- anno_doc : GlozzDocument Document to filter verbose : int Verbosity level Returns ------- anno_doc : GlozzDocument Filtered document, where the support of relations and schemas has been rewritten. """ anno_map = dict() cautious_map = dict() new_cdus = [] turns = [x for x in unanno_doc.units if is_turn(x)] for turn in turns: # `unannotated` was the starting point for the annotation process u_edus = [ x for x in unanno_doc.units if is_edu(x) and turn.span.encloses(x.span) ] u_ids = set(x.local_id() for x in u_edus) # `annotated` is the result of the annotation process # find conflicts, as pair-wise overlaps between annotations # from `annotated` a_edus = [ x for x in anno_doc.units if is_edu(x) and turn.span.encloses(x.span) ] # 1. map new segments to their original equivalent, backporting # dialogue act annotation dup_items = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations( sorted(a_edus, key=lambda x: (x.local_id() in u_ids, x.local_id())), 2) if (span_eq(elt_a.text_span(), elt_b.text_span(), eps=1) and elt_b.local_id() in u_ids)] anno_map.update(dup_items) # backport dialogue act annotation to original segment for elt_a, elt_b in dup_items: if elt_a.type in DIALOGUE_ACTS: # backport annotation to original segment elt_b elt_b.type = elt_a.type elt_b.features = elt_a.features for k in ['lastModifier', 'lastModificationDate']: elt_b.metadata[k] = elt_a.metadata[k] # (locally) update the list of EDUs in anno_doc, so conflicts # are not computed on trivially mapped segments a_edus = [x for x in a_edus if x not in anno_map] # 2. list conflicts, then whitelist them progressively # NB: we sort EDUs in reverse using their local_ids, so that # conflict pairs are of the form (stac*, skar*) ; this is # admittedly a cheap, ad-hoc, trick to simulate an ordering # such that annotations already present in unannotated < annotations # introduced in annotated pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations( sorted(a_edus, key=lambda x: (x.type in DIALOGUE_ACTS, x.local_id())), 2) if elt_a.overlaps(elt_b)] # * Two cases are very close: EDU merges, and CDUs rels_support = set( anno_map.get(x, x) for rel in anno_doc.relations for x in [rel.source, rel.target]) edu_merges = [] # list of (list of elt_a, elt_b) cdu_guess = [] # list of (list of elt_a, elt_b) for elt_b, pairs in itertools.groupby(pw_conflicts, key=lambda x: x[1]): sorted_a = sorted((y[0] for y in pairs), key=lambda z: z.text_span()) span_seq_a = Span(sorted_a[0].text_span().char_start, sorted_a[-1].text_span().char_end) # we approximately check that the sequence of EDUs elts_a # fully covers the span of elt_b, from start to end, with # no overlap or that the whole sequence is enclosed in # the annotation from `annotated` (this happens when some but # not all of the merged EDUs have been deleted) if ((approximate_cover(sorted_a, elt_b) or elt_b.text_span().encloses(span_seq_a))): # then, it is either an EDU merge or a CDU ; # if any element of the sequence supports a relation, # we take this as indicating a CDU if any(y in rels_support for y in sorted_a): # broadcast type, features, metadata to the segments for elt_a in sorted_a: elt_a.type = _SPLIT_PREFIX + elt_b.type elt_a.features = elt_b.features for k in ['lastModifier', 'lastModificationDate']: elt_a.metadata[k] = elt_b.metadata[k] # transform elt_b into a CDU sch_relid = elt_b.local_id() sch_units = set(y.local_id() for y in sorted_a) sch_relas = set() sch_schms = set() sch_stype = 'Complex_discourse_unit' sch_feats = {} sch_metad = elt_b.metadata new_cdu = Schema(sch_relid, sch_units, sch_relas, sch_schms, sch_stype, sch_feats, metadata=sch_metad) new_cdus.append(new_cdu) # map former (bad) segment to its proper CDU version anno_map[elt_b] = new_cdu cdu_guess.append((sorted_a, elt_b)) if verbose > 1: print('CDU {}\nwas {}, from\n {}'.format( new_cdu, elt_b, '\n '.join(str(z) for z in sorted_a))) elif all(elt_a.local_id() in u_ids for elt_a in sorted_a): edu_merges.append((sorted_a, elt_b)) if verbose > 1: print('EDU merge {} from\n {}'.format( elt_b, '\n '.join(str(z) for z in sorted_a))) else: err_msg = 'Weird approximate cover:\n{}\n{}' raise ValueError( err_msg.format(', '.join(str(y) for y in sorted_a), elt_b)) # map each of the segments to its CDU, so these pairs can be # removed from the list of conflicts later cdu_map = dict() for elts_a, elt_b in cdu_guess: map_items = [(elt_a, elt_b) for elt_a in elts_a] cdu_map.update(map_items) cautious_map.update(map_items) # map each of the merged segments to the new, bigger EDU + mark for elts_a, elt_b in edu_merges: map_items = [(elt_a, elt_b) for elt_a in elts_a] anno_map.update(map_items) cautious_map.update(map_items) # update list of conflicts: remove pairs that contain a segment # and its merged EDU, or a segment and its enclosing CDU pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts if (anno_map.get(elt_a, elt_a) != elt_b and cdu_map.get(elt_a, elt_a) != elt_b)] # * EDU splits edu_splits = dict() # elt_a -> list of elt_b for elt_a, pairs in itertools.groupby(pw_conflicts, key=lambda x: x[0]): sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span) # we approximately check that the sequence of new EDUs # fully covers the span of elt_a, from start to end, with # no overlap if ((elt_a.local_id() in u_ids and approximate_cover(sorted_b, elt_a))): edu_splits[elt_a] = sorted_b pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts if elt_a not in set(edu_splits.keys())] # map the split segment to the first of the resulting EDUs + mark for elt_a, elts_b in edu_splits.items(): map_items = [(elt_a, elts_b[0])] anno_map.update(map_items) cautious_map.update(map_items) if verbose: if pw_conflicts: print('Conflict:') print('\n'.join(' {}\t<>\t{}'.format(str(elt_a), str(elt_b)) for elt_a, elt_b in pw_conflicts)) # update anno_doc using the computed mapping anno_map_id = {x.local_id(): y.local_id() for x, y in anno_map.items()} cautious_map_id = { x.local_id(): y.local_id() for x, y in cautious_map.items() } # * forget mapped units and segments rewritten as CDUs anno_doc.units = [ x for x in anno_doc.units if (not is_edu(x) or x.local_id() not in anno_map_id) ] # * add the new CDUs to the list of schemas anno_doc.schemas.extend(new_cdus) # rewrite the support of relations and schemas objects = { x.local_id(): x for x in itertools.chain(anno_doc.units, anno_doc.relations, anno_doc.schemas) } # * rewrite the support of relations for rel in anno_doc.relations: src = anno_map_id.get(rel.span.t1, rel.span.t1) tgt = anno_map_id.get(rel.span.t2, rel.span.t2) # update relation span, source, target rel.span = RelSpan(src, tgt) rel.source = objects[src] rel.target = objects[tgt] # if necessary, mark relation type for review if src in cautious_map_id or tgt in cautious_map_id: rel.type = _SPLIT_PREFIX + rel.type # * rewrite the support of schemas for sch in anno_doc.schemas: # sch.id = sch.id sch.units = set(anno_map_id.get(x, x) for x in sch.units) sch.relations = set(anno_map_id.get(x, x) for x in sch.relations) sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas) sch.type = sch.type # sch.features = sch.features # sch.metadata = sch.metadata sch.span = sch.units | sch.relations | sch.schemas sch.fleshout(objects) return anno_doc
def read_node(node, context=None): def get_one(name, default, ctx=None): f = lambda n: read_node(n, ctx) return on_single_element(node, default, f, name) def get_all(name): return list(map(read_node, node.findall(name))) if node.tag == 'annotations': hashcode = get_one('metadata', '', 'annotations') if hashcode is '': hashcode = None units = get_all('unit') rels = get_all('relation') schemas = get_all('schema') return (hashcode, units, rels, schemas) elif node.tag == 'characterisation': fs = get_one('featureSet', {}) unit_type = get_one('type', None) return (unit_type, fs) elif node.tag == 'feature': attr = node.attrib['name'] val = node.text.strip() if node.text else None return (attr, val) # TODO throw exception if we see more than one instance of a key elif node.tag == 'featureSet': return dict(get_all('feature')) elif node.tag == 'metadata' and context == 'annotations': return node.attrib['corpusHashcode'] elif node.tag == 'metadata': return dict([(t.tag, t.text.strip()) for t in node]) elif node.tag == 'positioning' and context == 'unit': start = get_one('start', None) end = get_one('end', None) return Span(start, end) elif node.tag == 'positioning' and context == 'relation': terms = get_all('term') if len(terms) != 2: raise GlozzException("Was expecting exactly 2 terms, but got %d" % len(terms)) else: return RelSpan(terms[0], terms[1]) elif node.tag == 'positioning' and context == 'schema': units = frozenset(get_all('embedded-unit')) relations = frozenset(get_all('embedded-relation')) schemas = frozenset(get_all('embedded-schema')) return units, relations, schemas elif node.tag == 'relation': rel_id = node.attrib['id'] (unit_type, fs) = get_one('characterisation', None) span = get_one('positioning', None, 'relation') metadata = get_one('metadata', {}) return Relation(rel_id, span, unit_type, fs, metadata=metadata) if node.tag == 'schema': anno_id = node.attrib['id'] (anno_type, fs) = get_one('characterisation', None) units, rels, schemas = get_one('positioning', None, 'schema') metadata = get_one('metadata', {}) return Schema(anno_id, units, rels, schemas, anno_type, fs, metadata=metadata) elif node.tag == 'singlePosition': return int(node.attrib['index']) elif node.tag == 'start' or node.tag == 'end': return get_one('singlePosition', None) elif node.tag in [ 'term', 'embedded-unit', 'embedded-relation', 'embedded-schema' ]: return node.attrib['id'] elif node.tag == 'type': return node.text.strip() elif node.tag == 'unit': unit_id = node.attrib['id'] (unit_type, fs) = get_one('characterisation', None) span = get_one('positioning', None, 'unit') metadata = get_one('metadata', {}) return Unit(unit_id, span, unit_type, fs, metadata=metadata)
def __init__(self, id, start, end): Relation.__init__(self, id, RelSpan(start, end), '', {})
def _mk_doc(self): """ Create an educe.annotation.Document from this graph """ def start(name): return ord(name) - ord('a') def glozz_id(name): return 'du_' + str(start(name)) def is_edu(name): return name not in self.cdus anno_units = list() anno_cdus = list() anno_rels = list() for du_name, speaker_set in self.speakers.items(): # EDU loop if not is_edu(du_name): continue du_start, du_glozz_id = start(du_name), glozz_id(du_name) x_edu = Unit(du_glozz_id, Span(du_start, du_start + 1), 'Segment', dict()) speaker = list(speaker_set)[0] turn = Unit('t' + du_glozz_id, Span(du_start, du_start + 1), 'Turn', { 'Identifier': du_start, 'Emitter': speaker }) self.anno_map[du_name] = x_edu anno_units.append(x_edu) anno_units.append(turn) for du_name, sub_names in self.cdus.items(): x_cdu = Schema( glozz_id(du_name), set(glozz_id(x) for x in sub_names if is_edu(x)), set(), set(glozz_id(x) for x in sub_names if not is_edu(x)), 'Complex_discourse_unit', dict()) self.anno_map[du_name] = x_cdu anno_cdus.append(x_cdu) rel_count = 0 for src_name in self.down: for tgt_name, rel_tag in self.down[src_name]: rel_glozz_id = 'rel_' + str(rel_count) rel_count += 1 if rel_tag == 'S': rel_name = 'Q-Elab' elif rel_tag == 'C': rel_name = 'Contrast' else: raise ValueError('Unknown tag {0}'.format(rel_tag)) rel = Relation(rel_glozz_id, RelSpan(glozz_id(src_name), glozz_id(tgt_name)), rel_name, dict()) self.anno_map[(src_name, tgt_name)] = rel anno_rels.append(rel) dialogue = Unit( 'dialogue_0', Span(0, max(u.text_span().char_end for u in anno_units)), 'Dialogue', {}) anno_units.append(dialogue) doc = Document(anno_units, anno_rels, anno_cdus, string.ascii_lowercase) return doc