def turns_with_final_emoticons(doc, tags): """ Return a tuple of lists. Both lists contain the turns in a document that end with the pattern EDU emoticon-only-EDU. The first (main) list contains those that are not pointed to by any relations or schema. The second (warnings only) list contains those that have relations or schema pointing to them. The reason we distinguish between the two lists is that we don't want to touch those in the latter (out of conservatism, the idea of removing these from their relations, CDUs seems scary), but we want to know about them. """ egraph = EnclosureGraph(doc, tags) affected_free_turns = [] affected_linked_turns = [] for turn in sorted_turns(doc): edus = sorted_first_widest(egraph.inside(turn)) last_edu = edus[-1] if len(edus) > 1 and is_just_emoticon(egraph.inside(last_edu)): if has_links(doc, last_edu): affected_linked_turns.append(turn) else: affected_free_turns.append(turn) return affected_free_turns, affected_linked_turns
def merge_final_emoticons(tcache, turn_spans, doc, tags): """ Given a timestamp cache and some text spans identifying turns with final emoticons in them, and a document: 1. find the specified turns in the document 2. absorb their emoticon EDUs into the one before it This modifies the document and does not return anything """ egraph = EnclosureGraph(doc, tags) for turn in sorted_turns(doc): if turn.text_span() not in turn_spans: continue edus = sorted_first_widest(egraph.inside(turn)) assert len(edus) > 1 stamp = tcache.get(educe.stac.turn_id(turn)) last_edu = edus[-1] penult_edu = edus[-2] absorb_emoticon(doc, stamp, penult_edu, last_edu) doc.units.remove(last_edu)