def read_tags(corpus, root_dir): """ Read stored POS tagger output from a directory, and convert them to educe.annotation.Standoff objects. Return a dictionary mapping 'FileId's to sets of tokens. """ pos_tags = {} for k in corpus: doc = corpus[k] turns = sorted_by_span(x for x in doc.units if stac.is_turn(x)) tagged_file = tagger_file_name(k, root_dir) raw_toks = ext.read_token_file(tagged_file) pos_tags[k] = [] for turn, seg in zip(turns, raw_toks): prefix, body = stac.split_turn_text(doc.text(turn.text_span())) start = turn.span.char_start + len(prefix) toks = ext.token_spans(body, seg, start) for t in toks: t.origin = doc dtxt = doc.text(t.text_span()) assert dtxt == t.word pos_tags[k].extend(toks) return pos_tags
def tgt_html(grandparent, anno, naughty=False): """ Describe the given annotation in HTML and append that description to the given HTML grandparent node. """ parent = h.span(grandparent) h.span(parent, anno_code(anno)) type_span = h.span(parent, '[%s] ' % anno.type) if naughty: type_span.attrib['class'] = 'naughty' if anno in contexts: turn = contexts[anno].turn turn_info = stac.split_turn_text(doc.text(turn.span))[0] turn_splits = turn_info.split(":") if len(turn_splits) > 1: tid = ET.SubElement(parent, 'b') tid.text = turn_splits[0] + ":" h.span(parent, ":".join(turn_splits[1:])) else: h.span(parent, turn_info) if not stac.is_relation_instance(anno): t_text = text(anno) if stac.is_cdu(anno): trange = turn_range(anno) if trange: h.elem(parent, 'b', trange) h.span(parent, text=snippet(t_text, 100), attrib={'class': 'snippet'}) h.span(parent, ' %s' % anno.text_span()) return parent
def read_tags(corpus, dir): """ Read stored POS tagger output from a directory, and convert them to educe.annotation.Standoff objects. Return a dictionary mapping 'FileId's to sets of tokens. """ pos_tags = {} for k in corpus: doc = corpus[k] turns = sorted_by_span(filter(stac.is_turn, doc.units)) tagged_file = tagger_file_name(k, dir) raw_toks = ext.read_token_file(tagged_file) pos_tags[k] = [] for turn, seg in zip(turns, raw_toks): prefix, body = stac.split_turn_text(doc.text_for(turn)) start = turn.span.char_start + len(prefix) toks = ext.token_spans(body, seg, start) for t in toks: t.origin = doc dtxt = doc.text_for(t) assert dtxt == t.word pos_tags[k].extend(toks) return pos_tags
def is_disconnected(gra, contexts, node): """ An EDU is considered disconnected unless: * it has an incoming link or * it has an outgoing Conditional link * it's at the beginning of a dialogue In principle we don't need to look at EDUs that are disconnected on the outgoing end because (1) it's can be legitimate for non-dialogue-ending EDUs to not have outgoing links and (2) such information would be redundant with the incoming anyway """ def rel_type(rel): "relation type for a given link (string)" return gra.annotation(gra.mirror(rel)).type edu = gra.annotation(node) if edu not in contexts: return True else: ctx = contexts[edu] first_turn_span = ctx.dialogue_turns[0].text_span() first_turn_text = gra.doc.text(first_turn_span) first_turn_pref = stac.split_turn_text(first_turn_text)[0] first_turn_start = first_turn_span.char_start + len(first_turn_pref) rel_links = [x for x in gra.links(node) if gra.is_relation(x)] has_incoming = any(node == gra.rel_links(x)[1] for x in rel_links) has_outgoing_whitelist = any(node == gra.rel_links(r)[0] and rel_type(r) in BACKWARDS_WHITELIST for r in rel_links) is_at_start = edu.text_span().char_start == first_turn_start return not (has_incoming or has_outgoing_whitelist or is_at_start)
def html(self): doc = self.doc contexts = self.contexts t = self.unit parent = ET.Element('span') html_anno_id(parent, self.unit) html_span(parent, " " + anno_code(t)) type_span = html_span(parent, '[%s] ' % t.type) if t in contexts: turn = contexts[t].turn turn_info = stac.split_turn_text(doc.text(turn.span))[0] turn_splits = turn_info.split(":") if len(turn_splits) > 1: tid = ET.SubElement(parent, 'b') tid.text = turn_splits[0] + ":" trest = html_span(parent, ":".join(turn_splits[1:])) else: html_span(parent, turn_info) t_span = t.text_span() t_text = doc.text(t_span) if t_span.char_start > 0: before_idx = t_span.char_start - 1 before_sp = html_span(parent, doc.text()[before_idx]) before_sp.attrib['class'] = 'spillover' text_sp = html_span(parent, t_text) text_sp.attrib['class'] = 'snippet' if t_span.char_end < len(doc.text()): after_idx = t_span.char_end after_sp = html_span(parent, doc.text()[after_idx]) after_sp.attrib['class'] = 'spillover' html_span(parent, ' %s' % t_span) return parent
def is_disconnected(gra, contexts, node): """Return True if an EDU is disconnected from a discourse structure. An EDU is considered disconnected unless: * it has an incoming link or * it has an outgoing Conditional link or * it's at the beginning of a dialogue In principle we don't need to look at EDUs that are disconnected on the outgoing end because (1) it can be legitimate for non-dialogue-ending EDUs to not have outgoing links and (2) such information would be redundant with the incoming anyway. """ def rel_type(rel): "relation type for a given link (string)" return gra.annotation(gra.mirror(rel)).type edu = gra.annotation(node) if edu not in contexts: return True else: ctx = contexts[edu] first_turn_span = ctx.dialogue_turns[0].text_span() first_turn_text = gra.doc.text(first_turn_span) first_turn_pref = stac.split_turn_text(first_turn_text)[0] first_turn_start = first_turn_span.char_start + len(first_turn_pref) rel_links = [x for x in gra.links(node) if gra.is_relation(x)] has_incoming = any(node == gra.rel_links(x)[1] for x in rel_links) has_outgoing_whitelist = any( node == gra.rel_links(r)[0] and rel_type(r) in BACKWARDS_WHITELIST for r in rel_links) is_at_start = edu.text_span().char_start == first_turn_start return not (has_incoming or has_outgoing_whitelist or is_at_start)
def turn_id_text(doc): """ Return a list of (turn ids, text) tuples in span order (no speaker) """ turns = sorted((x for x in doc.units if stac.is_turn(x)), key=lambda k: k.text_span()) return [(stac.turn_id(turn), stac.split_turn_text(doc.text(turn.text_span()))[1]) for turn in turns]
def _get_turn_info(self, u): enclosing_turns = [ t for t in self.turns if t.span.encloses(u.span) ] if len(enclosing_turns) > 0: turn = enclosing_turns[0] speaker = turn.features['Emitter'] turn_text = stac.split_turn_text(self.doc.text(turn.span))[0] turn_id = turn_text.split(':')[0].strip() return speaker, turn_id else: return None, None
def html_turn_info(self, parent, turn): """ Given a turn annotation, append a prettified HTML representation of the turn text (highlighting parts of it, such as the turn number) """ turn_text = self.doc.text(turn.text_span()) turn_info = stac.split_turn_text(turn_text)[0] turn_splits = turn_info.split(":") if len(turn_splits) > 1: tid = turn_splits[0] trest = turn_splits[1:] h.elem(parent, 'b', text=tid + ":") h.span(parent, text=":".join(trest)) else: h.span(parent, turn_info)
def tgt_html(grandparent, t, naughty=False): def tid(x): if x in contexts: tid_str = contexts[x].turn.features['Identifier'] return int(tid_str) if tid_str else None else: return None parent = html_span(grandparent) html_span(parent, anno_code(t)) type_span = html_span(parent, '[%s] ' % t.type) if naughty: type_span.attrib['class'] = 'naughty' if t in contexts: turn = contexts[t].turn turn_info = stac.split_turn_text(doc.text(turn.span))[0] turn_splits = turn_info.split(":") if len(turn_splits) > 1: tid = ET.SubElement(parent, 'b') tid.text = turn_splits[0] + ":" trest = html_span(parent, ":".join(turn_splits[1:])) else: html_span(parent, turn_info) if not stac.is_relation_instance(t): t_span = t.text_span() t_text = doc.text(t_span) if stac.is_cdu(t): tids = [x for x in map(tid, t.terminals()) if x] if tids: tspan = ET.SubElement(parent, 'b') min_tid = min(tids) max_tid = max(tids) if min_tid == max_tid: tspan.text = "%d: " % min_tid else: tspan.text = "%d-%d: " % (min_tid, max_tid) text_sp = html_span(parent, snippet(t_text, 100)) text_sp.attrib['class'] = 'snippet' html_span(parent, ' %s' % t_span) return parent
def read_tags(corpus, root_dir): """ Read stored POS tagger output from a directory, and convert them to educe.annotation.Standoff objects. Return a dictionary mapping 'FileId's to sets of tokens. Parameters ---------- corpus : dict(FileId, GlozzDocument) Dictionary of documents keyed by their FileId. root_dir : str Path to the directory containing the output of the POS tagger, one file per document. Returns ------- pos_tags : dict(FileId, list(Token)) Map from each document id to the list of tokens predicted by a POS tagger. """ pos_tags = {} for k in corpus: doc = corpus[k] turns = sorted_by_span(x for x in doc.units if stac.is_turn(x)) tagged_file = tagger_file_name(k, root_dir) raw_toks = ext.read_token_file(tagged_file) pos_tags[k] = [] for turn, seg in zip(turns, raw_toks): prefix, body = stac.split_turn_text(doc.text(turn.text_span())) start = turn.span.char_start + len(prefix) toks = ext.token_spans(body, seg, start) for t in toks: t.origin = doc dtxt = doc.text(t.text_span()) assert dtxt == t.word pos_tags[k].extend(toks) return pos_tags
def read_corenlp_result(doc, corenlp_doc, tid=None): """Read CoreNLP's output for a document. Parameters ---------- doc: educe Document (?) The original document (?) corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document tid: turn id Turn id (?) Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information. """ def is_matching_turn(x): """Check whether x corresponds to the current turn""" if tid is None: return stac.is_turn(x) else: x_tid = stac.turn_id(x) return stac.is_turn(x) & tid == x_tid turns = sorted((x for x in doc.units if is_matching_turn(x)), key=lambda k: k.span) sentences = corenlp_doc.get_ordered_sentence_list() if len(turns) != len(sentences): msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\ 'and parsed sentences (%d) %s'\ % (len(turns), len(sentences), doc.origin) raise Exception(msg) sentence_toks = defaultdict(list) for t in corenlp_doc.get_ordered_token_list(): sid = t['s_id'] sentence_toks[sid].append(t) # build dict from sid to (dict from tid to fancy token) educe_tokens = defaultdict(dict) for turn, sent in zip(turns, sentences): sid = sent['id'] # the token offsets are global, ie. for all sentences/turns # in the file; so we have to shift them to left to zero them # and then shift them back to the right sentence_begin = min(t['extent'][0] for t in sentence_toks[sid]) ttext = doc.text(turn.text_span()) offset = (turn.span.char_start + len(stac.split_turn_text(ttext)[0]) - sentence_begin) for t in sentence_toks[sid]: tid = t['id'] educe_tokens[sid][tid] = CoreNlpToken(t, offset) all_tokens = [] all_trees = [] all_dtrees = [] for turn, sent in zip(turns, sentences): sid = sent['id'] tokens_dict = educe_tokens[sid] # FIXME tokens are probably not properly ordered because token ids # are global ids, i.e. strings like "1-18" (sentence 1, token 18) # which means basic sorting ranks "1-10" before "1-2" # cf. educe.rst_dt.corenlp sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())] # end FIXME tree = nltk.tree.Tree.fromstring(sent['parse']) educe_tree = ConstituencyTree.build(tree, sorted_tokens) deps = defaultdict(list) for ty, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((ty, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') all_tokens.extend(sorted_tokens) all_trees.append(educe_tree) all_dtrees.append(educe_dtree) all_chains = [] for ctr, chain in enumerate(corenlp_doc.get_coref_chains()): mentions = [] for m in chain: sid = m['sentence'] local_id = lambda x: int(x[len(sid) + 1:]) global_id = lambda x: sid + '-' + str(x) start = local_id(m['start']) end = local_id(m['end']) token_range = [global_id(x) for x in range(start, end)] tokens = [educe_tokens[sid][t] for t in token_range] head = educe_tokens[sid][m['head']] mentions.append(Mention(tokens, head, m['most_representative'])) all_chains.append(Chain(mentions)) return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
def run_pipeline(corpus, outdir, corenlp_dir, split=False): """ Run the standard corenlp pipeline on all the (unannotated) documents in the corpus and save the results in the specified directory If `split=True`, we output one file per turn, an experimental mode to account for switching between multiple speakers. We don't have all the infrastructure to read these back in (it should just be a matter of some filename manipulation though) and hope to flesh this out later. We also intend to tweak the notion of splitting by aggregating consecutive turns with the same speaker, which may somewhat mitigate the lost of coreference information. """ # for each document, how many digits do we need to represent the turns # in that document; for essentially cosmetic purposes (padding) digits = {} for d in frozenset([ k.doc for k in corpus ]): turns = [] for k in corpus: if k.doc == d: turns.extend(filter(stac.is_turn, corpus[k].units)) turn_ids = [ int(t.features['Identifier']) for t in turns ] digits[d] = max(2,int(math.ceil(math.log10(max(turn_ids))))) # dump the turn text # TODO: aggregate consecutive turns by same speaker txt_files = [] for k in corpus: doc = corpus[k] turns = sorted(filter(stac.is_turn, doc.units), key=lambda k:k.span) k_txt = copy.copy(k) k_txt.stage = 'turns' k_txt.annotator = None if split: for turn in turns: ttext = stac.split_turn_text(doc.text_for(turn))[1] tid = turn.features['Identifier'] root = stac.id_to_path(k_txt) + '_' + tid.zfill(digits[k.doc]) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: print >> f, ttext txt_files.append(txt_file) else: root = stac.id_to_path(k_txt) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: for turn in turns: ttext = stac.split_turn_text(doc.text_for(turn))[1] print >> f, ttext txt_files.append(txt_file) # manifest tells corenlp what to files to read as input manifest_dir = os.path.join(outdir, 'tmp') manifest_file = os.path.join(manifest_dir, 'manifest') with codecs.open(manifest_file, 'w', 'utf-8') as f: print >> f, '\n'.join(txt_files) # java properties to control behaviour of corenlp properties = [] if split else ['ssplit.eolonly=true'] props_file = os.path.join(manifest_dir, 'corenlp.properties') with codecs.open(props_file, 'w', 'utf-8') as f: print >> f, '\n'.join(properties) # run corenlp (will take a while for it to load its various models) jars = [ x for x in os.listdir(corenlp_dir) if os.path.splitext(x)[1] == '.jar' ] cp_sep = ':' if os.name != 'nt' else ';' corenlp_outdir = os.path.join(outdir, 'corenlp') if not os.path.exists(corenlp_outdir): os.makedirs(corenlp_outdir) cmd = [ 'java' , '-cp', cp_sep.join(jars) , '-Xmx3g' , 'edu.stanford.nlp.pipeline.StanfordCoreNLP' , '-filelist', manifest_file , '-props', props_file , '-outputDirectory', corenlp_outdir ] subprocess.call(cmd, cwd=corenlp_dir) # corenlp dumps all the output into one flat directory; # move them to the standard STAC layout paths for sfile in os.listdir(corenlp_outdir): if os.path.splitext(sfile)[1] != '.xml': continue k, tid = from_corenlp_output_filename(sfile) from_path = os.path.join(corenlp_outdir, sfile) to_path = parsed_file_name(k, outdir) to_dir = os.path.dirname(to_path) if not os.path.exists(to_dir): os.makedirs(to_dir) os.rename(from_path, to_path)
def read_corenlp_result(doc, corenlp_doc, tid=None): def is_matching_turn(x): if tid is None: return stac.is_turn(x) else: x_tid = x.features['Identifier'] return stac.is_turn(x) & tid == x_tid turns = sorted(filter(is_matching_turn, doc.units), key=lambda k:k.span) sentences = corenlp_doc.get_ordered_sentence_list() if len(turns) != len(sentences): msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\ 'and parsed sentences (%d) %s'\ % (len(turns), len(sentences), doc.origin) raise Exception(msg) sentence_toks = collections.defaultdict(list) for t in corenlp_doc.get_ordered_token_list(): sid = t['s_id'] sentence_toks[sid].append(t) # build dict from sid to (dict from tid to fancy token) educe_tokens = collections.defaultdict(dict) for turn, sent in zip(turns, sentences): sid = sent['id'] # the token offsets are global, ie. for all sentences/turns # in the file; so we have to shift them to left to zero them # and then shift them back to the right sentence_begin = min(t['extent'][0] for t in sentence_toks[sid]) ttext = doc.text_for(turn) offset = turn.span.char_start + len(stac.split_turn_text(ttext)[0]) - sentence_begin for t in sentence_toks[sid]: tid = t['id'] educe_tokens[sid][tid] = CoreNlpToken(t, offset) all_tokens = [] all_trees = [] all_dtrees = [] for turn, sent in zip(turns, sentences): sid = sent['id'] tokens = educe_tokens[sid] tree = nltk.tree.Tree(sent['parse']) educe_tree = ConstituencyTree.build(tree, tokens.values()) deps = collections.defaultdict(list) for ty, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((ty,dep_id)) educe_dtree = DependencyTree.build(deps, tokens, sid + '-0') all_tokens.extend(tokens.values()) all_trees.append(educe_tree) all_dtrees.append(educe_dtree) all_chains = [] for ctr,chain in enumerate(corenlp_doc.get_coref_chains()): mentions = [] for m in chain: sid = m['sentence'] local_id = lambda x : int(x[len(sid) + 1:]) global_id = lambda x : sid + '-' + str(x) start = local_id(m['start']) end = local_id(m['end']) token_range = map(global_id, range(start, end)) tokens = [ educe_tokens[sid][t] for t in token_range ] head = educe_tokens[sid][m['head']] mentions.append(Mention(tokens, head, m['most_representative'])) all_chains.append(Chain(mentions)) return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
def ttext(turn): return stac.split_turn_text(doc.text(turn.text_span()))[1]
def ttext(turn): return stac.split_turn_text(doc.text_for(turn))[1]
def ttext(turn): """Get the turn text""" return stac.split_turn_text(doc.text(turn.text_span()))[1]