def parse_doc_ptb(doc_id, doc_tkd_toks): """Dirty PTB parser""" # get PTB trees ptb_name = _guess_ptb_name(doc_id) if ptb_name is None: return None # use tweaked tokens doc_tokens = doc_tkd_toks tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in PTB_READER.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) return trees # , lex_heads
def parse(self, doc): """Parse a document, using the gold PTB annotation. Given a document, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: does nothing if there is no associated PTB corpus entry. Parameters ---------- doc: DocumentPlus Rich representation of the document. Returns ------- doc: DocumentPlus Rich representation of the document, with syntactic constituency trees. """ # get PTB trees ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get tokens from tokenized document # FIXME alignment/reconstruction should never have to deal # with the left padding token in the first place doc_tokens = doc.tkd_tokens[1:] # skip left padding token tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in self.reader.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) # store trees in doc doc.set_syn_ctrees(trees, lex_heads=lex_heads) return doc
def parse(self, doc): """ Given a document, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: does nothing if there is no associated PTB corpus entry. """ # get PTB trees ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get tokens from tokenized document # FIXME alignment/reconstruction should never have to deal # with the left padding token in the first place doc_tokens = doc.tkd_tokens[1:] # skip left padding token tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in self.reader.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) # store trees in doc doc.tkd_trees.extend(trees) # store lexical heads in doc # TODO move to DocumentPlus doc.lex_heads = [] doc.lex_heads.append(None) # end TODO doc.lex_heads.extend(lex_heads) return doc
def parse_trees(corpus, k, ptb): """ Given an RST DT tree and an NLTK PTB reader, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: returns None if there is no associated PTB corpus entry. """ ptb_name = _guess_ptb_name(k) if ptb_name is None: return None tokens_iter = align(corpus, k, ptb) results = [] for tree in ptb.parsed_sents(ptb_name): leaves = tree.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) results.append(ConstituencyTree.build(tree, tslice)) return results
def read_corenlp_result(doc, corenlp_doc, tid=None): """Read CoreNLP's output for a document. Parameters ---------- doc: educe Document (?) The original document (?) corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document tid: turn id Turn id (?) Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information. """ def is_matching_turn(x): """Check whether x corresponds to the current turn""" if tid is None: return stac.is_turn(x) else: x_tid = stac.turn_id(x) return stac.is_turn(x) & tid == x_tid turns = sorted((x for x in doc.units if is_matching_turn(x)), key=lambda k: k.span) sentences = corenlp_doc.get_ordered_sentence_list() if len(turns) != len(sentences): msg = 'Uh-oh, mismatch between number turns in the corpus (%d) '\ 'and parsed sentences (%d) %s'\ % (len(turns), len(sentences), doc.origin) raise Exception(msg) sentence_toks = defaultdict(list) for t in corenlp_doc.get_ordered_token_list(): sid = t['s_id'] sentence_toks[sid].append(t) # build dict from sid to (dict from tid to fancy token) educe_tokens = defaultdict(dict) for turn, sent in zip(turns, sentences): sid = sent['id'] # the token offsets are global, ie. for all sentences/turns # in the file; so we have to shift them to left to zero them # and then shift them back to the right sentence_begin = min(t['extent'][0] for t in sentence_toks[sid]) ttext = doc.text(turn.text_span()) offset = (turn.span.char_start + len(stac.split_turn_text(ttext)[0]) - sentence_begin) for t in sentence_toks[sid]: tid = t['id'] educe_tokens[sid][tid] = CoreNlpToken(t, offset) all_tokens = [] all_trees = [] all_dtrees = [] for turn, sent in zip(turns, sentences): sid = sent['id'] tokens_dict = educe_tokens[sid] # FIXME tokens are probably not properly ordered because token ids # are global ids, i.e. strings like "1-18" (sentence 1, token 18) # which means basic sorting ranks "1-10" before "1-2" # cf. educe.rst_dt.corenlp sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict.keys())] # end FIXME tree = nltk.tree.Tree.fromstring(sent['parse']) educe_tree = ConstituencyTree.build(tree, sorted_tokens) deps = defaultdict(list) for ty, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((ty, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') all_tokens.extend(sorted_tokens) all_trees.append(educe_tree) all_dtrees.append(educe_dtree) all_chains = [] for ctr, chain in enumerate(corenlp_doc.get_coref_chains()): mentions = [] for m in chain: sid = m['sentence'] local_id = lambda x: int(x[len(sid) + 1:]) global_id = lambda x: sid + '-' + str(x) start = local_id(m['start']) end = local_id(m['end']) token_range = [global_id(x) for x in range(start, end)] tokens = [educe_tokens[sid][t] for t in token_range] head = educe_tokens[sid][m['head']] mentions.append(Mention(tokens, head, m['most_representative'])) all_chains.append(Chain(mentions)) return CoreNlpDocument(all_tokens, all_trees, all_dtrees, all_chains)
def read_corenlp_result(doc, corenlp_doc): """Read CoreNLP's output for a document. Parameters ---------- doc: educe.rst_dt.document_plus.DocumentPlus The original document (currently unused, could be necessary to determine e.g. token offset for specific file formats ; if it never gets used, this function should probably to the generic default and moved to `educe.external.corenlp`). corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information """ # sentences sentences = corenlp_doc.get_ordered_sentence_list() # tokens sentence_toks = defaultdict(list) for tok in corenlp_doc.get_ordered_token_list(): sid = tok['s_id'] sentence_toks[sid].append(tok) # educe tokens educe_tokens = defaultdict(dict) for sent in sentences: sid = sent['id'] sent_toks = sentence_toks[sid] offset = 0 # was: sent_begin for tok in sent_toks: tid = tok['id'] educe_tokens[sid][tid] = CoreNlpToken(tok, offset) # educe tokens, ctree and dtree all_tokens = [] all_ctrees = [] all_dtrees = [] for sent in sentences: sid = sent['id'] tokens_dict = educe_tokens[sid] # NEW extract local id to properly sort tokens tok_local_id = lambda x: int(x[len(sid) + 1:]) sorted_tokens = [tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id)] # ctree tree = nltk.tree.Tree.fromstring(sent['parse']) # FIXME 2016-06-13 skip the ROOT node, as in PTB # maybe we'd better add ROOT to the empty parentheses in the # PTB version, but just getting rid of ROOT here seems simpler: # the type of the root node of a tree is informative: usually # S, but more interestingly SINV, NP... if tree.label() != 'ROOT' or len(tree) > 1: print(tree) raise ValueError('Atypical root of CoreNLP tree') tree = tree[0] # go down from ROOT to the real root educe_ctree = ConstituencyTree.build(tree, sorted_tokens) # dtree deps = defaultdict(list) for lbl, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((lbl, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') # store educe tokens, ctrees and dtrees all_tokens.extend(sorted_tokens) all_ctrees.append(educe_ctree) all_dtrees.append(educe_dtree) # coreference chains all_chains = [] for chain in corenlp_doc.get_coref_chains(): mentions = [] for mntn in chain: sid = mntn['sentence'] # helper functions to extract local ids and generate global ids local_id = lambda x: int(x[len(sid) + 1:]) global_id = lambda x: sid + '-' + str(x) # retrieve tokens for this mention start = local_id(mntn['start']) end = local_id(mntn['end']) tokens = [educe_tokens[sid][global_id(tok_idx)] for tok_idx in range(start, end)] head = educe_tokens[sid][mntn['head']] mentions.append(Mention(tokens, head, mntn['most_representative'])) all_chains.append(Chain(mentions)) corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees, all_chains) return corenlp_doc
def read_corenlp_result(doc, corenlp_doc): """Read CoreNLP's output for a document. Parameters ---------- doc: educe.rst_dt.document_plus.DocumentPlus The original document (currently unused, could be necessary to determine e.g. token offset for specific file formats ; if it never gets used, this function should probably to the generic default and moved to `educe.external.corenlp`). corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information """ # sentences sentences = corenlp_doc.get_ordered_sentence_list() # tokens sentence_toks = defaultdict(list) for tok in corenlp_doc.get_ordered_token_list(): sid = tok['s_id'] sentence_toks[sid].append(tok) # educe tokens educe_tokens = defaultdict(dict) for sent in sentences: sid = sent['id'] sent_toks = sentence_toks[sid] offset = 0 # was: sent_begin for tok in sent_toks: tid = tok['id'] educe_tokens[sid][tid] = CoreNlpToken(tok, offset) # educe tokens, ctree and dtree all_tokens = [] all_ctrees = [] all_dtrees = [] for sent in sentences: sid = sent['id'] tokens_dict = educe_tokens[sid] # NEW extract local id to properly sort tokens tok_local_id = lambda x: int(x[len(sid) + 1:]) sorted_tokens = [ tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id) ] # ctree tree = nltk.tree.Tree.fromstring(sent['parse']) educe_ctree = ConstituencyTree.build(tree, sorted_tokens) # dtree deps = defaultdict(list) for lbl, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((lbl, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') # store educe tokens, ctrees and dtrees all_tokens.extend(sorted_tokens) all_ctrees.append(educe_ctree) all_dtrees.append(educe_dtree) # coreference chains all_chains = [] for chain in corenlp_doc.get_coref_chains(): mentions = [] for mntn in chain: sid = mntn['sentence'] # helper functions to extract local ids and generate global ids local_id = lambda x: int(x[len(sid) + 1:]) global_id = lambda x: sid + '-' + str(x) # retrieve tokens for this mention start = local_id(mntn['start']) end = local_id(mntn['end']) tokens = [ educe_tokens[sid][global_id(tok_idx)] for tok_idx in range(start, end) ] head = educe_tokens[sid][mntn['head']] mentions.append(Mention(tokens, head, mntn['most_representative'])) all_chains.append(Chain(mentions)) corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees, all_chains) return corenlp_doc
def read_corenlp_result(doc, corenlp_doc): """Read CoreNLP's output for a document. Parameters ---------- doc: educe.rst_dt.document_plus.DocumentPlus The original document (currently unused, could be necessary to determine e.g. token offset for specific file formats ; if it never gets used, this function should probably to the generic default and moved to `educe.external.corenlp`). corenlp_doc: educe.external.stanford_xml_reader.PreprocessingSource Object that contains all annotations for the document Returns ------- corenlp_doc: CoreNlpDocument A CoreNlpDocument containing all information """ # sentences sentences = corenlp_doc.get_ordered_sentence_list() # tokens sentence_toks = defaultdict(list) for tok in corenlp_doc.get_ordered_token_list(): sid = tok['s_id'] sentence_toks[sid].append(tok) # educe tokens educe_tokens = defaultdict(dict) for sent in sentences: sid = sent['id'] sent_toks = sentence_toks[sid] offset = 0 # was: sent_begin for tok in sent_toks: tid = tok['id'] educe_tokens[sid][tid] = CoreNlpToken(tok, offset) # educe tokens, ctree and dtree all_tokens = [] all_ctrees = [] all_dtrees = [] for sent in sentences: sid = sent['id'] tokens_dict = educe_tokens[sid] # sort tokens by their (integer) local id tok_local_id = tok_lid(sid) sorted_tokens = [ tokens_dict[x] for x in sorted(tokens_dict, key=tok_local_id) ] # ctree tree = nltk.tree.Tree.fromstring(sent['parse']) # FIXME 2016-06-13 skip the ROOT node, as in PTB # maybe we'd better add ROOT to the empty parentheses in the # PTB version, but just getting rid of ROOT here seems simpler: # the type of the root node of a tree is informative: usually # S, but more interestingly SINV, NP... if tree.label() != 'ROOT' or len(tree) > 1: print(tree) raise ValueError('Atypical root of CoreNLP tree') tree = tree[0] # go down from ROOT to the real root educe_ctree = ConstituencyTree.build(tree, sorted_tokens) # dtree deps = defaultdict(list) for lbl, gov_id, dep_id in sent['dependencies']: deps[gov_id].append((lbl, dep_id)) educe_dtree = DependencyTree.build(deps, tokens_dict, sid + '-0') # store educe tokens, ctrees and dtrees all_tokens.extend(sorted_tokens) all_ctrees.append(educe_ctree) all_dtrees.append(educe_dtree) # coreference chains all_chains = [] for chain in corenlp_doc.get_coref_chains(): mentions = [] for mntn in chain: sid = mntn['sentence'] # helper functions to map from/to local and global ids tok_local_id = tok_lid(sid) tok_global_id = tok_gid(sid) # retrieve tokens for this mention start = tok_local_id(mntn['start']) end = tok_local_id(mntn['end']) tokens = [ educe_tokens[sid][tok_global_id(tok_idx)] for tok_idx in range(start, end) ] head = educe_tokens[sid][mntn['head']] mentions.append(Mention(tokens, head, mntn['most_representative'])) all_chains.append(Chain(mentions)) corenlp_doc = CoreNlpDocument(all_tokens, all_ctrees, all_dtrees, all_chains) return corenlp_doc