def parse(self, doc): """Parse """ corenlp_out_name = _guess_corenlp_name(doc.key) if corenlp_out_name is None: return doc fname = os.path.join(self.corenlp_out_dir, corenlp_out_name) if not os.path.exists(fname): raise ValueError('CoreNLP XML: no file {}'.format(fname)) # CoreNLP XML output reader # FIXME the same reading is done in tokenize(), should find # a way to cache or share call reader = PreprocessingSource() reader.read(fname, suffix='') corenlp_out = read_corenlp_result(doc, reader) # ctrees and lexical heads on their nodes ctrees = corenlp_out.trees # strip function tags # TODO maybe this should be an internal preprocessing step in # find_lexical_heads(), so as to keep the function tags # that are kept by default by CoreNLP parser because they were found # to be useful e.g. `-retainTMPSubcategories` ctrees_no_gf = [transform_tree(ctree, strip_subcategory) for ctree in ctrees] lex_heads = [find_lexical_heads(ctree_no_gf) for ctree_no_gf in ctrees_no_gf] # store trees in doc doc.set_syn_ctrees(ctrees_no_gf, lex_heads=lex_heads) return doc
def parse_doc_ptb(doc_id, doc_tkd_toks): """Dirty PTB parser""" # get PTB trees ptb_name = _guess_ptb_name(doc_id) if ptb_name is None: return None # use tweaked tokens doc_tokens = doc_tkd_toks tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in PTB_READER.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) return trees # , lex_heads
def parse(self, doc): """Parse """ corenlp_out_name = _guess_corenlp_name(doc.key) if corenlp_out_name is None: return doc fname = os.path.join(self.corenlp_out_dir, corenlp_out_name) if not os.path.exists(fname): raise ValueError('CoreNLP XML: no file {}'.format(fname)) # CoreNLP XML output reader # FIXME the same reading is done in tokenize(), should find # a way to cache or share call reader = PreprocessingSource() reader.read(fname, suffix='') corenlp_out = read_corenlp_result(doc, reader) # ctrees and lexical heads on their nodes ctrees = corenlp_out.trees # strip function tags # TODO maybe this should be an internal preprocessing step in # find_lexical_heads(), so as to keep the function tags # that are kept by default by CoreNLP parser because they were found # to be useful e.g. `-retainTMPSubcategories` ctrees_no_gf = [ transform_tree(ctree, strip_subcategory) for ctree in ctrees ] lex_heads = [ find_lexical_heads(ctree_no_gf) for ctree_no_gf in ctrees_no_gf ] # store trees in doc doc.set_syn_ctrees(ctrees_no_gf, lex_heads=lex_heads) return doc
def parse(self, doc): """Parse a document, using the gold PTB annotation. Given a document, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: does nothing if there is no associated PTB corpus entry. Parameters ---------- doc: DocumentPlus Rich representation of the document. Returns ------- doc: DocumentPlus Rich representation of the document, with syntactic constituency trees. """ # get PTB trees ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get tokens from tokenized document # FIXME alignment/reconstruction should never have to deal # with the left padding token in the first place doc_tokens = doc.tkd_tokens[1:] # skip left padding token tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in self.reader.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) # store trees in doc doc.set_syn_ctrees(trees, lex_heads=lex_heads) return doc
def parse(self, doc): """ Given a document, return a list of educified PTB parse trees (one per sentence). These are almost the same as the trees that would be returned by the `parsed_sents` method, except that each leaf/node is associated with a span within the RST DT text. Note: does nothing if there is no associated PTB corpus entry. """ # get PTB trees ptb_name = _guess_ptb_name(doc.key) if ptb_name is None: return doc # get tokens from tokenized document # FIXME alignment/reconstruction should never have to deal # with the left padding token in the first place doc_tokens = doc.tkd_tokens[1:] # skip left padding token tokens_iter = iter(doc_tokens) trees = [] lex_heads = [] for tree in self.reader.parsed_sents(ptb_name): # apply standard cleaning to tree # strip function tags, remove empty nodes tree_no_empty = prune_tree(tree, is_non_empty) tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory) # leaves = tree_no_empty_no_gf.leaves() tslice = itertools.islice(tokens_iter, len(leaves)) clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice) trees.append(clean_tree) # lexicalize the PTB tree: find the head word of each constituent # constituents and their heads are designated by their Gorn address # ("tree position" in NLTK) in the tree lheads = find_lexical_heads(clean_tree) lex_heads.append(lheads) # store trees in doc doc.tkd_trees.extend(trees) # store lexical heads in doc # TODO move to DocumentPlus doc.lex_heads = [] doc.lex_heads.append(None) # end TODO doc.lex_heads.extend(lex_heads) return doc