def parse(self, doc): """Parse """ corenlp_out_name = _guess_corenlp_name(doc.key) if corenlp_out_name is None: return doc fname = os.path.join(self.corenlp_out_dir, corenlp_out_name) if not os.path.exists(fname): raise ValueError('CoreNLP XML: no file {}'.format(fname)) # CoreNLP XML output reader # FIXME the same reading is done in tokenize(), should find # a way to cache or share call reader = PreprocessingSource() reader.read(fname, suffix='') corenlp_out = read_corenlp_result(doc, reader) # ctrees and lexical heads on their nodes ctrees = corenlp_out.trees # strip function tags # TODO maybe this should be an internal preprocessing step in # find_lexical_heads(), so as to keep the function tags # that are kept by default by CoreNLP parser because they were found # to be useful e.g. `-retainTMPSubcategories` ctrees_no_gf = [transform_tree(ctree, strip_subcategory) for ctree in ctrees] lex_heads = [find_lexical_heads(ctree_no_gf) for ctree_no_gf in ctrees_no_gf] # store trees in doc doc.set_syn_ctrees(ctrees_no_gf, lex_heads=lex_heads) return doc
def tokenize(self, doc): """Tokenize the document text. Parameters ---------- doc: educe.rst_dt.DocumentPlus Document Returns ------- doc: educe.rst_dt.DocumentPlus Tokenized document """ corenlp_out_name = _guess_corenlp_name(doc.key) if corenlp_out_name is None: return doc fname = os.path.join(self.corenlp_out_dir, corenlp_out_name) if not os.path.exists(fname): raise ValueError('CoreNLP XML: no file {}'.format(fname)) # CoreNLP XML output reader reader = PreprocessingSource() reader.read(fname, suffix='') corenlp_out = read_corenlp_result(doc, reader) # modify DocumentPlus doc to add tokens doc.set_tokens(corenlp_out.tokens) return doc
def tokenize(self, doc): """Tokenize the document text. Parameters ---------- doc: educe.rst_dt.DocumentPlus Document Returns ------- doc: educe.rst_dt.DocumentPlus Tokenized document """ corenlp_out_name = _guess_corenlp_name(doc.key) if corenlp_out_name is None: return doc fname = os.path.join(self.corenlp_out_dir, corenlp_out_name) if not os.path.exists(fname): raise ValueError('CoreNLP XML: no file {}'.format(fname)) # CoreNLP XML output reader reader = PreprocessingSource() reader.read(fname, suffix='') corenlp_out = read_corenlp_result(doc, reader) # modify DocumentPlus doc to add tokens doc.tkd_tokens.extend(corenlp_out.tokens) return doc
def parse(self, doc): """Parse """ corenlp_out_name = _guess_corenlp_name(doc.key) if corenlp_out_name is None: return doc fname = os.path.join(self.corenlp_out_dir, corenlp_out_name) if not os.path.exists(fname): raise ValueError('CoreNLP XML: no file {}'.format(fname)) # CoreNLP XML output reader # FIXME the same reading is done in tokenize(), should find # a way to cache or share call reader = PreprocessingSource() reader.read(fname, suffix='') corenlp_out = read_corenlp_result(doc, reader) # ctrees and lexical heads on their nodes ctrees = corenlp_out.trees # strip function tags # TODO maybe this should be an internal preprocessing step in # find_lexical_heads(), so as to keep the function tags # that are kept by default by CoreNLP parser because they were found # to be useful e.g. `-retainTMPSubcategories` ctrees_no_gf = [ transform_tree(ctree, strip_subcategory) for ctree in ctrees ] lex_heads = [ find_lexical_heads(ctree_no_gf) for ctree_no_gf in ctrees_no_gf ] # store trees in doc doc.set_syn_ctrees(ctrees_no_gf, lex_heads=lex_heads) return doc
def read_results(corpus, dir_name): """ Read stored parser output from a directory, and convert them to educe.annotation.Standoff objects. Return a dictionary mapping 'FileId's to sets of tokens. """ results = {} for k in corpus: reader = PreprocessingSource() reader.read(parsed_file_name(k, dir_name), suffix='') doc = corpus[k] results[k] = read_corenlp_result(doc, reader) return results
encoding='ascii') # read the RST corpus rst_reader = Reader(corpus_dir) rst_corpus = rst_reader.slurp() # for each file, compare tokenizations between PTB and CoreNLP for key, rst_tree in sorted(rst_corpus.items()): doc_name = key.doc.split('.', 1)[0] if doc_name.startswith('wsj_'): print(doc_name) doc_wsj_num = doc_name.split('_')[1] section = doc_wsj_num[:2] # corenlp stuff core_fname = os.path.join(CORENLP_OUT_DIR, corpus, doc_name + '.out.xml') core_reader = PreprocessingSource() core_reader.read(core_fname, suffix='') corenlp_doc = read_corenlp_result(None, core_reader) core_toks = corenlp_doc.tokens core_toks_beg = [x.span.char_start for x in core_toks] core_toks_end = [x.span.char_end for x in core_toks] # PTB stuff # * create DocumentPlus (adapted from educe.rst_dt.corpus) rst_context = rst_tree.label().context ptb_docp = DocumentPlus(key, doc_name, rst_context) # * attach EDUs (yerk) # FIXME we currently get them via an RstDepTree created from # the original RSTTree, so as to get the left padding EDU rst_dtree = RstDepTree.from_rst_tree(rst_tree) ptb_docp.edus = rst_dtree.edus