def dump_dep_rstdt(corpus_dir, out_dir, nary_enc): """Convert and dump the RST-DT corpus as dependency trees.""" # convert and dump RST trees from train dir_train = os.path.join(corpus_dir, TRAIN_FOLDER) if not os.path.isdir(dir_train): raise ValueError('No such folder: {}'.format(dir_train)) reader_train = Reader(dir_train) trees_train = reader_train.slurp() dtrees_train = {doc_name: RstDepTree.from_rst_tree(rst_tree, nary_enc=nary_enc) for doc_name, rst_tree in trees_train.items()} dump_disdep_files(dtrees_train.values(), os.path.join(out_dir, os.path.basename(dir_train))) # convert and dump RST trees from test dir_test = os.path.join(corpus_dir, TEST_FOLDER) if not os.path.isdir(dir_test): raise ValueError('No such folder: {}'.format(dir_test)) reader_test = Reader(dir_test) trees_test = reader_test.slurp() dtrees_test = {doc_name: RstDepTree.from_rst_tree(rst_tree, nary_enc=nary_enc) for doc_name, rst_tree in trees_test.items()} dump_disdep_files(dtrees_test.values(), os.path.join(out_dir, os.path.basename(dir_test)))
def test_rst_to_dt(self): lw_trees = ["(R:rel (S x) (N y))", """ (R:rel (S x) (N:rel (N h) (S t))) """, """ (R:r (S x) (N:r (N:r (S t1) (N h)) (S t2))) """ ] for lstr in lw_trees: rst1 = parse_lightweight_tree(lstr) dep = RstDepTree.from_simple_rst_tree(rst1) rst2 = deptree_to_simple_rst_tree(dep) self.assertEqual(rst1, rst2, "round-trip on " + lstr) for name, tree in self._test_trees().items(): rst1 = SimpleRSTTree.from_rst_tree(tree) dep = RstDepTree.from_simple_rst_tree(rst1) rst2 = deptree_to_simple_rst_tree(dep) self.assertEqual(treenode(rst1).span, treenode(rst2).span, "span equality on " + name) self.assertEqual(treenode(rst1).edu_span, treenode(rst2).edu_span, "edu span equality on " + name)
def test_rst_to_dt(self): lw_trees = [ "(R:rel (S x) (N y))", """ (R:rel (S x) (N:rel (N h) (S t))) """, """ (R:r (S x) (N:r (N:r (S t1) (N h)) (S t2))) """ ] for lstr in lw_trees: rst1 = parse_lightweight_tree(lstr) dep = RstDepTree.from_simple_rst_tree(rst1) rst2 = deptree_to_simple_rst_tree(dep) self.assertEqual(rst1, rst2, "round-trip on " + lstr) for name, tree in self._test_trees().items(): rst1 = SimpleRSTTree.from_rst_tree(tree) dep = RstDepTree.from_simple_rst_tree(rst1) rst2 = deptree_to_simple_rst_tree(dep) self.assertEqual( treenode(rst1).span, treenode(rst2).span, "span equality on " + name) self.assertEqual( treenode(rst1).edu_span, treenode(rst2).edu_span, "edu span equality on " + name)
def convert(corpus, multinuclear, odir): """ Convert every RST tree in the corpus to a dependency tree (and back, but simplified using a set of relation types that will be systematically treated as multinuclear) """ bin_dir = os.path.join(odir, "rst-binarised") dt_dir = os.path.join(odir, "rst-to-dt") rst2_dir = os.path.join(odir, "dt-to-rst") for subdir in [bin_dir, dt_dir, rst2_dir]: if not os.path.exists(subdir): os.makedirs(subdir) for k in corpus: suffix = os.path.splitext(k.doc)[0] stree = educe.rst_dt.SimpleRSTTree.from_rst_tree(corpus[k]) with open(os.path.join(bin_dir, suffix), 'w') as fout: fout.write(str(stree)) dtree = RstDepTree.from_simple_rst_tree(stree) with open(os.path.join(dt_dir, suffix), 'w') as fout: fout.write(str(dtree)) stree2 = deptree_to_simple_rst_tree(dtree, multinuclear) with open(os.path.join(rst2_dir, suffix), 'w') as fout: fout.write(str(stree2))
def convert(corpus, multinuclear, odir): """ Convert every RST tree in the corpus to a dependency tree (and back, but simplified using a set of relation types that will be systematically treated as multinuclear) """ bin_dir = os.path.join(odir, "rst-binarised") dt_dir = os.path.join(odir, "rst-to-dt") rst2_dir = os.path.join(odir, "dt-to-rst") for subdir in [bin_dir, dt_dir, rst2_dir]: if not os.path.exists(subdir): os.makedirs(subdir) for k in corpus: suffix = os.path.splitext(k.doc)[0] stree = SimpleRSTTree.from_rst_tree(corpus[k]) with open(os.path.join(bin_dir, suffix), 'w') as fout: fout.write(str(stree)) dtree = RstDepTree.from_simple_rst_tree(stree) with open(os.path.join(dt_dir, suffix), 'w') as fout: fout.write(str(dtree)) stree2 = deptree_to_simple_rst_tree(dtree) with open(os.path.join(rst2_dir, suffix), 'w') as fout: fout.write(str(stree2))
def test_rst_to_dt_nuclearity_loss(self): """ Test that we still get sane tree structure with nuclearity loss """ tricky = """ (R:r (S t) (N h)) """ nuked = """ (R:r (N t) (N h)) """ # tricky = """ # (R:r # (S x) # (N:r (N:r (S t1) (N h)) # (S t2))) # """ # # nuked = """ # (R:r # (N x) # (N:r (N:r (N t1) (N h)) # (N t2))) # """ rst0 = parse_lightweight_tree(nuked) rst1 = parse_lightweight_tree(tricky) # a little sanity check first dep0 = RstDepTree.from_simple_rst_tree(rst0) rev0 = deptree_to_simple_rst_tree(dep0) # was:, ['r']) self.assertEqual(rst0, rev0, "same structure " + nuked) # sanity # now the real test dep1 = RstDepTree.from_simple_rst_tree(rst1) rev1 = deptree_to_simple_rst_tree(dep1) # was:, ['r'])
def test_dt_to_rst_order(self): lw_trees = [ "(R:r (N:r (N h) (S r1)) (S r2))", "(R:r (S:r (S l2) (N l1)) (N h))", "(R:r (N:r (S l1) (N h)) (S r1))", """ (R:r (N:r (N:r (S l2) (N:r (S l1) (N h))) (S r1)) (S r2)) """, # ((l2 <- l1 <- h) -> r1 -> r2) """ (R:r (N:r (S l2) (N:r (N:r (S l1) (N h)) (S r1))) (S r2)) """, # (l2 <- ((l1 <- h) -> r1)) -> r2 ] for lstr in lw_trees: rst1 = parse_lightweight_tree(lstr) dep = RstDepTree.from_simple_rst_tree(rst1) dep_a = dep rst2a = deptree_to_simple_rst_tree(dep_a) self.assertEqual(rst1, rst2a, "round-trip on " + lstr) dep_b = copy.deepcopy(dep) dep_b.deps(0).reverse() rst2b = deptree_to_simple_rst_tree(dep_b) # TODO assertion on rst2b? dep_c = copy.deepcopy(dep) random.shuffle(dep_c.deps(0)) rst2c = deptree_to_simple_rst_tree(dep_c)
def read_deps(corpus, section='all', nary_enc='chain', rew_pseudo_rels=False, mrg_same_units=False): """Collect dependencies from the corpus. Parameters ---------- corpus : dict from str to dict from FileId to RSTTree Corpus of RST c-trees indexed by {'train', 'test'} then FileId. section : str, one of {'train', 'test', 'all'} Section of interest in the RST-DT. nary_enc : str, one of {'tree', 'chain'} Encoding of n-ary relations used in the c-to-d conversion. rew_pseudo_rels : boolean, defaults to False If True, rewrite pseudo relations ; see `educe.rst_dt.pseudo_relations`. mrg_same_units : boolean, defaults to False If True, merge fragmented EDUs ; see `educe.rst_dt.pseudo_relations`. Returns ------- edu_df : pandas.DataFrame Table of EDUs read from the corpus. dep_df : pandas.DataFrame Table of dependencies read from the corpus. """ # experimental: rewrite pseudo-relations if rew_pseudo_rels: for sec_name, sec_corpus in corpus.items(): corpus[sec_name] = { doc_id: rewrite_pseudo_rels(doc_id, rst_ctree) for doc_id, rst_ctree in sec_corpus.items() } if mrg_same_units: for sec_name, sec_corpus in corpus.items(): corpus[sec_name] = { doc_id: merge_same_units(doc_id, rst_ctree) for doc_id, rst_ctree in sec_corpus.items() } # convert to d-trees, collect dependencies edus = [] deps = [] for sec_name, sec_corpus in corpus.items(): for doc_id, rst_ctree in sorted(sec_corpus.items()): doc_name = doc_id.doc doc_text = rst_ctree.text() # DIRTY infer (approximate) sentence and paragraph indices # from newlines in the text (\n and \n\n) sent_idx = 0 para_idx = 0 # end DIRTY rst_dtree = RstDepTree.from_rst_tree(rst_ctree, nary_enc='chain') for dep_idx, (edu, hd_idx, lbl, nuc, hd_order) in enumerate( zip(rst_dtree.edus[1:], rst_dtree.heads[1:], rst_dtree.labels[1:], rst_dtree.nucs[1:], rst_dtree.ranks[1:]), start=1): char_beg = edu.span.char_start char_end = edu.span.char_end edus.append((sec_name, doc_name, dep_idx, char_beg, char_end, sent_idx, para_idx)) deps.append((doc_name, dep_idx, hd_idx, lbl, nuc, hd_order)) # DIRTY search for paragraph or sentence breaks in the # text of the EDU *plus the next three characters* (yerk) edu_txt_plus = doc_text[char_beg:char_end + 3] if '\n\n' in edu_txt_plus: para_idx += 1 sent_idx += 1 # sometimes wrong ; to be fixed elif '\n' in edu_txt_plus: sent_idx += 1 # end DIRTY # turn into DataFrame edu_df = pd.DataFrame(edus, columns=[ 'section', 'doc_name', 'dep_idx', 'char_beg', 'char_end', 'sent_idx', 'para_idx' ]) dep_df = pd.DataFrame( deps, columns=['doc_name', 'dep_idx', 'hd_idx', 'rel', 'nuc', 'hd_order']) # additional columns # * attachment length in EDUs dep_df['len_edu'] = dep_df['dep_idx'] - dep_df['hd_idx'] dep_df['len_edu_abs'] = abs(dep_df['len_edu']) # * attachment length, in sentences and paragraphs if False: # TODO rewrite in a pandas-ic manner ; my previous attempts have # failed but I think I got pretty close # NB: the current implementation is *extremely* slow: 155 seconds # on my laptop for the RST-DT, just for this (minor) computation len_sent = [] len_para = [] for _, row in dep_df[['doc_name', 'dep_idx', 'hd_idx']].iterrows(): edu_dep = edu_df[(edu_df['doc_name'] == row['doc_name']) & (edu_df['dep_idx'] == row['dep_idx'])] if row['hd_idx'] == 0: # {sent,para}_idx + 1 for dependents of the fake root lsent = edu_dep['sent_idx'].values[0] + 1 lpara = edu_dep['para_idx'].values[0] + 1 else: edu_hd = edu_df[(edu_df['doc_name'] == row['doc_name']) & (edu_df['dep_idx'] == row['hd_idx'])] lsent = (edu_dep['sent_idx'].values[0] - edu_hd['sent_idx'].values[0]) lpara = (edu_dep['para_idx'].values[0] - edu_hd['para_idx'].values[0]) len_sent.append(lsent) len_para.append(lpara) dep_df['len_sent'] = pd.Series(len_sent) dep_df['len_sent_abs'] = abs(dep_df['len_sent']) dep_df['len_para'] = pd.Series(len_para) dep_df['len_para_abs'] = abs(dep_df['len_para']) # * class of relation (FIXME we need to handle interaction with # rewrite_pseudo_rels) rel_conv = RstRelationConverter(RELMAP_112_18_FILE).convert_label dep_df['rel_class'] = dep_df['rel'].apply(rel_conv) # * boolean indicator for pseudo-relations ; NB: the 'Style-' prefix # can only apply if rew_pseudo_rels (otherwise no occurrence) dep_df['pseudo_rel'] = ( (dep_df['rel'].str.startswith('Style')) | (dep_df['rel'].str.endswith('Same-Unit')) | (dep_df['rel'].str.endswith('TextualOrganization'))) return edu_df, dep_df
doc_name + '.out.xml') core_reader = PreprocessingSource() core_reader.read(core_fname, suffix='') corenlp_doc = read_corenlp_result(None, core_reader) core_toks = corenlp_doc.tokens core_toks_beg = [x.span.char_start for x in core_toks] core_toks_end = [x.span.char_end for x in core_toks] # PTB stuff # * create DocumentPlus (adapted from educe.rst_dt.corpus) rst_context = rst_tree.label().context ptb_docp = DocumentPlus(key, doc_name, rst_context) # * attach EDUs (yerk) # FIXME we currently get them via an RstDepTree created from # the original RSTTree, so as to get the left padding EDU rst_dtree = RstDepTree.from_rst_tree(rst_tree) ptb_docp.edus = rst_dtree.edus # * setup a PtbParser (re-yerk) ptb_parser = PtbParser(PTB_DIR) ptb_parser.tokenize(ptb_docp) # get PTB toks ; skip left padding token ptb_toks = ptb_docp.tkd_tokens[1:] ptb_toks_beg = ptb_docp.toks_beg[1:] ptb_toks_end = ptb_docp.toks_end[1:] # compare ! core2ptb_beg = np.searchsorted(ptb_toks_beg, core_toks_beg, side='left') core2ptb_end = np.searchsorted(ptb_toks_end, core_toks_end, side='right') - 1 # TODO maybe use np.diff?
def get_oracle_ctrees(dep_edges, att_edus, nuc_strategy="unamb_else_most_frequent", rank_strategy="closest-intra-rl-inter-rl", prioritize_same_unit=True, strict=False): """Build the oracle constituency tree(s) for a dependency tree. Parameters ---------- dep_edges: dict(string, [(string, string, string)]) Edges for each document, indexed by doc name Cf. type of return value from irit-rst-dt/ctree.py:load_attelo_output_file() att_edus: cf return type of attelo.io.load_edus EDUs as they are known to attelo strict: boolean, True by default If True, any link from ROOT to an EDU that is neither 'ROOT' nor UNRELATED raises an exception, otherwise a warning is issued. Returns ------- ctrees: list of RstTree There can be several e.g. for leaky sentences. """ # rebuild educe EDUs from their attelo description # and group them by doc_name educe_edus = defaultdict(list) edu2sent_idx = defaultdict(dict) gid2num = dict() for att_edu in att_edus: # doc name doc_name = att_edu.grouping # EDU info # skip ROOT (automatically added by RstDepTree.__init__) if att_edu.id == 'ROOT': continue edu_num = int(att_edu.id.rsplit('_', 1)[1]) edu_span = EduceSpan(att_edu.start, att_edu.end) edu_text = att_edu.text educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text)) # map global id of EDU to num of EDU inside doc gid2num[att_edu.id] = edu_num # map EDU to sentence try: sent_idx = int(att_edu.subgrouping.split('_sent')[1]) except IndexError: # this EDU could not be attached to any sentence (ex: missing # text in the PTB), so a default subgrouping identifier was used ; # we aim for consistency with educe and map these to "None" sent_idx = None edu2sent_idx[doc_name][edu_num] = sent_idx # check that our info covers only one document assert len(educe_edus) == 1 # then restrict to this document doc_name = educe_edus.keys()[0] educe_edus = educe_edus[doc_name] edu2sent_idx = edu2sent_idx[doc_name] # sort EDUs by num educe_edus = list(sorted(educe_edus, key=lambda e: e.num)) # rebuild educe-style edu2sent ; prepend 0 for the fake root edu2sent = [0] + [edu2sent_idx[e.num] for e in educe_edus] # classifiers for nuclearity and ranking # FIXME declare, fit and predict upstream... # nuclearity nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy) nuc_classifier.fit([], []) # empty X and y for dummy fit # ranking classifier rank_classifier = InsideOutAttachmentRanker( strategy=rank_strategy, prioritize_same_unit=prioritize_same_unit) # rebuild RstDepTrees dtree = RstDepTree(educe_edus) for src_id, tgt_id, lbl in dep_edges: if src_id == 'ROOT': if lbl not in ['ROOT', UNKNOWN]: err_msg = 'weird root label: {} {} {}'.format( src_id, tgt_id, lbl) if strict: raise ValueError(err_msg) else: print('W: {}, using ROOT instead'.format(err_msg)) dtree.set_root(gid2num[tgt_id]) else: dtree.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl) # add nuclearity: heuristic baseline dtree.nucs = nuc_classifier.predict([dtree])[0] # add rank: some strategies require a mapping from EDU to sentence # EXPERIMENTAL attach array of sentence index for each EDU in tree dtree.sent_idx = edu2sent # end EXPERIMENTAL dtree.ranks = rank_classifier.predict([dtree])[0] # end NEW # create pred ctree try: bin_srtrees = deptree_to_simple_rst_tree(dtree, allow_forest=True) if False: # EXPERIMENTAL # currently False to run on output that already has # labels embedding nuclearity bin_srtrees = [ SimpleRSTTree.incorporate_nuclearity_into_label(bin_srtree) for bin_srtree in bin_srtrees ] bin_rtrees = [ SimpleRSTTree.to_binary_rst_tree(bin_srtree) for bin_srtree in bin_srtrees ] except RstDtException as rst_e: print(rst_e) if False: print('\n'.join('{}: {}'.format(edu.text_span(), edu) for edu in educe_edus[doc_name])) raise ctrees = bin_rtrees return ctrees
def get_oracle_ctrees(dep_edges, att_edus, nuc_strategy="unamb_else_most_frequent", rank_strategy="closest-intra-rl-inter-rl", prioritize_same_unit=True, strict=False): """Build the oracle constituency tree(s) for a dependency tree. Parameters ---------- dep_edges: dict(string, [(string, string, string)]) Edges for each document, indexed by doc name Cf. type of return value from irit-rst-dt/ctree.py:load_attelo_output_file() att_edus: cf return type of attelo.io.load_edus EDUs as they are known to attelo strict: boolean, True by default If True, any link from ROOT to an EDU that is neither 'ROOT' nor UNRELATED raises an exception, otherwise a warning is issued. Returns ------- ctrees: list of RstTree There can be several e.g. for leaky sentences. """ # rebuild educe EDUs from their attelo description # and group them by doc_name educe_edus = defaultdict(list) edu2sent_idx = defaultdict(dict) gid2num = dict() for att_edu in att_edus: # doc name doc_name = att_edu.grouping # EDU info # skip ROOT (automatically added by RstDepTree.__init__) if att_edu.id == 'ROOT': continue edu_num = int(att_edu.id.rsplit('_', 1)[1]) edu_span = EduceSpan(att_edu.start, att_edu.end) edu_text = att_edu.text educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text)) # map global id of EDU to num of EDU inside doc gid2num[att_edu.id] = edu_num # map EDU to sentence try: sent_idx = int(att_edu.subgrouping.split('_sent')[1]) except IndexError: # this EDU could not be attached to any sentence (ex: missing # text in the PTB), so a default subgrouping identifier was used ; # we aim for consistency with educe and map these to "None" sent_idx = None edu2sent_idx[doc_name][edu_num] = sent_idx # check that our info covers only one document assert len(educe_edus) == 1 # then restrict to this document doc_name = educe_edus.keys()[0] educe_edus = educe_edus[doc_name] edu2sent_idx = edu2sent_idx[doc_name] # sort EDUs by num educe_edus = list(sorted(educe_edus, key=lambda e: e.num)) # rebuild educe-style edu2sent ; prepend 0 for the fake root edu2sent = [0] + [edu2sent_idx[e.num] for e in educe_edus] # classifiers for nuclearity and ranking # FIXME declare, fit and predict upstream... # nuclearity nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy) nuc_classifier.fit([], []) # empty X and y for dummy fit # ranking classifier rank_classifier = InsideOutAttachmentRanker( strategy=rank_strategy, prioritize_same_unit=prioritize_same_unit) # rebuild RstDepTrees dtree = RstDepTree(educe_edus) for src_id, tgt_id, lbl in dep_edges: if src_id == 'ROOT': if lbl not in ['ROOT', UNKNOWN]: err_msg = 'weird root label: {} {} {}'.format( src_id, tgt_id, lbl) if strict: raise ValueError(err_msg) else: print('W: {}, using ROOT instead'.format(err_msg)) dtree.set_root(gid2num[tgt_id]) else: dtree.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl) # add nuclearity: heuristic baseline dtree.nucs = nuc_classifier.predict([dtree])[0] # add rank: some strategies require a mapping from EDU to sentence # EXPERIMENTAL attach array of sentence index for each EDU in tree dtree.sent_idx = edu2sent # end EXPERIMENTAL dtree.ranks = rank_classifier.predict([dtree])[0] # end NEW # create pred ctree try: bin_srtrees = deptree_to_simple_rst_tree(dtree, allow_forest=True) if False: # EXPERIMENTAL # currently False to run on output that already has # labels embedding nuclearity bin_srtrees = [SimpleRSTTree.incorporate_nuclearity_into_label( bin_srtree) for bin_srtree in bin_srtrees] bin_rtrees = [SimpleRSTTree.to_binary_rst_tree(bin_srtree) for bin_srtree in bin_srtrees] except RstDtException as rst_e: print(rst_e) if False: print('\n'.join('{}: {}'.format(edu.text_span(), edu) for edu in educe_edus[doc_name])) raise ctrees = bin_rtrees return ctrees
'corpus': os.path.relpath(rst_corpus_dir, start=DATA_DIR), 'strip_accents': strip_accents, 'lowercase': lowercase, 'stop_words': stop_words, 'n_jobs': n_jobs, 'verbose': verbose, } print('# parameters: ({})'.format(params), file=outfile) # do the real job corpus_items = sorted(rst_corpus.items()) doc_keys = [key.doc for key, doc in corpus_items] doc_key_dtrees = [ (doc_key.doc, RstDepTree.from_simple_rst_tree(SimpleRSTTree.from_rst_tree(doc))) for doc_key, doc in corpus_items ] edu_txts = list(e.text().replace('\n', ' ') for doc_key, dtree in doc_key_dtrees for e in dtree.edus) # vectorize each EDU using its text edu_vecs = vect.transform(edu_txts) # normalize each row of the count matrix using the l1 norm # (copy=False to perform in place) edu_vecs = normalize(edu_vecs, norm='l1', copy=False) # get all pairs of EDUs of interest, here as triples # (gov_idx, dep_idx, lbl) # TODO maybe sort edu pairs so that dependents with # the same governor are grouped (potential speed up?) edu_pairs = [
doc_name + '.out.xml') core_reader = PreprocessingSource() core_reader.read(core_fname, suffix='') corenlp_doc = read_corenlp_result(None, core_reader) core_toks = corenlp_doc.tokens core_toks_beg = [x.span.char_start for x in core_toks] core_toks_end = [x.span.char_end for x in core_toks] # PTB stuff # * create DocumentPlus (adapted from educe.rst_dt.corpus) rst_context = rst_tree.label().context ptb_docp = DocumentPlus(key, doc_name, rst_context) # * attach EDUs (yerk) # FIXME we currently get them via an RstDepTree created from # the original RSTTree, so as to get the left padding EDU rst_dtree = RstDepTree.from_rst_tree(rst_tree) ptb_docp.edus = rst_dtree.edus # * setup a PtbParser (re-yerk) ptb_parser = PtbParser(PTB_DIR) ptb_parser.tokenize(ptb_docp) # get PTB toks ; skip left padding token ptb_toks = ptb_docp.tkd_tokens[1:] ptb_toks_beg = ptb_docp.toks_beg[1:] ptb_toks_end = ptb_docp.toks_end[1:] # compare ! core2ptb_beg = np.searchsorted(ptb_toks_beg, core_toks_beg, side='left') core2ptb_end = np.searchsorted( ptb_toks_end, core_toks_end, side='right') - 1
def read_deps(corpus, section='all', nary_enc='chain', rew_pseudo_rels=False, mrg_same_units=False): """Collect dependencies from the corpus. Parameters ---------- corpus : dict from str to dict from FileId to RSTTree Corpus of RST c-trees indexed by {'train', 'test'} then FileId. section : str, one of {'train', 'test', 'all'} Section of interest in the RST-DT. nary_enc : str, one of {'tree', 'chain'} Encoding of n-ary relations used in the c-to-d conversion. rew_pseudo_rels : boolean, defaults to False If True, rewrite pseudo relations ; see `educe.rst_dt.pseudo_relations`. mrg_same_units : boolean, defaults to False If True, merge fragmented EDUs ; see `educe.rst_dt.pseudo_relations`. Returns ------- edu_df : pandas.DataFrame Table of EDUs read from the corpus. dep_df : pandas.DataFrame Table of dependencies read from the corpus. """ # experimental: rewrite pseudo-relations if rew_pseudo_rels: for sec_name, sec_corpus in corpus.items(): corpus[sec_name] = { doc_id: rewrite_pseudo_rels(doc_id, rst_ctree) for doc_id, rst_ctree in sec_corpus.items() } if mrg_same_units: for sec_name, sec_corpus in corpus.items(): corpus[sec_name] = { doc_id: merge_same_units(doc_id, rst_ctree) for doc_id, rst_ctree in sec_corpus.items() } # convert to d-trees, collect dependencies edus = [] deps = [] for sec_name, sec_corpus in corpus.items(): for doc_id, rst_ctree in sorted(sec_corpus.items()): doc_name = doc_id.doc doc_text = rst_ctree.text() # DIRTY infer (approximate) sentence and paragraph indices # from newlines in the text (\n and \n\n) sent_idx = 0 para_idx = 0 # end DIRTY rst_dtree = RstDepTree.from_rst_tree(rst_ctree, nary_enc='chain') for dep_idx, (edu, hd_idx, lbl, nuc, hd_order) in enumerate( zip(rst_dtree.edus[1:], rst_dtree.heads[1:], rst_dtree.labels[1:], rst_dtree.nucs[1:], rst_dtree.ranks[1:]), start=1): char_beg = edu.span.char_start char_end = edu.span.char_end edus.append( (sec_name, doc_name, dep_idx, char_beg, char_end, sent_idx, para_idx) ) deps.append( (doc_name, dep_idx, hd_idx, lbl, nuc, hd_order) ) # DIRTY search for paragraph or sentence breaks in the # text of the EDU *plus the next three characters* (yerk) edu_txt_plus = doc_text[char_beg:char_end + 3] if '\n\n' in edu_txt_plus: para_idx += 1 sent_idx += 1 # sometimes wrong ; to be fixed elif '\n' in edu_txt_plus: sent_idx += 1 # end DIRTY # turn into DataFrame edu_df = pd.DataFrame(edus, columns=[ 'section', 'doc_name', 'dep_idx', 'char_beg', 'char_end', 'sent_idx', 'para_idx'] ) dep_df = pd.DataFrame(deps, columns=[ 'doc_name', 'dep_idx', 'hd_idx', 'rel', 'nuc', 'hd_order'] ) # additional columns # * attachment length in EDUs dep_df['len_edu'] = dep_df['dep_idx'] - dep_df['hd_idx'] dep_df['len_edu_abs'] = abs(dep_df['len_edu']) # * attachment length, in sentences and paragraphs if False: # TODO rewrite in a pandas-ic manner ; my previous attempts have # failed but I think I got pretty close # NB: the current implementation is *extremely* slow: 155 seconds # on my laptop for the RST-DT, just for this (minor) computation len_sent = [] len_para = [] for _, row in dep_df[['doc_name', 'dep_idx', 'hd_idx']].iterrows(): edu_dep = edu_df[ (edu_df['doc_name'] == row['doc_name']) & (edu_df['dep_idx'] == row['dep_idx']) ] if row['hd_idx'] == 0: # {sent,para}_idx + 1 for dependents of the fake root lsent = edu_dep['sent_idx'].values[0] + 1 lpara = edu_dep['para_idx'].values[0] + 1 else: edu_hd = edu_df[ (edu_df['doc_name'] == row['doc_name']) & (edu_df['dep_idx'] == row['hd_idx']) ] lsent = (edu_dep['sent_idx'].values[0] - edu_hd['sent_idx'].values[0]) lpara = (edu_dep['para_idx'].values[0] - edu_hd['para_idx'].values[0]) len_sent.append(lsent) len_para.append(lpara) dep_df['len_sent'] = pd.Series(len_sent) dep_df['len_sent_abs'] = abs(dep_df['len_sent']) dep_df['len_para'] = pd.Series(len_para) dep_df['len_para_abs'] = abs(dep_df['len_para']) # * class of relation (FIXME we need to handle interaction with # rewrite_pseudo_rels) rel_conv = RstRelationConverter(RELMAP_112_18_FILE).convert_label dep_df['rel_class'] = dep_df['rel'].apply(rel_conv) # * boolean indicator for pseudo-relations ; NB: the 'Style-' prefix # can only apply if rew_pseudo_rels (otherwise no occurrence) dep_df['pseudo_rel'] = ( (dep_df['rel'].str.startswith('Style')) | (dep_df['rel'].str.endswith('Same-Unit')) | (dep_df['rel'].str.endswith('TextualOrganization')) ) return edu_df, dep_df