def test_binarize(self): for name, tree in self._test_trees().items(): bin_tree = annotation._binarize(tree) self.assertTrue(annotation.is_binary(bin_tree))
def test_binarize(self): for _, tree in self._test_trees().items(): bin_tree = annotation._binarize(tree) self.assertTrue(annotation.is_binary(bin_tree))
def load_corpus_as_dataframe_new(selection='train', binarize=False, verbose=0): """Load training section of the RST-WSJ corpus as a pandas.DataFrame. Parameters ---------- selection : one of {'train', 'test'} TODO: add 'both' Select the part of the corpus to load. binarize : boolean, default: False If True, apply right-heavy binarization on RST trees. Returns ------- node_df: pandas.DataFrame DataFrame of all nodes from the constituency trees. rel_df: pandas.DataFrame DataFrame of all relations. edu_df: pandas.DataFrame DataFrame of all EDUs. TODO ---- [ ] intra-sentential-first right-heavy binarization [ ] left-heavy binarization (?) [ ] add selection='both' Notes ----- `selection='both'` can currently be done as: ``` train_df = load_corpus_as_dataframe_new(selection='train') test_df = load_corpus_as_dataframe_new(selection='test') both_df = train_df.append(test_df) ``` """ node_rows = [] # list of dicts, one dict per node rel_rows = [] # list of dicts, one dict per relation # edu_rows contains pre-EDUs rather than EDUs themselves, but maybe # conflating both does no harm edu_rows = [] # list of dicts, one dict per EDU sent_rows = [] # ibid para_rows = [] # ibid if selection == 'train': rst_reader = RstReader(CD_TRAIN) elif selection == 'test': rst_reader = RstReader(CD_TEST) else: raise ValueError('Unknown selection {}'.format(selection)) rst_corpus = rst_reader.slurp() for doc_id, rtree_ref in sorted(rst_corpus.items()): doc_ctx = rtree_ref.label().context doc_text = doc_ctx.text() doc_edus = rtree_ref.leaves() # 0. collect EDUs doc_edu_rows = load_edus(doc_edus) # 1. collect spans (constituency nodes) from the gold RST trees # # transform the original RST tree: convert labels to their # coarse equivalent, binarize if required coarse_rtree_ref = REL_CONV(rtree_ref) if binarize: coarse_rtree_ref = _binarize(coarse_rtree_ref) # collect spans doc_span_rows = load_spans(coarse_rtree_ref) # prepare this info to find "leaky" substructures: # sentences and paragraphs # dict of EDU spans to constituent node from the RST tree rst_tree_node_spans = { (row['edu_start'], row['edu_end']): row['treepos'] for row in doc_span_rows } # list of EDU spans of constituent nodes, sorted by length of span # then start rst_tree_node_spans_by_len = list(sorted( rst_tree_node_spans, key=lambda x: (x[1] - x[0], x[0]))) # 2. Collect sentences doc_sent_rows = [] # use dirty PTB tokenizer + parser # NB: the two following lines eat up 67% of total time doc_tkd_toks = tokenize_doc_ptb(doc_id, doc_text) doc_tkd_trees = parse_doc_ptb(doc_id, doc_tkd_toks) # sentence <-> EDU mapping and the information that depends on this # mapping might be more appropriate as a separate DataFrame # align EDUs with sentences edu2sent = align_edus_with_sentences(doc_edus, doc_tkd_trees, strict=False) # get the codomain of edu2sent # if we want to be strict, we can assert that the codomain is # a gapless interval # assert sent_idc == list(range(len(doc_tkd_trees))) # this assertion is currently known to fail on: # * RST-WSJ/TRAINING/wsj_0678.out: wrong sentence segmentation in PTB # (1 sentence is split in 2) edu2sent_codom = set([sent_idx for sent_idx in edu2sent if sent_idx is not None]) # find the index of the first and last EDU of each sentence # indices in both lists are offset by 1 to map to real EDU # numbering (which is 1-based) sent_edu_starts = [(edu2sent.index(i) + 1 if i in edu2sent_codom else None) for i in range(len(doc_tkd_trees))] sent_edu_ends = [(len(edu2sent) - 1 - edu2sent[::-1].index(i) + 1 if i in edu2sent_codom else None) for i in range(len(doc_tkd_trees))] # sentences that don't have their own RST subtree are 'leaky' # WIP propagate sentence-EDU mapping to RST tree spans for row in doc_span_rows: # offset by -1 because edu2sent is 0-based row['sent_start'] = edu2sent[row['edu_start'] - 1] row['sent_end'] = edu2sent[row['edu_end'] - 1] # inferred columns # special cases because edu2sent is None for EDUs whose text # is missing from PTB if ((row['sent_start'] is not None and row['sent_end'] is not None)): row['sent_len'] = row['sent_end'] - row['sent_start'] + 1 row['intra_sent'] = (row['sent_start'] == row['sent_end']) row['strad_sent'] = (not row['intra_sent'] and (not row['edu_start'] in sent_edu_starts or not row['edu_end'] in sent_edu_ends)) else: row['sent_len'] = None row['intra_sent'] = None row['strad_sent'] = None # end WIP propagate # end of sentence <-> EDU mapping et al. # iterate over syntactic trees as proxy for sentences for sent_idx, tkd_tree in enumerate(doc_tkd_trees): row = { # data directly from the sentence segmenter 'sent_id': '{}_sent{}'.format(doc_id.doc, sent_idx), 'char_start': tkd_tree.span.char_start, 'char_end': tkd_tree.span.char_end, } # sentence <-> EDU mapping dependent data # should probably have its own dataframe # to better handle disagreement between sentence and EDU # segmentation, that translates in the following entries # as None for missing data if sent_idx in edu2sent_codom: row.update({ 'edu_start': sent_edu_starts[sent_idx], 'edu_end': sent_edu_ends[sent_idx], # computed column 'edu_len': (sent_edu_ends[sent_idx] - sent_edu_starts[sent_idx]) + 1, }) # use alignment: sentence <-> RST tree spans (via EDUs) # leaky sentences: complex (2+ EDUs) sentences that don't # have a corresponding span in the RST tree leaky = (row['edu_len'] > 1 and ((sent_edu_starts[sent_idx], sent_edu_ends[sent_idx]) not in rst_tree_node_spans)) row.update({ 'leaky': leaky, }) # find for each leaky sentence the smallest RST subtree # that covers it if row['leaky']: sent_edu_first = sent_edu_starts[sent_idx] sent_edu_last = sent_edu_ends[sent_idx] # find parent span and straddling spans ; # straddling spans exist for type 3 and 4 strad_spans = [] for edu_span in rst_tree_node_spans_by_len: # parent span if ((edu_span[0] <= sent_edu_first and sent_edu_last <= edu_span[1])): parent_span = edu_span break # straddling spans if ((edu_span[0] < sent_edu_first and sent_edu_first <= edu_span[1]) or (edu_span[0] <= sent_edu_last and sent_edu_last < edu_span[1])): strad_spans.append(edu_span) else: raise ValueError( 'No minimal spanning node for {}'.format(row)) # leaky types {1, 2} vs {3, 4}: # for types 1 and 2, members of the parent # constituent are "pure" wrt sentence span: # each member is either fully inside or fully # outside the sentence ; # no member straddles either of the sentence # boundaries leaky_type_12 = not strad_spans # DEBUG if verbose: print(doc_id.doc) print(parent_span, strad_spans if strad_spans else '') # end DEBUG # leaky types {1, 3} vs {2, 4} # {1, 3} have at least one coordinative (aka multinuclear) # relation in the chain of spans between the parent span # and the EDU(s) of the leaky sentence ; # {2, 4} have only subordinative (aka mononuclear) # relations leaky_coord = False # first, check the kids of the parent span parent_tpos = rst_tree_node_spans[parent_span] parent_subtree = coarse_rtree_ref[parent_tpos] if all(kid.label().nuclearity == 'Nucleus' for kid in parent_subtree): leaky_coord = True # then, check the kids of all straddling spans strad_rels = [] # TEMPORARY for strad_span in strad_spans: strad_tpos = rst_tree_node_spans[strad_span] strad_subtree = coarse_rtree_ref[strad_tpos] if all(kid.label().nuclearity == 'Nucleus' for kid in strad_subtree): leaky_coord = True # TEMPORARY: store straddling relations (from kids) kid_rels = [kid.label().rel for kid in strad_subtree if kid.label().rel != 'span'] # if all kids bear the same relation label, store # only this value if len(set(kid_rels)) == 1: kid_rels = kid_rels[0] else: kid_rels = '+'.join(kid_rels) strad_rels.append(kid_rels) # WIP list of straddling relations strad_rels_rows.append({ 'node_id': '{}_const{}'.format( strad_subtree.origin.doc, '-'.join(str(x) for x in strad_tpos)), 'sent_id': '{}_sent{}'.format( doc_id.doc, sent_idx), 'kid_rels': kid_rels, }) # end WIP running counter # determine type of leaky (ugly) if leaky_type_12: if leaky_coord: leaky_type = 1 else: leaky_type = 2 else: if leaky_coord: leaky_type = 3 else: leaky_type = 4 # display type of leaky if verbose: print('Type {} ({}-level {} structure)\t{}'.format( leaky_type, 'Same' if leaky_type_12 else 'Multi', 'coordination' if leaky_coord else 'subordination', '; '.join(strad_rels))) print() # end WIP nuclearity of straddling spans # add info to row row.update({ # parent span, in EDUs 'parent_edu_start': parent_span[0], 'parent_edu_end': parent_span[1], # length of parent span, in sentences 'parent_sent_len': ( edu2sent[parent_span[1] - 1] - edu2sent[parent_span[0] - 1] + 1), # distance between the current sentence and the most # remote sentence covered by the parent span, # in sentences 'parent_sent_dist': ( max([(edu2sent[parent_span[1] - 1] - sent_idx), (sent_idx - edu2sent[parent_span[0] - 1])])), # types of leaky, in the taxonomy of # (van der Vliet et al. 2011) 'leaky_type': leaky_type, }) else: row.update({ 'parent_span_start': row['edu_start'], 'parent_span_end': row['edu_end'], # default value for leaky_type, provides a mean for # easy comparison on complex sentences, between # non-leaky and the various types of leaky 'leaky_type': 0, }) # end WIP doc_sent_rows.append(row) # 3. collect paragraphs doc_para_rows = [] doc_paras = doc_ctx.paragraphs doc_text = doc_ctx.text() # doc_paras is None when the original text has no explicit marking # for paragraphs ; this is true for 'fileX' documents in the RST-WSJ # corpus if doc_paras is not None: # EDU to paragraph mapping edu2para = align_edus_with_paragraphs(doc_edus, doc_paras, doc_text, strict=False) edu2para_codom = set([para_idx for para_idx in edu2para if para_idx is not None]) # index of the first and last EDU of each paragraph para_edu_starts = [(edu2para.index(i) + 1 if i in edu2para_codom else None) for i in range(len(doc_paras))] para_edu_ends = [(len(edu2para) - 1 - edu2para[::-1].index(i) + 1 if i in edu2para_codom else None) for i in range(len(doc_paras))] # paragraphs that don't have their own RST subtree are "leaky" ; # end of paragraph <-> EDU mapping et al. # iterate over paragraphs for para_idx, para in enumerate(doc_paras): # dirty, educe.rst_dt.text.Paragraph should have a span para_span = Span(para.sentences[0].span.char_start, para.sentences[-1].span.char_end) # end dirty row = { # data directly from the paragraph segmenter 'para_id': '{}_para{}'.format(doc_id.doc, para_idx), 'char_start': para_span.char_start, 'char_end': para_span.char_end, } # paragraph <-> EDU mapping dependent data # should probably have its own dataframe etc. if para_idx in edu2para_codom: row.update({ 'edu_start': para_edu_starts[para_idx], 'edu_end': para_edu_ends[para_idx], # computed column 'edu_len': (para_edu_ends[para_idx] - para_edu_starts[para_idx]) + 1, }) # use paragraph <-> RST tree alignment if row['edu_len'] > 1: # complex paragraphs only row.update({ 'leaky': ((para_edu_starts[para_idx], para_edu_ends[para_idx]) not in rst_tree_node_spans), }) else: row.update({'leaky': False}) # WIP find for each leaky paragraph the smallest RST # subtree that covers it if row['leaky']: for edu_span in rst_tree_node_spans_by_len: if ((edu_span[0] <= para_edu_starts[para_idx] and para_edu_ends[para_idx] <= edu_span[1])): parent_span = edu_span break else: raise ValueError( 'No minimal spanning node for {}'.format(row)) # add info to row row.update({ # parent span, on EDUs 'parent_edu_start': parent_span[0], 'parent_edu_end': parent_span[1] }) # length of parent span, in paragraphs if ((edu2para[parent_span[1] - 1] is not None and edu2para[parent_span[0] - 1] is not None)): row.update({ 'parent_para_len': ( edu2para[parent_span[1] - 1] - edu2para[parent_span[0] - 1] + 1), # distance between the current paragraph and the # most remote paragraph covered by the parent # span, in paragraphs 'parent_para_dist': ( max([(edu2para[parent_span[1] - 1] - para_idx), (para_idx - edu2para[parent_span[0] - 1])])), }) else: row.update({ 'parent_edu_start': row['edu_start'], 'parent_edu_end': row['edu_end'], }) # end WIP doc_para_rows.append(row) # NB: these are leaky sentences wrt the original constituency # trees ; leaky sentences wrt the binarized constituency trees # might be different (TODO), similarly for the dependency trees # (TODO too) ; # I should count them, see if the ~5% Joty mentions are on the # original or binarized ctrees, and compare with the number of # leaky for deptrees ; I suspect the latter will be much lower... # HYPOTHESIS: (some or all?) leaky sentences in ctrees correspond # to cases where nodes that are not the head of their sentence # have dependents in other sentences # this would capture the set (or a subset) of edges that fall # outside of the search space for the "iheads" intra/inter # strategy # add doc entries to corpus entries para_rows.extend(doc_para_rows) sent_rows.extend(doc_sent_rows) rel_rows.extend(doc_span_rows) edu_rows.extend(doc_edu_rows) # turn list into a DataFrame node_df = pd.DataFrame(node_rows) rel_df = pd.DataFrame(rel_rows) edu_df = pd.DataFrame(edu_rows) sent_df = pd.DataFrame(sent_rows) para_df = pd.DataFrame(para_rows) # add calculated columns here? (leaky and complex sentences) return node_df, rel_df, edu_df, sent_df, para_df
def load_corpus_as_dataframe_new(selection='train', binarize=False, verbose=0): """Load training section of the RST-WSJ corpus as a pandas.DataFrame. Parameters ---------- selection : one of {'train', 'test'} TODO: add 'both' Select the part of the corpus to load. binarize : boolean, default: False If True, apply right-heavy binarization on RST trees. Returns ------- node_df: pandas.DataFrame DataFrame of all nodes from the constituency trees. rel_df: pandas.DataFrame DataFrame of all relations. edu_df: pandas.DataFrame DataFrame of all EDUs. TODO ---- [ ] intra-sentential-first right-heavy binarization [ ] left-heavy binarization (?) [ ] add selection='both' Notes ----- `selection='both'` can currently be done as: ``` train_df = load_corpus_as_dataframe_new(selection='train') test_df = load_corpus_as_dataframe_new(selection='test') both_df = train_df.append(test_df) ``` """ node_rows = [] # list of dicts, one dict per node rel_rows = [] # list of dicts, one dict per relation # edu_rows contains pre-EDUs rather than EDUs themselves, but maybe # conflating both does no harm edu_rows = [] # list of dicts, one dict per EDU sent_rows = [] # ibid para_rows = [] # ibid if selection == 'train': rst_reader = RstReader(CD_TRAIN) elif selection == 'test': rst_reader = RstReader(CD_TEST) else: raise ValueError('Unknown selection {}'.format(selection)) rst_corpus = rst_reader.slurp() for doc_id, rtree_ref in sorted(rst_corpus.items()): doc_ctx = rtree_ref.label().context doc_text = doc_ctx.text() doc_edus = rtree_ref.leaves() # 0. collect EDUs doc_edu_rows = load_edus(doc_edus) # 1. collect spans (constituency nodes) from the gold RST trees # # transform the original RST tree: convert labels to their # coarse equivalent, binarize if required coarse_rtree_ref = REL_CONV(rtree_ref) if binarize: coarse_rtree_ref = _binarize(coarse_rtree_ref) # collect spans doc_span_rows = load_spans(coarse_rtree_ref) # prepare this info to find "leaky" substructures: # sentences and paragraphs # dict of EDU spans to constituent node from the RST tree rst_tree_node_spans = {(row['edu_start'], row['edu_end']): row['treepos'] for row in doc_span_rows} # list of EDU spans of constituent nodes, sorted by length of span # then start rst_tree_node_spans_by_len = list( sorted(rst_tree_node_spans, key=lambda x: (x[1] - x[0], x[0]))) # 2. Collect sentences doc_sent_rows = [] # use dirty PTB tokenizer + parser # NB: the two following lines eat up 67% of total time doc_tkd_toks = tokenize_doc_ptb(doc_id, doc_text) doc_tkd_trees = parse_doc_ptb(doc_id, doc_tkd_toks) # sentence <-> EDU mapping and the information that depends on this # mapping might be more appropriate as a separate DataFrame # align EDUs with sentences edu2sent = align_edus_with_sentences(doc_edus, doc_tkd_trees, strict=False) # get the codomain of edu2sent # if we want to be strict, we can assert that the codomain is # a gapless interval # assert sent_idc == list(range(len(doc_tkd_trees))) # this assertion is currently known to fail on: # * RST-WSJ/TRAINING/wsj_0678.out: wrong sentence segmentation in PTB # (1 sentence is split in 2) edu2sent_codom = set( [sent_idx for sent_idx in edu2sent if sent_idx is not None]) # find the index of the first and last EDU of each sentence # indices in both lists are offset by 1 to map to real EDU # numbering (which is 1-based) sent_edu_starts = [ (edu2sent.index(i) + 1 if i in edu2sent_codom else None) for i in range(len(doc_tkd_trees)) ] sent_edu_ends = [(len(edu2sent) - 1 - edu2sent[::-1].index(i) + 1 if i in edu2sent_codom else None) for i in range(len(doc_tkd_trees))] # sentences that don't have their own RST subtree are 'leaky' # WIP propagate sentence-EDU mapping to RST tree spans for row in doc_span_rows: # offset by -1 because edu2sent is 0-based row['sent_start'] = edu2sent[row['edu_start'] - 1] row['sent_end'] = edu2sent[row['edu_end'] - 1] # inferred columns # special cases because edu2sent is None for EDUs whose text # is missing from PTB if ((row['sent_start'] is not None and row['sent_end'] is not None)): row['sent_len'] = row['sent_end'] - row['sent_start'] + 1 row['intra_sent'] = (row['sent_start'] == row['sent_end']) row['strad_sent'] = (not row['intra_sent'] and (not row['edu_start'] in sent_edu_starts or not row['edu_end'] in sent_edu_ends)) else: row['sent_len'] = None row['intra_sent'] = None row['strad_sent'] = None # end WIP propagate # end of sentence <-> EDU mapping et al. # iterate over syntactic trees as proxy for sentences for sent_idx, tkd_tree in enumerate(doc_tkd_trees): row = { # data directly from the sentence segmenter 'sent_id': '{}_sent{}'.format(doc_id.doc, sent_idx), 'char_start': tkd_tree.span.char_start, 'char_end': tkd_tree.span.char_end, } # sentence <-> EDU mapping dependent data # should probably have its own dataframe # to better handle disagreement between sentence and EDU # segmentation, that translates in the following entries # as None for missing data if sent_idx in edu2sent_codom: row.update({ 'edu_start': sent_edu_starts[sent_idx], 'edu_end': sent_edu_ends[sent_idx], # computed column 'edu_len': (sent_edu_ends[sent_idx] - sent_edu_starts[sent_idx]) + 1, }) # use alignment: sentence <-> RST tree spans (via EDUs) # leaky sentences: complex (2+ EDUs) sentences that don't # have a corresponding span in the RST tree leaky = (row['edu_len'] > 1 and ((sent_edu_starts[sent_idx], sent_edu_ends[sent_idx]) not in rst_tree_node_spans)) row.update({ 'leaky': leaky, }) # find for each leaky sentence the smallest RST subtree # that covers it if row['leaky']: sent_edu_first = sent_edu_starts[sent_idx] sent_edu_last = sent_edu_ends[sent_idx] # find parent span and straddling spans ; # straddling spans exist for type 3 and 4 strad_spans = [] for edu_span in rst_tree_node_spans_by_len: # parent span if ((edu_span[0] <= sent_edu_first and sent_edu_last <= edu_span[1])): parent_span = edu_span break # straddling spans if ((edu_span[0] < sent_edu_first and sent_edu_first <= edu_span[1]) or (edu_span[0] <= sent_edu_last and sent_edu_last < edu_span[1])): strad_spans.append(edu_span) else: raise ValueError( 'No minimal spanning node for {}'.format(row)) # leaky types {1, 2} vs {3, 4}: # for types 1 and 2, members of the parent # constituent are "pure" wrt sentence span: # each member is either fully inside or fully # outside the sentence ; # no member straddles either of the sentence # boundaries leaky_type_12 = not strad_spans # DEBUG if verbose: print(doc_id.doc) print(parent_span, strad_spans if strad_spans else '') # end DEBUG # leaky types {1, 3} vs {2, 4} # {1, 3} have at least one coordinative (aka multinuclear) # relation in the chain of spans between the parent span # and the EDU(s) of the leaky sentence ; # {2, 4} have only subordinative (aka mononuclear) # relations leaky_coord = False # first, check the kids of the parent span parent_tpos = rst_tree_node_spans[parent_span] parent_subtree = coarse_rtree_ref[parent_tpos] if all(kid.label().nuclearity == 'Nucleus' for kid in parent_subtree): leaky_coord = True # then, check the kids of all straddling spans strad_rels = [] # TEMPORARY for strad_span in strad_spans: strad_tpos = rst_tree_node_spans[strad_span] strad_subtree = coarse_rtree_ref[strad_tpos] if all(kid.label().nuclearity == 'Nucleus' for kid in strad_subtree): leaky_coord = True # TEMPORARY: store straddling relations (from kids) kid_rels = [ kid.label().rel for kid in strad_subtree if kid.label().rel != 'span' ] # if all kids bear the same relation label, store # only this value if len(set(kid_rels)) == 1: kid_rels = kid_rels[0] else: kid_rels = '+'.join(kid_rels) strad_rels.append(kid_rels) # WIP list of straddling relations strad_rels_rows.append({ 'node_id': '{}_const{}'.format( strad_subtree.origin.doc, '-'.join(str(x) for x in strad_tpos)), 'sent_id': '{}_sent{}'.format(doc_id.doc, sent_idx), 'kid_rels': kid_rels, }) # end WIP running counter # determine type of leaky (ugly) if leaky_type_12: if leaky_coord: leaky_type = 1 else: leaky_type = 2 else: if leaky_coord: leaky_type = 3 else: leaky_type = 4 # display type of leaky if verbose: print('Type {} ({}-level {} structure)\t{}'.format( leaky_type, 'Same' if leaky_type_12 else 'Multi', 'coordination' if leaky_coord else 'subordination', '; '.join(strad_rels))) print() # end WIP nuclearity of straddling spans # add info to row row.update({ # parent span, in EDUs 'parent_edu_start': parent_span[0], 'parent_edu_end': parent_span[1], # length of parent span, in sentences 'parent_sent_len': (edu2sent[parent_span[1] - 1] - edu2sent[parent_span[0] - 1] + 1), # distance between the current sentence and the most # remote sentence covered by the parent span, # in sentences 'parent_sent_dist': (max([(edu2sent[parent_span[1] - 1] - sent_idx), (sent_idx - edu2sent[parent_span[0] - 1])])), # types of leaky, in the taxonomy of # (van der Vliet et al. 2011) 'leaky_type': leaky_type, }) else: row.update({ 'parent_span_start': row['edu_start'], 'parent_span_end': row['edu_end'], # default value for leaky_type, provides a mean for # easy comparison on complex sentences, between # non-leaky and the various types of leaky 'leaky_type': 0, }) # end WIP doc_sent_rows.append(row) # 3. collect paragraphs doc_para_rows = [] doc_paras = doc_ctx.paragraphs doc_text = doc_ctx.text() # doc_paras is None when the original text has no explicit marking # for paragraphs ; this is true for 'fileX' documents in the RST-WSJ # corpus if doc_paras is not None: # EDU to paragraph mapping edu2para = align_edus_with_paragraphs(doc_edus, doc_paras, doc_text, strict=False) edu2para_codom = set( [para_idx for para_idx in edu2para if para_idx is not None]) # index of the first and last EDU of each paragraph para_edu_starts = [ (edu2para.index(i) + 1 if i in edu2para_codom else None) for i in range(len(doc_paras)) ] para_edu_ends = [(len(edu2para) - 1 - edu2para[::-1].index(i) + 1 if i in edu2para_codom else None) for i in range(len(doc_paras))] # paragraphs that don't have their own RST subtree are "leaky" ; # end of paragraph <-> EDU mapping et al. # iterate over paragraphs for para_idx, para in enumerate(doc_paras): # dirty, educe.rst_dt.text.Paragraph should have a span para_span = Span(para.sentences[0].span.char_start, para.sentences[-1].span.char_end) # end dirty row = { # data directly from the paragraph segmenter 'para_id': '{}_para{}'.format(doc_id.doc, para_idx), 'char_start': para_span.char_start, 'char_end': para_span.char_end, } # paragraph <-> EDU mapping dependent data # should probably have its own dataframe etc. if para_idx in edu2para_codom: row.update({ 'edu_start': para_edu_starts[para_idx], 'edu_end': para_edu_ends[para_idx], # computed column 'edu_len': (para_edu_ends[para_idx] - para_edu_starts[para_idx]) + 1, }) # use paragraph <-> RST tree alignment if row['edu_len'] > 1: # complex paragraphs only row.update({ 'leaky': ((para_edu_starts[para_idx], para_edu_ends[para_idx]) not in rst_tree_node_spans), }) else: row.update({'leaky': False}) # WIP find for each leaky paragraph the smallest RST # subtree that covers it if row['leaky']: for edu_span in rst_tree_node_spans_by_len: if ((edu_span[0] <= para_edu_starts[para_idx] and para_edu_ends[para_idx] <= edu_span[1])): parent_span = edu_span break else: raise ValueError( 'No minimal spanning node for {}'.format(row)) # add info to row row.update({ # parent span, on EDUs 'parent_edu_start': parent_span[0], 'parent_edu_end': parent_span[1] }) # length of parent span, in paragraphs if ((edu2para[parent_span[1] - 1] is not None and edu2para[parent_span[0] - 1] is not None)): row.update({ 'parent_para_len': (edu2para[parent_span[1] - 1] - edu2para[parent_span[0] - 1] + 1), # distance between the current paragraph and # the most remote paragraph covered by the # parent span, in paragraphs 'parent_para_dist': (max([ (edu2para[parent_span[1] - 1] - para_idx), (para_idx - edu2para[parent_span[0] - 1]) ])), }) else: row.update({ 'parent_edu_start': row['edu_start'], 'parent_edu_end': row['edu_end'], }) # end WIP doc_para_rows.append(row) # NB: these are leaky sentences wrt the original constituency # trees ; leaky sentences wrt the binarized constituency trees # might be different (TODO), similarly for the dependency trees # (TODO too) ; # I should count them, see if the ~5% Joty mentions are on the # original or binarized ctrees, and compare with the number of # leaky for deptrees ; I suspect the latter will be much lower... # HYPOTHESIS: (some or all?) leaky sentences in ctrees correspond # to cases where nodes that are not the head of their sentence # have dependents in other sentences # this would capture the set (or a subset) of edges that fall # outside of the search space for the "iheads" intra/inter # strategy # add doc entries to corpus entries para_rows.extend(doc_para_rows) sent_rows.extend(doc_sent_rows) rel_rows.extend(doc_span_rows) edu_rows.extend(doc_edu_rows) # turn list into a DataFrame node_df = pd.DataFrame(node_rows) rel_df = pd.DataFrame(rel_rows) edu_df = pd.DataFrame(edu_rows) sent_df = pd.DataFrame(sent_rows) para_df = pd.DataFrame(para_rows) # add calculated columns here? (leaky and complex sentences) return node_df, rel_df, edu_df, sent_df, para_df