示例#1
0
文件: tests.py 项目: chloebt/educe
 def test_binarize(self):
     for name, tree in self._test_trees().items():
         bin_tree = annotation._binarize(tree)
         self.assertTrue(annotation.is_binary(bin_tree))
示例#2
0
 def test_binarize(self):
     for _, tree in self._test_trees().items():
         bin_tree = annotation._binarize(tree)
         self.assertTrue(annotation.is_binary(bin_tree))
示例#3
0
def load_corpus_as_dataframe_new(selection='train', binarize=False,
                                 verbose=0):
    """Load training section of the RST-WSJ corpus as a pandas.DataFrame.

    Parameters
    ----------
    selection : one of {'train', 'test'}  TODO: add 'both'
        Select the part of the corpus to load.

    binarize : boolean, default: False
        If True, apply right-heavy binarization on RST trees.

    Returns
    -------
    node_df: pandas.DataFrame
        DataFrame of all nodes from the constituency trees.
    rel_df: pandas.DataFrame
        DataFrame of all relations.
    edu_df: pandas.DataFrame
        DataFrame of all EDUs.

    TODO
    ----
    [ ] intra-sentential-first right-heavy binarization
    [ ] left-heavy binarization (?)
    [ ] add selection='both'

    Notes
    -----
    `selection='both'` can currently be done as:
    ```
    train_df = load_corpus_as_dataframe_new(selection='train')
    test_df = load_corpus_as_dataframe_new(selection='test')
    both_df = train_df.append(test_df)
    ```
    """
    node_rows = []  # list of dicts, one dict per node
    rel_rows = []  # list of dicts, one dict per relation
    # edu_rows contains pre-EDUs rather than EDUs themselves, but maybe
    # conflating both does no harm
    edu_rows = []  # list of dicts, one dict per EDU
    sent_rows = []  # ibid
    para_rows = []  # ibid

    if selection == 'train':
        rst_reader = RstReader(CD_TRAIN)
    elif selection == 'test':
        rst_reader = RstReader(CD_TEST)
    else:
        raise ValueError('Unknown selection {}'.format(selection))

    rst_corpus = rst_reader.slurp()

    for doc_id, rtree_ref in sorted(rst_corpus.items()):
        doc_ctx = rtree_ref.label().context
        doc_text = doc_ctx.text()
        doc_edus = rtree_ref.leaves()

        # 0. collect EDUs
        doc_edu_rows = load_edus(doc_edus)

        # 1. collect spans (constituency nodes) from the gold RST trees
        #
        # transform the original RST tree: convert labels to their
        # coarse equivalent, binarize if required
        coarse_rtree_ref = REL_CONV(rtree_ref)
        if binarize:
            coarse_rtree_ref = _binarize(coarse_rtree_ref)
        # collect spans
        doc_span_rows = load_spans(coarse_rtree_ref)

        # prepare this info to find "leaky" substructures:
        # sentences and paragraphs
        # dict of EDU spans to constituent node from the RST tree
        rst_tree_node_spans = {
            (row['edu_start'], row['edu_end']): row['treepos']
            for row in doc_span_rows
        }
        # list of EDU spans of constituent nodes, sorted by length of span
        # then start
        rst_tree_node_spans_by_len = list(sorted(
            rst_tree_node_spans, key=lambda x: (x[1] - x[0], x[0])))

        # 2. Collect sentences
        doc_sent_rows = []
        # use dirty PTB tokenizer + parser
        # NB: the two following lines eat up 67% of total time
        doc_tkd_toks = tokenize_doc_ptb(doc_id, doc_text)
        doc_tkd_trees = parse_doc_ptb(doc_id, doc_tkd_toks)

        # sentence <-> EDU mapping and the information that depends on this
        # mapping might be more appropriate as a separate DataFrame
        # align EDUs with sentences
        edu2sent = align_edus_with_sentences(doc_edus, doc_tkd_trees,
                                             strict=False)
        # get the codomain of edu2sent
        # if we want to be strict, we can assert that the codomain is
        # a gapless interval
        # assert sent_idc == list(range(len(doc_tkd_trees)))
        # this assertion is currently known to fail on:
        # * RST-WSJ/TRAINING/wsj_0678.out: wrong sentence segmentation in PTB
        #     (1 sentence is split in 2)
        edu2sent_codom = set([sent_idx for sent_idx in edu2sent
                              if sent_idx is not None])

        # find the index of the first and last EDU of each sentence
        # indices in both lists are offset by 1 to map to real EDU
        # numbering (which is 1-based)
        sent_edu_starts = [(edu2sent.index(i) + 1 if i in edu2sent_codom
                            else None)
                           for i in range(len(doc_tkd_trees))]
        sent_edu_ends = [(len(edu2sent) - 1 - edu2sent[::-1].index(i) + 1
                          if i in edu2sent_codom
                          else None)
                         for i in range(len(doc_tkd_trees))]
        # sentences that don't have their own RST subtree are 'leaky'

        # WIP propagate sentence-EDU mapping to RST tree spans
        for row in doc_span_rows:
            # offset by -1 because edu2sent is 0-based
            row['sent_start'] = edu2sent[row['edu_start'] - 1]
            row['sent_end'] = edu2sent[row['edu_end'] - 1]
            # inferred columns
            # special cases because edu2sent is None for EDUs whose text
            # is missing from PTB
            if ((row['sent_start'] is not None and
                 row['sent_end'] is not None)):
                row['sent_len'] = row['sent_end'] - row['sent_start'] + 1
                row['intra_sent'] = (row['sent_start'] == row['sent_end'])
                row['strad_sent'] = (not row['intra_sent'] and
                                     (not row['edu_start'] in sent_edu_starts or
                                      not row['edu_end'] in sent_edu_ends))
            else:
                row['sent_len'] = None
                row['intra_sent'] = None
                row['strad_sent'] = None
        # end WIP propagate

        # end of sentence <-> EDU mapping et al.

        # iterate over syntactic trees as proxy for sentences
        for sent_idx, tkd_tree in enumerate(doc_tkd_trees):
            row = {
                # data directly from the sentence segmenter
                'sent_id': '{}_sent{}'.format(doc_id.doc, sent_idx),
                'char_start': tkd_tree.span.char_start,
                'char_end': tkd_tree.span.char_end,
            }
            # sentence <-> EDU mapping dependent data
            # should probably have its own dataframe
            # to better handle disagreement between sentence and EDU
            # segmentation, that translates in the following entries
            # as None for missing data
            if sent_idx in edu2sent_codom:
                row.update({
                    'edu_start': sent_edu_starts[sent_idx],
                    'edu_end': sent_edu_ends[sent_idx],
                    # computed column
                    'edu_len': (sent_edu_ends[sent_idx] -
                                sent_edu_starts[sent_idx]) + 1,
                })

                # use alignment: sentence <-> RST tree spans (via EDUs)
                # leaky sentences: complex (2+ EDUs) sentences that don't
                # have a corresponding span in the RST tree
                leaky = (row['edu_len'] > 1 and
                         ((sent_edu_starts[sent_idx],
                           sent_edu_ends[sent_idx])
                          not in rst_tree_node_spans))
                row.update({
                    'leaky': leaky,
                })
                # find for each leaky sentence the smallest RST subtree
                # that covers it
                if row['leaky']:
                    sent_edu_first = sent_edu_starts[sent_idx]
                    sent_edu_last = sent_edu_ends[sent_idx]
                    # find parent span and straddling spans ;
                    # straddling spans exist for type 3 and 4
                    strad_spans = []
                    for edu_span in rst_tree_node_spans_by_len:
                        # parent span
                        if ((edu_span[0] <= sent_edu_first and
                             sent_edu_last <= edu_span[1])):
                            parent_span = edu_span
                            break
                        # straddling spans
                        if ((edu_span[0] < sent_edu_first and
                             sent_edu_first <= edu_span[1]) or
                            (edu_span[0] <= sent_edu_last and
                             sent_edu_last < edu_span[1])):
                            strad_spans.append(edu_span)
                    else:
                        raise ValueError(
                            'No minimal spanning node for {}'.format(row))
                    # leaky types {1, 2} vs {3, 4}:
                    # for types 1 and 2, members of the parent
                    # constituent are "pure" wrt sentence span:
                    # each member is either fully inside or fully
                    # outside the sentence ;
                    # no member straddles either of the sentence
                    # boundaries
                    leaky_type_12 = not strad_spans
                    # DEBUG
                    if verbose:
                        print(doc_id.doc)
                        print(parent_span, strad_spans if strad_spans else '')
                    # end DEBUG

                    # leaky types {1, 3} vs {2, 4}
                    # {1, 3} have at least one coordinative (aka multinuclear)
                    # relation in the chain of spans between the parent span
                    # and the EDU(s) of the leaky sentence ;
                    # {2, 4} have only subordinative (aka mononuclear)
                    # relations
                    leaky_coord = False
                    # first, check the kids of the parent span
                    parent_tpos = rst_tree_node_spans[parent_span]
                    parent_subtree = coarse_rtree_ref[parent_tpos]
                    if all(kid.label().nuclearity == 'Nucleus'
                           for kid in parent_subtree):
                        leaky_coord = True

                    # then, check the kids of all straddling spans
                    strad_rels = []  # TEMPORARY
                    for strad_span in strad_spans:
                        strad_tpos = rst_tree_node_spans[strad_span]
                        strad_subtree = coarse_rtree_ref[strad_tpos]
                        if all(kid.label().nuclearity == 'Nucleus'
                               for kid in strad_subtree):
                            leaky_coord = True
                        # TEMPORARY: store straddling relations (from kids)
                        kid_rels = [kid.label().rel
                                    for kid in strad_subtree
                                    if kid.label().rel != 'span']
                        # if all kids bear the same relation label, store
                        # only this value
                        if len(set(kid_rels)) == 1:
                            kid_rels = kid_rels[0]
                        else:
                            kid_rels = '+'.join(kid_rels)
                        strad_rels.append(kid_rels)
                        # WIP list of straddling relations
                        strad_rels_rows.append({
                            'node_id': '{}_const{}'.format(
                                strad_subtree.origin.doc,
                                '-'.join(str(x) for x in strad_tpos)),
                            'sent_id': '{}_sent{}'.format(
                                doc_id.doc, sent_idx),
                            'kid_rels': kid_rels,
                        })
                        # end WIP running counter
                    # determine type of leaky (ugly)
                    if leaky_type_12:
                        if leaky_coord:
                            leaky_type = 1
                        else:
                            leaky_type = 2
                    else:
                        if leaky_coord:
                            leaky_type = 3
                        else:
                            leaky_type = 4
                    # display type of leaky
                    if verbose:
                        print('Type {} ({}-level {} structure)\t{}'.format(
                            leaky_type,
                            'Same' if leaky_type_12 else 'Multi',
                            'coordination' if leaky_coord else 'subordination',
                            '; '.join(strad_rels)))
                        print()
                    # end WIP nuclearity of straddling spans

                    # add info to row
                    row.update({
                        # parent span, in EDUs
                        'parent_edu_start': parent_span[0],
                        'parent_edu_end': parent_span[1],
                        # length of parent span, in sentences
                        'parent_sent_len': (
                            edu2sent[parent_span[1] - 1] -
                            edu2sent[parent_span[0] - 1] + 1),
                        # distance between the current sentence and the most
                        # remote sentence covered by the parent span,
                        # in sentences
                        'parent_sent_dist': (
                            max([(edu2sent[parent_span[1] - 1] - sent_idx),
                                 (sent_idx - edu2sent[parent_span[0] - 1])])),
                        # types of leaky, in the taxonomy of
                        # (van der Vliet et al. 2011)
                        'leaky_type': leaky_type,
                    })
                else:
                    row.update({
                        'parent_span_start': row['edu_start'],
                        'parent_span_end': row['edu_end'],
                        # default value for leaky_type, provides a mean for
                        # easy comparison on complex sentences, between
                        # non-leaky and the various types of leaky
                        'leaky_type': 0,
                    })
                # end WIP
            doc_sent_rows.append(row)

        # 3. collect paragraphs
        doc_para_rows = []
        doc_paras = doc_ctx.paragraphs
        doc_text = doc_ctx.text()
        # doc_paras is None when the original text has no explicit marking
        # for paragraphs ; this is true for 'fileX' documents in the RST-WSJ
        # corpus
        if doc_paras is not None:
            # EDU to paragraph mapping
            edu2para = align_edus_with_paragraphs(doc_edus, doc_paras,
                                                  doc_text, strict=False)
            edu2para_codom = set([para_idx for para_idx in edu2para
                                  if para_idx is not None])
            # index of the first and last EDU of each paragraph
            para_edu_starts = [(edu2para.index(i) + 1 if i in edu2para_codom
                                else None)
                               for i in range(len(doc_paras))]
            para_edu_ends = [(len(edu2para) - 1 - edu2para[::-1].index(i) + 1
                              if i in edu2para_codom
                              else None)
                             for i in range(len(doc_paras))]
            # paragraphs that don't have their own RST subtree are "leaky" ;
            # end of paragraph <-> EDU mapping et al.

            # iterate over paragraphs
            for para_idx, para in enumerate(doc_paras):
                # dirty, educe.rst_dt.text.Paragraph should have a span
                para_span = Span(para.sentences[0].span.char_start,
                                 para.sentences[-1].span.char_end)
                # end dirty
                row = {
                    # data directly from the paragraph segmenter
                    'para_id': '{}_para{}'.format(doc_id.doc, para_idx),
                    'char_start': para_span.char_start,
                    'char_end': para_span.char_end,
                }
                # paragraph <-> EDU mapping dependent data
                # should probably have its own dataframe etc.
                if para_idx in edu2para_codom:
                    row.update({
                        'edu_start': para_edu_starts[para_idx],
                        'edu_end': para_edu_ends[para_idx],
                        # computed column
                        'edu_len': (para_edu_ends[para_idx] -
                                    para_edu_starts[para_idx]) + 1,
                    })
                    # use paragraph <-> RST tree alignment
                    if row['edu_len'] > 1:  # complex paragraphs only
                        row.update({
                            'leaky': ((para_edu_starts[para_idx],
                                       para_edu_ends[para_idx])
                                      not in rst_tree_node_spans),
                        })
                    else:
                        row.update({'leaky': False})
                    # WIP find for each leaky paragraph the smallest RST
                    # subtree that covers it
                    if row['leaky']:
                        for edu_span in rst_tree_node_spans_by_len:
                            if ((edu_span[0] <= para_edu_starts[para_idx] and
                                 para_edu_ends[para_idx] <= edu_span[1])):
                                parent_span = edu_span
                                break
                        else:
                            raise ValueError(
                                'No minimal spanning node for {}'.format(row))
                        # add info to row
                        row.update({
                            # parent span, on EDUs
                            'parent_edu_start': parent_span[0],
                            'parent_edu_end': parent_span[1]
                        })
                        # length of parent span, in paragraphs
                        if ((edu2para[parent_span[1] - 1] is not None and
                             edu2para[parent_span[0] - 1] is not None)):
                            row.update({
                                'parent_para_len': (
                                    edu2para[parent_span[1] - 1] -
                                    edu2para[parent_span[0] - 1] + 1),
                                # distance between the current paragraph and the
                                # most remote paragraph covered by the parent
                                # span, in paragraphs
                                'parent_para_dist': (
                                    max([(edu2para[parent_span[1] - 1] -
                                          para_idx),
                                         (para_idx -
                                          edu2para[parent_span[0] - 1])])),
                            })
                    else:
                        row.update({
                            'parent_edu_start': row['edu_start'],
                            'parent_edu_end': row['edu_end'],
                        })
                    # end WIP
                doc_para_rows.append(row)

        # NB: these are leaky sentences wrt the original constituency
        # trees ; leaky sentences wrt the binarized constituency trees
        # might be different (TODO), similarly for the dependency trees
        # (TODO too) ;
        # I should count them, see if the ~5% Joty mentions are on the
        # original or binarized ctrees, and compare with the number of
        # leaky for deptrees ; I suspect the latter will be much lower...
        # HYPOTHESIS: (some or all?) leaky sentences in ctrees correspond
        # to cases where nodes that are not the head of their sentence
        # have dependents in other sentences
        # this would capture the set (or a subset) of edges that fall
        # outside of the search space for the "iheads" intra/inter
        # strategy

        # add doc entries to corpus entries
        para_rows.extend(doc_para_rows)
        sent_rows.extend(doc_sent_rows)
        rel_rows.extend(doc_span_rows)
        edu_rows.extend(doc_edu_rows)

    # turn list into a DataFrame
    node_df = pd.DataFrame(node_rows)
    rel_df = pd.DataFrame(rel_rows)
    edu_df = pd.DataFrame(edu_rows)
    sent_df = pd.DataFrame(sent_rows)
    para_df = pd.DataFrame(para_rows)
    # add calculated columns here? (leaky and complex sentences)

    return node_df, rel_df, edu_df, sent_df, para_df
示例#4
0
def load_corpus_as_dataframe_new(selection='train', binarize=False, verbose=0):
    """Load training section of the RST-WSJ corpus as a pandas.DataFrame.

    Parameters
    ----------
    selection : one of {'train', 'test'}  TODO: add 'both'
        Select the part of the corpus to load.

    binarize : boolean, default: False
        If True, apply right-heavy binarization on RST trees.

    Returns
    -------
    node_df: pandas.DataFrame
        DataFrame of all nodes from the constituency trees.
    rel_df: pandas.DataFrame
        DataFrame of all relations.
    edu_df: pandas.DataFrame
        DataFrame of all EDUs.

    TODO
    ----
    [ ] intra-sentential-first right-heavy binarization
    [ ] left-heavy binarization (?)
    [ ] add selection='both'

    Notes
    -----
    `selection='both'` can currently be done as:
    ```
    train_df = load_corpus_as_dataframe_new(selection='train')
    test_df = load_corpus_as_dataframe_new(selection='test')
    both_df = train_df.append(test_df)
    ```
    """
    node_rows = []  # list of dicts, one dict per node
    rel_rows = []  # list of dicts, one dict per relation
    # edu_rows contains pre-EDUs rather than EDUs themselves, but maybe
    # conflating both does no harm
    edu_rows = []  # list of dicts, one dict per EDU
    sent_rows = []  # ibid
    para_rows = []  # ibid

    if selection == 'train':
        rst_reader = RstReader(CD_TRAIN)
    elif selection == 'test':
        rst_reader = RstReader(CD_TEST)
    else:
        raise ValueError('Unknown selection {}'.format(selection))

    rst_corpus = rst_reader.slurp()

    for doc_id, rtree_ref in sorted(rst_corpus.items()):
        doc_ctx = rtree_ref.label().context
        doc_text = doc_ctx.text()
        doc_edus = rtree_ref.leaves()

        # 0. collect EDUs
        doc_edu_rows = load_edus(doc_edus)

        # 1. collect spans (constituency nodes) from the gold RST trees
        #
        # transform the original RST tree: convert labels to their
        # coarse equivalent, binarize if required
        coarse_rtree_ref = REL_CONV(rtree_ref)
        if binarize:
            coarse_rtree_ref = _binarize(coarse_rtree_ref)
        # collect spans
        doc_span_rows = load_spans(coarse_rtree_ref)

        # prepare this info to find "leaky" substructures:
        # sentences and paragraphs
        # dict of EDU spans to constituent node from the RST tree
        rst_tree_node_spans = {(row['edu_start'], row['edu_end']):
                               row['treepos']
                               for row in doc_span_rows}
        # list of EDU spans of constituent nodes, sorted by length of span
        # then start
        rst_tree_node_spans_by_len = list(
            sorted(rst_tree_node_spans, key=lambda x: (x[1] - x[0], x[0])))

        # 2. Collect sentences
        doc_sent_rows = []
        # use dirty PTB tokenizer + parser
        # NB: the two following lines eat up 67% of total time
        doc_tkd_toks = tokenize_doc_ptb(doc_id, doc_text)
        doc_tkd_trees = parse_doc_ptb(doc_id, doc_tkd_toks)

        # sentence <-> EDU mapping and the information that depends on this
        # mapping might be more appropriate as a separate DataFrame
        # align EDUs with sentences
        edu2sent = align_edus_with_sentences(doc_edus,
                                             doc_tkd_trees,
                                             strict=False)
        # get the codomain of edu2sent
        # if we want to be strict, we can assert that the codomain is
        # a gapless interval
        # assert sent_idc == list(range(len(doc_tkd_trees)))
        # this assertion is currently known to fail on:
        # * RST-WSJ/TRAINING/wsj_0678.out: wrong sentence segmentation in PTB
        #     (1 sentence is split in 2)
        edu2sent_codom = set(
            [sent_idx for sent_idx in edu2sent if sent_idx is not None])

        # find the index of the first and last EDU of each sentence
        # indices in both lists are offset by 1 to map to real EDU
        # numbering (which is 1-based)
        sent_edu_starts = [
            (edu2sent.index(i) + 1 if i in edu2sent_codom else None)
            for i in range(len(doc_tkd_trees))
        ]
        sent_edu_ends = [(len(edu2sent) - 1 - edu2sent[::-1].index(i) +
                          1 if i in edu2sent_codom else None)
                         for i in range(len(doc_tkd_trees))]
        # sentences that don't have their own RST subtree are 'leaky'

        # WIP propagate sentence-EDU mapping to RST tree spans
        for row in doc_span_rows:
            # offset by -1 because edu2sent is 0-based
            row['sent_start'] = edu2sent[row['edu_start'] - 1]
            row['sent_end'] = edu2sent[row['edu_end'] - 1]
            # inferred columns
            # special cases because edu2sent is None for EDUs whose text
            # is missing from PTB
            if ((row['sent_start'] is not None
                 and row['sent_end'] is not None)):
                row['sent_len'] = row['sent_end'] - row['sent_start'] + 1
                row['intra_sent'] = (row['sent_start'] == row['sent_end'])
                row['strad_sent'] = (not row['intra_sent'] and
                                     (not row['edu_start'] in sent_edu_starts
                                      or not row['edu_end'] in sent_edu_ends))
            else:
                row['sent_len'] = None
                row['intra_sent'] = None
                row['strad_sent'] = None
        # end WIP propagate

        # end of sentence <-> EDU mapping et al.

        # iterate over syntactic trees as proxy for sentences
        for sent_idx, tkd_tree in enumerate(doc_tkd_trees):
            row = {
                # data directly from the sentence segmenter
                'sent_id': '{}_sent{}'.format(doc_id.doc, sent_idx),
                'char_start': tkd_tree.span.char_start,
                'char_end': tkd_tree.span.char_end,
            }
            # sentence <-> EDU mapping dependent data
            # should probably have its own dataframe
            # to better handle disagreement between sentence and EDU
            # segmentation, that translates in the following entries
            # as None for missing data
            if sent_idx in edu2sent_codom:
                row.update({
                    'edu_start':
                    sent_edu_starts[sent_idx],
                    'edu_end':
                    sent_edu_ends[sent_idx],
                    # computed column
                    'edu_len':
                    (sent_edu_ends[sent_idx] - sent_edu_starts[sent_idx]) + 1,
                })

                # use alignment: sentence <-> RST tree spans (via EDUs)
                # leaky sentences: complex (2+ EDUs) sentences that don't
                # have a corresponding span in the RST tree
                leaky = (row['edu_len'] > 1 and
                         ((sent_edu_starts[sent_idx], sent_edu_ends[sent_idx])
                          not in rst_tree_node_spans))
                row.update({
                    'leaky': leaky,
                })
                # find for each leaky sentence the smallest RST subtree
                # that covers it
                if row['leaky']:
                    sent_edu_first = sent_edu_starts[sent_idx]
                    sent_edu_last = sent_edu_ends[sent_idx]
                    # find parent span and straddling spans ;
                    # straddling spans exist for type 3 and 4
                    strad_spans = []
                    for edu_span in rst_tree_node_spans_by_len:
                        # parent span
                        if ((edu_span[0] <= sent_edu_first
                             and sent_edu_last <= edu_span[1])):
                            parent_span = edu_span
                            break
                        # straddling spans
                        if ((edu_span[0] < sent_edu_first
                             and sent_edu_first <= edu_span[1])
                                or (edu_span[0] <= sent_edu_last
                                    and sent_edu_last < edu_span[1])):
                            strad_spans.append(edu_span)
                    else:
                        raise ValueError(
                            'No minimal spanning node for {}'.format(row))
                    # leaky types {1, 2} vs {3, 4}:
                    # for types 1 and 2, members of the parent
                    # constituent are "pure" wrt sentence span:
                    # each member is either fully inside or fully
                    # outside the sentence ;
                    # no member straddles either of the sentence
                    # boundaries
                    leaky_type_12 = not strad_spans
                    # DEBUG
                    if verbose:
                        print(doc_id.doc)
                        print(parent_span, strad_spans if strad_spans else '')
                    # end DEBUG

                    # leaky types {1, 3} vs {2, 4}
                    # {1, 3} have at least one coordinative (aka multinuclear)
                    # relation in the chain of spans between the parent span
                    # and the EDU(s) of the leaky sentence ;
                    # {2, 4} have only subordinative (aka mononuclear)
                    # relations
                    leaky_coord = False
                    # first, check the kids of the parent span
                    parent_tpos = rst_tree_node_spans[parent_span]
                    parent_subtree = coarse_rtree_ref[parent_tpos]
                    if all(kid.label().nuclearity == 'Nucleus'
                           for kid in parent_subtree):
                        leaky_coord = True

                    # then, check the kids of all straddling spans
                    strad_rels = []  # TEMPORARY
                    for strad_span in strad_spans:
                        strad_tpos = rst_tree_node_spans[strad_span]
                        strad_subtree = coarse_rtree_ref[strad_tpos]
                        if all(kid.label().nuclearity == 'Nucleus'
                               for kid in strad_subtree):
                            leaky_coord = True
                        # TEMPORARY: store straddling relations (from kids)
                        kid_rels = [
                            kid.label().rel for kid in strad_subtree
                            if kid.label().rel != 'span'
                        ]
                        # if all kids bear the same relation label, store
                        # only this value
                        if len(set(kid_rels)) == 1:
                            kid_rels = kid_rels[0]
                        else:
                            kid_rels = '+'.join(kid_rels)
                        strad_rels.append(kid_rels)
                        # WIP list of straddling relations
                        strad_rels_rows.append({
                            'node_id':
                            '{}_const{}'.format(
                                strad_subtree.origin.doc,
                                '-'.join(str(x) for x in strad_tpos)),
                            'sent_id':
                            '{}_sent{}'.format(doc_id.doc, sent_idx),
                            'kid_rels':
                            kid_rels,
                        })
                        # end WIP running counter
                    # determine type of leaky (ugly)
                    if leaky_type_12:
                        if leaky_coord:
                            leaky_type = 1
                        else:
                            leaky_type = 2
                    else:
                        if leaky_coord:
                            leaky_type = 3
                        else:
                            leaky_type = 4
                    # display type of leaky
                    if verbose:
                        print('Type {} ({}-level {} structure)\t{}'.format(
                            leaky_type, 'Same' if leaky_type_12 else 'Multi',
                            'coordination' if leaky_coord else 'subordination',
                            '; '.join(strad_rels)))
                        print()
                    # end WIP nuclearity of straddling spans

                    # add info to row
                    row.update({
                        # parent span, in EDUs
                        'parent_edu_start':
                        parent_span[0],
                        'parent_edu_end':
                        parent_span[1],
                        # length of parent span, in sentences
                        'parent_sent_len': (edu2sent[parent_span[1] - 1] -
                                            edu2sent[parent_span[0] - 1] + 1),
                        # distance between the current sentence and the most
                        # remote sentence covered by the parent span,
                        # in sentences
                        'parent_sent_dist':
                        (max([(edu2sent[parent_span[1] - 1] - sent_idx),
                              (sent_idx - edu2sent[parent_span[0] - 1])])),
                        # types of leaky, in the taxonomy of
                        # (van der Vliet et al. 2011)
                        'leaky_type':
                        leaky_type,
                    })
                else:
                    row.update({
                        'parent_span_start': row['edu_start'],
                        'parent_span_end': row['edu_end'],
                        # default value for leaky_type, provides a mean for
                        # easy comparison on complex sentences, between
                        # non-leaky and the various types of leaky
                        'leaky_type': 0,
                    })
                # end WIP
            doc_sent_rows.append(row)

        # 3. collect paragraphs
        doc_para_rows = []
        doc_paras = doc_ctx.paragraphs
        doc_text = doc_ctx.text()
        # doc_paras is None when the original text has no explicit marking
        # for paragraphs ; this is true for 'fileX' documents in the RST-WSJ
        # corpus
        if doc_paras is not None:
            # EDU to paragraph mapping
            edu2para = align_edus_with_paragraphs(doc_edus,
                                                  doc_paras,
                                                  doc_text,
                                                  strict=False)
            edu2para_codom = set(
                [para_idx for para_idx in edu2para if para_idx is not None])
            # index of the first and last EDU of each paragraph
            para_edu_starts = [
                (edu2para.index(i) + 1 if i in edu2para_codom else None)
                for i in range(len(doc_paras))
            ]
            para_edu_ends = [(len(edu2para) - 1 - edu2para[::-1].index(i) +
                              1 if i in edu2para_codom else None)
                             for i in range(len(doc_paras))]
            # paragraphs that don't have their own RST subtree are "leaky" ;
            # end of paragraph <-> EDU mapping et al.

            # iterate over paragraphs
            for para_idx, para in enumerate(doc_paras):
                # dirty, educe.rst_dt.text.Paragraph should have a span
                para_span = Span(para.sentences[0].span.char_start,
                                 para.sentences[-1].span.char_end)
                # end dirty
                row = {
                    # data directly from the paragraph segmenter
                    'para_id': '{}_para{}'.format(doc_id.doc, para_idx),
                    'char_start': para_span.char_start,
                    'char_end': para_span.char_end,
                }
                # paragraph <-> EDU mapping dependent data
                # should probably have its own dataframe etc.
                if para_idx in edu2para_codom:
                    row.update({
                        'edu_start':
                        para_edu_starts[para_idx],
                        'edu_end':
                        para_edu_ends[para_idx],
                        # computed column
                        'edu_len':
                        (para_edu_ends[para_idx] - para_edu_starts[para_idx]) +
                        1,
                    })
                    # use paragraph <-> RST tree alignment
                    if row['edu_len'] > 1:  # complex paragraphs only
                        row.update({
                            'leaky': ((para_edu_starts[para_idx],
                                       para_edu_ends[para_idx])
                                      not in rst_tree_node_spans),
                        })
                    else:
                        row.update({'leaky': False})
                    # WIP find for each leaky paragraph the smallest RST
                    # subtree that covers it
                    if row['leaky']:
                        for edu_span in rst_tree_node_spans_by_len:
                            if ((edu_span[0] <= para_edu_starts[para_idx]
                                 and para_edu_ends[para_idx] <= edu_span[1])):
                                parent_span = edu_span
                                break
                        else:
                            raise ValueError(
                                'No minimal spanning node for {}'.format(row))
                        # add info to row
                        row.update({
                            # parent span, on EDUs
                            'parent_edu_start': parent_span[0],
                            'parent_edu_end': parent_span[1]
                        })
                        # length of parent span, in paragraphs
                        if ((edu2para[parent_span[1] - 1] is not None
                             and edu2para[parent_span[0] - 1] is not None)):
                            row.update({
                                'parent_para_len':
                                (edu2para[parent_span[1] - 1] -
                                 edu2para[parent_span[0] - 1] + 1),
                                # distance between the current paragraph and
                                # the most remote paragraph covered by the
                                # parent span, in paragraphs
                                'parent_para_dist': (max([
                                    (edu2para[parent_span[1] - 1] - para_idx),
                                    (para_idx - edu2para[parent_span[0] - 1])
                                ])),
                            })
                    else:
                        row.update({
                            'parent_edu_start': row['edu_start'],
                            'parent_edu_end': row['edu_end'],
                        })
                    # end WIP
                doc_para_rows.append(row)

        # NB: these are leaky sentences wrt the original constituency
        # trees ; leaky sentences wrt the binarized constituency trees
        # might be different (TODO), similarly for the dependency trees
        # (TODO too) ;
        # I should count them, see if the ~5% Joty mentions are on the
        # original or binarized ctrees, and compare with the number of
        # leaky for deptrees ; I suspect the latter will be much lower...
        # HYPOTHESIS: (some or all?) leaky sentences in ctrees correspond
        # to cases where nodes that are not the head of their sentence
        # have dependents in other sentences
        # this would capture the set (or a subset) of edges that fall
        # outside of the search space for the "iheads" intra/inter
        # strategy

        # add doc entries to corpus entries
        para_rows.extend(doc_para_rows)
        sent_rows.extend(doc_sent_rows)
        rel_rows.extend(doc_span_rows)
        edu_rows.extend(doc_edu_rows)

    # turn list into a DataFrame
    node_df = pd.DataFrame(node_rows)
    rel_df = pd.DataFrame(rel_rows)
    edu_df = pd.DataFrame(edu_rows)
    sent_df = pd.DataFrame(sent_rows)
    para_df = pd.DataFrame(para_rows)
    # add calculated columns here? (leaky and complex sentences)

    return node_df, rel_df, edu_df, sent_df, para_df