示例#1
0
def dump_dep_rstdt(corpus_dir, out_dir, nary_enc):
    """Convert and dump the RST-DT corpus as dependency trees."""
    # convert and dump RST trees from train
    dir_train = os.path.join(corpus_dir, TRAIN_FOLDER)
    if not os.path.isdir(dir_train):
        raise ValueError('No such folder: {}'.format(dir_train))
    reader_train = Reader(dir_train)
    trees_train = reader_train.slurp()
    dtrees_train = {doc_name: RstDepTree.from_rst_tree(rst_tree,
                                                       nary_enc=nary_enc)
                    for doc_name, rst_tree in trees_train.items()}
    dump_disdep_files(dtrees_train.values(),
                      os.path.join(out_dir, os.path.basename(dir_train)))

    # convert and dump RST trees from test
    dir_test = os.path.join(corpus_dir, TEST_FOLDER)
    if not os.path.isdir(dir_test):
        raise ValueError('No such folder: {}'.format(dir_test))
    reader_test = Reader(dir_test)
    trees_test = reader_test.slurp()
    dtrees_test = {doc_name: RstDepTree.from_rst_tree(rst_tree,
                                                      nary_enc=nary_enc)
                   for doc_name, rst_tree in trees_test.items()}
    dump_disdep_files(dtrees_test.values(),
                      os.path.join(out_dir, os.path.basename(dir_test)))
示例#2
0
文件: tests.py 项目: eipiplusun/educe
    def test_rst_to_dt(self):
        lw_trees = ["(R:rel (S x) (N y))",

                    """
                    (R:rel
                        (S x)
                        (N:rel (N h) (S t)))
                    """,

                    """
                    (R:r
                        (S x)
                        (N:r (N:r (S t1) (N h))
                             (S t2)))
                    """
        ]

        for lstr in lw_trees:
            rst1 = parse_lightweight_tree(lstr)
            dep = RstDepTree.from_simple_rst_tree(rst1)
            rst2 = deptree_to_simple_rst_tree(dep)
            self.assertEqual(rst1, rst2, "round-trip on " + lstr)

        for name, tree in self._test_trees().items():
            rst1 = SimpleRSTTree.from_rst_tree(tree)
            dep = RstDepTree.from_simple_rst_tree(rst1)
            rst2 = deptree_to_simple_rst_tree(dep)
            self.assertEqual(treenode(rst1).span,
                             treenode(rst2).span,
                             "span equality on " + name)
            self.assertEqual(treenode(rst1).edu_span,
                             treenode(rst2).edu_span,
                             "edu span equality on " + name)
示例#3
0
    def test_rst_to_dt(self):
        lw_trees = [
            "(R:rel (S x) (N y))", """
                    (R:rel
                        (S x)
                        (N:rel (N h) (S t)))
                    """, """
                    (R:r
                        (S x)
                        (N:r (N:r (S t1) (N h))
                             (S t2)))
                    """
        ]

        for lstr in lw_trees:
            rst1 = parse_lightweight_tree(lstr)
            dep = RstDepTree.from_simple_rst_tree(rst1)
            rst2 = deptree_to_simple_rst_tree(dep)
            self.assertEqual(rst1, rst2, "round-trip on " + lstr)

        for name, tree in self._test_trees().items():
            rst1 = SimpleRSTTree.from_rst_tree(tree)
            dep = RstDepTree.from_simple_rst_tree(rst1)
            rst2 = deptree_to_simple_rst_tree(dep)
            self.assertEqual(
                treenode(rst1).span,
                treenode(rst2).span, "span equality on " + name)
            self.assertEqual(
                treenode(rst1).edu_span,
                treenode(rst2).edu_span, "edu span equality on " + name)
示例#4
0
def convert(corpus, multinuclear, odir):
    """
    Convert every RST tree in the corpus to a dependency tree
    (and back, but simplified using a set of relation types
    that will be systematically treated as multinuclear)
    """
    bin_dir = os.path.join(odir, "rst-binarised")
    dt_dir = os.path.join(odir, "rst-to-dt")
    rst2_dir = os.path.join(odir, "dt-to-rst")
    for subdir in [bin_dir, dt_dir, rst2_dir]:
        if not os.path.exists(subdir):
            os.makedirs(subdir)

    for k in corpus:
        suffix = os.path.splitext(k.doc)[0]

        stree = educe.rst_dt.SimpleRSTTree.from_rst_tree(corpus[k])
        with open(os.path.join(bin_dir, suffix), 'w') as fout:
            fout.write(str(stree))

        dtree = RstDepTree.from_simple_rst_tree(stree)
        with open(os.path.join(dt_dir, suffix), 'w') as fout:
            fout.write(str(dtree))

        stree2 = deptree_to_simple_rst_tree(dtree, multinuclear)
        with open(os.path.join(rst2_dir, suffix), 'w') as fout:
            fout.write(str(stree2))
示例#5
0
def convert(corpus, multinuclear, odir):
    """
    Convert every RST tree in the corpus to a dependency tree
    (and back, but simplified using a set of relation types
    that will be systematically treated as multinuclear)
    """
    bin_dir = os.path.join(odir, "rst-binarised")
    dt_dir = os.path.join(odir, "rst-to-dt")
    rst2_dir = os.path.join(odir, "dt-to-rst")
    for subdir in [bin_dir, dt_dir, rst2_dir]:
        if not os.path.exists(subdir):
            os.makedirs(subdir)

    for k in corpus:
        suffix = os.path.splitext(k.doc)[0]

        stree = SimpleRSTTree.from_rst_tree(corpus[k])
        with open(os.path.join(bin_dir, suffix), 'w') as fout:
            fout.write(str(stree))

        dtree = RstDepTree.from_simple_rst_tree(stree)
        with open(os.path.join(dt_dir, suffix), 'w') as fout:
            fout.write(str(dtree))

        stree2 = deptree_to_simple_rst_tree(dtree)
        with open(os.path.join(rst2_dir, suffix), 'w') as fout:
            fout.write(str(stree2))
示例#6
0
    def test_rst_to_dt_nuclearity_loss(self):
        """
        Test that we still get sane tree structure with
        nuclearity loss
        """
        tricky = """
                 (R:r (S t) (N h))
                 """

        nuked = """
                (R:r (N t) (N h))
                """

        #        tricky = """
        #                 (R:r
        #                     (S x)
        #                     (N:r (N:r (S t1) (N h))
        #                          (S t2)))
        #                 """
        #
        #        nuked = """
        #                 (R:r
        #                     (N x)
        #                     (N:r (N:r (N t1) (N h))
        #                          (N t2)))
        #                 """

        rst0 = parse_lightweight_tree(nuked)
        rst1 = parse_lightweight_tree(tricky)

        # a little sanity check first
        dep0 = RstDepTree.from_simple_rst_tree(rst0)
        rev0 = deptree_to_simple_rst_tree(dep0)  # was:, ['r'])
        self.assertEqual(rst0, rev0, "same structure " + nuked)  # sanity

        # now the real test
        dep1 = RstDepTree.from_simple_rst_tree(rst1)
        rev1 = deptree_to_simple_rst_tree(dep1)  # was:, ['r'])
示例#7
0
文件: tests.py 项目: eipiplusun/educe
    def test_rst_to_dt_nuclearity_loss(self):
        """
        Test that we still get sane tree structure with
        nuclearity loss
        """
        tricky = """
                 (R:r (S t) (N h))
                 """

        nuked = """
                (R:r (N t) (N h))
                """

#        tricky = """
#                 (R:r
#                     (S x)
#                     (N:r (N:r (S t1) (N h))
#                          (S t2)))
#                 """
#
#        nuked = """
#                 (R:r
#                     (N x)
#                     (N:r (N:r (N t1) (N h))
#                          (N t2)))
#                 """

        rst0 = parse_lightweight_tree(nuked)
        rst1 = parse_lightweight_tree(tricky)

        # a little sanity check first
        dep0 = RstDepTree.from_simple_rst_tree(rst0)
        rev0 = deptree_to_simple_rst_tree(dep0)  # was:, ['r'])
        self.assertEqual(rst0, rev0, "same structure " + nuked)  # sanity

        # now the real test
        dep1 = RstDepTree.from_simple_rst_tree(rst1)
        rev1 = deptree_to_simple_rst_tree(dep1)  # was:, ['r'])
示例#8
0
    def test_dt_to_rst_order(self):
        lw_trees = [
            "(R:r (N:r (N h) (S r1)) (S r2))",
            "(R:r (S:r (S l2) (N l1)) (N h))",
            "(R:r (N:r (S l1) (N h)) (S r1))",
            """
            (R:r
              (N:r
                (N:r (S l2)
                     (N:r (S l1)
                          (N h)))
                (S r1))
              (S r2))
            """,  # ((l2 <- l1 <- h) -> r1 -> r2)
            """
            (R:r
              (N:r
                (S l2)
                (N:r (N:r (S l1)
                          (N h))
                     (S r1)))
              (S r2))
            """,  # (l2 <- ((l1 <- h) -> r1)) -> r2
        ]

        for lstr in lw_trees:
            rst1 = parse_lightweight_tree(lstr)
            dep = RstDepTree.from_simple_rst_tree(rst1)

            dep_a = dep
            rst2a = deptree_to_simple_rst_tree(dep_a)
            self.assertEqual(rst1, rst2a, "round-trip on " + lstr)

            dep_b = copy.deepcopy(dep)
            dep_b.deps(0).reverse()
            rst2b = deptree_to_simple_rst_tree(dep_b)
            # TODO assertion on rst2b?

            dep_c = copy.deepcopy(dep)
            random.shuffle(dep_c.deps(0))
            rst2c = deptree_to_simple_rst_tree(dep_c)
示例#9
0
def read_deps(corpus,
              section='all',
              nary_enc='chain',
              rew_pseudo_rels=False,
              mrg_same_units=False):
    """Collect dependencies from the corpus.

    Parameters
    ----------
    corpus : dict from str to dict from FileId to RSTTree
        Corpus of RST c-trees indexed by {'train', 'test'} then FileId.
    section : str, one of {'train', 'test', 'all'}
        Section of interest in the RST-DT.
    nary_enc : str, one of {'tree', 'chain'}
        Encoding of n-ary relations used in the c-to-d conversion.
    rew_pseudo_rels : boolean, defaults to False
        If True, rewrite pseudo relations ; see
        `educe.rst_dt.pseudo_relations`.
    mrg_same_units : boolean, defaults to False
        If True, merge fragmented EDUs ; see
        `educe.rst_dt.pseudo_relations`.

    Returns
    -------
    edu_df : pandas.DataFrame
        Table of EDUs read from the corpus.
    dep_df : pandas.DataFrame
        Table of dependencies read from the corpus.
    """
    # experimental: rewrite pseudo-relations
    if rew_pseudo_rels:
        for sec_name, sec_corpus in corpus.items():
            corpus[sec_name] = {
                doc_id: rewrite_pseudo_rels(doc_id, rst_ctree)
                for doc_id, rst_ctree in sec_corpus.items()
            }
    if mrg_same_units:
        for sec_name, sec_corpus in corpus.items():
            corpus[sec_name] = {
                doc_id: merge_same_units(doc_id, rst_ctree)
                for doc_id, rst_ctree in sec_corpus.items()
            }
    # convert to d-trees, collect dependencies
    edus = []
    deps = []
    for sec_name, sec_corpus in corpus.items():
        for doc_id, rst_ctree in sorted(sec_corpus.items()):
            doc_name = doc_id.doc
            doc_text = rst_ctree.text()
            # DIRTY infer (approximate) sentence and paragraph indices
            # from newlines in the text (\n and \n\n)
            sent_idx = 0
            para_idx = 0
            # end DIRTY
            rst_dtree = RstDepTree.from_rst_tree(rst_ctree, nary_enc='chain')
            for dep_idx, (edu, hd_idx, lbl, nuc, hd_order) in enumerate(
                    zip(rst_dtree.edus[1:], rst_dtree.heads[1:],
                        rst_dtree.labels[1:], rst_dtree.nucs[1:],
                        rst_dtree.ranks[1:]),
                    start=1):
                char_beg = edu.span.char_start
                char_end = edu.span.char_end
                edus.append((sec_name, doc_name, dep_idx, char_beg, char_end,
                             sent_idx, para_idx))
                deps.append((doc_name, dep_idx, hd_idx, lbl, nuc, hd_order))
                # DIRTY search for paragraph or sentence breaks in the
                # text of the EDU *plus the next three characters* (yerk)
                edu_txt_plus = doc_text[char_beg:char_end + 3]
                if '\n\n' in edu_txt_plus:
                    para_idx += 1
                    sent_idx += 1  # sometimes wrong ; to be fixed
                elif '\n' in edu_txt_plus:
                    sent_idx += 1
                # end DIRTY
    # turn into DataFrame
    edu_df = pd.DataFrame(edus,
                          columns=[
                              'section', 'doc_name', 'dep_idx', 'char_beg',
                              'char_end', 'sent_idx', 'para_idx'
                          ])
    dep_df = pd.DataFrame(
        deps,
        columns=['doc_name', 'dep_idx', 'hd_idx', 'rel', 'nuc', 'hd_order'])
    # additional columns
    # * attachment length in EDUs
    dep_df['len_edu'] = dep_df['dep_idx'] - dep_df['hd_idx']
    dep_df['len_edu_abs'] = abs(dep_df['len_edu'])
    # * attachment length, in sentences and paragraphs
    if False:
        # TODO rewrite in a pandas-ic manner ; my previous attempts have
        # failed but I think I got pretty close
        # NB: the current implementation is *extremely* slow: 155 seconds
        # on my laptop for the RST-DT, just for this (minor) computation
        len_sent = []
        len_para = []
        for _, row in dep_df[['doc_name', 'dep_idx', 'hd_idx']].iterrows():
            edu_dep = edu_df[(edu_df['doc_name'] == row['doc_name'])
                             & (edu_df['dep_idx'] == row['dep_idx'])]
            if row['hd_idx'] == 0:
                # {sent,para}_idx + 1 for dependents of the fake root
                lsent = edu_dep['sent_idx'].values[0] + 1
                lpara = edu_dep['para_idx'].values[0] + 1
            else:
                edu_hd = edu_df[(edu_df['doc_name'] == row['doc_name'])
                                & (edu_df['dep_idx'] == row['hd_idx'])]
                lsent = (edu_dep['sent_idx'].values[0] -
                         edu_hd['sent_idx'].values[0])
                lpara = (edu_dep['para_idx'].values[0] -
                         edu_hd['para_idx'].values[0])
            len_sent.append(lsent)
            len_para.append(lpara)
        dep_df['len_sent'] = pd.Series(len_sent)
        dep_df['len_sent_abs'] = abs(dep_df['len_sent'])
        dep_df['len_para'] = pd.Series(len_para)
        dep_df['len_para_abs'] = abs(dep_df['len_para'])
    # * class of relation (FIXME we need to handle interaction with
    #   rewrite_pseudo_rels)
    rel_conv = RstRelationConverter(RELMAP_112_18_FILE).convert_label
    dep_df['rel_class'] = dep_df['rel'].apply(rel_conv)
    # * boolean indicator for pseudo-relations ; NB: the 'Style-' prefix
    # can only apply if rew_pseudo_rels (otherwise no occurrence)
    dep_df['pseudo_rel'] = (
        (dep_df['rel'].str.startswith('Style')) |
        (dep_df['rel'].str.endswith('Same-Unit')) |
        (dep_df['rel'].str.endswith('TextualOrganization')))
    return edu_df, dep_df
示例#10
0
                                      doc_name + '.out.xml')
            core_reader = PreprocessingSource()
            core_reader.read(core_fname, suffix='')
            corenlp_doc = read_corenlp_result(None, core_reader)
            core_toks = corenlp_doc.tokens
            core_toks_beg = [x.span.char_start for x in core_toks]
            core_toks_end = [x.span.char_end for x in core_toks]

            # PTB stuff
            # * create DocumentPlus (adapted from educe.rst_dt.corpus)
            rst_context = rst_tree.label().context
            ptb_docp = DocumentPlus(key, doc_name, rst_context)
            # * attach EDUs (yerk)
            # FIXME we currently get them via an RstDepTree created from
            # the original RSTTree, so as to get the left padding EDU
            rst_dtree = RstDepTree.from_rst_tree(rst_tree)
            ptb_docp.edus = rst_dtree.edus
            # * setup a PtbParser (re-yerk)
            ptb_parser = PtbParser(PTB_DIR)
            ptb_parser.tokenize(ptb_docp)
            # get PTB toks ; skip left padding token
            ptb_toks = ptb_docp.tkd_tokens[1:]
            ptb_toks_beg = ptb_docp.toks_beg[1:]
            ptb_toks_end = ptb_docp.toks_end[1:]

            # compare !
            core2ptb_beg = np.searchsorted(ptb_toks_beg, core_toks_beg,
                                           side='left')
            core2ptb_end = np.searchsorted(ptb_toks_end, core_toks_end,
                                           side='right') - 1
            # TODO maybe use np.diff?
示例#11
0
文件: util.py 项目: moreymat/attelo
def get_oracle_ctrees(dep_edges,
                      att_edus,
                      nuc_strategy="unamb_else_most_frequent",
                      rank_strategy="closest-intra-rl-inter-rl",
                      prioritize_same_unit=True,
                      strict=False):
    """Build the oracle constituency tree(s) for a dependency tree.

    Parameters
    ----------
    dep_edges: dict(string, [(string, string, string)])
        Edges for each document, indexed by doc name
        Cf. type of return value from
        irit-rst-dt/ctree.py:load_attelo_output_file()
    att_edus: cf return type of attelo.io.load_edus
        EDUs as they are known to attelo
    strict: boolean, True by default
        If True, any link from ROOT to an EDU that is neither 'ROOT' nor
        UNRELATED raises an exception, otherwise a warning is issued.

    Returns
    -------
    ctrees: list of RstTree
        There can be several e.g. for leaky sentences.
    """
    # rebuild educe EDUs from their attelo description
    # and group them by doc_name
    educe_edus = defaultdict(list)
    edu2sent_idx = defaultdict(dict)
    gid2num = dict()
    for att_edu in att_edus:
        # doc name
        doc_name = att_edu.grouping
        # EDU info
        # skip ROOT (automatically added by RstDepTree.__init__)
        if att_edu.id == 'ROOT':
            continue
        edu_num = int(att_edu.id.rsplit('_', 1)[1])
        edu_span = EduceSpan(att_edu.start, att_edu.end)
        edu_text = att_edu.text
        educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text))
        # map global id of EDU to num of EDU inside doc
        gid2num[att_edu.id] = edu_num
        # map EDU to sentence
        try:
            sent_idx = int(att_edu.subgrouping.split('_sent')[1])
        except IndexError:
            # this EDU could not be attached to any sentence (ex: missing
            # text in the PTB), so a default subgrouping identifier was used ;
            # we aim for consistency with educe and map these to "None"
            sent_idx = None
        edu2sent_idx[doc_name][edu_num] = sent_idx
    # check that our info covers only one document
    assert len(educe_edus) == 1
    # then restrict to this document
    doc_name = educe_edus.keys()[0]
    educe_edus = educe_edus[doc_name]
    edu2sent_idx = edu2sent_idx[doc_name]
    # sort EDUs by num
    educe_edus = list(sorted(educe_edus, key=lambda e: e.num))
    # rebuild educe-style edu2sent ; prepend 0 for the fake root
    edu2sent = [0] + [edu2sent_idx[e.num] for e in educe_edus]
    # classifiers for nuclearity and ranking
    # FIXME declare, fit and predict upstream...
    # nuclearity
    nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy)
    nuc_classifier.fit([], [])  # empty X and y for dummy fit
    # ranking classifier
    rank_classifier = InsideOutAttachmentRanker(
        strategy=rank_strategy, prioritize_same_unit=prioritize_same_unit)

    # rebuild RstDepTrees
    dtree = RstDepTree(educe_edus)
    for src_id, tgt_id, lbl in dep_edges:
        if src_id == 'ROOT':
            if lbl not in ['ROOT', UNKNOWN]:
                err_msg = 'weird root label: {} {} {}'.format(
                    src_id, tgt_id, lbl)
                if strict:
                    raise ValueError(err_msg)
                else:
                    print('W: {}, using ROOT instead'.format(err_msg))
            dtree.set_root(gid2num[tgt_id])
        else:
            dtree.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
    # add nuclearity: heuristic baseline
    dtree.nucs = nuc_classifier.predict([dtree])[0]
    # add rank: some strategies require a mapping from EDU to sentence
    # EXPERIMENTAL attach array of sentence index for each EDU in tree
    dtree.sent_idx = edu2sent
    # end EXPERIMENTAL
    dtree.ranks = rank_classifier.predict([dtree])[0]
    # end NEW

    # create pred ctree
    try:
        bin_srtrees = deptree_to_simple_rst_tree(dtree, allow_forest=True)
        if False:  # EXPERIMENTAL
            # currently False to run on output that already has
            # labels embedding nuclearity
            bin_srtrees = [
                SimpleRSTTree.incorporate_nuclearity_into_label(bin_srtree)
                for bin_srtree in bin_srtrees
            ]
        bin_rtrees = [
            SimpleRSTTree.to_binary_rst_tree(bin_srtree)
            for bin_srtree in bin_srtrees
        ]
    except RstDtException as rst_e:
        print(rst_e)
        if False:
            print('\n'.join('{}: {}'.format(edu.text_span(), edu)
                            for edu in educe_edus[doc_name]))
        raise
    ctrees = bin_rtrees

    return ctrees
示例#12
0
文件: util.py 项目: eipiplusun/attelo
def get_oracle_ctrees(dep_edges, att_edus,
                      nuc_strategy="unamb_else_most_frequent",
                      rank_strategy="closest-intra-rl-inter-rl",
                      prioritize_same_unit=True,
                      strict=False):
    """Build the oracle constituency tree(s) for a dependency tree.

    Parameters
    ----------
    dep_edges: dict(string, [(string, string, string)])
        Edges for each document, indexed by doc name
        Cf. type of return value from
        irit-rst-dt/ctree.py:load_attelo_output_file()
    att_edus: cf return type of attelo.io.load_edus
        EDUs as they are known to attelo
    strict: boolean, True by default
        If True, any link from ROOT to an EDU that is neither 'ROOT' nor
        UNRELATED raises an exception, otherwise a warning is issued.

    Returns
    -------
    ctrees: list of RstTree
        There can be several e.g. for leaky sentences.
    """
    # rebuild educe EDUs from their attelo description
    # and group them by doc_name
    educe_edus = defaultdict(list)
    edu2sent_idx = defaultdict(dict)
    gid2num = dict()
    for att_edu in att_edus:
        # doc name
        doc_name = att_edu.grouping
        # EDU info
        # skip ROOT (automatically added by RstDepTree.__init__)
        if att_edu.id == 'ROOT':
            continue
        edu_num = int(att_edu.id.rsplit('_', 1)[1])
        edu_span = EduceSpan(att_edu.start, att_edu.end)
        edu_text = att_edu.text
        educe_edus[doc_name].append(EduceEDU(edu_num, edu_span, edu_text))
        # map global id of EDU to num of EDU inside doc
        gid2num[att_edu.id] = edu_num
        # map EDU to sentence
        try:
            sent_idx = int(att_edu.subgrouping.split('_sent')[1])
        except IndexError:
            # this EDU could not be attached to any sentence (ex: missing
            # text in the PTB), so a default subgrouping identifier was used ;
            # we aim for consistency with educe and map these to "None"
            sent_idx = None
        edu2sent_idx[doc_name][edu_num] = sent_idx
    # check that our info covers only one document
    assert len(educe_edus) == 1
    # then restrict to this document
    doc_name = educe_edus.keys()[0]
    educe_edus = educe_edus[doc_name]
    edu2sent_idx = edu2sent_idx[doc_name]
    # sort EDUs by num
    educe_edus = list(sorted(educe_edus, key=lambda e: e.num))
    # rebuild educe-style edu2sent ; prepend 0 for the fake root
    edu2sent = [0] + [edu2sent_idx[e.num] for e in educe_edus]
    # classifiers for nuclearity and ranking
    # FIXME declare, fit and predict upstream...
    # nuclearity
    nuc_classifier = DummyNuclearityClassifier(strategy=nuc_strategy)
    nuc_classifier.fit([], [])  # empty X and y for dummy fit
    # ranking classifier
    rank_classifier = InsideOutAttachmentRanker(
        strategy=rank_strategy,
        prioritize_same_unit=prioritize_same_unit)

    # rebuild RstDepTrees
    dtree = RstDepTree(educe_edus)
    for src_id, tgt_id, lbl in dep_edges:
        if src_id == 'ROOT':
            if lbl not in ['ROOT', UNKNOWN]:
                err_msg = 'weird root label: {} {} {}'.format(
                    src_id, tgt_id, lbl)
                if strict:
                    raise ValueError(err_msg)
                else:
                    print('W: {}, using ROOT instead'.format(err_msg))
            dtree.set_root(gid2num[tgt_id])
        else:
            dtree.add_dependency(gid2num[src_id], gid2num[tgt_id], lbl)
    # add nuclearity: heuristic baseline
    dtree.nucs = nuc_classifier.predict([dtree])[0]
    # add rank: some strategies require a mapping from EDU to sentence
    # EXPERIMENTAL attach array of sentence index for each EDU in tree
    dtree.sent_idx = edu2sent
    # end EXPERIMENTAL
    dtree.ranks = rank_classifier.predict([dtree])[0]
    # end NEW

    # create pred ctree
    try:
        bin_srtrees = deptree_to_simple_rst_tree(dtree, allow_forest=True)
        if False:  # EXPERIMENTAL
            # currently False to run on output that already has
            # labels embedding nuclearity
            bin_srtrees = [SimpleRSTTree.incorporate_nuclearity_into_label(
                bin_srtree) for bin_srtree in bin_srtrees]
        bin_rtrees = [SimpleRSTTree.to_binary_rst_tree(bin_srtree)
                      for bin_srtree in bin_srtrees]
    except RstDtException as rst_e:
        print(rst_e)
        if False:
            print('\n'.join('{}: {}'.format(edu.text_span(), edu)
                            for edu in educe_edus[doc_name]))
        raise
    ctrees = bin_rtrees

    return ctrees
示例#13
0
        'corpus': os.path.relpath(rst_corpus_dir, start=DATA_DIR),
        'strip_accents': strip_accents,
        'lowercase': lowercase,
        'stop_words': stop_words,
        'n_jobs': n_jobs,
        'verbose': verbose,
    }
    print('# parameters: ({})'.format(params),
          file=outfile)

    # do the real job
    corpus_items = sorted(rst_corpus.items())
    doc_keys = [key.doc for key, doc in corpus_items]
    doc_key_dtrees = [
        (doc_key.doc,
         RstDepTree.from_simple_rst_tree(SimpleRSTTree.from_rst_tree(doc)))
        for doc_key, doc in corpus_items
    ]
    edu_txts = list(e.text().replace('\n', ' ')
                    for doc_key, dtree in doc_key_dtrees
                    for e in dtree.edus)
    # vectorize each EDU using its text
    edu_vecs = vect.transform(edu_txts)
    # normalize each row of the count matrix using the l1 norm
    # (copy=False to perform in place)
    edu_vecs = normalize(edu_vecs, norm='l1', copy=False)
    # get all pairs of EDUs of interest, here as triples
    # (gov_idx, dep_idx, lbl)
    # TODO maybe sort edu pairs so that dependents with
    # the same governor are grouped (potential speed up?)
    edu_pairs = [
示例#14
0
        'corpus': os.path.relpath(rst_corpus_dir, start=DATA_DIR),
        'strip_accents': strip_accents,
        'lowercase': lowercase,
        'stop_words': stop_words,
        'n_jobs': n_jobs,
        'verbose': verbose,
    }
    print('# parameters: ({})'.format(params),
          file=outfile)

    # do the real job
    corpus_items = sorted(rst_corpus.items())
    doc_keys = [key.doc for key, doc in corpus_items]
    doc_key_dtrees = [
        (doc_key.doc,
         RstDepTree.from_simple_rst_tree(SimpleRSTTree.from_rst_tree(doc)))
        for doc_key, doc in corpus_items
    ]
    edu_txts = list(e.text().replace('\n', ' ')
                    for doc_key, dtree in doc_key_dtrees
                    for e in dtree.edus)
    # vectorize each EDU using its text
    edu_vecs = vect.transform(edu_txts)
    # normalize each row of the count matrix using the l1 norm
    # (copy=False to perform in place)
    edu_vecs = normalize(edu_vecs, norm='l1', copy=False)
    # get all pairs of EDUs of interest, here as triples
    # (gov_idx, dep_idx, lbl)
    # TODO maybe sort edu pairs so that dependents with
    # the same governor are grouped (potential speed up?)
    edu_pairs = [
示例#15
0
                                      doc_name + '.out.xml')
            core_reader = PreprocessingSource()
            core_reader.read(core_fname, suffix='')
            corenlp_doc = read_corenlp_result(None, core_reader)
            core_toks = corenlp_doc.tokens
            core_toks_beg = [x.span.char_start for x in core_toks]
            core_toks_end = [x.span.char_end for x in core_toks]

            # PTB stuff
            # * create DocumentPlus (adapted from educe.rst_dt.corpus)
            rst_context = rst_tree.label().context
            ptb_docp = DocumentPlus(key, doc_name, rst_context)
            # * attach EDUs (yerk)
            # FIXME we currently get them via an RstDepTree created from
            # the original RSTTree, so as to get the left padding EDU
            rst_dtree = RstDepTree.from_rst_tree(rst_tree)
            ptb_docp.edus = rst_dtree.edus
            # * setup a PtbParser (re-yerk)
            ptb_parser = PtbParser(PTB_DIR)
            ptb_parser.tokenize(ptb_docp)
            # get PTB toks ; skip left padding token
            ptb_toks = ptb_docp.tkd_tokens[1:]
            ptb_toks_beg = ptb_docp.toks_beg[1:]
            ptb_toks_end = ptb_docp.toks_end[1:]

            # compare !
            core2ptb_beg = np.searchsorted(ptb_toks_beg,
                                           core_toks_beg,
                                           side='left')
            core2ptb_end = np.searchsorted(
                ptb_toks_end, core_toks_end, side='right') - 1
示例#16
0
def read_deps(corpus, section='all', nary_enc='chain',
              rew_pseudo_rels=False, mrg_same_units=False):
    """Collect dependencies from the corpus.

    Parameters
    ----------
    corpus : dict from str to dict from FileId to RSTTree
        Corpus of RST c-trees indexed by {'train', 'test'} then FileId.
    section : str, one of {'train', 'test', 'all'}
        Section of interest in the RST-DT.
    nary_enc : str, one of {'tree', 'chain'}
        Encoding of n-ary relations used in the c-to-d conversion.
    rew_pseudo_rels : boolean, defaults to False
        If True, rewrite pseudo relations ; see
        `educe.rst_dt.pseudo_relations`.
    mrg_same_units : boolean, defaults to False
        If True, merge fragmented EDUs ; see
        `educe.rst_dt.pseudo_relations`.

    Returns
    -------
    edu_df : pandas.DataFrame
        Table of EDUs read from the corpus.
    dep_df : pandas.DataFrame
        Table of dependencies read from the corpus.
    """
    # experimental: rewrite pseudo-relations
    if rew_pseudo_rels:
        for sec_name, sec_corpus in corpus.items():
            corpus[sec_name] = {
                doc_id: rewrite_pseudo_rels(doc_id, rst_ctree)
                for doc_id, rst_ctree in sec_corpus.items()
            }
    if mrg_same_units:
        for sec_name, sec_corpus in corpus.items():
            corpus[sec_name] = {
                doc_id: merge_same_units(doc_id, rst_ctree)
                for doc_id, rst_ctree in sec_corpus.items()
            }
    # convert to d-trees, collect dependencies
    edus = []
    deps = []
    for sec_name, sec_corpus in corpus.items():
        for doc_id, rst_ctree in sorted(sec_corpus.items()):
            doc_name = doc_id.doc
            doc_text = rst_ctree.text()
            # DIRTY infer (approximate) sentence and paragraph indices
            # from newlines in the text (\n and \n\n)
            sent_idx = 0
            para_idx = 0
            # end DIRTY
            rst_dtree = RstDepTree.from_rst_tree(rst_ctree, nary_enc='chain')
            for dep_idx, (edu, hd_idx, lbl, nuc, hd_order) in enumerate(
                    zip(rst_dtree.edus[1:],
                        rst_dtree.heads[1:], rst_dtree.labels[1:],
                        rst_dtree.nucs[1:], rst_dtree.ranks[1:]),
                    start=1):
                char_beg = edu.span.char_start
                char_end = edu.span.char_end
                edus.append(
                    (sec_name, doc_name,
                     dep_idx, char_beg, char_end, sent_idx, para_idx)
                )
                deps.append(
                    (doc_name,
                     dep_idx, hd_idx, lbl, nuc, hd_order)
                )
                # DIRTY search for paragraph or sentence breaks in the
                # text of the EDU *plus the next three characters* (yerk)
                edu_txt_plus = doc_text[char_beg:char_end + 3]
                if '\n\n' in edu_txt_plus:
                    para_idx += 1
                    sent_idx += 1  # sometimes wrong ; to be fixed
                elif '\n' in edu_txt_plus:
                    sent_idx += 1
                # end DIRTY
    # turn into DataFrame
    edu_df = pd.DataFrame(edus, columns=[
        'section', 'doc_name', 'dep_idx', 'char_beg', 'char_end',
        'sent_idx', 'para_idx']
    )
    dep_df = pd.DataFrame(deps, columns=[
        'doc_name', 'dep_idx',
        'hd_idx', 'rel', 'nuc', 'hd_order']
    )
    # additional columns
    # * attachment length in EDUs
    dep_df['len_edu'] = dep_df['dep_idx'] - dep_df['hd_idx']
    dep_df['len_edu_abs'] = abs(dep_df['len_edu'])
    # * attachment length, in sentences and paragraphs
    if False:
        # TODO rewrite in a pandas-ic manner ; my previous attempts have
        # failed but I think I got pretty close
        # NB: the current implementation is *extremely* slow: 155 seconds
        # on my laptop for the RST-DT, just for this (minor) computation
        len_sent = []
        len_para = []
        for _, row in dep_df[['doc_name', 'dep_idx', 'hd_idx']].iterrows():
            edu_dep = edu_df[
                (edu_df['doc_name'] == row['doc_name']) &
                (edu_df['dep_idx'] == row['dep_idx'])
            ]
            if row['hd_idx'] == 0:
                # {sent,para}_idx + 1 for dependents of the fake root
                lsent = edu_dep['sent_idx'].values[0] + 1
                lpara = edu_dep['para_idx'].values[0] + 1
            else:
                edu_hd = edu_df[
                    (edu_df['doc_name'] == row['doc_name']) &
                    (edu_df['dep_idx'] == row['hd_idx'])
                ]
                lsent = (edu_dep['sent_idx'].values[0] -
                         edu_hd['sent_idx'].values[0])
                lpara = (edu_dep['para_idx'].values[0] -
                         edu_hd['para_idx'].values[0])
            len_sent.append(lsent)
            len_para.append(lpara)
        dep_df['len_sent'] = pd.Series(len_sent)
        dep_df['len_sent_abs'] = abs(dep_df['len_sent'])
        dep_df['len_para'] = pd.Series(len_para)
        dep_df['len_para_abs'] = abs(dep_df['len_para'])
    # * class of relation (FIXME we need to handle interaction with
    #   rewrite_pseudo_rels)
    rel_conv = RstRelationConverter(RELMAP_112_18_FILE).convert_label
    dep_df['rel_class'] = dep_df['rel'].apply(rel_conv)
    # * boolean indicator for pseudo-relations ; NB: the 'Style-' prefix
    # can only apply if rew_pseudo_rels (otherwise no occurrence)
    dep_df['pseudo_rel'] = (
        (dep_df['rel'].str.startswith('Style')) | 
        (dep_df['rel'].str.endswith('Same-Unit')) |
        (dep_df['rel'].str.endswith('TextualOrganization'))
    )
    return edu_df, dep_df