Exemplo n.º 1
0
    def parse(self, doc):
        """Parse
        """
        corenlp_out_name = _guess_corenlp_name(doc.key)
        if corenlp_out_name is None:
            return doc

        fname = os.path.join(self.corenlp_out_dir,
                             corenlp_out_name)
        if not os.path.exists(fname):
            raise ValueError('CoreNLP XML: no file {}'.format(fname))
        # CoreNLP XML output reader
        # FIXME the same reading is done in tokenize(), should find
        # a way to cache or share call
        reader = PreprocessingSource()
        reader.read(fname, suffix='')
        corenlp_out = read_corenlp_result(doc, reader)

        # ctrees and lexical heads on their nodes
        ctrees = corenlp_out.trees
        # strip function tags
        # TODO maybe this should be an internal preprocessing step in
        # find_lexical_heads(), so as to keep the function tags
        # that are kept by default by CoreNLP parser because they were found
        # to be useful e.g. `-retainTMPSubcategories`
        ctrees_no_gf = [transform_tree(ctree, strip_subcategory)
                        for ctree in ctrees]
        lex_heads = [find_lexical_heads(ctree_no_gf)
                     for ctree_no_gf in ctrees_no_gf]

        # store trees in doc
        doc.set_syn_ctrees(ctrees_no_gf, lex_heads=lex_heads)

        return doc
Exemplo n.º 2
0
def parse_doc_ptb(doc_id, doc_tkd_toks):
    """Dirty PTB parser"""
    # get PTB trees
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # use tweaked tokens
    doc_tokens = doc_tkd_toks
    tokens_iter = iter(doc_tokens)

    trees = []
    lex_heads = []
    for tree in PTB_READER.parsed_sents(ptb_name):
        # apply standard cleaning to tree
        # strip function tags, remove empty nodes
        tree_no_empty = prune_tree(tree, is_non_empty)
        tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                             strip_subcategory)
        #
        leaves = tree_no_empty_no_gf.leaves()
        tslice = itertools.islice(tokens_iter, len(leaves))
        clean_tree = ConstituencyTree.build(tree_no_empty_no_gf,
                                            tslice)
        trees.append(clean_tree)

        # lexicalize the PTB tree: find the head word of each constituent
        # constituents and their heads are designated by their Gorn address
        # ("tree position" in NLTK) in the tree
        lheads = find_lexical_heads(clean_tree)
        lex_heads.append(lheads)
    return trees  # , lex_heads
Exemplo n.º 3
0
def parse_doc_ptb(doc_id, doc_tkd_toks):
    """Dirty PTB parser"""
    # get PTB trees
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # use tweaked tokens
    doc_tokens = doc_tkd_toks
    tokens_iter = iter(doc_tokens)

    trees = []
    lex_heads = []
    for tree in PTB_READER.parsed_sents(ptb_name):
        # apply standard cleaning to tree
        # strip function tags, remove empty nodes
        tree_no_empty = prune_tree(tree, is_non_empty)
        tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory)
        #
        leaves = tree_no_empty_no_gf.leaves()
        tslice = itertools.islice(tokens_iter, len(leaves))
        clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice)
        trees.append(clean_tree)

        # lexicalize the PTB tree: find the head word of each constituent
        # constituents and their heads are designated by their Gorn address
        # ("tree position" in NLTK) in the tree
        lheads = find_lexical_heads(clean_tree)
        lex_heads.append(lheads)
    return trees  # , lex_heads
Exemplo n.º 4
0
    def parse(self, doc):
        """Parse
        """
        corenlp_out_name = _guess_corenlp_name(doc.key)
        if corenlp_out_name is None:
            return doc

        fname = os.path.join(self.corenlp_out_dir, corenlp_out_name)
        if not os.path.exists(fname):
            raise ValueError('CoreNLP XML: no file {}'.format(fname))
        # CoreNLP XML output reader
        # FIXME the same reading is done in tokenize(), should find
        # a way to cache or share call
        reader = PreprocessingSource()
        reader.read(fname, suffix='')
        corenlp_out = read_corenlp_result(doc, reader)

        # ctrees and lexical heads on their nodes
        ctrees = corenlp_out.trees
        # strip function tags
        # TODO maybe this should be an internal preprocessing step in
        # find_lexical_heads(), so as to keep the function tags
        # that are kept by default by CoreNLP parser because they were found
        # to be useful e.g. `-retainTMPSubcategories`
        ctrees_no_gf = [
            transform_tree(ctree, strip_subcategory) for ctree in ctrees
        ]
        lex_heads = [
            find_lexical_heads(ctree_no_gf) for ctree_no_gf in ctrees_no_gf
        ]

        # store trees in doc
        doc.set_syn_ctrees(ctrees_no_gf, lex_heads=lex_heads)

        return doc
Exemplo n.º 5
0
    def parse(self, doc):
        """Parse a document, using the gold PTB annotation.

        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.

        Parameters
        ----------
        doc: DocumentPlus
            Rich representation of the document.

        Returns
        -------
        doc: DocumentPlus
            Rich representation of the document, with syntactic
            constituency trees.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.set_syn_ctrees(trees, lex_heads=lex_heads)

        return doc
Exemplo n.º 6
0
Arquivo: ptb.py Projeto: fbuijs/educe
    def parse(self, doc):
        """
        Given a document, return a list of educified PTB parse trees
        (one per sentence).

        These are almost the same as the trees that would be returned by the
        `parsed_sents` method, except that each leaf/node is
        associated with a span within the RST DT text.

        Note: does nothing if there is no associated PTB corpus entry.
        """
        # get PTB trees
        ptb_name = _guess_ptb_name(doc.key)
        if ptb_name is None:
            return doc

        # get tokens from tokenized document
        # FIXME alignment/reconstruction should never have to deal
        # with the left padding token in the first place
        doc_tokens = doc.tkd_tokens[1:]  # skip left padding token
        tokens_iter = iter(doc_tokens)

        trees = []
        lex_heads = []
        for tree in self.reader.parsed_sents(ptb_name):
            # apply standard cleaning to tree
            # strip function tags, remove empty nodes
            tree_no_empty = prune_tree(tree, is_non_empty)
            tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                                 strip_subcategory)
            #
            leaves = tree_no_empty_no_gf.leaves()
            tslice = itertools.islice(tokens_iter, len(leaves))
            clean_tree = ConstituencyTree.build(tree_no_empty_no_gf,
                                                tslice)
            trees.append(clean_tree)

            # lexicalize the PTB tree: find the head word of each constituent
            # constituents and their heads are designated by their Gorn address
            # ("tree position" in NLTK) in the tree
            lheads = find_lexical_heads(clean_tree)
            lex_heads.append(lheads)

        # store trees in doc
        doc.tkd_trees.extend(trees)
        # store lexical heads in doc
        # TODO move to DocumentPlus
        doc.lex_heads = []
        doc.lex_heads.append(None)
        # end TODO
        doc.lex_heads.extend(lex_heads)

        return doc