예제 #1
0
파일: query.py 프로젝트: haukurb/Reynir
    def _parse(toklist):
        """ Parse a token list as a query """

        # Parse with the nonterminal 'QueryRoot' as the grammar root
        with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp:

            sent_begin = 0
            num_sent = 0
            num_parsed_sent = 0
            rdc = Reducer(bp.grammar)
            trees = dict()
            sent = []

            for ix, t in enumerate(toklist):
                if t[0] == TOK.S_BEGIN:
                    sent = []
                    sent_begin = ix
                elif t[0] == TOK.S_END:
                    slen = len(sent)
                    if not slen:
                        continue
                    num_sent += 1
                    # Parse the accumulated sentence
                    num = 0
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest is not None:
                            num = Fast_Parser.num_combinations(forest)
                            if num > 1:
                                # Reduce the resulting forest
                                forest = rdc.go(forest)
                    except ParseError as e:
                        forest = None
                    if num > 0:
                        num_parsed_sent += 1
                        # Obtain a text representation of the parse tree
                        trees[num_sent] = ParseForestDumper.dump_forest(forest)
                        #ParseForestPrinter.print_forest(forest)

                elif t[0] == TOK.P_BEGIN:
                    pass
                elif t[0] == TOK.P_END:
                    pass
                else:
                    sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees
예제 #2
0
    def make_tree(text: str) -> Tree:
        toklist = tokenize(text)
        fp = Fast_Parser(verbose=False)
        ip = IncrementalParser(fp, toklist, verbose=False)
        # Dict of parse trees in string dump format,
        # stored by sentence index (1-based)
        trees = OrderedDict()
        num_sent = 0
        for p in ip.paragraphs():
            for sent in p.sentences():
                num_sent += 1
                num_tokens = len(sent)
                assert sent.parse(), "Sentence does not parse: " + sent.text
                # Obtain a text representation of the parse tree
                token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
                # Create a verbose text representation of
                # the highest scoring parse tree
                tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
                # Add information about the sentence tree's score
                # and the number of tokens
                trees[num_sent] = "\n".join(
                    ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
                )
        # Create a tree representation string out of
        # all the accumulated parse trees
        tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())

        tree = Tree()
        tree.load(tree_string)
        return tree
예제 #3
0
    def parse_text_with_full_tree(session, text, all_names=False):
        """ Parse plain text, assumed to contain one sentence only, and
            return its simplified form as well as its full form. """

        full_tree = None

        def xform(tokens, tree, err_index):
            """ Transformation function that yields a simplified parse tree
                with POS-tagged, normalized terminal leaves for the sentence """
            if err_index is not None:
                return TreeUtility.dump_tokens(tokens,
                                               tree,
                                               error_index=err_index)
            # Successfully parsed: return a simplified tree for the sentence
            nonlocal full_tree
            # We are assuming that there is only one parsed sentence
            if full_tree is None:
                # Note the full tree of the first parsed paragraph
                full_tree = tree
            return TreeUtility._simplify_tree(tokens, tree)

        with Fast_Parser(verbose=False) as parser:
            pgs, stats, _ = TreeUtility._process_text(parser, session, text,
                                                      all_names, xform)

        if (not pgs or stats["num_parsed"] == 0 or not pgs[0]
                or any("err" in t for t in pgs[0][0])):
            # The first sentence didn't parse: let's not beat around the bush with that fact
            return (None, None, stats)

        # Return the simplified tree, full tree and stats
        assert full_tree is not None
        return (pgs[0][0], full_tree, stats)
예제 #4
0
 def tag_text(
     session: Session, text: str, all_names: bool = False
 ) -> Tuple[PgsList, StatsDict, Optional["RegisterType"]]:
     """ Parse plain text and return the parsed paragraphs as lists of sentences
         where each sentence is a list of tagged tokens """
     # Don't emit diagnostic messages
     with Fast_Parser(verbose=False) as parser:
         return TreeUtility.raw_tag_text(parser, session, text, all_names=all_names)
예제 #5
0
 def tag_text(session, text, all_names=False):
     """ Parse plain text and return the parsed paragraphs as lists of sentences
         where each sentence is a list of tagged tokens """
     # Don't emit diagnostic messages
     with Fast_Parser(verbose=False) as parser:
         return TreeUtility.raw_tag_text(parser,
                                         session,
                                         text,
                                         all_names=all_names)
예제 #6
0
    def parse_text_to_bracket_form(session: Session, text: str):
        """ Parse plain text and return the parsed paragraphs as bracketed strings """

        def xform(
            tokens: List[Tok], tree: Optional[Node], err_index: Optional[int]
        ) -> str:
            """ Transformation function that yields a simplified parse tree
                with POS-tagged, normalized terminal leaves for the sentence """
            if err_index is not None:
                # Return an empty string for sentences that don't parse
                return ""
            # Successfully parsed: obtain a simplified tree for the sentence
            result = []

            def push(node: Optional[CanonicalTokenDict]) -> None:
                """ Append information about a node to the result list """
                if node is None:
                    return
                nonlocal result
                if node.get("k") == "NONTERMINAL":
                    node = cast(SimpleTreeNode, node)
                    result.append("(" + node.get("i", ""))
                    # Recursively add the children of this nonterminal
                    for child in node.get("p", []):
                        result.append(" ")
                        push(child)
                    result.append(")")
                elif node.get("k") == "PUNCTUATION":
                    pass
                    # Include punctuation?
                    # If so, do something like:
                    # result.push("(PUNCT |" + node["x"] + "|)")
                else:
                    # Terminal: append the text
                    result.append(node.get("x", "").replace(" ", "_"))

            # This uses a custom simplification scheme
            simple_tree = TreeUtility._simplify_tree(
                tokens,
                tree,
                nt_map=_TEST_NT_MAP,
                id_map=_TEST_ID_MAP,
                terminal_map=_TEST_TERMINAL_MAP,
            )
            push(simple_tree)
            return "".join(result)

        with Fast_Parser(verbose=False) as parser:
            # The cast(XformFunc, xform) type annotation is a hack
            pgs, stats, _ = TreeUtility._process_text(
                parser, session, text, all_names=False, xform=cast(XformFunc, xform)
            )
        # pgs is a list of paragraphs, each being a list of sentences
        # To access the first parsed sentence, use pgs[0][0]
        return (pgs, stats)
예제 #7
0
    def raw_tag_toklist(session, toklist, root=None):
        """ Parse plain text and return the parsed paragraphs as lists of sentences
            where each sentence is a list of tagged tokens. The result does not
            include a name register. """
        def xform(tokens, tree, err_index):
            """ Transformation function that simply returns a list of POS-tagged,
                normalized tokens for the sentence """
            return TreeUtility.dump_tokens(tokens, tree, error_index=err_index)

        with Fast_Parser(verbose=False, root=root) as parser:
            return TreeUtility._process_toklist(parser, session, toklist,
                                                xform)
예제 #8
0
파일: query.py 프로젝트: sultur/Greynir
    def _parse(toklist: Iterable[Tok]) -> Tuple[ResponseDict, Dict[int, str]]:
        """ Parse a token list as a query """
        bp = Query._parser
        assert bp is not None
        num_sent = 0
        num_parsed_sent = 0
        rdc = Reducer(bp.grammar)
        trees: Dict[int, str] = dict()
        sent: List[Tok] = []

        for t in toklist:
            if t[0] == TOK.S_BEGIN:
                if num_sent > 0:
                    # A second sentence is beginning: this is not valid for a query
                    raise ParseError(
                        "A query cannot contain more than one sentence")
                sent = []
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if not slen:
                    continue
                num_sent += 1
                # Parse the accumulated sentence
                num = 0
                try:
                    # Parse the sentence
                    forest = bp.go(sent)
                    if forest is not None:
                        num = Fast_Parser.num_combinations(forest)
                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                except ParseError:
                    forest = None
                    num = 0
                if num > 0:
                    num_parsed_sent += 1
                    # Obtain a text representation of the parse tree
                    assert forest is not None
                    trees[num_sent] = ParseForestDumper.dump_forest(forest)

            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

        result: ResponseDict = dict(num_sent=num_sent,
                                    num_parsed_sent=num_parsed_sent)
        return result, trees
예제 #9
0
    def parse_text(session, text, all_names=False):
        """ Parse plain text and return the parsed paragraphs as simplified trees """
        def xform(tokens, tree, err_index):
            """ Transformation function that yields a simplified parse tree
                with POS-tagged, normalized terminal leaves for the sentence """
            if err_index is not None:
                return TreeUtility.dump_tokens(tokens, tree, None, err_index)
            # Successfully parsed: return a simplified tree for the sentence
            return TreeUtility._simplify_tree(tokens, tree)

        with Fast_Parser(
                verbose=False) as parser:  # Don't emit diagnostic messages
            return TreeUtility._process_text(parser, session, text, all_names,
                                             xform)
예제 #10
0
    def _parse(toklist):
        """ Parse a token list as a query """

        # Parse with the nonterminal 'QueryRoot' as the grammar root
        with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp:

            sent_begin = 0
            num_sent = 0
            num_parsed_sent = 0
            rdc = Reducer(bp.grammar)
            trees = dict()
            sent = []

            for ix, t in enumerate(toklist):
                if t[0] == TOK.S_BEGIN:
                    sent = []
                    sent_begin = ix
                elif t[0] == TOK.S_END:
                    slen = len(sent)
                    if not slen:
                        continue
                    num_sent += 1
                    # Parse the accumulated sentence
                    num = 0
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest is not None:
                            num = Fast_Parser.num_combinations(forest)
                            if num > 1:
                                # Reduce the resulting forest
                                forest = rdc.go(forest)
                    except ParseError as e:
                        forest = None
                    if num > 0:
                        num_parsed_sent += 1
                        # Obtain a text representation of the parse tree
                        trees[num_sent] = ParseForestDumper.dump_forest(forest)
                        # ParseForestPrinter.print_forest(forest)

                elif t[0] == TOK.P_BEGIN:
                    pass
                elif t[0] == TOK.P_END:
                    pass
                else:
                    sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees
예제 #11
0
파일: treeutil.py 프로젝트: Loknar/Greynir
    def tag_toklist(session, toklist, all_names=False):
        """ Parse plain text and return the parsed paragraphs as lists of sentences
            where each sentence is a list of tagged tokens """
        def xform(tokens, tree, err_index):
            """ Transformation function that simply returns a list of POS-tagged,
                normalized tokens for the sentence """
            return TreeUtility.dump_tokens(tokens, tree, error_index=err_index)

        with Fast_Parser(
                verbose=False) as parser:  # Don't emit diagnostic messages
            pgs, stats = TreeUtility._process_toklist(parser, session, toklist,
                                                      xform)
        from queries.builtin import create_name_register
        register = create_name_register(toklist, session, all_names=all_names)
        return pgs, stats, register
예제 #12
0
    def raw_tag_toklist(
        toklist: Iterable[Tok], root: Optional[str] = None
    ) -> Tuple[PgsList, StatsDict]:
        """ Parse plain text and return the parsed paragraphs as lists of sentences
            where each sentence is a list of tagged tokens. The result does not
            include a name register. """

        def xform(
            tokens: List[Tok], tree: Optional[Node], err_index: Optional[int]
        ) -> List[TokenDict]:
            """ Transformation function that simply returns a list of POS-tagged,
                normalized tokens for the sentence """
            return TreeUtility.dump_tokens(tokens, tree, error_index=err_index)

        with Fast_Parser(verbose=False, root=root) as parser:
            return TreeUtility._process_toklist(parser, toklist, xform)
예제 #13
0
 def parse(self):
     """ Parse the sentence """
     num = 0
     score = 0
     try:
         forest = self._ip._parser.go(self._s)
         if forest is not None:
             num = Fast_Parser.num_combinations(forest)
             if num > 1:
                 forest, score = self._ip._reducer.go_with_score(forest)
     except ParseError as e:
         forest = None
         self._err_index = e.token_index
     self._tree = forest
     self._score = score
     self._ip._add_sentence(self, num)
     return num > 0
예제 #14
0
    def _parse(toklist):
        """ Parse a token list as a query """
        bp = Query._parser
        num_sent = 0
        num_parsed_sent = 0
        rdc = Reducer(bp.grammar)
        trees = dict()
        sent = []

        for t in toklist:
            if t[0] == TOK.S_BEGIN:
                sent = []
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if not slen:
                    continue
                num_sent += 1
                # Parse the accumulated sentence
                num = 0
                try:
                    # Parse the sentence
                    forest = bp.go(sent)
                    if forest is not None:
                        num = Fast_Parser.num_combinations(forest)
                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                except ParseError:
                    forest = None
                if num > 0:
                    num_parsed_sent += 1
                    # Obtain a text representation of the parse tree
                    trees[num_sent] = ParseForestDumper.dump_forest(forest)

            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees
예제 #15
0
    def parse_text(
        session: Session, text: str, all_names: bool = False
    ) -> Tuple[PgsList, StatsDict, Optional["RegisterType"]]:
        """ Parse plain text and return the parsed paragraphs as simplified trees """

        def xform(
            tokens: List[Tok], tree: Optional[Node], err_index: Optional[int]
        ) -> Union[None, List[TokenDict], CanonicalTokenDict]:
            """ Transformation function that yields a simplified parse tree
                with POS-tagged, normalized terminal leaves for the sentence """
            if err_index is not None:
                return TreeUtility.dump_tokens(tokens, tree, error_index=err_index)
            # Successfully parsed: return a simplified tree for the sentence
            return TreeUtility._simplify_tree(tokens, tree)

        with Fast_Parser(verbose=False) as parser:  # Don't emit diagnostic messages
            # The type annotation cast(XformFunc, xform) is a hack
            return TreeUtility._process_text(
                parser, session, text, all_names, cast(XformFunc, xform)
            )
예제 #16
0
    def parse_text_with_full_tree(
        session: Session, text: str, all_names: bool = False
    ) -> Tuple[Optional[List[TokenDict]], Optional[Node], StatsDict]:
        """ Parse plain text, assumed to contain one sentence only, and
            return its simplified form as well as its full form. """

        full_tree: Optional[Node] = None

        def xform(
            tokens: List[Tok], tree: Optional[Node], err_index: Optional[int]
        ) -> Union[None, List[TokenDict], CanonicalTokenDict]:
            """ Transformation function that yields a simplified parse tree
                with POS-tagged, normalized terminal leaves for the sentence """
            if err_index is not None:
                return TreeUtility.dump_tokens(tokens, tree, error_index=err_index)
            # Successfully parsed: return a simplified tree for the sentence
            nonlocal full_tree
            # We are assuming that there is only one parsed sentence
            if full_tree is None:
                # Note the full tree of the first parsed paragraph
                full_tree = tree
            return TreeUtility._simplify_tree(tokens, tree)

        with Fast_Parser(verbose=False) as parser:
            # The cast(XformFunction, xform) type annotation is a hack
            pgs, stats, _ = TreeUtility._process_text(
                parser, session, text, all_names, cast(XformFunc, xform)
            )

        if (
            not pgs
            or stats["num_parsed"] == 0
            or not pgs[0]
            or any("err" in t for t in pgs[0][0])
        ):
            # The first sentence didn't parse: let's not beat around the bush with that fact
            return (None, None, stats)

        # Return the simplified tree, full tree and stats
        assert full_tree is not None
        return (pgs[0][0], full_tree, stats)
예제 #17
0
def _make_tree(text: str) -> Tree:
    """Tokenize and parse text, create tree representation string
    from all the parse trees, return Tree object and token JSON."""
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)

    pgs = []
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        pgs.append([])
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse: " + sent.text
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            assert sent.tree is not None
            tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
            )
            pgs[-1].append(token_dicts)
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())
    tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False)

    tree = Tree()
    tree.load(tree_string)
    return tree, tokens_json
예제 #18
0
파일: main.py 프로젝트: thorunna/Greynir
        else:
            raise
    finally:
        ArticleProxy.cleanup()
        BIN_Db.cleanup()

else:
    app.config["PRODUCTION"] = True

    # Suppress information log messages from Werkzeug
    werkzeug_log = logging.getLogger("werkzeug")
    if werkzeug_log:
        werkzeug_log.setLevel(logging.WARNING)

    # Log our startup
    log_str = ("Greynir instance starting with "
               "host={0}:{1}, db_host={2}:{3} on Python {4}".format(
                   Settings.HOST,
                   Settings.PORT,
                   Settings.DB_HOSTNAME,
                   Settings.DB_PORT,
                   sys.version.replace("\n", " "),
               ))
    logging.info(log_str)
    print(log_str)
    sys.stdout.flush()

    # Running as a server module: pre-load the grammar into memory
    with Fast_Parser() as fp:
        pass
예제 #19
0
def test_entities():
    text = """

       Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota.

       Danska byggingavörukeðjan Bygma hefur keypt íslenska
       verslunarfyrirtækið Húsasmiðjuna.

       Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs
       eru hluthafar í Arion banka.

       Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra.
       Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær.
       Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær.

       Primera Air var íslenskt flugfélag.
       Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag.

       Villeneuve-Loubet er franskt þorp.

       Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda.
       
       Í miðbæ Reykjavíkur er herrafataverslunin Geysir.

       Mér er sagt að Geysir sé hættur að gjósa.
       
       Geysir er hættur að gjósa.
       
       Geysir er gamall goshver.
       
       Fyrirtækið Apple-búðin selur Apple Mac tölvur.
       Fyrirtækið Origo selur IBM tölvur.
       
       Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
       
    """
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse"
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            tree = ParseForestDumper.dump_forest(sent.tree,
                                                 token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree])
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val)
                          for key, val in trees.items())

    tree = Tree()
    tree.load(tree_string)

    session = SessionShim()
    tree.process(session, entities)

    session.check(("Bygma", "er", "dönsk byggingavörukeðja"))
    session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki"))
    session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður"))
    session.check(
        ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður"))
    session.check(("Primera Air", "var", "íslenskt flugfélag"))
    session.check(("Villeneuve-Loubet", "er", "franskt þorp"))
    session.check(("Valdís", "er", "ísbúð"))
    session.check(("Fosshótel", "var", "rekin með tapi"))
    session.check(("Fosshótel", "er", "stór hótelkeðja"))
    session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði"))
    session.check(("Lax", "er", "stór fiskur af ætt laxfiska"))
    session.check(("Geysir", "er", "gamall goshver"))
    session.check(("Eimskipafélag Íslands hf", "er", "skipafélag"))
    session.check(("Origo", "er", "fyrirtæki"))
    session.check(("AirBerlin", "er", "flugfélag"))

    assert session.is_empty()
예제 #20
0
파일: article.py 프로젝트: haukurb/Reynir
 def _init_class(cls):
     """ Initialize class attributes """
     if cls._parser is None:
         cls._parser = Fast_Parser(
             verbose=False)  # Don't emit diagnostic messages