Exemplo n.º 1
0
def test_figure_matcher(doc_setup):
    """Test matchers for figures."""
    doc = doc_setup
    # Create two dummy figures
    Figure(id=2, document=doc)
    Figure(id=3, document=doc)
    assert len(doc.figures) == 2

    space = MentionFigures()
    assert len(list(space.apply(doc))) == 2

    # Set up a matcher that matches figures with id==2.
    matcher = LambdaFunctionFigureMatcher(
        func=lambda tf: True if tf.figure.id == 2 else False)

    # Test if matcher only matches the first figure.
    assert len(list(matcher.apply(space.apply(doc)))) == 1
    assert set(tf.figure.id for tf in matcher.apply(space.apply(doc))) == {2}

    # The keyword arg should be "func"
    with pytest.raises(Exception):
        LambdaFunctionFigureMatcher(
            function=lambda tf: True if tf.figure.id == 2 else False)

    # LambdaFunctionFigureMatcher only supports TemporaryFigureMention.
    space = MentionNgrams(n_min=1, n_max=2)
    with pytest.raises(ValueError):
        list(matcher.apply(space.apply(doc)))
    def _parse_figure(self, node, state):
        """Parse the figure node.

        :param node: The lxml img node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        if node.tag not in ["img", "figure"]:
            return state

        # Process the Figure
        stable_id = (f"{state['document'].name}"
                     f"::"
                     f"{'figure'}"
                     f":"
                     f"{state['figure']['idx']}")

        # Set name for Figure
        name = node.attrib["name"] if "name" in node.attrib else None

        # img within a Figure get's processed in the parent Figure
        if node.tag == "img" and isinstance(state["parent"][node], Figure):
            return state

        # NOTE: We currently do NOT support nested figures.
        parts = {}
        parent = state["parent"][node]
        if isinstance(parent, Section):
            parts["section"] = parent
        elif isinstance(parent, Cell):
            parts["section"] = parent.table.section
            parts["cell"] = parent
        else:
            logger.warning(f"Figure is nested within {state['parent'][node]}")
            return state

        parts["document"] = state["document"]
        parts["stable_id"] = stable_id
        parts["name"] = name
        parts["position"] = state["figure"]["idx"]

        # If processing a raw img
        if node.tag == "img":
            # Create the Figure entry in the DB
            parts["url"] = node.get("src")
            state["context"][node] = Figure(**parts)
        elif node.tag == "figure":
            # Pull the image from a child img node, if one exists
            imgs = [child for child in node if child.tag == "img"]

            if len(imgs) > 1:
                logger.warning("Figure contains multiple images.")
                # Right now we don't support multiple URLs in the Figure context
                # As a workaround, just ignore the outer Figure and allow processing
                # of the individual images. We ignore the accompanying figcaption
                # by marking it as visited.
                captions = [
                    child for child in node if child.tag == "figcaption"
                ]
                state["visited"].update(captions)
                return state

            img = imgs[0]
            state["visited"].add(img)

            # Create the Figure entry in the DB
            parts["url"] = img.get("src")
            state["context"][node] = Figure(**parts)

        state["figure"]["idx"] += 1
        return state
Exemplo n.º 3
0
    def _parse_figure(self, node, state):
        """Parse the figure node.

        :param node: The lxml img node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        """
        if node.tag not in ["img", "figure"]:
            return state

        # Process the figure
        stable_id = "{}::{}:{}".format(
            state["document"].name, "figure", state["figure"]["idx"]
        )

        # img within a Figure get's processed in the parent Figure
        if node.tag == "img" and isinstance(state["parent"][node], Figure):
            return state

        # NOTE: We currently do NOT support nested figures.
        if not isinstance(state["parent"][node], Section):
            logger.warning("Figure is nested within {}".format(state["parent"][node]))
            return state

        # If processing a raw img
        if node.tag == "img":
            # Create the Figure entry in the DB
            state["context"][node] = Figure(
                document=state["document"],
                section=state["parent"][node],
                stable_id=stable_id,
                position=state["figure"]["idx"],
                url=node.get("src"),
            )
        elif node.tag == "figure":
            # Pull the image from a child img node, if one exists
            imgs = [child for child in node if child.tag == "img"]

            if len(imgs) > 1:
                logger.warning("Figure contains multiple images.")
                # Right now we don't support multiple URLs in the Figure context
                # As a workaround, just ignore the outer Figure and allow processing
                # of the individual images. We ignore the accompanying figcaption
                # by marking it as visited.
                captions = [child for child in node if child.tag == "figcaption"]
                state["visited"].update(captions)
                return state

            img = imgs[0]
            state["visited"].add(img)

            # Create the Figure entry in the DB
            state["context"][node] = Figure(
                document=state["document"],
                section=state["parent"][node],
                stable_id=stable_id,
                position=state["figure"]["idx"],
                url=img.get("src"),
            )

        state["figure"]["idx"] += 1
        return state