예제 #1
0
 def test_filter_edges(self):
     with pytest.raises(ValueError):
         utils.filter_edges([], "x")
예제 #2
0
    def get_edges(self):
        settings = self.settings
        for name in [ "vertical", "horizontal" ]:
            strategy = settings[name + "_strategy"]
            if strategy not in TABLE_STRATEGIES:
                raise ValueError("{0} must be one of {{{1}}}".format(
                    name + "_strategy",
                    ",".join(TABLE_STRATEGIES)
                ))
            if strategy == "explicit":
                if len(settings["explicit_" + name + "_lines"]) < 2:
                    raise ValueError("If {0} == 'explicit', {1} must be specified as list/tuple of two or more floats/ints.".format(
                        strategy + "_strategy",
                        "explicit_" + name + "_lines",
                    ))

        v_strat = settings["vertical_strategy"]
        h_strat = settings["horizontal_strategy"]

        if v_strat == "text" or h_strat == "text":
            xt = settings["text_x_tolerance"]
            if xt == None:
                xt = settings["text_tolerance"]
            yt = settings["text_y_tolerance"]
            if yt == None:
                yt = settings["text_tolerance"]
            words = self.page.extract_words(
                x_tolerance=xt,
                y_tolerance=yt,
                keep_blank_chars=settings["keep_blank_chars"]
            )

        def v_edge_desc_to_edge(desc):
            if isinstance(desc, dict):
                edge = {
                    "x0": desc.get("x0", desc.get("x")),
                    "x1": desc.get("x1", desc.get("x")),
                    "top": desc.get("top", self.page.bbox[1]),
                    "bottom": desc.get("bottom", self.page.bbox[3]),
                    "orientation": "v"
                }
            else:
                edge = {
                    "x0": desc,
                    "x1": desc,
                    "top": self.page.bbox[1],
                    "bottom": self.page.bbox[3],
                }
            edge["height"] = edge["bottom"] - edge["top"]
            edge["orientation"] = "v"
            return edge

        v_explicit = list(map(v_edge_desc_to_edge, settings["explicit_vertical_lines"]))

        if  v_strat == "lines":
            v_base = utils.filter_edges(self.page.edges, "v")
        elif v_strat == "lines_strict":
            v_base = utils.filter_edges(self.page.edges, "v",
                edge_type="lines")
        elif v_strat == "text":
            v_base = words_to_edges_v(words,
                word_threshold=settings["min_words_vertical"])
        elif v_strat == "explicit":
            v_base = []

        v = v_base + v_explicit
        
        def h_edge_desc_to_edge(desc):
            if isinstance(desc, dict):
                edge = {
                    "x0": desc.get("x0", self.page.bbox[0]),
                    "x1": desc.get("x1", self.page.bbox[2]),
                    "top": desc.get("top", desc.get("bottom")),
                    "bottom": desc.get("bottom", desc.get("top")),
                }
            else:
                edge = {
                    "x0": self.page.bbox[0],
                    "x1": self.page.bbox[2],
                    "top": desc,
                    "bottom": desc,
                }
            edge["width"] = edge["x1"] - edge["x0"]
            edge["orientation"] = "h"
            return edge

        h_explicit = list(map(h_edge_desc_to_edge, settings["explicit_horizontal_lines"]))

        if  h_strat == "lines":
            h_base = utils.filter_edges(self.page.edges, "h")
        elif h_strat == "lines_strict":
            h_base = utils.filter_edges(self.page.edges, "h",
                edge_type="lines")
        elif h_strat == "text":
            h_base = words_to_edges_h(words,
                word_threshold=settings["min_words_horizontal"])
        elif h_strat == "explicit":
            h_base = []

        h = h_base + h_explicit

        edges = list(v) + list(h)
        if settings["snap_tolerance"] > 0 or settings["join_tolerance"] > 0:
            edges = merge_edges(edges,
                snap_tolerance=settings["snap_tolerance"],
                join_tolerance=settings["join_tolerance"],
            )
        return utils.filter_edges(edges,
            min_length=settings["edge_min_length"])