def test_filter_edges(self): with pytest.raises(ValueError): utils.filter_edges([], "x")
def get_edges(self): settings = self.settings for name in [ "vertical", "horizontal" ]: strategy = settings[name + "_strategy"] if strategy not in TABLE_STRATEGIES: raise ValueError("{0} must be one of {{{1}}}".format( name + "_strategy", ",".join(TABLE_STRATEGIES) )) if strategy == "explicit": if len(settings["explicit_" + name + "_lines"]) < 2: raise ValueError("If {0} == 'explicit', {1} must be specified as list/tuple of two or more floats/ints.".format( strategy + "_strategy", "explicit_" + name + "_lines", )) v_strat = settings["vertical_strategy"] h_strat = settings["horizontal_strategy"] if v_strat == "text" or h_strat == "text": xt = settings["text_x_tolerance"] if xt == None: xt = settings["text_tolerance"] yt = settings["text_y_tolerance"] if yt == None: yt = settings["text_tolerance"] words = self.page.extract_words( x_tolerance=xt, y_tolerance=yt, keep_blank_chars=settings["keep_blank_chars"] ) def v_edge_desc_to_edge(desc): if isinstance(desc, dict): edge = { "x0": desc.get("x0", desc.get("x")), "x1": desc.get("x1", desc.get("x")), "top": desc.get("top", self.page.bbox[1]), "bottom": desc.get("bottom", self.page.bbox[3]), "orientation": "v" } else: edge = { "x0": desc, "x1": desc, "top": self.page.bbox[1], "bottom": self.page.bbox[3], } edge["height"] = edge["bottom"] - edge["top"] edge["orientation"] = "v" return edge v_explicit = list(map(v_edge_desc_to_edge, settings["explicit_vertical_lines"])) if v_strat == "lines": v_base = utils.filter_edges(self.page.edges, "v") elif v_strat == "lines_strict": v_base = utils.filter_edges(self.page.edges, "v", edge_type="lines") elif v_strat == "text": v_base = words_to_edges_v(words, word_threshold=settings["min_words_vertical"]) elif v_strat == "explicit": v_base = [] v = v_base + v_explicit def h_edge_desc_to_edge(desc): if isinstance(desc, dict): edge = { "x0": desc.get("x0", self.page.bbox[0]), "x1": desc.get("x1", self.page.bbox[2]), "top": desc.get("top", desc.get("bottom")), "bottom": desc.get("bottom", desc.get("top")), } else: edge = { "x0": self.page.bbox[0], "x1": self.page.bbox[2], "top": desc, "bottom": desc, } edge["width"] = edge["x1"] - edge["x0"] edge["orientation"] = "h" return edge h_explicit = list(map(h_edge_desc_to_edge, settings["explicit_horizontal_lines"])) if h_strat == "lines": h_base = utils.filter_edges(self.page.edges, "h") elif h_strat == "lines_strict": h_base = utils.filter_edges(self.page.edges, "h", edge_type="lines") elif h_strat == "text": h_base = words_to_edges_h(words, word_threshold=settings["min_words_horizontal"]) elif h_strat == "explicit": h_base = [] h = h_base + h_explicit edges = list(v) + list(h) if settings["snap_tolerance"] > 0 or settings["join_tolerance"] > 0: edges = merge_edges(edges, snap_tolerance=settings["snap_tolerance"], join_tolerance=settings["join_tolerance"], ) return utils.filter_edges(edges, min_length=settings["edge_min_length"])