def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower, from_phrase): # TODO: this currently looks only in current table; # precompute over the whole document/page instead bbox_direction_aligned = bbox_vert_aligned if direction == 'vert' else bbox_horz_aligned ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = [c] if isinstance(c, TemporarySpan) else c.get_contexts() for span in spans: if not span.sentence.is_tabular() or not span.sentence.is_visual(): continue for phrase in span.sentence.table.phrases: if (from_phrase): if (bbox_direction_aligned(bbox_from_phrase(phrase), bbox_from_span(span)) and phrase is not span.sentence): for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(phrase): if (bbox_direction_aligned(bbox_from_span(ts), bbox_from_span(span)) and not (phrase == span.sentence and ts.get_span() in span.get_span())): yield f(ts.get_span())
def is_horz_aligned(c): """Return True if all the components of c are horizontally aligned. Horizontal alignment means that the bounding boxes of each Span of c shares a similar y-axis value in the visual rendering of the document. :param c: The candidate to evaluate :rtype: boolean """ return (all([ c[i].sentence.is_visual() and bbox_horz_aligned(bbox_from_span(c[i]), bbox_from_span(c[0])) for i in range(len(c)) ]))
def is_vert_aligned_center(c): """Return true if all the components of c are vertically aligned based on their left border. Vertical alignment means that the bounding boxes of each Span of c shares a similar x-axis value in the visual rendering of the document. In this function the similarity of the x-axis value is based on the center of their bounding boxes. :param c: The candidate to evaluate :rtype: boolean """ return (all([ c[i].sentence.is_visual() and bbox_vert_aligned_center( bbox_from_span(c[i]), bbox_from_span(c[0])) for i in range(len(c)) ]))
def same_page(c): """Return true if all the components of c are on the same page of the document. Page numbers are based on the PDF rendering of the document. If a PDF file is provided, it is used. Otherwise, if only a HTML/XML document is provided, a PDF is created and then used to determine the page number of a Span. :param c: The candidate to evaluate :rtype: boolean """ return (all([ c[i].sentence.is_visual() and bbox_from_span(c[i]).page == bbox_from_span(c[0]).page for i in range(len(c)) ]))
def get_page_vert_percentile(span, page_width=DEFAULT_WIDTH, page_height=DEFAULT_HEIGHT): """Return which percentile from the TOP in the page Span candidate is located in. Percentile is calculated where the top of the page is 0.0, and the bottom of the page is 1.0. For example, a Span in at the top 1/4 of the page will have a percentil of 0.25. Page width and height are based on pt values:: Letter 612x792 Tabloid 792x1224 Ledger 1224x792 Legal 612x1008 Statement 396x612 Executive 540x720 A0 2384x3371 A1 1685x2384 A2 1190x1684 A3 842x1190 A4 595x842 A4Small 595x842 A5 420x595 B4 729x1032 B5 516x729 Folio 612x936 Quarto 610x780 10x14 720x1008 and should match the source documents. Letter size is used by default. Note that if a candidate is passed in, only the vertical percentil of its first Span is returned. :param span: The Span to evaluate :param page_width: The width of the page. Default to Letter paper width. :param page_height: The heigh of the page. Default to Letter paper height. :rtype: float in [0.0, 1.0] """ span = span if isinstance(span, TemporarySpan) else span[0] return bbox_from_span(span).top / page_height