def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower, from_sentence): # TODO: this currently looks only in current table; # precompute over the whole document/page instead bbox_direction_aligned = (bbox_vert_aligned if direction == "vert" else bbox_horz_aligned) ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = [c] if isinstance(c, TemporarySpan) else c.get_contexts() for span in spans: if not span.sentence.is_tabular() or not span.sentence.is_visual(): continue for sentence in span.sentence.table.sentences: if from_sentence: if (bbox_direction_aligned(bbox_from_sentence(sentence), bbox_from_span(span)) and sentence is not span.sentence): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(sentence): if bbox_direction_aligned( bbox_from_span(ts), bbox_from_span(span)) and not ( sentence == span.sentence and ts.get_span() in span.get_span()): yield f(ts.get_span())
def _preprocess_visual_features(doc: Document) -> None: if hasattr(doc, "_visual_features"): return # cache flag doc._visual_features = True sentence_by_page: DefaultDict[str, List[Sentence]] = defaultdict(list) for sentence in doc.sentences: sentence_by_page[sentence.page[0]].append(sentence) sentence._aligned_lemmas = set() for page, sentences in sentence_by_page.items(): # process per page alignments yc_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list) x0_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list) xc_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list) x1_aligned: DefaultDict[int, List[Sentence]] = defaultdict(list) for sentence in sentences: sentence.bbox = bbox_from_sentence(sentence) sentence.yc = (sentence.bbox.top + sentence.bbox.bottom) / 2 sentence.x0 = sentence.bbox.left sentence.x1 = sentence.bbox.right sentence.xc = (sentence.x0 + sentence.x1) / 2 # index current sentence by different alignment keys yc_aligned[sentence.yc].append(sentence) x0_aligned[sentence.x0].append(sentence) x1_aligned[sentence.x1].append(sentence) xc_aligned[sentence.xc].append(sentence) for l in yc_aligned.values(): l.sort(key=lambda p: p.xc) for l in x0_aligned.values(): l.sort(key=lambda p: p.yc) for l in x1_aligned.values(): l.sort(key=lambda p: p.yc) for l in xc_aligned.values(): l.sort(key=lambda p: p.yc) _assign_alignment_features(yc_aligned, "Y_") _assign_alignment_features(x0_aligned, "LEFT_") _assign_alignment_features(x1_aligned, "RIGHT_") _assign_alignment_features(xc_aligned, "CENTER_")
def _get_direction_ngrams( direction: str, c: Union[Candidate, Mention, TemporarySpanMention], attrib: str, n_min: int, n_max: int, lower: bool, from_sentence: bool, ) -> Iterator[str]: # TODO: this currently looks only in current table; # precompute over the whole document/page instead bbox_direction_aligned = (bbox_vert_aligned if direction == "vert" else bbox_horz_aligned) ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = _to_spans(c) for span in spans: if not span.sentence.is_tabular() or not span.sentence.is_visual(): continue for sentence in span.sentence.table.sentences: if from_sentence: if (bbox_direction_aligned(bbox_from_sentence(sentence), bbox_from_span(span)) and sentence is not span.sentence): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(sentence): if bbox_direction_aligned( bbox_from_span(ts), bbox_from_span(span)) and not ( sentence == span.sentence and ts.get_span() in span.get_span()): yield f(ts.get_span())