def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower, from_phrase): # TODO: this currently looks only in current table; # precompute over the whole document/page instead bbox_direction_aligned = bbox_vert_aligned if direction == 'vert' else bbox_horz_aligned ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = [c] if isinstance(c, TemporarySpan) else c.get_contexts() for span in spans: if not span.sentence.is_tabular() or not span.sentence.is_visual(): continue for phrase in span.sentence.table.phrases: if (from_phrase): if (bbox_direction_aligned(bbox_from_phrase(phrase), bbox_from_span(span)) and phrase is not span.sentence): for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(phrase): if (bbox_direction_aligned(bbox_from_span(ts), bbox_from_span(span)) and not (phrase == span.sentence and ts.get_span() in span.get_span())): yield f(ts.get_span())
def _preprocess_visual_features(doc): if hasattr(doc, '_visual_features'): return # cache flag doc._visual_features = True phrase_by_page = defaultdict(list) for phrase in doc.phrases: phrase_by_page[phrase.page[0]].append(phrase) phrase._aligned_lemmas = set() for page, phrases in phrase_by_page.items(): # process per page alignments yc_aligned = defaultdict(list) x0_aligned = defaultdict(list) xc_aligned = defaultdict(list) x1_aligned = defaultdict(list) for phrase in phrases: phrase.bbox = bbox_from_phrase(phrase) phrase.yc = (phrase.bbox.top + phrase.bbox.bottom) / 2 phrase.x0 = phrase.bbox.left phrase.x1 = phrase.bbox.right phrase.xc = (phrase.x0 + phrase.x1) / 2 # index current phrase by different alignment keys yc_aligned[phrase.yc].append(phrase) x0_aligned[phrase.x0].append(phrase) x1_aligned[phrase.x1].append(phrase) xc_aligned[phrase.xc].append(phrase) for l in yc_aligned.values(): l.sort(key=lambda p: p.xc) for l in x0_aligned.values(): l.sort(key=lambda p: p.yc) for l in x1_aligned.values(): l.sort(key=lambda p: p.yc) for l in xc_aligned.values(): l.sort(key=lambda p: p.yc) _assign_alignment_features(yc_aligned, 'Y_') _assign_alignment_features(x0_aligned, 'LEFT_') _assign_alignment_features(x1_aligned, 'RIGHT_') _assign_alignment_features(xc_aligned, 'CENTER_')