def get_box(span: SpanMention) -> Tuple[int, int, int, int, int]: box = ( min(span.get_attrib_tokens("page")), min(span.get_attrib_tokens("top")), min(span.get_attrib_tokens("left")), max(span.get_attrib_tokens("bottom")), max(span.get_attrib_tokens("right")), ) return box
def get_box(span: SpanMention) -> Bbox: """Get the bounding box.""" warnings.warn( "get_box(span) is deprecated. Use span.get_bbox() instead.", DeprecationWarning, ) return Bbox( min(span.get_attrib_tokens("page")), min(span.get_attrib_tokens("top")), max(span.get_attrib_tokens("bottom")), min(span.get_attrib_tokens("left")), max(span.get_attrib_tokens("right")), )
def _get_word_feats(span: SpanMention) -> Iterator[str]: attrib = "words" if span.stable_id not in unary_word_feats: unary_word_feats[span.stable_id] = set() for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib), n_min=1, n_max=2): feature = f"CONTAINS_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) for ngram in get_left_ngrams( span, window=settings["featurization"]["textual"]["word_feature"] ["window"], n_max=2, attrib=attrib, ): feature = f"LEFT_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) for ngram in get_right_ngrams( span, window=settings["featurization"]["textual"]["word_feature"] ["window"], n_max=2, attrib=attrib, ): feature = f"RIGHT_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) unary_word_feats[span.stable_id].add(( f"SPAN_TYPE_[" f"{'IMPLICIT' if isinstance(span, ImplicitSpanMention) else 'EXPLICIT'}" f"]")) if span.get_span()[0].isupper(): unary_word_feats[span.stable_id].add("STARTS_WITH_CAPITAL") unary_word_feats[span.stable_id].add(f"LENGTH_{span.get_num_words()}") for f in unary_word_feats[span.stable_id]: yield f