def _get_axis_ngrams(mention, axis, attrib="words", n_min=1, n_max=1, spread=[0, 0], lower=True): span = _to_span(mention) if not span.sentence.is_tabular(): yield None return for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if span.sentence.is_tabular(): for sentence in _get_aligned_sentences(span.sentence, axis, spread=spread): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram
def get_cell_ngrams(mention, attrib="words", n_min=1, n_max=1, lower=True): """Get the ngrams that are in the Cell of the given mention, not including itself. Note that if a candidate is passed in, all of its Mentions will be searched. :param mention: The Mention whose Cell is being searched :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a *generator* of ngrams """ spans = _to_spans(mention) for span in spans: for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if span.sentence.is_tabular(): for ngram in chain.from_iterable([ tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower) for sentence in _get_table_cells(span.sentence.table)[span.sentence.cell] if sentence != span.sentence ]): yield ngram
def get_word_feats(span): attrib = "words" if span.stable_id not in unary_word_feats: unary_word_feats[span.stable_id] = set() for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib), n_min=1, n_max=2): feature = f"CONTAINS_{attrib.upper()}_[{ngram}]" unary_word_feats.add(feature) for ngram in get_left_ngrams( span, window=settings["featurization"]["content"]["word_feature"] ["window"], n_max=2, attrib=attrib, ): feature = f"LEFT_{attrib.upper()}_[{ngram}]" unary_word_feats.add(feature) for ngram in get_right_ngrams( span, window=settings["featurization"]["content"]["word_feature"] ["window"], n_max=2, attrib=attrib, ): feature = f"RIGHT_{attrib.upper()}_[{ngram}]" unary_word_feats.add(feature) for f in unary_word_feats[span.stable_id]: yield f
def get_neighbor_sentence_ngrams(mention, d=1, attrib="words", n_min=1, n_max=1, lower=True): """Get the ngrams that are in the neighoring Sentences of the given Mention. Note that if a candidate is passed in, all of its Mentions will be searched. :param mention: The Mention whose neighbor Sentences are being searched :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a *generator* of ngrams """ spans = _to_spans(mention) for span in spans: for ngram in chain.from_iterable([ tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower) for sentence in span.sentence.document.sentences if abs(sentence.position - span.sentence.position) <= d and sentence != span.sentence ]): yield ngram
def _get_axis_ngrams( mention: Union[Candidate, Mention, TemporarySpanMention], axis: str, attrib: str = "words", n_min: int = 1, n_max: int = 1, spread: List[int] = [0, 0], lower: bool = True, ) -> Iterator[str]: span = _to_span(mention) if not span.sentence.is_tabular(): return yield for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram for sentence in _get_aligned_sentences(span.sentence, axis, spread=spread): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram
def get_left_ngrams( mention: Union[Candidate, Mention, TemporarySpanMention], window: int = 3, attrib: str = "words", n_min: int = 1, n_max: int = 1, lower: bool = True, ) -> Iterator[str]: """Get the ngrams within a window to the *left* from the sentence Context. For higher-arity Candidates, defaults to the *first* argument. :param mention: The Mention to evaluate. If a candidate is given, default to its first Mention. :param window: The number of tokens to the left of the first argument to return. :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case """ span = _to_span(mention) i = span.get_word_start_index() for ngram in tokens_to_ngrams( getattr(span.sentence, attrib)[max(0, i - window):i], n_min=n_min, n_max=n_max, lower=lower, ): yield ngram
def get_head_ngrams(mention, axis=None, attrib="words", n_min=1, n_max=1, lower=True): """Get the ngrams from the cell in the head of the row or column. More specifically, this returns the ngrams in the leftmost cell in a row and/or the ngrams in the topmost cell in the column, depending on the axis parameter. Note that if a candidate is passed in, all of its Mentions will be searched. :param mention: The Mention whose head Cells are being returned :param axis: Which axis {'row', 'col'} to search. If None, then both row and col are searched. :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a *generator* of ngrams """ spans = _to_spans(mention) axes = (axis,) if axis else ("row", "col") for span in spans: if span.sentence.is_tabular(): for axis in axes: if getattr(span.sentence, _other_axis(axis) + "_start") == 0: return for sentence in getattr( _get_head_cell(span.sentence.cell, axis), "sentences", [] ): for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower ): yield ngram
def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower, from_sentence): # TODO: this currently looks only in current table; # precompute over the whole document/page instead bbox_direction_aligned = (bbox_vert_aligned if direction == "vert" else bbox_horz_aligned) ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = _to_spans(c) for span in spans: if not span.sentence.is_tabular() or not span.sentence.is_visual(): continue for sentence in span.sentence.table.sentences: if from_sentence: if (bbox_direction_aligned(bbox_from_sentence(sentence), bbox_from_span(span)) and sentence is not span.sentence): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(sentence): if bbox_direction_aligned( bbox_from_span(ts), bbox_from_span(span)) and not ( sentence == span.sentence and ts.get_span() in span.get_span()): yield f(ts.get_span())
def get_right_ngrams(mention, window=3, attrib="words", n_min=1, n_max=1, lower=True): """Get the ngrams within a window to the *right* from the sentence Context. For higher-arity Candidates, defaults to the *last* argument. :param mention: The Mention to evaluate. If a candidate is given, default to its last Mention. :param window: The number of tokens to the left of the first argument to return :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a *generator* of ngrams """ span = _to_span(mention, idx=-1) i = span.get_word_end() for ngram in tokens_to_ngrams( getattr(span.sentence, attrib)[i + 1:i + 1 + window], n_min=n_min, n_max=n_max, lower=lower, ): yield ngram
def _get_word_feats(span: SpanMention) -> Iterator[str]: attrib = "words" if span.stable_id not in unary_word_feats: unary_word_feats[span.stable_id] = set() for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib), n_min=1, n_max=2): feature = f"CONTAINS_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) for ngram in get_left_ngrams( span, window=settings["featurization"]["textual"]["word_feature"] ["window"], n_max=2, attrib=attrib, ): feature = f"LEFT_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) for ngram in get_right_ngrams( span, window=settings["featurization"]["textual"]["word_feature"] ["window"], n_max=2, attrib=attrib, ): feature = f"RIGHT_{attrib.upper()}_[{ngram}]" unary_word_feats[span.stable_id].add(feature) unary_word_feats[span.stable_id].add(( f"SPAN_TYPE_[" f"{'IMPLICIT' if isinstance(span, ImplicitSpanMention) else 'EXPLICIT'}" f"]")) if span.get_span()[0].isupper(): unary_word_feats[span.stable_id].add("STARTS_WITH_CAPITAL") unary_word_feats[span.stable_id].add(f"LENGTH_{span.get_num_words()}") for f in unary_word_feats[span.stable_id]: yield f
def get_cell_ngrams( mention: Union[Candidate, Mention, TemporarySpanMention], attrib: str = "words", n_min: int = 1, n_max: int = 1, lower: bool = True, ) -> Iterator[str]: """Get the ngrams that are in the Cell of the given mention, not including itself. Note that if a candidate is passed in, all of its Mentions will be searched. Also note that if the mention is not tabular, nothing will be yielded. :param mention: The Mention whose Cell is being searched :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case """ spans = _to_spans(mention) for span in spans: if not span.sentence.is_tabular(): continue for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram for ngram in chain.from_iterable([ tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower) for sentence in _get_table_cells(span.sentence.table)[span.sentence.cell] if sentence != span.sentence ]): yield ngram
def _get_direction_ngrams( direction: str, c: Union[Candidate, Mention, TemporarySpanMention], attrib: str, n_min: int, n_max: int, lower: bool, from_sentence: bool, ) -> Iterator[str]: bbox_direction_aligned = (bbox_vert_aligned if direction == "vert" else bbox_horz_aligned) ngrams_space = Ngrams(n_max=n_max, split_tokens=[]) f = (lambda w: w.lower()) if lower else (lambda w: w) spans = _to_spans(c) for span in spans: if not span.sentence.is_visual(): continue for sentence in span.sentence.document.sentences: # Skip if not in the same page. if span.sentence.get_bbox().page != sentence.get_bbox().page: continue if from_sentence: if (bbox_direction_aligned(sentence.get_bbox(), span.get_bbox()) and sentence is not span.sentence # not from its Sentence ): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram else: for ts in ngrams_space.apply(sentence): if ( # True if visually aligned AND not from itself. bbox_direction_aligned(ts.get_bbox(), span.get_bbox()) and ts not in span and span not in ts): yield f(ts.get_span())
def get_neighbor_cell_ngrams(mention, dist=1, directions=False, attrib="words", n_min=1, n_max=1, lower=True): """ Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Mention. Note that if a candidate is passed in, all of its Mentions will be searched. If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}. :param mention: The Mention whose neighbor Cells are being searched :param dist: The Cell distance within which a neighbor Cell must be to be considered :param directions: A Boolean expressing whether or not to return the direction of each ngram :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a *generator* of ngrams (or (ngram, direction) tuples if directions=True) """ # TODO: Fix this to be more efficient (optimize with SQL query) spans = _to_spans(mention) for span in spans: for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if span.sentence.is_tabular(): root_cell = span.sentence.cell for sentence in chain.from_iterable([ _get_aligned_sentences(root_cell, "row"), _get_aligned_sentences(root_cell, "col"), ]): row_diff = min_row_diff(sentence, root_cell, absolute=False) col_diff = min_col_diff(sentence, root_cell, absolute=False) if ((row_diff or col_diff) and not (row_diff and col_diff) and abs(row_diff) + abs(col_diff) <= dist): if directions: direction = "" if col_diff == 0: if 0 < row_diff and row_diff <= dist: direction = "UP" elif 0 > row_diff and row_diff >= -dist: direction = "DOWN" elif row_diff == 0: if 0 < col_diff and col_diff <= dist: direction = "RIGHT" elif 0 > col_diff and col_diff >= -dist: direction = "LEFT" for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower, ): yield (ngram, direction) else: for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower, ): yield ngram
def get_neighbor_cell_ngrams( mention: Union[Candidate, Mention, TemporarySpanMention], dist: int = 1, directions: bool = False, attrib: str = "words", n_min: int = 1, n_max: int = 1, lower: bool = True, ) -> Iterator[Union[str, Tuple[str, str]]]: """Get ngrams from all neighbor Cells. Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Mention. Note that if a candidate is passed in, all of its Mentions will be searched. If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}. Also note that if the mention is not tabular, nothing will be yielded. :param mention: The Mention whose neighbor Cells are being searched :param dist: The Cell distance within which a neighbor Cell must be to be considered :param directions: A Boolean expressing whether or not to return the direction of each ngram :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :return: a *generator* of ngrams (or (ngram, direction) tuples if directions=True) """ # TODO: Fix this to be more efficient (optimize with SQL query) spans = _to_spans(mention) for span in spans: if not span.sentence.is_tabular(): continue for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram root_cell = span.sentence.cell for sentence in chain.from_iterable([ _get_aligned_sentences(root_cell, "row"), _get_aligned_sentences(root_cell, "col"), ]): row_diff = min_row_diff([sentence, root_cell], absolute=False) col_diff = min_col_diff([sentence, root_cell], absolute=False) if (row_diff ^ col_diff # Exclusive OR and abs(row_diff) + abs(col_diff) <= dist): if directions: if col_diff == 0: direction = "DOWN" if 0 < row_diff else "UP" else: direction = "RIGHT" if 0 < col_diff else "LEFT" for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower, ): yield (ngram, direction) else: for ngram in tokens_to_ngrams( getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower, ): yield ngram