def lowest_common_ancestor_depth(c: Candidate) -> int: """Return the minimum distance between a binary-Mention Candidate to their lowest common ancestor. For example, if the tree looked like this:: html ├──<div> Mention 1 </div> ├──table │ ├──tr │ │ └──<th> Mention 2 </th> we return 1, the distance from Mention 1 to the html root. Smaller values indicate that two Mentions are close structurally, while larger values indicate that two Mentions are spread far apart structurally in the document. :param c: The binary-Mention Candidate to evaluate :rtype: integer """ span1 = _to_span(c[0]) span2 = _to_span(c[1]) ancestor1 = np.array(span1.sentence.xpath.split("/")) ancestor2 = np.array(span2.sentence.xpath.split("/")) min_len = min(ancestor1.size, ancestor2.size) return min_len - np.argmin(ancestor1[:min_len] == ancestor2[:min_len])
def same_col(c: Candidate) -> bool: """Return True if all Mentions in the given candidate are from the same Col. :param c: The candidate whose Mentions are being compared """ return same_table(c) and all( is_col_aligned(_to_span(c[i]).sentence, _to_span(c[0]).sentence) for i in range(len(c)))
def same_sentence(c: Candidate) -> bool: """Return True if all Mentions in the given candidate are from the same Sentence. :param c: The candidate whose Mentions are being compared """ return all( _to_span(c[i]).sentence is not None and _to_span(c[i]).sentence == _to_span(c[0]).sentence for i in range(len(c)))
def same_row(c): """Return True if all Mentions in the given candidate are from the same Row. :param c: The candidate whose Mentions are being compared :rtype: boolean """ return same_table(c) and all( is_row_aligned(_to_span(c[i]).sentence, _to_span(c[0]).sentence) for i in range(len(c)))
def same_table(c: Candidate) -> bool: """Return True if all Mentions in the given candidate are from the same Table. :param c: The candidate whose Mentions are being compared """ return all( _to_span(c[i]).sentence.is_tabular() and _to_span(c[i]).sentence.table == _to_span(c[0]).sentence.table for i in range(len(c)))
def same_cell(c): """Return True if all Mentions in the given candidate are from the same Cell. :param c: The candidate whose Mentions are being compared :rtype: boolean """ return all( _to_span(c[i]).sentence.cell is not None and _to_span(c[i]).sentence.cell == _to_span(c[0]).sentence.cell for i in range(len(c)))
def is_tabular_aligned(c: Candidate) -> bool: """Return True if all Mentions in the given candidate are from the same Row or Col. :param c: The candidate whose Mentions are being compared :rtype: boolean """ return same_table(c) and ( is_col_aligned(_to_span(c[i]).sentence, _to_span(c[0]).sentence) or is_row_aligned(_to_span(c[i]).sentence, _to_span(c[0]).sentence) for i in range(len(c)))
def is_vert_aligned(c: Candidate) -> bool: """Return true if all the components of c are vertically aligned. Vertical alignment means that the bounding boxes of each Mention of c shares a similar x-axis value in the visual rendering of the document. :param c: The candidate to evaluate """ return all([ _to_span(c[i]).sentence.is_visual() and bbox_vert_aligned( _to_span(c[i]).get_bbox(), _to_span(c[0]).get_bbox()) for i in range(len(c)) ])
def same_page(c: Candidate) -> bool: """Return true if all the components of c are on the same page of the document. Page numbers are based on the PDF rendering of the document. If a PDF file is provided, it is used. Otherwise, if only a HTML/XML document is provided, a PDF is created and then used to determine the page number of a Mention. :param c: The candidate to evaluate """ return all([ _to_span(c[i]).sentence.is_visual() and _to_span(c[i]).get_bbox().page == _to_span(c[0]).get_bbox().page for i in range(len(c)) ])
def common_ancestor(c: Tuple[SpanMention, SpanMention]) -> List[str]: """Return the path to the root that is shared between a binary-Mention Candidate. In particular, this is the common path of HTML tags. :param c: The binary-Mention Candidate to evaluate """ span1 = _to_span(c[0]) span2 = _to_span(c[1]) ancestor1 = np.array(span1.sentence.xpath.split("/")) ancestor2 = np.array(span2.sentence.xpath.split("/")) min_len = min(ancestor1.size, ancestor2.size) return list( ancestor1[:np.argmin(ancestor1[:min_len] == ancestor2[:min_len])])
def is_horz_aligned(c: Candidate) -> bool: """Return True if all the components of c are horizontally aligned. Horizontal alignment means that the bounding boxes of each Mention of c shares a similar y-axis value in the visual rendering of the document. :param c: The candidate to evaluate :rtype: boolean """ return all([ _to_span(c[i]).sentence.is_visual() and bbox_horz_aligned( bbox_from_span(_to_span(c[i])), bbox_from_span(_to_span(c[0]))) for i in range(len(c)) ])
def is_vert_aligned_right(c: Candidate) -> bool: """Return true if all components vertically aligned on their right border. Vertical alignment means that the bounding boxes of each Mention of c shares a similar x-axis value in the visual rendering of the document. In this function the similarity of the x-axis value is based on the right border of their bounding boxes. :param c: The candidate to evaluate """ return all([ _to_span(c[i]).sentence.is_visual() and bbox_vert_aligned_right( _to_span(c[i]).get_bbox(), _to_span(c[0]).get_bbox()) for i in range(len(c)) ])
def get_between_ngrams( c: Candidate, attrib: str = "words", n_min: int = 1, n_max: int = 1, lower: bool = True, ) -> Iterator[str]: """Return the ngrams *between* two unary Mentions of a binary-Mention Candidate. Get the ngrams *between* two unary Mentions of a binary-Mention Candidate, where both share the same sentence Context. :param c: The binary-Mention Candidate to evaluate. :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If 'True', all ngrams will be returned in lower case """ if len(c) != 2: raise ValueError("Only applicable to binary Candidates") span0 = _to_span(c[0]) span1 = _to_span(c[1]) if span0.sentence != span1.sentence: raise ValueError( "Only applicable to Candidates where both spans are \ from the same immediate Context." ) distance = abs(span0.get_word_start_index() - span1.get_word_start_index()) if span0.get_word_start_index() < span1.get_word_start_index(): for ngram in get_right_ngrams( span0, window=distance - 1, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower, ): yield ngram else: # span0.get_word_start_index() > span1.get_word_start_index() for ngram in get_right_ngrams( span1, window=distance - 1, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower, ): yield ngram
def is_vert_aligned_center(c: Candidate) -> bool: """Return true if all the components are vertically aligned on their center. Vertical alignment means that the bounding boxes of each Mention of c shares a similar x-axis value in the visual rendering of the document. In this function the similarity of the x-axis value is based on the center of their bounding boxes. :param c: The candidate to evaluate :rtype: boolean """ return all([ _to_span(c[i]).sentence.is_visual() and bbox_vert_aligned_center( bbox_from_span(_to_span(c[i])), bbox_from_span(_to_span(c[0]))) for i in range(len(c)) ])
def get_left_ngrams( mention: Union[Candidate, Mention, TemporarySpanMention], window: int = 3, attrib: str = "words", n_min: int = 1, n_max: int = 1, lower: bool = True, ) -> Iterator[str]: """Get the ngrams within a window to the *left* from the sentence Context. For higher-arity Candidates, defaults to the *first* argument. :param mention: The Mention to evaluate. If a candidate is given, default to its first Mention. :param window: The number of tokens to the left of the first argument to return. :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case """ span = _to_span(mention) i = span.get_word_start_index() for ngram in tokens_to_ngrams( getattr(span.sentence, attrib)[max(0, i - window):i], n_min=n_min, n_max=n_max, lower=lower, ): yield ngram
def get_right_ngrams(mention, window=3, attrib="words", n_min=1, n_max=1, lower=True): """Get the ngrams within a window to the *right* from the sentence Context. For higher-arity Candidates, defaults to the *last* argument. :param mention: The Mention to evaluate. If a candidate is given, default to its last Mention. :param window: The number of tokens to the left of the first argument to return :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a *generator* of ngrams """ span = _to_span(mention, idx=-1) i = span.get_word_end() for ngram in tokens_to_ngrams( getattr(span.sentence, attrib)[i + 1:i + 1 + window], n_min=n_min, n_max=n_max, lower=lower, ): yield ngram
def lowest_common_ancestor_depth(c: Tuple[SpanMention, ...]) -> int: """Return the lowest common ancestor depth. In particular, return the minimum distance between a multinary-Mention Candidate to their lowest common ancestor. For example, if the tree looked like this:: html ├──<div> Mention 1 </div> ├──table │ ├──tr │ │ └──<th> Mention 2 </th> we return 1, the distance from Mention 1 to the html root. Smaller values indicate that two Mentions are close structurally, while larger values indicate that two Mentions are spread far apart structurally in the document. :param c: The multinary-Mention Candidate to evaluate """ spans = [_to_span(i) for i in c] ancestors = [np.array(span.sentence.xpath.split("/")) for span in spans] min_len = min([a.size for a in ancestors]) ancestor = ancestors[0] ind = 0 # all the ancestors are common up to this index (exclusive). while ind < min_len: if not all([a[ind] == ancestor[ind] for a in ancestors]): break ind += 1 return min_len - ind
def _get_axis_ngrams(mention, axis, attrib="words", n_min=1, n_max=1, spread=[0, 0], lower=True): span = _to_span(mention) if not span.sentence.is_tabular(): yield None return for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if span.sentence.is_tabular(): for sentence in _get_aligned_sentences(span.sentence, axis, spread=spread): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram
def _get_axis_ngrams( mention: Union[Candidate, Mention, TemporarySpanMention], axis: str, attrib: str = "words", n_min: int = 1, n_max: int = 1, spread: List[int] = [0, 0], lower: bool = True, ) -> Iterator[str]: span = _to_span(mention) if not span.sentence.is_tabular(): return yield for ngram in get_sentence_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram for sentence in _get_aligned_sentences(span.sentence, axis, spread=spread): for ngram in tokens_to_ngrams(getattr(sentence, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram
def get_page(mention: Union[Candidate, Mention, TemporarySpanMention]) -> int: """Return the page number of the given mention. If a candidate is passed in, this returns the page of its first Mention. :param mention: The Mention to get the page number of. """ span = _to_span(mention) return span.get_attrib_tokens("page")[0]
def get_page(mention): """Return the page number of the given mention. If a candidate is passed in, this returns the page of its first Mention. :param mention: The Mention to get the page number of. :rtype: integer """ span = _to_span(mention) return span.get_attrib_tokens("page")[0]
def get_tag(mention: Union[Candidate, Mention, TemporarySpanMention]) -> str: """Return the HTML tag of the Mention. If a candidate is passed in, only the tag of its first Mention is returned. These may be tags such as 'p', 'h2', 'table', 'div', etc. :param mention: The Mention to evaluate :rtype: string """ span = _to_span(mention) return str(span.sentence.html_tag)
def get_parent_tag(mention): """Return the HTML tag of the Mention's parent. These may be tags such as 'p', 'h2', 'table', 'div', etc. If a candidate is passed in, only the tag of its first Mention is returned. :param mention: The Mention to evaluate :rtype: string """ span = _to_span(mention) i = _get_node(span.sentence) return str(i.getparent().tag) if i.getparent() is not None else None
def get_min_row_num(mention): """Return the lowest row number that a Mention occupies. :param mention: The Mention to evaluate. If a candidate is given, default to its first Mention. :rtype: integer or None """ span = _to_span(mention) if span.sentence.is_tabular(): return span.sentence.cell.row_start else: return None
def get_max_col_num(mention): """Return the largest column number that a Mention occupies. :param mention: The Mention to evaluate. If a candidate is given, default to its last Mention. :rtype: integer or None """ span = _to_span(mention, idx=-1) if span.sentence.is_tabular(): return span.sentence.cell.col_end else: return None
def get_parent_tag( mention: Union[Candidate, Mention, TemporarySpanMention]) -> Optional[str]: """Return the HTML tag of the Mention's parent. These may be tags such as 'p', 'h2', 'table', 'div', etc. If a candidate is passed in, only the tag of its first Mention is returned. :param mention: The Mention to evaluate """ span = _to_span(mention) i = _get_node(span.sentence) return str(i.getparent().tag) if i.getparent() is not None else None
def get_max_col_num( mention: Union[Candidate, Mention, TemporarySpanMention]) -> Optional[int]: """Return the largest column number that a Mention occupies. :param mention: The Mention to evaluate. If a candidate is given, default to its last Mention. """ span = _to_span(mention, idx=-1) if span.sentence.is_tabular(): return span.sentence.cell.col_end else: return None
def get_attributes(mention): """Return the HTML attributes of the Mention. If a candidate is passed in, only the tag of its first Mention is returned. A sample outout of this function on a Mention in a paragraph tag is [u'style=padding-top: 8pt;padding-left: 20pt;text-indent: 0pt;text-align: left;'] :param mention: The Mention to evaluate :rtype: list of strings representing HTML attributes """ span = _to_span(mention) return span.sentence.html_attrs
def get_min_row_num( mention: Union[Candidate, Mention, TemporarySpanMention]) -> Optional[int]: """Return the lowest row number that a Mention occupies. :param mention: The Mention to evaluate. If a candidate is given, default to its first Mention. """ span = _to_span(mention) if span.sentence.is_tabular(): return span.sentence.cell.row_start else: return None
def get_ancestor_id_names(mention): """Return the HTML id's of the Mention's ancestors. If a candidate is passed in, only the ancestors of its first Mention are returned. :param mention: The Mention to evaluate :rtype: list of strings """ span = _to_span(mention) id_names = [] i = _get_node(span.sentence) while i is not None: id_names.insert(0, str(i.get("id"))) i = i.getparent() return id_names