Exemplo n.º 1
0
def lowest_common_ancestor_depth(c: Candidate) -> int:
    """Return the minimum distance between a binary-Mention Candidate to their
    lowest common ancestor.

    For example, if the tree looked like this::

        html
        ├──<div> Mention 1 </div>
        ├──table
        │    ├──tr
        │    │  └──<th> Mention 2 </th>

    we return 1, the distance from Mention 1 to the html root. Smaller values
    indicate that two Mentions are close structurally, while larger values
    indicate that two Mentions are spread far apart structurally in the
    document.

    :param c: The binary-Mention Candidate to evaluate
    :rtype: integer
    """
    span1 = _to_span(c[0])
    span2 = _to_span(c[1])
    ancestor1 = np.array(span1.sentence.xpath.split("/"))
    ancestor2 = np.array(span2.sentence.xpath.split("/"))
    min_len = min(ancestor1.size, ancestor2.size)
    return min_len - np.argmin(ancestor1[:min_len] == ancestor2[:min_len])
Exemplo n.º 2
0
def same_col(c: Candidate) -> bool:
    """Return True if all Mentions in the given candidate are from the same Col.

    :param c: The candidate whose Mentions are being compared
    """
    return same_table(c) and all(
        is_col_aligned(_to_span(c[i]).sentence,
                       _to_span(c[0]).sentence) for i in range(len(c)))
Exemplo n.º 3
0
def same_sentence(c: Candidate) -> bool:
    """Return True if all Mentions in the given candidate are from the same Sentence.

    :param c: The candidate whose Mentions are being compared
    """
    return all(
        _to_span(c[i]).sentence is not None
        and _to_span(c[i]).sentence == _to_span(c[0]).sentence
        for i in range(len(c)))
Exemplo n.º 4
0
def same_row(c):
    """Return True if all Mentions in the given candidate are from the same Row.

    :param c: The candidate whose Mentions are being compared
    :rtype: boolean
    """
    return same_table(c) and all(
        is_row_aligned(_to_span(c[i]).sentence,
                       _to_span(c[0]).sentence) for i in range(len(c)))
Exemplo n.º 5
0
def same_table(c: Candidate) -> bool:
    """Return True if all Mentions in the given candidate are from the same Table.

    :param c: The candidate whose Mentions are being compared
    """
    return all(
        _to_span(c[i]).sentence.is_tabular()
        and _to_span(c[i]).sentence.table == _to_span(c[0]).sentence.table
        for i in range(len(c)))
Exemplo n.º 6
0
def same_cell(c):
    """Return True if all Mentions in the given candidate are from the same Cell.

    :param c: The candidate whose Mentions are being compared
    :rtype: boolean
    """
    return all(
        _to_span(c[i]).sentence.cell is not None
        and _to_span(c[i]).sentence.cell == _to_span(c[0]).sentence.cell
        for i in range(len(c)))
Exemplo n.º 7
0
def is_tabular_aligned(c: Candidate) -> bool:
    """Return True if all Mentions in the given candidate are from the same Row or Col.

    :param c: The candidate whose Mentions are being compared
    :rtype: boolean
    """
    return same_table(c) and (
        is_col_aligned(_to_span(c[i]).sentence,
                       _to_span(c[0]).sentence)
        or is_row_aligned(_to_span(c[i]).sentence,
                          _to_span(c[0]).sentence) for i in range(len(c)))
Exemplo n.º 8
0
def is_vert_aligned(c: Candidate) -> bool:
    """Return true if all the components of c are vertically aligned.

    Vertical alignment means that the bounding boxes of each Mention of c
    shares a similar x-axis value in the visual rendering of the document.

    :param c: The candidate to evaluate
    """
    return all([
        _to_span(c[i]).sentence.is_visual() and bbox_vert_aligned(
            _to_span(c[i]).get_bbox(),
            _to_span(c[0]).get_bbox()) for i in range(len(c))
    ])
Exemplo n.º 9
0
def same_page(c: Candidate) -> bool:
    """Return true if all the components of c are on the same page of the document.

    Page numbers are based on the PDF rendering of the document. If a PDF file is
    provided, it is used. Otherwise, if only a HTML/XML document is provided, a
    PDF is created and then used to determine the page number of a Mention.

    :param c: The candidate to evaluate
    """
    return all([
        _to_span(c[i]).sentence.is_visual()
        and _to_span(c[i]).get_bbox().page == _to_span(c[0]).get_bbox().page
        for i in range(len(c))
    ])
Exemplo n.º 10
0
def common_ancestor(c: Tuple[SpanMention, SpanMention]) -> List[str]:
    """Return the path to the root that is shared between a binary-Mention Candidate.

    In particular, this is the common path of HTML tags.

    :param c: The binary-Mention Candidate to evaluate
    """
    span1 = _to_span(c[0])
    span2 = _to_span(c[1])
    ancestor1 = np.array(span1.sentence.xpath.split("/"))
    ancestor2 = np.array(span2.sentence.xpath.split("/"))
    min_len = min(ancestor1.size, ancestor2.size)
    return list(
        ancestor1[:np.argmin(ancestor1[:min_len] == ancestor2[:min_len])])
Exemplo n.º 11
0
def is_horz_aligned(c: Candidate) -> bool:
    """Return True if all the components of c are horizontally aligned.

    Horizontal alignment means that the bounding boxes of each Mention of c
    shares a similar y-axis value in the visual rendering of the document.

    :param c: The candidate to evaluate
    :rtype: boolean
    """
    return all([
        _to_span(c[i]).sentence.is_visual() and bbox_horz_aligned(
            bbox_from_span(_to_span(c[i])), bbox_from_span(_to_span(c[0])))
        for i in range(len(c))
    ])
Exemplo n.º 12
0
def is_vert_aligned_right(c: Candidate) -> bool:
    """Return true if all components vertically aligned on their right border.

    Vertical alignment means that the bounding boxes of each Mention of c
    shares a similar x-axis value in the visual rendering of the document. In
    this function the similarity of the x-axis value is based on the right
    border of their bounding boxes.

    :param c: The candidate to evaluate
    """
    return all([
        _to_span(c[i]).sentence.is_visual() and bbox_vert_aligned_right(
            _to_span(c[i]).get_bbox(),
            _to_span(c[0]).get_bbox()) for i in range(len(c))
    ])
Exemplo n.º 13
0
def get_between_ngrams(
    c: Candidate,
    attrib: str = "words",
    n_min: int = 1,
    n_max: int = 1,
    lower: bool = True,
) -> Iterator[str]:
    """Return the ngrams *between* two unary Mentions of a binary-Mention Candidate.

    Get the ngrams *between* two unary Mentions of a binary-Mention Candidate,
    where both share the same sentence Context.

    :param c: The binary-Mention Candidate to evaluate.
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If 'True', all ngrams will be returned in lower case
    """
    if len(c) != 2:
        raise ValueError("Only applicable to binary Candidates")
    span0 = _to_span(c[0])
    span1 = _to_span(c[1])
    if span0.sentence != span1.sentence:
        raise ValueError(
            "Only applicable to Candidates where both spans are \
                          from the same immediate Context."
        )
    distance = abs(span0.get_word_start_index() - span1.get_word_start_index())
    if span0.get_word_start_index() < span1.get_word_start_index():
        for ngram in get_right_ngrams(
            span0,
            window=distance - 1,
            attrib=attrib,
            n_min=n_min,
            n_max=n_max,
            lower=lower,
        ):
            yield ngram
    else:  # span0.get_word_start_index() > span1.get_word_start_index()
        for ngram in get_right_ngrams(
            span1,
            window=distance - 1,
            attrib=attrib,
            n_min=n_min,
            n_max=n_max,
            lower=lower,
        ):
            yield ngram
Exemplo n.º 14
0
def is_vert_aligned_center(c: Candidate) -> bool:
    """Return true if all the components are vertically aligned on their center.

    Vertical alignment means that the bounding boxes of each Mention of c
    shares a similar x-axis value in the visual rendering of the document. In
    this function the similarity of the x-axis value is based on the center of
    their bounding boxes.

    :param c: The candidate to evaluate
    :rtype: boolean
    """
    return all([
        _to_span(c[i]).sentence.is_visual() and bbox_vert_aligned_center(
            bbox_from_span(_to_span(c[i])), bbox_from_span(_to_span(c[0])))
        for i in range(len(c))
    ])
Exemplo n.º 15
0
def get_left_ngrams(
    mention: Union[Candidate, Mention, TemporarySpanMention],
    window: int = 3,
    attrib: str = "words",
    n_min: int = 1,
    n_max: int = 1,
    lower: bool = True,
) -> Iterator[str]:
    """Get the ngrams within a window to the *left* from the sentence Context.

    For higher-arity Candidates, defaults to the *first* argument.

    :param mention: The Mention to evaluate. If a candidate is given, default
        to its first Mention.
    :param window: The number of tokens to the left of the first argument to
        return.
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    """
    span = _to_span(mention)
    i = span.get_word_start_index()
    for ngram in tokens_to_ngrams(
            getattr(span.sentence, attrib)[max(0, i - window):i],
            n_min=n_min,
            n_max=n_max,
            lower=lower,
    ):
        yield ngram
Exemplo n.º 16
0
def get_right_ngrams(mention,
                     window=3,
                     attrib="words",
                     n_min=1,
                     n_max=1,
                     lower=True):
    """Get the ngrams within a window to the *right* from the sentence Context.

    For higher-arity Candidates, defaults to the *last* argument.

    :param mention: The Mention to evaluate. If a candidate is given, default
        to its last Mention.
    :param window: The number of tokens to the left of the first argument to
        return
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    span = _to_span(mention, idx=-1)
    i = span.get_word_end()
    for ngram in tokens_to_ngrams(
            getattr(span.sentence, attrib)[i + 1:i + 1 + window],
            n_min=n_min,
            n_max=n_max,
            lower=lower,
    ):
        yield ngram
Exemplo n.º 17
0
def lowest_common_ancestor_depth(c: Tuple[SpanMention, ...]) -> int:
    """Return the lowest common ancestor depth.

    In particular, return the minimum distance between a multinary-Mention Candidate to
    their lowest common ancestor.

    For example, if the tree looked like this::

        html
        ├──<div> Mention 1 </div>
        ├──table
        │    ├──tr
        │    │  └──<th> Mention 2 </th>

    we return 1, the distance from Mention 1 to the html root. Smaller values
    indicate that two Mentions are close structurally, while larger values
    indicate that two Mentions are spread far apart structurally in the
    document.

    :param c: The multinary-Mention Candidate to evaluate
    """
    spans = [_to_span(i) for i in c]
    ancestors = [np.array(span.sentence.xpath.split("/")) for span in spans]
    min_len = min([a.size for a in ancestors])
    ancestor = ancestors[0]
    ind = 0  # all the ancestors are common up to this index (exclusive).
    while ind < min_len:
        if not all([a[ind] == ancestor[ind] for a in ancestors]):
            break
        ind += 1
    return min_len - ind
Exemplo n.º 18
0
def _get_axis_ngrams(mention,
                     axis,
                     attrib="words",
                     n_min=1,
                     n_max=1,
                     spread=[0, 0],
                     lower=True):
    span = _to_span(mention)

    if not span.sentence.is_tabular():
        yield None
        return

    for ngram in get_sentence_ngrams(span,
                                     attrib=attrib,
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower):
        yield ngram
    if span.sentence.is_tabular():
        for sentence in _get_aligned_sentences(span.sentence,
                                               axis,
                                               spread=spread):
            for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                          n_min=n_min,
                                          n_max=n_max,
                                          lower=lower):
                yield ngram
Exemplo n.º 19
0
def _get_axis_ngrams(
    mention: Union[Candidate, Mention, TemporarySpanMention],
    axis: str,
    attrib: str = "words",
    n_min: int = 1,
    n_max: int = 1,
    spread: List[int] = [0, 0],
    lower: bool = True,
) -> Iterator[str]:
    span = _to_span(mention)

    if not span.sentence.is_tabular():
        return
        yield

    for ngram in get_sentence_ngrams(span,
                                     attrib=attrib,
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower):
        yield ngram

    for sentence in _get_aligned_sentences(span.sentence, axis, spread=spread):
        for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                      n_min=n_min,
                                      n_max=n_max,
                                      lower=lower):
            yield ngram
Exemplo n.º 20
0
def get_page(mention: Union[Candidate, Mention, TemporarySpanMention]) -> int:
    """Return the page number of the given mention.

    If a candidate is passed in, this returns the page of its first Mention.

    :param mention: The Mention to get the page number of.
    """
    span = _to_span(mention)
    return span.get_attrib_tokens("page")[0]
Exemplo n.º 21
0
def get_page(mention):
    """Return the page number of the given mention.

    If a candidate is passed in, this returns the page of its first Mention.

    :param mention: The Mention to get the page number of.
    :rtype: integer
    """
    span = _to_span(mention)
    return span.get_attrib_tokens("page")[0]
Exemplo n.º 22
0
def get_tag(mention: Union[Candidate, Mention, TemporarySpanMention]) -> str:
    """Return the HTML tag of the Mention.

    If a candidate is passed in, only the tag of its first Mention is returned.

    These may be tags such as 'p', 'h2', 'table', 'div', etc.
    :param mention: The Mention to evaluate
    :rtype: string
    """
    span = _to_span(mention)
    return str(span.sentence.html_tag)
Exemplo n.º 23
0
def get_parent_tag(mention):
    """Return the HTML tag of the Mention's parent.

    These may be tags such as 'p', 'h2', 'table', 'div', etc.
    If a candidate is passed in, only the tag of its first Mention is returned.

    :param mention: The Mention to evaluate
    :rtype: string
    """
    span = _to_span(mention)
    i = _get_node(span.sentence)
    return str(i.getparent().tag) if i.getparent() is not None else None
Exemplo n.º 24
0
def get_min_row_num(mention):
    """Return the lowest row number that a Mention occupies.

    :param mention: The Mention to evaluate. If a candidate is given, default
        to its first Mention.
    :rtype: integer or None
    """
    span = _to_span(mention)
    if span.sentence.is_tabular():
        return span.sentence.cell.row_start
    else:
        return None
Exemplo n.º 25
0
def get_max_col_num(mention):
    """Return the largest column number that a Mention occupies.

    :param mention: The Mention to evaluate. If a candidate is given, default
        to its last Mention.
    :rtype: integer or None
    """
    span = _to_span(mention, idx=-1)
    if span.sentence.is_tabular():
        return span.sentence.cell.col_end
    else:
        return None
Exemplo n.º 26
0
def get_parent_tag(
        mention: Union[Candidate, Mention,
                       TemporarySpanMention]) -> Optional[str]:
    """Return the HTML tag of the Mention's parent.

    These may be tags such as 'p', 'h2', 'table', 'div', etc.
    If a candidate is passed in, only the tag of its first Mention is returned.

    :param mention: The Mention to evaluate
    """
    span = _to_span(mention)
    i = _get_node(span.sentence)
    return str(i.getparent().tag) if i.getparent() is not None else None
Exemplo n.º 27
0
def get_max_col_num(
        mention: Union[Candidate, Mention,
                       TemporarySpanMention]) -> Optional[int]:
    """Return the largest column number that a Mention occupies.

    :param mention: The Mention to evaluate. If a candidate is given, default
        to its last Mention.
    """
    span = _to_span(mention, idx=-1)
    if span.sentence.is_tabular():
        return span.sentence.cell.col_end
    else:
        return None
Exemplo n.º 28
0
def get_attributes(mention):
    """Return the HTML attributes of the Mention.

    If a candidate is passed in, only the tag of its first Mention is returned.

    A sample outout of this function on a Mention in a paragraph tag is
    [u'style=padding-top: 8pt;padding-left: 20pt;text-indent: 0pt;text-align: left;']

    :param mention: The Mention to evaluate
    :rtype: list of strings representing HTML attributes
    """
    span = _to_span(mention)
    return span.sentence.html_attrs
Exemplo n.º 29
0
def get_min_row_num(
        mention: Union[Candidate, Mention,
                       TemporarySpanMention]) -> Optional[int]:
    """Return the lowest row number that a Mention occupies.

    :param mention: The Mention to evaluate. If a candidate is given, default
        to its first Mention.
    """
    span = _to_span(mention)
    if span.sentence.is_tabular():
        return span.sentence.cell.row_start
    else:
        return None
Exemplo n.º 30
0
def get_ancestor_id_names(mention):
    """Return the HTML id's of the Mention's ancestors.

    If a candidate is passed in, only the ancestors of its first Mention are
    returned.

    :param mention: The Mention to evaluate
    :rtype: list of strings
    """
    span = _to_span(mention)
    id_names = []
    i = _get_node(span.sentence)
    while i is not None:
        id_names.insert(0, str(i.get("id")))
        i = i.getparent()
    return id_names