Пример #1
0
def _get_direction_ngrams(direction, c, attrib, n_min, n_max, lower,
                          from_phrase):
    # TODO: this currently looks only in current table;
    #   precompute over the whole document/page instead
    bbox_direction_aligned = bbox_vert_aligned if direction == 'vert' else bbox_horz_aligned
    ngrams_space = Ngrams(n_max=n_max, split_tokens=[])
    f = (lambda w: w.lower()) if lower else (lambda w: w)
    spans = [c] if isinstance(c, TemporarySpan) else c.get_contexts()
    for span in spans:
        if not span.sentence.is_tabular() or not span.sentence.is_visual():
            continue
        for phrase in span.sentence.table.phrases:
            if (from_phrase):
                if (bbox_direction_aligned(bbox_from_phrase(phrase),
                                           bbox_from_span(span))
                        and phrase is not span.sentence):
                    for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
            else:
                for ts in ngrams_space.apply(phrase):
                    if (bbox_direction_aligned(bbox_from_span(ts),
                                               bbox_from_span(span))
                            and not (phrase == span.sentence
                                     and ts.get_span() in span.get_span())):
                        yield f(ts.get_span())
Пример #2
0
def get_right_ngrams(span,
                     window=3,
                     attrib='words',
                     n_min=1,
                     n_max=1,
                     lower=True):
    """Get the ngrams within a window to the *right* of the Candidate from its sentence Context.

    For higher-arity Candidates, defaults to the *last* argument.

    :param span: The Span to evaluate. If a candidate is given, default to its last Span.
    :param window: The number of tokens to the left of the first argument to return
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    span = span if isinstance(span,
                              TemporarySpan) else span[-1]  # get last Span
    i = span.get_word_end()
    for ngram in tokens_to_ngrams(getattr(span.sentence,
                                          attrib)[i + 1:i + 1 + window],
                                  n_min=n_min,
                                  n_max=n_max,
                                  lower=lower):
        yield ngram
Пример #3
0
def get_cell_ngrams(span, attrib='words', n_min=1, n_max=1, lower=True):
    """Get the ngrams that are in the Cell of the given span, not including itself.

    Note that if a candidate is passed in, all of its Spans will be searched.

    :param span: The span whose Cell is being searched
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    for span in spans:
        for ngram in get_phrase_ngrams(span,
                                       attrib=attrib,
                                       n_min=n_min,
                                       n_max=n_max,
                                       lower=lower):
            yield ngram
        if isinstance(span.sentence,
                      Phrase) and span.sentence.cell is not None:
            for ngram in chain.from_iterable([
                    tokens_to_ngrams(getattr(phrase, attrib),
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower)
                    for phrase in span.sentence.cell.phrases
                    if phrase != span.sentence
            ]):
                yield ngram
Пример #4
0
def get_neighbor_phrase_ngrams(span,
                               d=1,
                               attrib='words',
                               n_min=1,
                               n_max=1,
                               lower=True):
    """Get the ngrams that are in the neighoring Phrases of the given Span.

    Note that if a candidate is passed in, all of its Spans will be searched.

    :param span: The span whose neighbor Phrases are being searched
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    for span in spans:
        for ngram in chain.from_iterable([
                tokens_to_ngrams(getattr(phrase, attrib),
                                 n_min=n_min,
                                 n_max=n_max,
                                 lower=lower)
                for phrase in span.sentence.document.phrases
                if abs(phrase.phrase_num - span.sentence.phrase_num) <= d
                and phrase != span.sentence
        ]):
            yield ngram
Пример #5
0
def get_word_feats(span):
    attrib = "words"

    if span.stable_id not in unary_word_feats:
        unary_word_feats[span.stable_id] = set()

        for ngram in tokens_to_ngrams(span.get_attrib_tokens(attrib),
                                      n_min=1,
                                      n_max=2):
            feature = "CONTAINS_%s_[%s]" % (attrib.upper(), ngram)
            unary_word_feats.add(feature)

        for ngram in get_left_ngrams(
                span,
                window=settings.featurization.content.word_feature.window,
                n_max=2,
                attrib=attrib,
        ):
            feature = "LEFT_%s_[%s]" % (attrib.upper(), ngram)
            unary_word_feats.add(feature)

        for ngram in get_right_ngrams(
                span,
                window=settings.featurization.content.word_feature.window,
                n_max=2,
                attrib=attrib,
        ):
            feature = "RIGHT_%s_[%s]" % (attrib.upper(), ngram)
            unary_word_feats.add(feature)

    for f in unary_word_feats[span.stable_id]:
        yield f
Пример #6
0
def _get_axis_ngrams(span,
                     axis,
                     attrib='words',
                     n_min=1,
                     n_max=1,
                     spread=[0, 0],
                     lower=True):
    for ngram in get_phrase_ngrams(span,
                                   attrib=attrib,
                                   n_min=n_min,
                                   n_max=n_max,
                                   lower=lower):
        yield ngram
    if (span.sentence.cell is not None):
        for phrase in _get_aligned_phrases(span.sentence, axis, spread=spread):
            for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                          n_min=n_min,
                                          n_max=n_max,
                                          lower=lower):
                yield ngram
Пример #7
0
def _get_axis_ngrams(span,
                     axis,
                     attrib="words",
                     n_min=1,
                     n_max=1,
                     spread=[0, 0],
                     lower=True):
    for ngram in get_sentence_ngrams(span,
                                     attrib=attrib,
                                     n_min=n_min,
                                     n_max=n_max,
                                     lower=lower):
        yield ngram
    if span.sentence.cell is not None:
        for sentence in _get_aligned_sentences(span.sentence,
                                               axis,
                                               spread=spread):
            for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                          n_min=n_min,
                                          n_max=n_max,
                                          lower=lower):
                yield ngram
Пример #8
0
def get_head_ngrams(span,
                    axis=None,
                    attrib="words",
                    n_min=1,
                    n_max=1,
                    lower=True):
    """Get the ngrams from the cell in the head of the row or column.

    More specifically, this returns the ngrams in the leftmost cell in a row and/or the
    ngrams in the topmost cell in the column, depending on the axis parameter.

    Note that if a candidate is passed in, all of its Spans will be searched.

    :param span: The span whose head Cells are being returned
    :param axis: Which axis {'row', 'col'} to search. If None, then both row and col are searched.
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams
    """
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    axes = [axis] if axis else ["row", "col"]
    for span in spans:
        if not span.sentence.cell:
            return
        else:
            for axis in axes:
                if getattr(span.sentence, _other_axis(axis) + "_start") == 0:
                    return
                for sentence in getattr(
                        _get_head_cell(span.sentence.cell, axis), "sentences",
                    []):
                    for ngram in tokens_to_ngrams(getattr(sentence, attrib),
                                                  n_min=n_min,
                                                  n_max=n_max,
                                                  lower=lower):
                        yield ngram
Пример #9
0
def get_neighbor_cell_ngrams(span,
                             dist=1,
                             directions=False,
                             attrib='words',
                             n_min=1,
                             n_max=1,
                             lower=True):
    """Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Span.

    Note that if a candidate is passed in, all of its Spans will be searched.
    If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}.

    :param span: The span whose neighbor Cells are being searched
    :param dist: The Cell distance within which a neighbor Cell must be to be considered
    :param directions: A Boolean expressing whether or not to return the direction of each ngram
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a *generator* of ngrams (or (ngram, direction) tuples if directions=True)
    """
    # TODO: Fix this to be more efficient (optimize with SQL query)
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    for span in spans:
        for ngram in get_phrase_ngrams(span,
                                       attrib=attrib,
                                       n_min=n_min,
                                       n_max=n_max,
                                       lower=lower):
            yield ngram
        if isinstance(span.sentence,
                      Phrase) and span.sentence.cell is not None:
            root_cell = span.sentence.cell
            for phrase in chain.from_iterable([
                    _get_aligned_phrases(root_cell, 'row'),
                    _get_aligned_phrases(root_cell, 'col')
            ]):
                row_diff = min_row_diff(phrase, root_cell, absolute=False)
                col_diff = min_col_diff(phrase, root_cell, absolute=False)
                if (row_diff or col_diff) and not (
                        row_diff and
                        col_diff) and abs(row_diff) + abs(col_diff) <= dist:
                    if directions:
                        direction = ''
                        if col_diff == 0:
                            if 0 < row_diff and row_diff <= dist:
                                direction = "UP"
                            elif 0 > row_diff and row_diff >= -dist:
                                direction = "DOWN"
                        elif row_diff == 0:
                            if 0 < col_diff and col_diff <= dist:
                                direction = "RIGHT"
                            elif 0 > col_diff and col_diff >= -dist:
                                direction = "LEFT"
                        for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                      n_min=n_min,
                                                      n_max=n_max,
                                                      lower=lower):
                            yield (ngram, direction)
                    else:
                        for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                      n_min=n_min,
                                                      n_max=n_max,
                                                      lower=lower):
                            yield ngram