예제 #1
0
def tablelib_binary_features(span1, span2):
    """
    Table-/structure-related features for a pair of spans
    """
    if span1.sentence.is_tabular() and span2.sentence.is_tabular():
        if span1.sentence.table == span2.sentence.table:
            yield u"SAME_TABLE", DEF_VALUE
            if span1.sentence.cell is not None and span2.sentence.cell is not None:
                row_diff = min_row_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=settings.featurization.table.binary_features.
                    min_row_diff.absolute)
                col_diff = min_col_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=settings.featurization.table.binary_features.
                    min_col_diff.absolute)
                yield u"SAME_TABLE_ROW_DIFF_[%s]" % row_diff, DEF_VALUE
                yield u"SAME_TABLE_COL_DIFF_[%s]" % col_diff, DEF_VALUE
                yield u"SAME_TABLE_MANHATTAN_DIST_[%s]" % str(
                    abs(row_diff) + abs(col_diff)), DEF_VALUE
                if span1.sentence.cell == span2.sentence.cell:
                    yield u"SAME_CELL", DEF_VALUE
                    yield u"WORD_DIFF_[%s]" % (
                        span1.get_word_start() - span2.get_word_start()
                    ), DEF_VALUE
                    yield u"CHAR_DIFF_[%s]" % (
                        span1.char_start - span2.char_start), DEF_VALUE
                    if span1.sentence == span2.sentence:
                        yield u"SAME_PHRASE", DEF_VALUE
        else:
            if span1.sentence.cell is not None and span2.sentence.cell is not None:
                yield u"DIFF_TABLE", DEF_VALUE
                row_diff = min_row_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=settings.featurization.table.binary_features.
                    min_row_diff.absolute)
                col_diff = min_col_diff(
                    span1.sentence,
                    span2.sentence,
                    absolute=settings.featurization.table.binary_features.
                    min_col_diff.absolute)
                yield u"DIFF_TABLE_ROW_DIFF_[%s]" % row_diff, DEF_VALUE
                yield u"DIFF_TABLE_COL_DIFF_[%s]" % col_diff, DEF_VALUE
                yield u"DIFF_TABLE_MANHATTAN_DIST_[%s]" % str(
                    abs(row_diff) + abs(col_diff)), DEF_VALUE
예제 #2
0
def get_neighbor_cell_ngrams(span,
                             dist=1,
                             directions=False,
                             attrib='words',
                             n_min=1,
                             n_max=1,
                             lower=True):
    """Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Span.

    Note that if a candidate is passed in, all of its Spans will be searched.
    If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}.

    :param span: The span whose neighbor Cells are being searched
    :param dist: The Cell distance within which a neighbor Cell must be to be considered
    :param directions: A Boolean expressing whether or not to return the direction of each ngram
    :param attrib: The token attribute type (e.g. words, lemmas, poses)
    :param n_min: The minimum n of the ngrams that should be returned
    :param n_max: The maximum n of the ngrams that should be returned
    :param lower: If True, all ngrams will be returned in lower case
    :rtype: a _generator_ of ngrams (or (ngram, direction) tuples if directions=True)
    """
    # TODO: Fix this to be more efficient (optimize with SQL query)
    spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts()
    for span in spans:
        for ngram in get_phrase_ngrams(span,
                                       attrib=attrib,
                                       n_min=n_min,
                                       n_max=n_max,
                                       lower=lower):
            yield ngram
        if isinstance(span.sentence,
                      Phrase) and span.sentence.cell is not None:
            root_cell = span.sentence.cell
            for phrase in chain.from_iterable([
                    _get_aligned_phrases(root_cell, 'row'),
                    _get_aligned_phrases(root_cell, 'col')
            ]):
                row_diff = min_row_diff(phrase, root_cell, absolute=False)
                col_diff = min_col_diff(phrase, root_cell, absolute=False)
                if (row_diff or col_diff) and not (
                        row_diff and
                        col_diff) and abs(row_diff) + abs(col_diff) <= dist:
                    if directions:
                        direction = ''
                        if col_diff == 0:
                            if 0 < row_diff and row_diff <= dist:
                                direction = "UP"
                            elif 0 > row_diff and row_diff >= -dist:
                                direction = "DOWN"
                        elif row_diff == 0:
                            if 0 < col_diff and col_diff <= dist:
                                direction = "RIGHT"
                            elif 0 > col_diff and col_diff >= -dist:
                                direction = "LEFT"
                        for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                      n_min=n_min,
                                                      n_max=n_max,
                                                      lower=lower):
                            yield (ngram, direction)
                    else:
                        for ngram in tokens_to_ngrams(getattr(phrase, attrib),
                                                      n_min=n_min,
                                                      n_max=n_max,
                                                      lower=lower):
                            yield ngram