def tablelib_binary_features(span1, span2): """ Table-/structure-related features for a pair of spans """ if span1.sentence.is_tabular() and span2.sentence.is_tabular(): if span1.sentence.table == span2.sentence.table: yield u"SAME_TABLE", DEF_VALUE if span1.sentence.cell is not None and span2.sentence.cell is not None: row_diff = min_row_diff( span1.sentence, span2.sentence, absolute=settings.featurization.table.binary_features. min_row_diff.absolute) col_diff = min_col_diff( span1.sentence, span2.sentence, absolute=settings.featurization.table.binary_features. min_col_diff.absolute) yield u"SAME_TABLE_ROW_DIFF_[%s]" % row_diff, DEF_VALUE yield u"SAME_TABLE_COL_DIFF_[%s]" % col_diff, DEF_VALUE yield u"SAME_TABLE_MANHATTAN_DIST_[%s]" % str( abs(row_diff) + abs(col_diff)), DEF_VALUE if span1.sentence.cell == span2.sentence.cell: yield u"SAME_CELL", DEF_VALUE yield u"WORD_DIFF_[%s]" % ( span1.get_word_start() - span2.get_word_start() ), DEF_VALUE yield u"CHAR_DIFF_[%s]" % ( span1.char_start - span2.char_start), DEF_VALUE if span1.sentence == span2.sentence: yield u"SAME_PHRASE", DEF_VALUE else: if span1.sentence.cell is not None and span2.sentence.cell is not None: yield u"DIFF_TABLE", DEF_VALUE row_diff = min_row_diff( span1.sentence, span2.sentence, absolute=settings.featurization.table.binary_features. min_row_diff.absolute) col_diff = min_col_diff( span1.sentence, span2.sentence, absolute=settings.featurization.table.binary_features. min_col_diff.absolute) yield u"DIFF_TABLE_ROW_DIFF_[%s]" % row_diff, DEF_VALUE yield u"DIFF_TABLE_COL_DIFF_[%s]" % col_diff, DEF_VALUE yield u"DIFF_TABLE_MANHATTAN_DIST_[%s]" % str( abs(row_diff) + abs(col_diff)), DEF_VALUE
def get_neighbor_cell_ngrams(span, dist=1, directions=False, attrib='words', n_min=1, n_max=1, lower=True): """Get the ngrams from all Cells that are within a given Cell distance in one direction from the given Span. Note that if a candidate is passed in, all of its Spans will be searched. If `directions=True``, each ngram will be returned with a direction in {'UP', 'DOWN', 'LEFT', 'RIGHT'}. :param span: The span whose neighbor Cells are being searched :param dist: The Cell distance within which a neighbor Cell must be to be considered :param directions: A Boolean expressing whether or not to return the direction of each ngram :param attrib: The token attribute type (e.g. words, lemmas, poses) :param n_min: The minimum n of the ngrams that should be returned :param n_max: The maximum n of the ngrams that should be returned :param lower: If True, all ngrams will be returned in lower case :rtype: a _generator_ of ngrams (or (ngram, direction) tuples if directions=True) """ # TODO: Fix this to be more efficient (optimize with SQL query) spans = [span] if isinstance(span, TemporarySpan) else span.get_contexts() for span in spans: for ngram in get_phrase_ngrams(span, attrib=attrib, n_min=n_min, n_max=n_max, lower=lower): yield ngram if isinstance(span.sentence, Phrase) and span.sentence.cell is not None: root_cell = span.sentence.cell for phrase in chain.from_iterable([ _get_aligned_phrases(root_cell, 'row'), _get_aligned_phrases(root_cell, 'col') ]): row_diff = min_row_diff(phrase, root_cell, absolute=False) col_diff = min_col_diff(phrase, root_cell, absolute=False) if (row_diff or col_diff) and not ( row_diff and col_diff) and abs(row_diff) + abs(col_diff) <= dist: if directions: direction = '' if col_diff == 0: if 0 < row_diff and row_diff <= dist: direction = "UP" elif 0 > row_diff and row_diff >= -dist: direction = "DOWN" elif row_diff == 0: if 0 < col_diff and col_diff <= dist: direction = "RIGHT" elif 0 > col_diff and col_diff >= -dist: direction = "LEFT" for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield (ngram, direction) else: for ngram in tokens_to_ngrams(getattr(phrase, attrib), n_min=n_min, n_max=n_max, lower=lower): yield ngram