예제 #1
0
def LF_part_ce_keywords_in_rows_cols_prefix(c):
    ngrams = set(list(get_row_ngrams(c[1], n_max=3)))
    ngrams = ngrams.union(set(list(get_col_ngrams(c[1], n_max=3))))
    ngrams_part = _filter_non_parts(ngrams)
    return (TRUE if overlap(_CE_KEYWORDS.union(_CE_ABBREVS), ngrams) and any(
        [c.part.context.get_span().lower().startswith(_)
         for _ in ngrams_part]) else ABSTAIN)
예제 #2
0
def LF_ce_keywords_not_part_in_row_col_prefix(c):
    ngrams_part = set(list(get_col_ngrams(c[1], n_max=3, lower=False)))
    ngrams_part = _filter_non_parts(
        ngrams_part.union(set(list(get_row_ngrams(c[1], n_max=3,
                                                  lower=False)))))

    return (TRUE if not same_table(c) and overlap(
        _CE_KEYWORDS.union(_CE_ABBREVS), get_row_ngrams(c[1], n_max=3))
            and len(ngrams_part) == 0
            and not overlap(_NON_CEV_KEYWORDS, get_row_ngrams(c.part, n_max=3))
            and not overlap(_NON_CEV_KEYWORDS, get_row_ngrams(c[1], n_max=3))
            and not LF_current_in_row(c) else ABSTAIN)
예제 #3
0
 def get_row_and_column_ngrams(mention):
     row_ngrams = list(get_row_ngrams(mention))
     col_ngrams = list(get_col_ngrams(mention))
     if not mention.sentence.is_tabular():
         assert len(row_ngrams) == 1 and row_ngrams[0] is None
         assert len(col_ngrams) == 1 and col_ngrams[0] is None
     else:
         assert not any(x is None for x in row_ngrams)
         assert not any(x is None for x in col_ngrams)
     if "birth_place" in row_ngrams:
         return True
     else:
         return False
예제 #4
0
def tablelib_unary_features(span):
    """
    Table-/structure-related features for a single span
    """
    if not span.sentence.is_tabular():
        return
    sentence = span.sentence
    for attrib in settings["featurization"]["table"]["unary_features"]["attrib"]:
        for ngram in get_cell_ngrams(
            span,
            n_max=settings["featurization"]["table"]["unary_features"][
                "get_cell_ngrams"
            ]["max"],
            attrib=attrib,
        ):
            yield f"CELL_{attrib.upper()}_[{ngram}]", DEF_VALUE
        for row_num in range(sentence.row_start, sentence.row_end + 1):
            yield f"ROW_NUM_[{row_num}]", DEF_VALUE
        for col_num in range(sentence.col_start, sentence.col_end + 1):
            yield f"COL_NUM_[{col_num}]", DEF_VALUE
        # NOTE: These two features could be accounted for by HTML_ATTR in
        # structural features
        yield f"ROW_SPAN_[{num_rows(sentence)}]", DEF_VALUE
        yield f"COL_SPAN_[{num_cols(sentence)}]", DEF_VALUE
        for axis in ["row", "col"]:
            for ngram in get_head_ngrams(
                span,
                axis,
                n_max=settings["featurization"]["table"]["unary_features"][
                    "get_head_ngrams"
                ]["max"],
                attrib=attrib,
            ):
                yield f"{axis.upper()}_HEAD_{attrib.upper()}_[{ngram}]", DEF_VALUE
        for ngram in get_row_ngrams(
            span,
            n_max=settings["featurization"]["table"]["unary_features"][
                "get_row_ngrams"
            ]["max"],
            attrib=attrib,
        ):
            yield f"ROW_{attrib.upper()}_[{ngram}]", DEF_VALUE
        for ngram in get_col_ngrams(
            span,
            n_max=settings["featurization"]["table"]["unary_features"][
                "get_col_ngrams"
            ]["max"],
            attrib=attrib,
        ):
            yield f"COL_{attrib.upper()}_[{ngram}]", DEF_VALUE
예제 #5
0
def neg_gain_keywords_in_column(c):
    col_ngrams = set(get_col_ngrams(c.gain, n_max=1, lower=True))
    if overlap(
        [
            "max",
            "min",
            "test",
            "condition",
            "conditions",
            "vgn",
            "f",
            "-3",
            "db",
            "dbc",
        ],
            col_ngrams,
    ):
        return FALSE

    else:
        return ABSTAIN
예제 #6
0
def _condition(attr):
    if overlap(["condition", "conditions"], get_col_ngrams(attr, n_max=1)):
        return False
    return True
예제 #7
0
def LF_part_mismatch_col(c):
    ngrams_part = _filter_non_parts(set(list(get_col_ngrams(c[1], n_max=1))))
    return (ABSTAIN if len(ngrams_part) == 0 or any([
        c.part.context.get_span().lower().startswith(_.lower())
        for _ in ngrams_part
    ]) else FALSE)
예제 #8
0
def neg_current_keywords_in_column(c):
    return (FALSE if overlap(
        ["over", "temperature", "vgn", "f", "-3", "db", "dbc", "min", "max"],
        get_col_ngrams(c.supply_current, lower=True),
    ) else ABSTAIN)
예제 #9
0
def pos_current_typ(c):
    return (TRUE if overlap(["typ", "typ."],
                            get_col_ngrams(c.supply_current, lower=True)) else
            ABSTAIN)