Exemplo n.º 1
0
    def hertz_units(attr):
        hertz_units = ["mhz", "khz"]
        keywords = [
            "product",
            "gain",
            "gain",
            "unity",
            "bandwidth",
            "gbp",
            "gbw",
            "gbwp",
        ]
        filter_keywords = ["-3 db", "maximum", "minimum", "impedance"]
        related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True))
        related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-2, 2], lower=True))
        cell_ngrams = set(get_cell_ngrams(attr, n_max=1, lower=True))

        if "f" in cell_ngrams and "=" in cell_ngrams:
            return False

        if attr.get_span().strip() == "0":
            return False

        if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)):
            return False

        if overlap(hertz_units, related_ngrams) and overlap(keywords, related_ngrams):
            return True

        return False
Exemplo n.º 2
0
def tablelib_unary_features(span):
    """
    Table-/structure-related features for a single span
    """
    if not span.sentence.is_tabular():
        return
    sentence = span.sentence
    for attrib in settings["featurization"]["table"]["unary_features"]["attrib"]:
        for ngram in get_cell_ngrams(
            span,
            n_max=settings["featurization"]["table"]["unary_features"][
                "get_cell_ngrams"
            ]["max"],
            attrib=attrib,
        ):
            yield f"CELL_{attrib.upper()}_[{ngram}]", DEF_VALUE
        for row_num in range(sentence.row_start, sentence.row_end + 1):
            yield f"ROW_NUM_[{row_num}]", DEF_VALUE
        for col_num in range(sentence.col_start, sentence.col_end + 1):
            yield f"COL_NUM_[{col_num}]", DEF_VALUE
        # NOTE: These two features could be accounted for by HTML_ATTR in
        # structural features
        yield f"ROW_SPAN_[{num_rows(sentence)}]", DEF_VALUE
        yield f"COL_SPAN_[{num_cols(sentence)}]", DEF_VALUE
        for axis in ["row", "col"]:
            for ngram in get_head_ngrams(
                span,
                axis,
                n_max=settings["featurization"]["table"]["unary_features"][
                    "get_head_ngrams"
                ]["max"],
                attrib=attrib,
            ):
                yield f"{axis.upper()}_HEAD_{attrib.upper()}_[{ngram}]", DEF_VALUE
        for ngram in get_row_ngrams(
            span,
            n_max=settings["featurization"]["table"]["unary_features"][
                "get_row_ngrams"
            ]["max"],
            attrib=attrib,
        ):
            yield f"ROW_{attrib.upper()}_[{ngram}]", DEF_VALUE
        for ngram in get_col_ngrams(
            span,
            n_max=settings["featurization"]["table"]["unary_features"][
                "get_col_ngrams"
            ]["max"],
            attrib=attrib,
        ):
            yield f"COL_{attrib.upper()}_[{ngram}]", DEF_VALUE
Exemplo n.º 3
0
def neg_gain_too_many_words_in_cell(c):
    cell_ngrams = list(get_cell_ngrams(c.gain))
    if len(cell_ngrams) >= 4:
        return FALSE
    else:
        return ABSTAIN
Exemplo n.º 4
0
def neg_gain_keywords_in_cell(c):
    cell_ngrams = set(get_cell_ngrams(c.gain, n_max=1, lower=True))
    if overlap(["g", "vo", "vpp", "f=", "f", "="], cell_ngrams):
        return FALSE
    else:
        return ABSTAIN