def hertz_units(attr): hertz_units = ["mhz", "khz"] keywords = [ "product", "gain", "gain", "unity", "bandwidth", "gbp", "gbw", "gbwp", ] filter_keywords = ["-3 db", "maximum", "minimum", "impedance"] related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True)) related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-2, 2], lower=True)) cell_ngrams = set(get_cell_ngrams(attr, n_max=1, lower=True)) if "f" in cell_ngrams and "=" in cell_ngrams: return False if attr.get_span().strip() == "0": return False if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)): return False if overlap(hertz_units, related_ngrams) and overlap(keywords, related_ngrams): return True return False
def tablelib_unary_features(span): """ Table-/structure-related features for a single span """ if not span.sentence.is_tabular(): return sentence = span.sentence for attrib in settings["featurization"]["table"]["unary_features"]["attrib"]: for ngram in get_cell_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_cell_ngrams" ]["max"], attrib=attrib, ): yield f"CELL_{attrib.upper()}_[{ngram}]", DEF_VALUE for row_num in range(sentence.row_start, sentence.row_end + 1): yield f"ROW_NUM_[{row_num}]", DEF_VALUE for col_num in range(sentence.col_start, sentence.col_end + 1): yield f"COL_NUM_[{col_num}]", DEF_VALUE # NOTE: These two features could be accounted for by HTML_ATTR in # structural features yield f"ROW_SPAN_[{num_rows(sentence)}]", DEF_VALUE yield f"COL_SPAN_[{num_cols(sentence)}]", DEF_VALUE for axis in ["row", "col"]: for ngram in get_head_ngrams( span, axis, n_max=settings["featurization"]["table"]["unary_features"][ "get_head_ngrams" ]["max"], attrib=attrib, ): yield f"{axis.upper()}_HEAD_{attrib.upper()}_[{ngram}]", DEF_VALUE for ngram in get_row_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_row_ngrams" ]["max"], attrib=attrib, ): yield f"ROW_{attrib.upper()}_[{ngram}]", DEF_VALUE for ngram in get_col_ngrams( span, n_max=settings["featurization"]["table"]["unary_features"][ "get_col_ngrams" ]["max"], attrib=attrib, ): yield f"COL_{attrib.upper()}_[{ngram}]", DEF_VALUE
def neg_gain_too_many_words_in_cell(c): cell_ngrams = list(get_cell_ngrams(c.gain)) if len(cell_ngrams) >= 4: return FALSE else: return ABSTAIN
def neg_gain_keywords_in_cell(c): cell_ngrams = set(get_cell_ngrams(c.gain, n_max=1, lower=True)) if overlap(["g", "vo", "vpp", "f=", "f", "="], cell_ngrams): return FALSE else: return ABSTAIN