示例#1
0
def get_subclasses(experiment):
  # 1.) Mention subclasses
  Data = mention_subclass("Data")
  Row = mention_subclass("Row")
  Col = mention_subclass("Col")

  # 2.) Mention spaces
  data_ngrams = MentionSentences() # MentionNgrams(n_max=3)
  row_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8)
  col_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8)

  # 3.) Matchers
  data_regex_matcher = RegexMatchSpan(rgx=r"[0-9-,.%$#]+( to | )?[0-9-,.%$#]*|^x$", longest_match_only=True)
  data_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Data", experiment))
  data_matcher = Intersect(data_regex_matcher, data_label_matcher)
  row_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True)
  row_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment))
  row_matcher = Intersect(row_regex_matcher, row_label_matcher)
  col_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True)
  col_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment))
  col_matcher = Intersect(col_regex_matcher, col_label_matcher)

  # 4.) Candidate classes
  RowCandidate = candidate_subclass("RowCandidate", [Data, Row])
  ColCandidate = candidate_subclass("ColCandidate", [Data, Col])

  # 5.) Throttlers
  mention_classes = [Data, Row, Col]
  mention_spaces = [data_ngrams, row_ngrams, col_ngrams]
  matchers = [data_matcher, row_matcher, col_matcher]
  candidate_classes = [RowCandidate, ColCandidate]
  throttlers = [row_filter, col_filter]

  return (mention_classes, mention_spaces, matchers, candidate_classes, throttlers)
示例#2
0
def get_supply_current_matcher():
    def current_units(attr):

        # NOTE: These two symbols for mu are unique, not duplicates.
        current_units = ["ma", "μa", "ua", "µa", "\uf06da"]
        keywords = ["supply", "quiescent", "iq", "is", "idd", "icc"]
        filter_keywords = ["offset", "bias", "logic", "shutdown"]
        related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True))
        related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-5, 5], lower=True))

        if attr.get_span().strip() == "0":
            return False

        if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)):
            return False

        if overlap(current_units, related_ngrams) and overlap(keywords, related_ngrams):
            return True

        return False

    # match 4-digit integers, or two-digit floats up with 2 points of precision
    current_rgx = RegexMatchSpan(
        rgx=r"(±?\d{1,2}\.\d{1,2}|±?\d{1,4})", longest_match_only=False
    )

    current_lambda = LambdaFunctionMatcher(func=current_units)
    condition_lambda = LambdaFunctionMatcher(func=_condition)
    location_lambda = LambdaFunctionMatcher(func=_first_page_or_table)

    return Intersect(condition_lambda, location_lambda, current_rgx, current_lambda)
示例#3
0
def _get_part_matcher():
    """Return the part matcher."""
    # Transistor Naming Conventions as Regular Expressions
    eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?"
                r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)")
    jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)"
    jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})"
    others_rgx = (
        r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT"
        r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}"
        r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)")

    part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])

    add_rgx = r"^[A-Z0-9\-]{5,15}$"
    part_file_name_lambda_matcher = LambdaFunctionMatcher(
        func=_part_file_name_conditions)

    part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True)
    part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH))
    part_file_name_matcher = Intersect(
        RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
        part_file_name_lambda_matcher,
    )
    return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
def birthday_extract_server(document, birthday_subclass):
    filter_birthday_matcher = LambdaFunctionMatcher(func=filter_birthday,
                                                    longest_match_only=True)
    birthday_conditions_matcher = LambdaFunctionMatcher(
        func=birthday_conditions, longest_match_only=True)
    birthday_matcher = Intersect(filter_birthday_matcher,
                                 birthday_conditions_matcher)
    birthday_space = MentionDates()

    document = MentionExtractorUDF([birthday_subclass], [birthday_space],
                                   [birthday_matcher]).apply(document)
    return document
示例#5
0
def address_extract_server(document, address_subclass):
    address_m1 = LambdaFunctionMatcher(func = has_province_address)
    address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address)
    address_m3 = LambdaFunctionMatcher(func = address_prefix)
    address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address)
    address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words)
    address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5)

    address_space = MentionSentences()
    
    document = MentionExtractorUDF([address_subclass], [address_space], [address_matcher]).apply(document)
    return document
示例#6
0
def address_extract(docs, session, address_subclass, parallelism, clear=True):
    address_m1 = LambdaFunctionMatcher(func = has_province_address)
    address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address)
    address_m3 = LambdaFunctionMatcher(func = address_prefix)
    address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address)
    address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words)
    address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5)

    address_space = MentionSentences()
    
    mention_extractor = MentionExtractor(session, [address_subclass], [address_space], [address_matcher])
    mention_extractor.apply(docs, parallelism=parallelism,clear=clear)
示例#7
0
def name_extract(docs, session, name_subclass, parallelism, clear=True):
    length_name_matcher = LambdaFunctionMatcher(func=length_name)
    position_name_matcher = LambdaFunctionMatcher(func=position_name)
    capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name)

    last_name_matcher = LambdaFunctionMatcher(func=last_name)
    name_common_matcher = LambdaFunctionMatcher(func=name_common)
    check_name_matcher = LambdaFunctionMatcher(func=check_name)
    prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name)

    form_name_matcher = Intersect(length_name_matcher, position_name_matcher,
                                  capitalize_name_matcher)
    name_matcher = Intersect(
        Union(Intersect(last_name_matcher, form_name_matcher),
              Intersect(name_common_matcher, form_name_matcher),
              prefix_name_matcher), check_name_matcher)
    name_space = MentionName()

    mention_extractor = MentionExtractor(session, [name_subclass],
                                         [name_space], [name_matcher])
    mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
示例#8
0
def test_intersect(doc_setup):
    """Test intersect matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=3)
    tc: TemporarySpanMention

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "This is apple",
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is apple",
        "This is",
        "This",
    }

    # Intersection of matcher0 and matcher1
    matcher = Intersect(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is apple"}

    # Intersection of matcher0 and matcher0
    matcher = Intersect(matcher0, matcher0, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is apple",
        "is apple",
        "apple",
    }

    # longest_match_only=True overrides that of child matchers.
    matcher = Intersect(matcher0, matcher0, longest_match_only=True)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is apple"}
示例#9
0
def _get_polarity_matcher():
    """Return the polarity matcher."""
    def polarity_conditions(attr):
        return not overlap(["complement", "complementary"],
                           get_sentence_ngrams(attr))

    polarity_rgx_matcher = RegexMatchSpan(rgx=r"NPN|PNP",
                                          longest_match_only=False,
                                          ignore_case=True)

    polarity_lambda_matcher = LambdaFunctionMatcher(func=polarity_conditions)

    return Intersect(polarity_rgx_matcher, polarity_lambda_matcher)
示例#10
0
def name_extract_server(document, name_subclass):
    length_name_matcher = LambdaFunctionMatcher(func=length_name)
    position_name_matcher = LambdaFunctionMatcher(func=position_name)
    capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name)

    last_name_matcher = LambdaFunctionMatcher(func=last_name)
    name_common_matcher = LambdaFunctionMatcher(func=name_common)
    check_name_matcher = LambdaFunctionMatcher(func=check_name)
    prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name)

    form_name_matcher = Intersect(length_name_matcher, position_name_matcher,
                                  capitalize_name_matcher)
    name_matcher = Intersect(
        Union(Intersect(last_name_matcher, form_name_matcher),
              Intersect(name_common_matcher, form_name_matcher),
              prefix_name_matcher), check_name_matcher)

    name_space = MentionName()

    document = MentionExtractorUDF([name_subclass], [name_space],
                                   [name_matcher]).apply(document)
    return document
def birthday_extract(docs,
                     session,
                     birthday_subclass,
                     parallelism,
                     clear=True):
    filter_birthday_matcher = LambdaFunctionMatcher(func=filter_birthday,
                                                    longest_match_only=True)
    birthday_conditions_matcher = LambdaFunctionMatcher(
        func=birthday_conditions, longest_match_only=True)
    birthday_matcher = Intersect(filter_birthday_matcher,
                                 birthday_conditions_matcher)
    birthday_space = MentionDates()

    mention_extractor = MentionExtractor(session, [birthday_subclass],
                                         [birthday_space], [birthday_matcher])
    mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
示例#12
0
def get_gain_matcher():
    def hertz_units(attr):
        hertz_units = ["mhz", "khz"]
        keywords = [
            "product",
            "gain",
            "gain",
            "unity",
            "bandwidth",
            "gbp",
            "gbw",
            "gbwp",
        ]
        filter_keywords = ["-3 db", "maximum", "minimum", "impedance"]
        related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True))
        related_ngrams.update(
            get_row_ngrams(attr, n_max=1, spread=[-2, 2], lower=True))
        cell_ngrams = set(get_cell_ngrams(attr, n_max=1, lower=True))

        if "f" in cell_ngrams and "=" in cell_ngrams:
            return False

        if attr.get_span().strip() == "0":
            return False

        if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)):
            return False

        if overlap(hertz_units, related_ngrams) and overlap(
                keywords, related_ngrams):
            return True

        return False

    # match 3-digit integers, or two-digit floats up with 2 points of precision
    gain_rgx = RegexMatchSpan(rgx=r"^(?:\d{1,2}\.\d{1,2}|\d{1,3})$",
                              longest_match_only=False)

    hertz_lambda = LambdaFunctionMatcher(func=hertz_units)
    condition_lambda = LambdaFunctionMatcher(func=_condition)
    location_lambda = LambdaFunctionMatcher(func=_first_page_or_table)

    return Intersect(hertz_lambda, gain_rgx, location_lambda, condition_lambda)
示例#13
0
def _get_ce_v_max_matcher():
    """Return a collector-emmiter voltage max matcher."""
    ce_keywords = set(
        ["collector emitter", "collector-emitter", "collector - emitter"])
    ce_abbrevs = set(["ceo", "vceo"])

    def ce_v_max_conditions(attr):
        ngrams = set(get_row_ngrams(attr, n_max=1))
        if not overlap(ce_keywords.union(ce_abbrevs), ngrams):
            return False
        if any(_ in attr.sentence.text.lower() for _ in ["vcb", "base"]):
            return False

        return True

    ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\b\d{1,2}[05]",
                                          longest_match_only=False)
    ce_v_max_row_matcher = LambdaFunctionMatcher(func=ce_v_max_conditions)
    ce_v_max_in_table = LambdaFunctionMatcher(func=_attr_in_table)

    return Intersect(ce_v_max_rgx_matcher, ce_v_max_row_matcher,
                     ce_v_max_in_table)
示例#14
0
    if len(file_name.split("_")) != 2:
        return False
    if attr.get_span()[0] == "-":
        return False
    name = attr.get_span().replace("-", "")
    return (any(char.isdigit() for char in name)
            and any(char.isalpha() for char in name)
            and common_prefix_length_diff(file_name.split("_")[1], name) <= 2)


add_rgx = r"^[A-Z0-9\-]{5,15}$"

part_file_name_lambda_matcher = LambdaFunctionMatcher(
    func=part_file_name_conditions)
part_file_name_matcher = Intersect(
    RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
    part_file_name_lambda_matcher)

part_matcher = Union(part_rgx_matcher, part_dict_matcher,
                     part_file_name_matcher)

# CE Voltage Matcher
ce_keywords = set(
    ["collector emitter", "collector-emitter", "collector - emitter"])
ce_abbrevs = set(["ceo", "vceo"])
ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\d{1,2}[05]",
                                      longest_match_only=False)


def ce_v_max_conditions(attr):
    """Check ce_v_max conditions."""
示例#15
0
# After defining all functions to capture the properties of a birth place mention, we combine them via an `Intersect` matcher. This matcher will only select a `span`, if all three functions agree.

# In[12]:

birth_place_in_labeled_row_matcher = LambdaFunctionMatcher(
    func=is_in_birthplace_table_row)
birth_place_in_labeled_row_matcher.longest_match_only = False
birth_place_no_commas_matcher = LambdaFunctionMatcher(
    func=no_commas_in_birth_place)
birth_place_left_aligned_matcher = LambdaFunctionMatcher(
    func=birthplace_left_aligned_to_punctuation)

place_of_birth_matcher = Intersect(
    birth_place_in_labeled_row_matcher,
    birth_place_no_commas_matcher,
    birth_place_left_aligned_matcher,
)

from fonduer.candidates import MentionNgrams

presname_ngrams = MentionNgrams(n_max=4, n_min=2)
placeofbirth_ngrams = MentionNgrams(n_max=3)

from fonduer.candidates.models import candidate_subclass

PresidentnamePlaceofbirth = candidate_subclass("PresidentnamePlaceofbirth",
                                               [Presidentname, Placeofbirth])

mention_classes = [Presidentname, Placeofbirth]
mention_spaces = [presname_ngrams, placeofbirth_ngrams]
示例#16
0

def post_matcher_fun(m):
    term = r"([Ll]ocation:[\w\W]{1,200}</.{0,20}>|\W[cC]ity:[\w\W]{1,200}</.{0,20}>|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)"
    #if m.get_span() in get_posting_html_fast(m.sentence.document.text, term):
    if m.get_span() in text_dict[m.sentence.document.name]:
        return True
    else:
        return False


post_matcher = LambdaFunctionMatcher(func=post_matcher_fun)

#spacy_location_matcher = LocationMatcher(longest_match_only=True)
#matchers = Union(geo_location_matcher)
matchers = Intersect(geo_location_matcher, post_matcher)

# Union matchers and create candidate extractor
print("Extracting candidates...")
LocationMention = mention_subclass("LocationMention")
mention_extractor = MentionExtractor(session, [LocationMention], [ngrams],
                                     [matchers])
mention_extractor.clear_all()
mention_extractor.apply(docs, parallelism=parallelism)
candidate_class = candidate_subclass("Location", [LocationMention])
candidate_extractor = CandidateExtractor(session, [candidate_class])

# Applying candidate extractors
candidate_extractor.apply(docs, split=0, parallelism=parallelism)
print("==============================")
print(f"Candidate extraction results for {postgres_db_name}:")