def get_subclasses(experiment): # 1.) Mention subclasses Data = mention_subclass("Data") Row = mention_subclass("Row") Col = mention_subclass("Col") # 2.) Mention spaces data_ngrams = MentionSentences() # MentionNgrams(n_max=3) row_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8) col_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8) # 3.) Matchers data_regex_matcher = RegexMatchSpan(rgx=r"[0-9-,.%$#]+( to | )?[0-9-,.%$#]*|^x$", longest_match_only=True) data_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Data", experiment)) data_matcher = Intersect(data_regex_matcher, data_label_matcher) row_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True) row_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment)) row_matcher = Intersect(row_regex_matcher, row_label_matcher) col_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True) col_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment)) col_matcher = Intersect(col_regex_matcher, col_label_matcher) # 4.) Candidate classes RowCandidate = candidate_subclass("RowCandidate", [Data, Row]) ColCandidate = candidate_subclass("ColCandidate", [Data, Col]) # 5.) Throttlers mention_classes = [Data, Row, Col] mention_spaces = [data_ngrams, row_ngrams, col_ngrams] matchers = [data_matcher, row_matcher, col_matcher] candidate_classes = [RowCandidate, ColCandidate] throttlers = [row_filter, col_filter] return (mention_classes, mention_spaces, matchers, candidate_classes, throttlers)
def get_supply_current_matcher(): def current_units(attr): # NOTE: These two symbols for mu are unique, not duplicates. current_units = ["ma", "μa", "ua", "µa", "\uf06da"] keywords = ["supply", "quiescent", "iq", "is", "idd", "icc"] filter_keywords = ["offset", "bias", "logic", "shutdown"] related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True)) related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-5, 5], lower=True)) if attr.get_span().strip() == "0": return False if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)): return False if overlap(current_units, related_ngrams) and overlap(keywords, related_ngrams): return True return False # match 4-digit integers, or two-digit floats up with 2 points of precision current_rgx = RegexMatchSpan( rgx=r"(±?\d{1,2}\.\d{1,2}|±?\d{1,4})", longest_match_only=False ) current_lambda = LambdaFunctionMatcher(func=current_units) condition_lambda = LambdaFunctionMatcher(func=_condition) location_lambda = LambdaFunctionMatcher(func=_first_page_or_table) return Intersect(condition_lambda, location_lambda, current_rgx, current_lambda)
def _get_part_matcher(): """Return the part matcher.""" # Transistor Naming Conventions as Regular Expressions eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?" r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)") jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)" jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})" others_rgx = ( r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT" r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}" r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)") part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx]) add_rgx = r"^[A-Z0-9\-]{5,15}$" part_file_name_lambda_matcher = LambdaFunctionMatcher( func=_part_file_name_conditions) part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True) part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH)) part_file_name_matcher = Intersect( RegexMatchSpan(rgx=add_rgx, longest_match_only=True), part_file_name_lambda_matcher, ) return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
def birthday_extract_server(document, birthday_subclass): filter_birthday_matcher = LambdaFunctionMatcher(func=filter_birthday, longest_match_only=True) birthday_conditions_matcher = LambdaFunctionMatcher( func=birthday_conditions, longest_match_only=True) birthday_matcher = Intersect(filter_birthday_matcher, birthday_conditions_matcher) birthday_space = MentionDates() document = MentionExtractorUDF([birthday_subclass], [birthday_space], [birthday_matcher]).apply(document) return document
def address_extract_server(document, address_subclass): address_m1 = LambdaFunctionMatcher(func = has_province_address) address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address) address_m3 = LambdaFunctionMatcher(func = address_prefix) address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address) address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words) address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5) address_space = MentionSentences() document = MentionExtractorUDF([address_subclass], [address_space], [address_matcher]).apply(document) return document
def address_extract(docs, session, address_subclass, parallelism, clear=True): address_m1 = LambdaFunctionMatcher(func = has_province_address) address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address) address_m3 = LambdaFunctionMatcher(func = address_prefix) address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address) address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words) address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5) address_space = MentionSentences() mention_extractor = MentionExtractor(session, [address_subclass], [address_space], [address_matcher]) mention_extractor.apply(docs, parallelism=parallelism,clear=clear)
def name_extract(docs, session, name_subclass, parallelism, clear=True): length_name_matcher = LambdaFunctionMatcher(func=length_name) position_name_matcher = LambdaFunctionMatcher(func=position_name) capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name) last_name_matcher = LambdaFunctionMatcher(func=last_name) name_common_matcher = LambdaFunctionMatcher(func=name_common) check_name_matcher = LambdaFunctionMatcher(func=check_name) prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name) form_name_matcher = Intersect(length_name_matcher, position_name_matcher, capitalize_name_matcher) name_matcher = Intersect( Union(Intersect(last_name_matcher, form_name_matcher), Intersect(name_common_matcher, form_name_matcher), prefix_name_matcher), check_name_matcher) name_space = MentionName() mention_extractor = MentionExtractor(session, [name_subclass], [name_space], [name_matcher]) mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
def test_intersect(doc_setup): """Test intersect matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=3) tc: TemporarySpanMention # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "This is apple", "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is apple", "This is", "This", } # Intersection of matcher0 and matcher1 matcher = Intersect(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is apple"} # Intersection of matcher0 and matcher0 matcher = Intersect(matcher0, matcher0, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is apple", "is apple", "apple", } # longest_match_only=True overrides that of child matchers. matcher = Intersect(matcher0, matcher0, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is apple"}
def _get_polarity_matcher(): """Return the polarity matcher.""" def polarity_conditions(attr): return not overlap(["complement", "complementary"], get_sentence_ngrams(attr)) polarity_rgx_matcher = RegexMatchSpan(rgx=r"NPN|PNP", longest_match_only=False, ignore_case=True) polarity_lambda_matcher = LambdaFunctionMatcher(func=polarity_conditions) return Intersect(polarity_rgx_matcher, polarity_lambda_matcher)
def name_extract_server(document, name_subclass): length_name_matcher = LambdaFunctionMatcher(func=length_name) position_name_matcher = LambdaFunctionMatcher(func=position_name) capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name) last_name_matcher = LambdaFunctionMatcher(func=last_name) name_common_matcher = LambdaFunctionMatcher(func=name_common) check_name_matcher = LambdaFunctionMatcher(func=check_name) prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name) form_name_matcher = Intersect(length_name_matcher, position_name_matcher, capitalize_name_matcher) name_matcher = Intersect( Union(Intersect(last_name_matcher, form_name_matcher), Intersect(name_common_matcher, form_name_matcher), prefix_name_matcher), check_name_matcher) name_space = MentionName() document = MentionExtractorUDF([name_subclass], [name_space], [name_matcher]).apply(document) return document
def birthday_extract(docs, session, birthday_subclass, parallelism, clear=True): filter_birthday_matcher = LambdaFunctionMatcher(func=filter_birthday, longest_match_only=True) birthday_conditions_matcher = LambdaFunctionMatcher( func=birthday_conditions, longest_match_only=True) birthday_matcher = Intersect(filter_birthday_matcher, birthday_conditions_matcher) birthday_space = MentionDates() mention_extractor = MentionExtractor(session, [birthday_subclass], [birthday_space], [birthday_matcher]) mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
def get_gain_matcher(): def hertz_units(attr): hertz_units = ["mhz", "khz"] keywords = [ "product", "gain", "gain", "unity", "bandwidth", "gbp", "gbw", "gbwp", ] filter_keywords = ["-3 db", "maximum", "minimum", "impedance"] related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True)) related_ngrams.update( get_row_ngrams(attr, n_max=1, spread=[-2, 2], lower=True)) cell_ngrams = set(get_cell_ngrams(attr, n_max=1, lower=True)) if "f" in cell_ngrams and "=" in cell_ngrams: return False if attr.get_span().strip() == "0": return False if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)): return False if overlap(hertz_units, related_ngrams) and overlap( keywords, related_ngrams): return True return False # match 3-digit integers, or two-digit floats up with 2 points of precision gain_rgx = RegexMatchSpan(rgx=r"^(?:\d{1,2}\.\d{1,2}|\d{1,3})$", longest_match_only=False) hertz_lambda = LambdaFunctionMatcher(func=hertz_units) condition_lambda = LambdaFunctionMatcher(func=_condition) location_lambda = LambdaFunctionMatcher(func=_first_page_or_table) return Intersect(hertz_lambda, gain_rgx, location_lambda, condition_lambda)
def _get_ce_v_max_matcher(): """Return a collector-emmiter voltage max matcher.""" ce_keywords = set( ["collector emitter", "collector-emitter", "collector - emitter"]) ce_abbrevs = set(["ceo", "vceo"]) def ce_v_max_conditions(attr): ngrams = set(get_row_ngrams(attr, n_max=1)) if not overlap(ce_keywords.union(ce_abbrevs), ngrams): return False if any(_ in attr.sentence.text.lower() for _ in ["vcb", "base"]): return False return True ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\b\d{1,2}[05]", longest_match_only=False) ce_v_max_row_matcher = LambdaFunctionMatcher(func=ce_v_max_conditions) ce_v_max_in_table = LambdaFunctionMatcher(func=_attr_in_table) return Intersect(ce_v_max_rgx_matcher, ce_v_max_row_matcher, ce_v_max_in_table)
if len(file_name.split("_")) != 2: return False if attr.get_span()[0] == "-": return False name = attr.get_span().replace("-", "") return (any(char.isdigit() for char in name) and any(char.isalpha() for char in name) and common_prefix_length_diff(file_name.split("_")[1], name) <= 2) add_rgx = r"^[A-Z0-9\-]{5,15}$" part_file_name_lambda_matcher = LambdaFunctionMatcher( func=part_file_name_conditions) part_file_name_matcher = Intersect( RegexMatchSpan(rgx=add_rgx, longest_match_only=True), part_file_name_lambda_matcher) part_matcher = Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher) # CE Voltage Matcher ce_keywords = set( ["collector emitter", "collector-emitter", "collector - emitter"]) ce_abbrevs = set(["ceo", "vceo"]) ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\d{1,2}[05]", longest_match_only=False) def ce_v_max_conditions(attr): """Check ce_v_max conditions."""
# After defining all functions to capture the properties of a birth place mention, we combine them via an `Intersect` matcher. This matcher will only select a `span`, if all three functions agree. # In[12]: birth_place_in_labeled_row_matcher = LambdaFunctionMatcher( func=is_in_birthplace_table_row) birth_place_in_labeled_row_matcher.longest_match_only = False birth_place_no_commas_matcher = LambdaFunctionMatcher( func=no_commas_in_birth_place) birth_place_left_aligned_matcher = LambdaFunctionMatcher( func=birthplace_left_aligned_to_punctuation) place_of_birth_matcher = Intersect( birth_place_in_labeled_row_matcher, birth_place_no_commas_matcher, birth_place_left_aligned_matcher, ) from fonduer.candidates import MentionNgrams presname_ngrams = MentionNgrams(n_max=4, n_min=2) placeofbirth_ngrams = MentionNgrams(n_max=3) from fonduer.candidates.models import candidate_subclass PresidentnamePlaceofbirth = candidate_subclass("PresidentnamePlaceofbirth", [Presidentname, Placeofbirth]) mention_classes = [Presidentname, Placeofbirth] mention_spaces = [presname_ngrams, placeofbirth_ngrams]
def post_matcher_fun(m): term = r"([Ll]ocation:[\w\W]{1,200}</.{0,20}>|\W[cC]ity:[\w\W]{1,200}</.{0,20}>|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)" #if m.get_span() in get_posting_html_fast(m.sentence.document.text, term): if m.get_span() in text_dict[m.sentence.document.name]: return True else: return False post_matcher = LambdaFunctionMatcher(func=post_matcher_fun) #spacy_location_matcher = LocationMatcher(longest_match_only=True) #matchers = Union(geo_location_matcher) matchers = Intersect(geo_location_matcher, post_matcher) # Union matchers and create candidate extractor print("Extracting candidates...") LocationMention = mention_subclass("LocationMention") mention_extractor = MentionExtractor(session, [LocationMention], [ngrams], [matchers]) mention_extractor.clear_all() mention_extractor.apply(docs, parallelism=parallelism) candidate_class = candidate_subclass("Location", [LocationMention]) candidate_extractor = CandidateExtractor(session, [candidate_class]) # Applying candidate extractors candidate_extractor.apply(docs, split=0, parallelism=parallelism) print("==============================") print(f"Candidate extraction results for {postgres_db_name}:")