def test_union(doc_setup): """Test union matcher.""" doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is", "This", } matcher = Union(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "is apple", "apple", "This is", "This", } # longest_match_only of each matcher is ignored. matcher = Union(matcher0, matcher1, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", } # Unsupported option should raise an exception with pytest.raises(Exception): Union(matcher0, matcher1, long_match_only=False)
def _get_part_matcher(): """Return the part matcher.""" # Transistor Naming Conventions as Regular Expressions eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?" r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)") jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)" jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})" others_rgx = ( r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT" r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}" r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)") part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx]) add_rgx = r"^[A-Z0-9\-]{5,15}$" part_file_name_lambda_matcher = LambdaFunctionMatcher( func=_part_file_name_conditions) part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True) part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH)) part_file_name_matcher = Intersect( RegexMatchSpan(rgx=add_rgx, longest_match_only=True), part_file_name_lambda_matcher, ) return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
def test_union(caplog, doc_setup): doc = doc_setup space = MentionNgrams(n_min=1, n_max=2) tc: TemporarySpanMention assert set(tc.get_span() for tc in space.apply(doc)) == { "This is", "is apple", "This", "is", "apple", } # Match any span that contains "apple" matcher0 = RegexMatchSpan(rgx=r"apple", search=True, full_match=True, longest_match_only=False) assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == { "is apple", "apple", } # Match any span that contains "this" (case insensitive) matcher1 = RegexMatchSpan(rgx=r"this", search=False, full_match=False, longest_match_only=False) assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == { "This is", "This", } matcher = Union(matcher0, matcher1, longest_match_only=False) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "is apple", "apple", "This is", "This", } # longest_match_only of each matcher is ignored. matcher = Union(matcher0, matcher1, longest_match_only=True) assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == { "This is", "is apple", }
def phone_extract_server(document, phone_subclass): phone_lambda_matcher = LambdaFunctionMatcher(func=matcher_number_phone) regex_matcher = LambdaFunctionMatcher(func=regexMatch) phone_lamda_matcher = Union(regex_matcher, phone_lambda_matcher) phone_space = MentionPhoneNumber() document = MentionExtractorUDF([phone_subclass], [phone_space], [phone_lamda_matcher]).apply(document) return document
def phone_extract(docs, session, phone_subclass, parallelism, clear=True): phone_lambda_matcher = LambdaFunctionMatcher(func=matcher_number_phone) regex_matcher = LambdaFunctionMatcher(func=regexMatch) phone_lamda_matcher = Union(regex_matcher, phone_lambda_matcher) phone_space = MentionPhoneNumber() mention_extractor = MentionExtractor(session, [phone_subclass], [phone_space], [phone_lamda_matcher]) mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
def address_extract_server(document, address_subclass): address_m1 = LambdaFunctionMatcher(func = has_province_address) address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address) address_m3 = LambdaFunctionMatcher(func = address_prefix) address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address) address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words) address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5) address_space = MentionSentences() document = MentionExtractorUDF([address_subclass], [address_space], [address_matcher]).apply(document) return document
def address_extract(docs, session, address_subclass, parallelism, clear=True): address_m1 = LambdaFunctionMatcher(func = has_province_address) address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address) address_m3 = LambdaFunctionMatcher(func = address_prefix) address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address) address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words) address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5) address_space = MentionSentences() mention_extractor = MentionExtractor(session, [address_subclass], [address_space], [address_matcher]) mention_extractor.apply(docs, parallelism=parallelism,clear=clear)
def name_extract(docs, session, name_subclass, parallelism, clear=True): length_name_matcher = LambdaFunctionMatcher(func=length_name) position_name_matcher = LambdaFunctionMatcher(func=position_name) capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name) last_name_matcher = LambdaFunctionMatcher(func=last_name) name_common_matcher = LambdaFunctionMatcher(func=name_common) check_name_matcher = LambdaFunctionMatcher(func=check_name) prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name) form_name_matcher = Intersect(length_name_matcher, position_name_matcher, capitalize_name_matcher) name_matcher = Intersect( Union(Intersect(last_name_matcher, form_name_matcher), Intersect(name_common_matcher, form_name_matcher), prefix_name_matcher), check_name_matcher) name_space = MentionName() mention_extractor = MentionExtractor(session, [name_subclass], [name_space], [name_matcher]) mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
def name_extract_server(document, name_subclass): length_name_matcher = LambdaFunctionMatcher(func=length_name) position_name_matcher = LambdaFunctionMatcher(func=position_name) capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name) last_name_matcher = LambdaFunctionMatcher(func=last_name) name_common_matcher = LambdaFunctionMatcher(func=name_common) check_name_matcher = LambdaFunctionMatcher(func=check_name) prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name) form_name_matcher = Intersect(length_name_matcher, position_name_matcher, capitalize_name_matcher) name_matcher = Intersect( Union(Intersect(last_name_matcher, form_name_matcher), Intersect(name_common_matcher, form_name_matcher), prefix_name_matcher), check_name_matcher) name_space = MentionName() document = MentionExtractorUDF([name_subclass], [name_space], [name_matcher]).apply(document) return document
# Defining ngrams for candidates extraction_name = "age" age_ngrams = MentionNgrams(n_max=3) # Define matchers m = RegexMatchSpan(rgx=r'.*(I|He|She) (is|am) ^([0-9]{2})*') p = RegexMatchSpan(rgx=r'.*(age|is|@|was) ^([0-9]{2})*') q = RegexMatchSpan(rgx=r'.*(age:) ^([0-9]{2})*') r = RegexMatchSpan( rgx=r'.*^([0-9]{2}) (yrs|years|year|yr|old|year-old|yr-old|Years|Year|Yr)*' ) s = RegexMatchSpan(rgx=r'(^|\W)age\W{0,4}[1-9]\d(\W|$)') # Union matchers and create candidate extractor age_matchers = Union(m, p, r, q, s) # Getting candidates AgeMention = mention_subclass("AgeMention") mention_extractor = MentionExtractor(session, [AgeMention], [age_ngrams], [age_matchers]) mention_extractor.clear_all() mention_extractor.apply(docs, parallelism=parallelism) candidate_class = candidate_subclass("Age", [AgeMention]) candidate_extractor = CandidateExtractor(session, [candidate_class]) # Applying candidate extractors candidate_extractor.apply(docs, split=0, parallelism=parallelism) print("==============================") print(f"Candidate extraction results for {postgres_db_name}:") print(
return False name = attr.get_span().replace("-", "") return (any(char.isdigit() for char in name) and any(char.isalpha() for char in name) and common_prefix_length_diff(file_name.split("_")[1], name) <= 2) add_rgx = r"^[A-Z0-9\-]{5,15}$" part_file_name_lambda_matcher = LambdaFunctionMatcher( func=part_file_name_conditions) part_file_name_matcher = Intersect( RegexMatchSpan(rgx=add_rgx, longest_match_only=True), part_file_name_lambda_matcher) part_matcher = Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher) # CE Voltage Matcher ce_keywords = set( ["collector emitter", "collector-emitter", "collector - emitter"]) ce_abbrevs = set(["ceo", "vceo"]) ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\d{1,2}[05]", longest_match_only=False) def ce_v_max_conditions(attr): """Check ce_v_max conditions.""" return overlap(ce_keywords.union(ce_abbrevs), get_row_ngrams(attr, spread=[0, 3], n_max=3))