Exemplo n.º 1
0
def test_union(doc_setup):
    """Test union matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)
    tc: TemporarySpanMention
    assert set(tc.get_span() for tc in space.apply(doc)) == {
        "This is",
        "is apple",
        "This",
        "is",
        "apple",
    }

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is",
        "This",
    }

    matcher = Union(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "is apple",
        "apple",
        "This is",
        "This",
    }

    # longest_match_only of each matcher is ignored.
    matcher = Union(matcher0, matcher1, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "is apple",
    }

    # Unsupported option should raise an exception
    with pytest.raises(Exception):
        Union(matcher0, matcher1, long_match_only=False)
Exemplo n.º 2
0
def _get_part_matcher():
    """Return the part matcher."""
    # Transistor Naming Conventions as Regular Expressions
    eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?"
                r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)")
    jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)"
    jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})"
    others_rgx = (
        r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT"
        r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}"
        r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)")

    part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])

    add_rgx = r"^[A-Z0-9\-]{5,15}$"
    part_file_name_lambda_matcher = LambdaFunctionMatcher(
        func=_part_file_name_conditions)

    part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True)
    part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH))
    part_file_name_matcher = Intersect(
        RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
        part_file_name_lambda_matcher,
    )
    return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
Exemplo n.º 3
0
def test_union(caplog, doc_setup):
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)
    tc: TemporarySpanMention
    assert set(tc.get_span() for tc in space.apply(doc)) == {
        "This is",
        "is apple",
        "This",
        "is",
        "apple",
    }

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is",
        "This",
    }

    matcher = Union(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "is apple",
        "apple",
        "This is",
        "This",
    }

    # longest_match_only of each matcher is ignored.
    matcher = Union(matcher0, matcher1, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "is apple",
    }
Exemplo n.º 4
0
def phone_extract_server(document, phone_subclass):
    phone_lambda_matcher = LambdaFunctionMatcher(func=matcher_number_phone)
    regex_matcher = LambdaFunctionMatcher(func=regexMatch)
    phone_lamda_matcher = Union(regex_matcher, phone_lambda_matcher)

    phone_space = MentionPhoneNumber()
    
    document = MentionExtractorUDF([phone_subclass], [phone_space], [phone_lamda_matcher]).apply(document)
    return document
Exemplo n.º 5
0
def phone_extract(docs, session, phone_subclass, parallelism, clear=True):
    phone_lambda_matcher = LambdaFunctionMatcher(func=matcher_number_phone)
    regex_matcher = LambdaFunctionMatcher(func=regexMatch)
    phone_lamda_matcher = Union(regex_matcher, phone_lambda_matcher)

    phone_space = MentionPhoneNumber()
    
    mention_extractor = MentionExtractor(session, [phone_subclass], [phone_space], [phone_lamda_matcher])
    mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
Exemplo n.º 6
0
def address_extract_server(document, address_subclass):
    address_m1 = LambdaFunctionMatcher(func = has_province_address)
    address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address)
    address_m3 = LambdaFunctionMatcher(func = address_prefix)
    address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address)
    address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words)
    address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5)

    address_space = MentionSentences()
    
    document = MentionExtractorUDF([address_subclass], [address_space], [address_matcher]).apply(document)
    return document
Exemplo n.º 7
0
def address_extract(docs, session, address_subclass, parallelism, clear=True):
    address_m1 = LambdaFunctionMatcher(func = has_province_address)
    address_m2 = LambdaFunctionMatcher(func = has_geographic_term_address)
    address_m3 = LambdaFunctionMatcher(func = address_prefix)
    address_m4 = LambdaFunctionMatcher(func = is_collection_of_number_and_geographical_term_and_provinces_name_address)
    address_m5 = LambdaFunctionMatcher(func = hasnt_ignor_words)
    address_matcher = Intersect(Union(address_m1, address_m2, address_m3), address_m4, address_m5)

    address_space = MentionSentences()
    
    mention_extractor = MentionExtractor(session, [address_subclass], [address_space], [address_matcher])
    mention_extractor.apply(docs, parallelism=parallelism,clear=clear)
Exemplo n.º 8
0
def name_extract(docs, session, name_subclass, parallelism, clear=True):
    length_name_matcher = LambdaFunctionMatcher(func=length_name)
    position_name_matcher = LambdaFunctionMatcher(func=position_name)
    capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name)

    last_name_matcher = LambdaFunctionMatcher(func=last_name)
    name_common_matcher = LambdaFunctionMatcher(func=name_common)
    check_name_matcher = LambdaFunctionMatcher(func=check_name)
    prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name)

    form_name_matcher = Intersect(length_name_matcher, position_name_matcher,
                                  capitalize_name_matcher)
    name_matcher = Intersect(
        Union(Intersect(last_name_matcher, form_name_matcher),
              Intersect(name_common_matcher, form_name_matcher),
              prefix_name_matcher), check_name_matcher)
    name_space = MentionName()

    mention_extractor = MentionExtractor(session, [name_subclass],
                                         [name_space], [name_matcher])
    mention_extractor.apply(docs, parallelism=parallelism, clear=clear)
Exemplo n.º 9
0
def name_extract_server(document, name_subclass):
    length_name_matcher = LambdaFunctionMatcher(func=length_name)
    position_name_matcher = LambdaFunctionMatcher(func=position_name)
    capitalize_name_matcher = LambdaFunctionMatcher(func=capitalize_name)

    last_name_matcher = LambdaFunctionMatcher(func=last_name)
    name_common_matcher = LambdaFunctionMatcher(func=name_common)
    check_name_matcher = LambdaFunctionMatcher(func=check_name)
    prefix_name_matcher = LambdaFunctionMatcher(func=prefix_name)

    form_name_matcher = Intersect(length_name_matcher, position_name_matcher,
                                  capitalize_name_matcher)
    name_matcher = Intersect(
        Union(Intersect(last_name_matcher, form_name_matcher),
              Intersect(name_common_matcher, form_name_matcher),
              prefix_name_matcher), check_name_matcher)

    name_space = MentionName()

    document = MentionExtractorUDF([name_subclass], [name_space],
                                   [name_matcher]).apply(document)
    return document
Exemplo n.º 10
0
# Defining ngrams for candidates
extraction_name = "age"
age_ngrams = MentionNgrams(n_max=3)

# Define matchers
m = RegexMatchSpan(rgx=r'.*(I|He|She) (is|am) ^([0-9]{2})*')
p = RegexMatchSpan(rgx=r'.*(age|is|@|was) ^([0-9]{2})*')
q = RegexMatchSpan(rgx=r'.*(age:) ^([0-9]{2})*')
r = RegexMatchSpan(
    rgx=r'.*^([0-9]{2}) (yrs|years|year|yr|old|year-old|yr-old|Years|Year|Yr)*'
)
s = RegexMatchSpan(rgx=r'(^|\W)age\W{0,4}[1-9]\d(\W|$)')

# Union matchers and create candidate extractor
age_matchers = Union(m, p, r, q, s)

# Getting candidates
AgeMention = mention_subclass("AgeMention")
mention_extractor = MentionExtractor(session, [AgeMention], [age_ngrams],
                                     [age_matchers])
mention_extractor.clear_all()
mention_extractor.apply(docs, parallelism=parallelism)
candidate_class = candidate_subclass("Age", [AgeMention])
candidate_extractor = CandidateExtractor(session, [candidate_class])

# Applying candidate extractors
candidate_extractor.apply(docs, split=0, parallelism=parallelism)
print("==============================")
print(f"Candidate extraction results for {postgres_db_name}:")
print(
Exemplo n.º 11
0
        return False
    name = attr.get_span().replace("-", "")
    return (any(char.isdigit() for char in name)
            and any(char.isalpha() for char in name)
            and common_prefix_length_diff(file_name.split("_")[1], name) <= 2)


add_rgx = r"^[A-Z0-9\-]{5,15}$"

part_file_name_lambda_matcher = LambdaFunctionMatcher(
    func=part_file_name_conditions)
part_file_name_matcher = Intersect(
    RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
    part_file_name_lambda_matcher)

part_matcher = Union(part_rgx_matcher, part_dict_matcher,
                     part_file_name_matcher)

# CE Voltage Matcher
ce_keywords = set(
    ["collector emitter", "collector-emitter", "collector - emitter"])
ce_abbrevs = set(["ceo", "vceo"])
ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\d{1,2}[05]",
                                      longest_match_only=False)


def ce_v_max_conditions(attr):
    """Check ce_v_max conditions."""
    return overlap(ce_keywords.union(ce_abbrevs),
                   get_row_ngrams(attr, spread=[0, 3], n_max=3))