Пример #1
0
def test_regex_match(doc_setup):
    """Test RegexMatch matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)

    # a wrong option name should raise an excetiopn
    with pytest.raises(Exception):
        RegexMatchSpan(regex=r"apple")

    # Test if matcher raises an error when _f is given non-TemporarySpanMention
    matcher = RegexMatchSpan(rgx=r"apple")
    with pytest.raises(ValueError):
        list(matcher.apply(doc.sentences[0].words))

    matcher = RegexMatchEach(rgx=r"apple")
    with pytest.raises(ValueError):
        list(matcher.apply(doc.sentences[0].words))

    # Test if RegexMatchEach works as expected.
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"apple"}

    # Test ignore_case option
    matcher = RegexMatchEach(rgx=r"Apple", ignore_case=False)
    assert list(matcher.apply(space.apply(doc))) == []

    # Test sep option
    matcher = RegexMatchSpan(rgx=r"isapple", sep=" ")
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"is apple"}
Пример #2
0
def get_subclasses(experiment):
  # 1.) Mention subclasses
  Data = mention_subclass("Data")
  Row = mention_subclass("Row")
  Col = mention_subclass("Col")

  # 2.) Mention spaces
  data_ngrams = MentionSentences() # MentionNgrams(n_max=3)
  row_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8)
  col_ngrams = MentionSentences() # MentionNgrams(n_min=1, n_max=8)

  # 3.) Matchers
  data_regex_matcher = RegexMatchSpan(rgx=r"[0-9-,.%$#]+( to | )?[0-9-,.%$#]*|^x$", longest_match_only=True)
  data_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Data", experiment))
  data_matcher = Intersect(data_regex_matcher, data_label_matcher)
  row_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True)
  row_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment))
  row_matcher = Intersect(row_regex_matcher, row_label_matcher)
  col_regex_matcher = RegexMatchSpan(rgx=r"^.*$", longest_match_only=True)
  col_label_matcher = LambdaFunctionMatcher(func=get_label_matcher("Header", experiment))
  col_matcher = Intersect(col_regex_matcher, col_label_matcher)

  # 4.) Candidate classes
  RowCandidate = candidate_subclass("RowCandidate", [Data, Row])
  ColCandidate = candidate_subclass("ColCandidate", [Data, Col])

  # 5.) Throttlers
  mention_classes = [Data, Row, Col]
  mention_spaces = [data_ngrams, row_ngrams, col_ngrams]
  matchers = [data_matcher, row_matcher, col_matcher]
  candidate_classes = [RowCandidate, ColCandidate]
  throttlers = [row_filter, col_filter]

  return (mention_classes, mention_spaces, matchers, candidate_classes, throttlers)
Пример #3
0
def _get_part_matcher():
    """Return the part matcher."""
    # Transistor Naming Conventions as Regular Expressions
    eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}[0-9]?[A-Z]?"
                r"(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)")
    jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)"
    jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})"
    others_rgx = (
        r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|TIPL|DTC|MMBT"
        r"|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}[\d]{2,4}[A-Z]{0,5}"
        r"(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)")

    part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])

    add_rgx = r"^[A-Z0-9\-]{5,15}$"
    part_file_name_lambda_matcher = LambdaFunctionMatcher(
        func=_part_file_name_conditions)

    part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True)
    part_dict_matcher = DictionaryMatch(d=_get_digikey_parts_set(DICT_PATH))
    part_file_name_matcher = Intersect(
        RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
        part_file_name_lambda_matcher,
    )
    return Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher)
Пример #4
0
def test_inverse(doc_setup):
    """Test inverse matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)
    tc: TemporarySpanMention
    assert set(tc.get_span() for tc in space.apply(doc)) == {
        "This is",
        "is apple",
        "This",
        "is",
        "apple",
    }

    # Match any span that contains "apple" with longest_match_only=False
    matcher0 = RegexMatchSpan(
        rgx=r"apple", search=True, full_match=True, longest_match_only=False
    )
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "is apple",
        "apple",
    }

    # Take an inverse
    matcher = Inverse(matcher0, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "This",
        "is",
    }

    # longest_match_only=True
    matcher = Inverse(matcher0, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}

    # Match any span that contains "apple" with longest_match_only=True
    matcher0 = RegexMatchSpan(
        rgx=r"apple", search=True, full_match=True, longest_match_only=True
    )
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {"is apple"}

    # longest_match_only=False on Inverse is in effect.
    matcher = Inverse(matcher0, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "This",
        "is",
    }

    # longest_match_only=True on Inverse is in effect.
    matcher = Inverse(matcher0, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"This is"}

    # Check if Inverse raises an error when no child matcher is provided.
    with pytest.raises(ValueError):
        Inverse()

    # Check if Inverse raises an error when two child matchers are provided.
    with pytest.raises(ValueError):
        Inverse(matcher0, matcher0)
Пример #5
0
def _get_temp_matcher(temp_type):
    """Return the temperature matcher."""
    if temp_type == "max":
        return RegexMatchSpan(rgx=r"(?:[1][5-9]|20)[05]",
                              longest_match_only=False)
    elif temp_type == "min":
        return RegexMatchSpan(rgx=r"-[56][05]", longest_match_only=False)
    else:
        logger.warning(f"{temp_type} is not a valid temperature type.")
Пример #6
0
def test_union(doc_setup):
    """Test union matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)
    tc: TemporarySpanMention
    assert set(tc.get_span() for tc in space.apply(doc)) == {
        "This is",
        "is apple",
        "This",
        "is",
        "apple",
    }

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is",
        "This",
    }

    matcher = Union(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "is apple",
        "apple",
        "This is",
        "This",
    }

    # longest_match_only of each matcher is ignored.
    matcher = Union(matcher0, matcher1, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "is apple",
    }

    # Unsupported option should raise an exception
    with pytest.raises(Exception):
        Union(matcher0, matcher1, long_match_only=False)
Пример #7
0
def get_supply_current_matcher():
    def current_units(attr):

        # NOTE: These two symbols for mu are unique, not duplicates.
        current_units = ["ma", "μa", "ua", "µa", "\uf06da"]
        keywords = ["supply", "quiescent", "iq", "is", "idd", "icc"]
        filter_keywords = ["offset", "bias", "logic", "shutdown"]
        related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True))
        related_ngrams.update(get_row_ngrams(attr, n_max=1, spread=[-5, 5], lower=True))

        if attr.get_span().strip() == "0":
            return False

        if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)):
            return False

        if overlap(current_units, related_ngrams) and overlap(keywords, related_ngrams):
            return True

        return False

    # match 4-digit integers, or two-digit floats up with 2 points of precision
    current_rgx = RegexMatchSpan(
        rgx=r"(±?\d{1,2}\.\d{1,2}|±?\d{1,4})", longest_match_only=False
    )

    current_lambda = LambdaFunctionMatcher(func=current_units)
    condition_lambda = LambdaFunctionMatcher(func=_condition)
    location_lambda = LambdaFunctionMatcher(func=_first_page_or_table)

    return Intersect(condition_lambda, location_lambda, current_rgx, current_lambda)
Пример #8
0
def test_cancat(doc_setup):
    """Test Concat matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)

    # Match any span that contains "this"
    matcher0 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    # Match any span that contains "is"
    matcher1 = RegexMatchSpan(rgx=r"is",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    matcher = Concat(matcher0, matcher1)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is"}

    # Test if matcher raises an error when _f is given non-TemporarySpanMention
    with pytest.raises(ValueError):
        list(matcher.apply(doc.sentences[0].words))

    # Test if an error is raised when the number of child matchers is not 2.
    matcher = Concat(matcher0)
    with pytest.raises(ValueError):
        list(matcher.apply(space.apply(doc)))

    # Test with left_required=False
    matcher = Concat(matcher0, matcher1, left_required=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "is apple",
    }

    # Test with right_required=False
    matcher = Concat(matcher0, matcher1, right_required=False)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is"}

    # Test with permutations=False
    matcher = Concat(matcher1, matcher0, permutations=False)
    assert set(matcher.apply(space.apply(doc))) == set()
    # Test with permutations=True
    matcher = Concat(matcher1, matcher0, permutations=True)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is"}
Пример #9
0
def test_union(caplog, doc_setup):
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=2)
    tc: TemporarySpanMention
    assert set(tc.get_span() for tc in space.apply(doc)) == {
        "This is",
        "is apple",
        "This",
        "is",
        "apple",
    }

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is",
        "This",
    }

    matcher = Union(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "is apple",
        "apple",
        "This is",
        "This",
    }

    # longest_match_only of each matcher is ignored.
    matcher = Union(matcher0, matcher1, longest_match_only=True)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is",
        "is apple",
    }
Пример #10
0
def test_intersect(doc_setup):
    """Test intersect matcher."""
    doc = doc_setup
    space = MentionNgrams(n_min=1, n_max=3)
    tc: TemporarySpanMention

    # Match any span that contains "apple"
    matcher0 = RegexMatchSpan(rgx=r"apple",
                              search=True,
                              full_match=True,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher0.apply(space.apply(doc))) == {
        "This is apple",
        "is apple",
        "apple",
    }

    # Match any span that contains "this" (case insensitive)
    matcher1 = RegexMatchSpan(rgx=r"this",
                              search=False,
                              full_match=False,
                              longest_match_only=False)
    assert set(tc.get_span() for tc in matcher1.apply(space.apply(doc))) == {
        "This is apple",
        "This is",
        "This",
    }

    # Intersection of matcher0 and matcher1
    matcher = Intersect(matcher0, matcher1, longest_match_only=False)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is apple"}

    # Intersection of matcher0 and matcher0
    matcher = Intersect(matcher0, matcher0, longest_match_only=False)
    assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {
        "This is apple",
        "is apple",
        "apple",
    }

    # longest_match_only=True overrides that of child matchers.
    matcher = Intersect(matcher0, matcher0, longest_match_only=True)
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"This is apple"}
Пример #11
0
def _get_polarity_matcher():
    """Return the polarity matcher."""
    def polarity_conditions(attr):
        return not overlap(["complement", "complementary"],
                           get_sentence_ngrams(attr))

    polarity_rgx_matcher = RegexMatchSpan(rgx=r"NPN|PNP",
                                          longest_match_only=False,
                                          ignore_case=True)

    polarity_lambda_matcher = LambdaFunctionMatcher(func=polarity_conditions)

    return Intersect(polarity_rgx_matcher, polarity_lambda_matcher)
Пример #12
0
def get_gain_matcher():
    def hertz_units(attr):
        hertz_units = ["mhz", "khz"]
        keywords = [
            "product",
            "gain",
            "gain",
            "unity",
            "bandwidth",
            "gbp",
            "gbw",
            "gbwp",
        ]
        filter_keywords = ["-3 db", "maximum", "minimum", "impedance"]
        related_ngrams = set(get_right_ngrams(attr, n_max=1, lower=True))
        related_ngrams.update(
            get_row_ngrams(attr, n_max=1, spread=[-2, 2], lower=True))
        cell_ngrams = set(get_cell_ngrams(attr, n_max=1, lower=True))

        if "f" in cell_ngrams and "=" in cell_ngrams:
            return False

        if attr.get_span().strip() == "0":
            return False

        if overlap(filter_keywords, get_row_ngrams(attr, n_max=1, lower=True)):
            return False

        if overlap(hertz_units, related_ngrams) and overlap(
                keywords, related_ngrams):
            return True

        return False

    # match 3-digit integers, or two-digit floats up with 2 points of precision
    gain_rgx = RegexMatchSpan(rgx=r"^(?:\d{1,2}\.\d{1,2}|\d{1,3})$",
                              longest_match_only=False)

    hertz_lambda = LambdaFunctionMatcher(func=hertz_units)
    condition_lambda = LambdaFunctionMatcher(func=_condition)
    location_lambda = LambdaFunctionMatcher(func=_first_page_or_table)

    return Intersect(hertz_lambda, gain_rgx, location_lambda, condition_lambda)
Пример #13
0
def _get_ce_v_max_matcher():
    """Return a collector-emmiter voltage max matcher."""
    ce_keywords = set(
        ["collector emitter", "collector-emitter", "collector - emitter"])
    ce_abbrevs = set(["ceo", "vceo"])

    def ce_v_max_conditions(attr):
        ngrams = set(get_row_ngrams(attr, n_max=1))
        if not overlap(ce_keywords.union(ce_abbrevs), ngrams):
            return False
        if any(_ in attr.sentence.text.lower() for _ in ["vcb", "base"]):
            return False

        return True

    ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\b\d{1,2}[05]",
                                          longest_match_only=False)
    ce_v_max_row_matcher = LambdaFunctionMatcher(func=ce_v_max_conditions)
    ce_v_max_in_table = LambdaFunctionMatcher(func=_attr_in_table)

    return Intersect(ce_v_max_rgx_matcher, ce_v_max_row_matcher,
                     ce_v_max_in_table)
# Getting all documents parsed by Snorkel
print("Getting documents and sentences...")
docs = session.query(Document).all()
#sents = session.query(Sentence).all()

from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams
from fonduer.candidates.models import mention_subclass, candidate_subclass
from fonduer.candidates.matchers import RegexMatchSpan, Union

# Defining ngrams for candidates
extraction_name = 'call'
ngrams = MentionNgrams(n_max=1)

# Define matchers
regex_matcher_1=RegexMatchSpan(rgx = r'(incalls?|outcalls?|incalls?outcalls?|in calls?|out calls?)')

# Union matchers and create candidate extractor
matchers = regex_matcher_1
# Getting candidates
CallMention = mention_subclass("CallMention")
mention_extractor = MentionExtractor(
        session, [CallMention], [ngrams], [matchers]
    )
mention_extractor.clear_all()
mention_extractor.apply(docs, parallelism=parallelism)
candidate_class = candidate_subclass("Call", [CallMention])
candidate_extractor = CandidateExtractor(session, [candidate_class])

# Applying candidate extractors
candidate_extractor.apply(docs, split=0, parallelism=parallelism)
Пример #15
0
# Getting all documents parsed by Snorkel
print("Getting documents and sentences...")
docs = session.query(Document).all()
#sents = session.query(Sentence).all()

from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams
from fonduer.candidates.models import mention_subclass, candidate_subclass
from fonduer.candidates.matchers import RegexMatchSpan, Union

# Defining ngrams for candidates
extraction_name = "age"
age_ngrams = MentionNgrams(n_max=3)

# Define matchers
m = RegexMatchSpan(rgx=r'.*(I|He|She) (is|am) ^([0-9]{2})*')
p = RegexMatchSpan(rgx=r'.*(age|is|@|was) ^([0-9]{2})*')
q = RegexMatchSpan(rgx=r'.*(age:) ^([0-9]{2})*')
r = RegexMatchSpan(
    rgx=r'.*^([0-9]{2}) (yrs|years|year|yr|old|year-old|yr-old|Years|Year|Yr)*'
)
s = RegexMatchSpan(rgx=r'(^|\W)age\W{0,4}[1-9]\d(\W|$)')

# Union matchers and create candidate extractor
age_matchers = Union(m, p, r, q, s)

# Getting candidates
AgeMention = mention_subclass("AgeMention")
mention_extractor = MentionExtractor(session, [AgeMention], [age_ngrams],
                                     [age_matchers])
mention_extractor.clear_all()
Пример #16
0
"""Hardware matchers."""
import csv

from fonduer.candidates.matchers import (
    DictionaryMatch,
    Intersect,
    LambdaFunctionMatcher,
    RegexMatchSpan,
    Union,
)
from fonduer.utils.data_model_utils import get_row_ngrams, overlap

temp_matcher = RegexMatchSpan(rgx=r"(?:[1][5-9]|20)[05]",
                              longest_match_only=False)

# Transistor Naming Conventions as Regular Expressions ###
eeca_rgx = (r"([ABC][A-Z][WXYZ]?[0-9]{3,5}(?:[A-Z]){0,5}"
            r"[0-9]?[A-Z]?(?:-[A-Z0-9]{1,7})?(?:[-][A-Z0-9]{1,2})?(?:\/DG)?)")
jedec_rgx = r"(2N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)"
jis_rgx = r"(2S[ABCDEFGHJKMQRSTVZ]{1}[\d]{2,4})"
others_rgx = (r"((?:NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|ZXT|TIS|"
              r"TIPL|DTC|MMBT|SMMBT|PZT|FZT|STD|BUV|PBSS|KSC|CXT|FCX|CMPT){1}"
              r"[\d]{2,4}[A-Z]{0,5}(?:-[A-Z0-9]{0,6})?(?:[-][A-Z0-9]{0,1})?)")

part_rgx = "|".join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])
part_rgx_matcher = RegexMatchSpan(rgx=part_rgx, longest_match_only=True)


def get_digikey_parts_set(path):
    """Get all transistor parts from digikey part dictionary."""
    all_parts = set()
Пример #17
0
# 2.) Mention spaces
station_ngrams = MentionNgrams(
    n_max=4, split_tokens=[" ", "_", "\.",
                           "%"])  # StationMentionSpace(n_max=4) #
price_ngrams = MentionNgrams(n_max=1)

# 3.) Matcher functions
station_matcher = RegexMatchFull(
    rgx=station_rgx,
    ignore_case=True,
    # search=True,
    # full_match=False,
    # longest_match_only=False,
)  # DictionaryMatch(d=stations_list)
price_matcher = RegexMatchSpan(rgx=r"\d{1,4}(\.\d{1,5})",
                               longest_match_only=True)

# 4.) Candidate classes
StationPrice = candidate_subclass("StationPrice", [Station, Price])


# 5.) Throttlers
def my_throttler(c):
    (station, price) = c
    if 'volume' in get_aligned_ngrams(price, lower=True):
        return False
    if 'date' in get_aligned_ngrams(price, lower=True):
        return False
    if 'non' in get_aligned_ngrams(price, lower=True):
        return False
    html_tags = get_ancestor_tag_names(station)
Пример #18
0
# Getting all documents parsed by Snorkel
print("Getting documents and sentences...")
docs = session.query(Document).all()
#sents = session.query(Sentence).all()

from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams
from fonduer.candidates.models import mention_subclass, candidate_subclass
from fonduer.candidates.matchers import RegexMatchSpan, Union

# Defining ngrams for candidates
extraction_name = 'ethnicity'
ngrams = MentionNgrams(n_max=1)

# Define matchers
regex_matcher_1 = RegexMatchSpan(
    rgx=r'(black|ebony|chocolate|mocha|cocoa|white|blonde|asian|latina|arab)')

# Union matchers and create candidate extractor
matchers = regex_matcher_1

# Getting candidates
EthnicityMention = mention_subclass("EthnicityMention")
mention_extractor = MentionExtractor(session, [EthnicityMention], [ngrams],
                                     [matchers])
mention_extractor.clear_all()
mention_extractor.apply(docs, parallelism=parallelism)
candidate_class = candidate_subclass("Ethnicity", [EthnicityMention])
candidate_extractor = CandidateExtractor(session, [candidate_class])

# Applying candidate extractors
candidate_extractor.apply(docs, split=0, parallelism=parallelism)