예제 #1
0
def test_mention_longest_match():
    """Test longest match filtering in mention extraction."""
    # SpaCy on mac has issue on parallel parsing
    PARALLEL = 1

    max_docs = 1
    session = Meta.init(CONN_STRING).Session()

    docs_path = "tests/data/pure_html/lincoln_short.html"

    # Parsing
    logger.info("Parsing...")
    doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs)
    corpus_parser = Parser(session, structural=True, lingual=True)
    corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)
    docs = session.query(Document).order_by(Document.name).all()
    # Mention Extraction
    name_ngrams = MentionNgramsPart(n_max=3)
    place_ngrams = MentionNgramsTemp(n_max=4)

    Name = mention_subclass("Name")
    Place = mention_subclass("Place")

    def is_birthplace_table_row(mention):
        if not mention.sentence.is_tabular():
            return False
        ngrams = get_row_ngrams(mention, lower=True)
        if "birth_place" in ngrams:
            return True
        else:
            return False

    birthplace_matcher = LambdaFunctionMatcher(
        func=is_birthplace_table_row, longest_match_only=False
    )
    mention_extractor = MentionExtractor(
        session,
        [Name, Place],
        [name_ngrams, place_ngrams],
        [PersonMatcher(), birthplace_matcher],
    )
    mention_extractor.apply(docs, parallelism=PARALLEL)
    mentions = session.query(Place).all()
    mention_spans = [x.context.get_span() for x in mentions]
    assert "Sinking Spring Farm" in mention_spans
    assert "Farm" in mention_spans
    assert len(mention_spans) == 23

    birthplace_matcher = LambdaFunctionMatcher(
        func=is_birthplace_table_row, longest_match_only=True
    )
    mention_extractor = MentionExtractor(
        session,
        [Name, Place],
        [name_ngrams, place_ngrams],
        [PersonMatcher(), birthplace_matcher],
    )
    mention_extractor.apply(docs, parallelism=PARALLEL)
    mentions = session.query(Place).all()
    mention_spans = [x.context.get_span() for x in mentions]
    assert "Sinking Spring Farm" in mention_spans
    assert "Farm" not in mention_spans
    assert len(mention_spans) == 4
예제 #2
0
Presidentname = mention_subclass("Presidentname")
Placeofbirth = mention_subclass("Placeofbirth")


def mention_span_matches_file_name(mention):
    president_name_string = mention.get_span()
    file_name = mention.sentence.document.name.replace("_", " ")
    if president_name_string == file_name:
        return True
    else:
        return False


from fonduer.candidates.matchers import LambdaFunctionMatcher, Intersect, Union

president_name_matcher = LambdaFunctionMatcher(
    func=mention_span_matches_file_name)

from fonduer.utils.data_model_utils import get_row_ngrams


def is_in_birthplace_table_row(mention):
    if not mention.sentence.is_tabular():
        return False
    ngrams = get_row_ngrams(mention, lower=True)
    birth_place_words = set(["birth", "place"])
    if birth_place_words <= set(ngrams):
        return True
    else:
        return False

예제 #3
0
def part_file_name_conditions(attr):
    file_name = attr.sentence.document.name
    if len(file_name.split("_")) != 2:
        return False
    if attr.get_span()[0] == "-":
        return False
    name = attr.get_span().replace("-", "")
    return (any(char.isdigit() for char in name)
            and any(char.isalpha() for char in name)
            and common_prefix_length_diff(file_name.split("_")[1], name) <= 2)


add_rgx = r"^[A-Z0-9\-]{5,15}$"

part_file_name_lambda_matcher = LambdaFunctionMatcher(
    func=part_file_name_conditions)
part_file_name_matcher = Intersect(
    RegexMatchSpan(rgx=add_rgx, longest_match_only=True),
    part_file_name_lambda_matcher)

part_matcher = Union(part_rgx_matcher, part_dict_matcher,
                     part_file_name_matcher)

# CE Voltage Matcher
ce_keywords = set(
    ["collector emitter", "collector-emitter", "collector - emitter"])
ce_abbrevs = set(["ceo", "vceo"])
ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\d{1,2}[05]",
                                      longest_match_only=False)

예제 #4
0
from fonduer.candidates.matchers import LambdaFunctionMatcher


def person_name_matcher(mention):
    mention_set = set(mention.sentence.ner_tags)
    if len(mention_set) == 1 and 'PERSON' in mention_set:
        return True
    else:
        return False


person_name_function = LambdaFunctionMatcher(func=person_name_matcher)


def get_matchers():
    return [person_name_function]
예제 #5
0
#sents = session.query(Sentence).all()

from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams
from fonduer.candidates.models import mention_subclass, candidate_subclass
from dataset_utils import LocationMatcher, city_index
from fonduer.candidates.matchers import Union, LambdaFunctionMatcher, Intersect
from emmental_utils import get_posting_html_fast

# Defining ngrams for candidates
extraction_name = 'location'
ngrams = MentionNgrams(n_max=3)

# Define matchers
# Geolocation matcher
cities = city_index('../utils/data/cities15000.txt')
geo_location_matcher = LambdaFunctionMatcher(func=cities.fast_loc)

# In raw text matcher
with open(f"{config['prediction_model_path']}/char_dict.pkl", 'rb') as fl:
    char_dict = pickle.load(fl)
dataset = load_data_from_db(postgres_db_name,
                            config['postgres_location'], {},
                            char_dict=char_dict,
                            clobber_label=True)
text_dict = {a[0]['uid']: a[0]['text'] for a in dataset}


def post_matcher_fun(m):
    term = r"([Ll]ocation:[\w\W]{1,200}</.{0,20}>|\W[cC]ity:[\w\W]{1,200}</.{0,20}>|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)"
    #if m.get_span() in get_posting_html_fast(m.sentence.document.text, term):
    if m.get_span() in text_dict[m.sentence.document.name]: