def test_mention_longest_match(): """Test longest match filtering in mention extraction.""" # SpaCy on mac has issue on parallel parsing PARALLEL = 1 max_docs = 1 session = Meta.init(CONN_STRING).Session() docs_path = "tests/data/pure_html/lincoln_short.html" # Parsing logger.info("Parsing...") doc_preprocessor = HTMLDocPreprocessor(docs_path, max_docs=max_docs) corpus_parser = Parser(session, structural=True, lingual=True) corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL) docs = session.query(Document).order_by(Document.name).all() # Mention Extraction name_ngrams = MentionNgramsPart(n_max=3) place_ngrams = MentionNgramsTemp(n_max=4) Name = mention_subclass("Name") Place = mention_subclass("Place") def is_birthplace_table_row(mention): if not mention.sentence.is_tabular(): return False ngrams = get_row_ngrams(mention, lower=True) if "birth_place" in ngrams: return True else: return False birthplace_matcher = LambdaFunctionMatcher( func=is_birthplace_table_row, longest_match_only=False ) mention_extractor = MentionExtractor( session, [Name, Place], [name_ngrams, place_ngrams], [PersonMatcher(), birthplace_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) mentions = session.query(Place).all() mention_spans = [x.context.get_span() for x in mentions] assert "Sinking Spring Farm" in mention_spans assert "Farm" in mention_spans assert len(mention_spans) == 23 birthplace_matcher = LambdaFunctionMatcher( func=is_birthplace_table_row, longest_match_only=True ) mention_extractor = MentionExtractor( session, [Name, Place], [name_ngrams, place_ngrams], [PersonMatcher(), birthplace_matcher], ) mention_extractor.apply(docs, parallelism=PARALLEL) mentions = session.query(Place).all() mention_spans = [x.context.get_span() for x in mentions] assert "Sinking Spring Farm" in mention_spans assert "Farm" not in mention_spans assert len(mention_spans) == 4
Presidentname = mention_subclass("Presidentname") Placeofbirth = mention_subclass("Placeofbirth") def mention_span_matches_file_name(mention): president_name_string = mention.get_span() file_name = mention.sentence.document.name.replace("_", " ") if president_name_string == file_name: return True else: return False from fonduer.candidates.matchers import LambdaFunctionMatcher, Intersect, Union president_name_matcher = LambdaFunctionMatcher( func=mention_span_matches_file_name) from fonduer.utils.data_model_utils import get_row_ngrams def is_in_birthplace_table_row(mention): if not mention.sentence.is_tabular(): return False ngrams = get_row_ngrams(mention, lower=True) birth_place_words = set(["birth", "place"]) if birth_place_words <= set(ngrams): return True else: return False
def part_file_name_conditions(attr): file_name = attr.sentence.document.name if len(file_name.split("_")) != 2: return False if attr.get_span()[0] == "-": return False name = attr.get_span().replace("-", "") return (any(char.isdigit() for char in name) and any(char.isalpha() for char in name) and common_prefix_length_diff(file_name.split("_")[1], name) <= 2) add_rgx = r"^[A-Z0-9\-]{5,15}$" part_file_name_lambda_matcher = LambdaFunctionMatcher( func=part_file_name_conditions) part_file_name_matcher = Intersect( RegexMatchSpan(rgx=add_rgx, longest_match_only=True), part_file_name_lambda_matcher) part_matcher = Union(part_rgx_matcher, part_dict_matcher, part_file_name_matcher) # CE Voltage Matcher ce_keywords = set( ["collector emitter", "collector-emitter", "collector - emitter"]) ce_abbrevs = set(["ceo", "vceo"]) ce_v_max_rgx_matcher = RegexMatchSpan(rgx=r"\d{1,2}[05]", longest_match_only=False)
from fonduer.candidates.matchers import LambdaFunctionMatcher def person_name_matcher(mention): mention_set = set(mention.sentence.ner_tags) if len(mention_set) == 1 and 'PERSON' in mention_set: return True else: return False person_name_function = LambdaFunctionMatcher(func=person_name_matcher) def get_matchers(): return [person_name_function]
#sents = session.query(Sentence).all() from fonduer.candidates import CandidateExtractor, MentionExtractor, MentionNgrams from fonduer.candidates.models import mention_subclass, candidate_subclass from dataset_utils import LocationMatcher, city_index from fonduer.candidates.matchers import Union, LambdaFunctionMatcher, Intersect from emmental_utils import get_posting_html_fast # Defining ngrams for candidates extraction_name = 'location' ngrams = MentionNgrams(n_max=3) # Define matchers # Geolocation matcher cities = city_index('../utils/data/cities15000.txt') geo_location_matcher = LambdaFunctionMatcher(func=cities.fast_loc) # In raw text matcher with open(f"{config['prediction_model_path']}/char_dict.pkl", 'rb') as fl: char_dict = pickle.load(fl) dataset = load_data_from_db(postgres_db_name, config['postgres_location'], {}, char_dict=char_dict, clobber_label=True) text_dict = {a[0]['uid']: a[0]['text'] for a in dataset} def post_matcher_fun(m): term = r"([Ll]ocation:[\w\W]{1,200}</.{0,20}>|\W[cC]ity:[\w\W]{1,200}</.{0,20}>|\d\dyo\W|\d\d.{0,10}\Wyo\W|\d\d.{0,10}\Wold\W|\d\d.{0,10}\Wyoung\W|\Wage\W.{0,10}\d\d)" #if m.get_span() in get_posting_html_fast(m.sentence.document.text, term): if m.get_span() in text_dict[m.sentence.document.name]: