def en_parsers_speed(self): file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt') with codecs.open(file_path, 'r', encoding='utf-8') as fr: text = fr.read() ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/') entities_fn = ge_path + 'geoentities.csv' aliases_fn = ge_path + 'geoaliases.csv' geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn)) times = {} # type: Dict[str, float] self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times) self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times) self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times) self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times) self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times) self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times) self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times) self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times) self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times) self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times) self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times) self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times) self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times) self.check_time(text, lambda s: list(get_money(s)), 'get_money', times) self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times) self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times) self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times) self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times) self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times) self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times) self.assertTrue('get_amounts' in times)
def get_employer_name(text, return_source=False): definitions = list(get_definitions(text)) companies = [] defined_employer_found = False defined_employee_found = False first_company_string = None for d in definitions: if d.lower() in TRIGGER_LIST_COMPANY: defined_employer_found = True if d.lower() in TRIGGER_LIST_EMPLOYEE: defined_employee_found = True if defined_employee_found is True and defined_employer_found is True: break if defined_employer_found and defined_employee_found: companies = list(get_companies(text)) if len(companies) > 0: # take first employer found first_company_string = ', '.join(str(s) for s in companies[0]) if return_source: return first_company_string, text else: return first_company_string
def parse(self, text, text_unit_id, _text_unit_lang, **kwargs) -> ParseResults: found = list(definitions.get_definitions(text)) if found: unique = set(found) return ParseResults({ DefinitionUsage: [ DefinitionUsage(text_unit_id=text_unit_id, definition=item, count=found.count(item)) for item in unique ] })
def get_employee_name(text, return_source=False): definitions = list(get_definitions(text)) fake_person = False found_employee = None defined_employee_found = False for d in definitions: if d.lower() in TRIGGER_LIST_EMPLOYEE: defined_employee_found = True break if defined_employee_found: persons = list(get_persons(text)) companies = list(get_companies(text)) for p in persons: person_is_a_company = False for f in FALSE_PEOPLE: if f in str(p).lower(): fake_person = True if not fake_person: for c in companies: # persons and companies return slightly different values for same text # so need to standardize to compare if len(c) > 0: if c[1] is not None and c[0] is not None: company_full_string = str( clean(c[0]) + clean(c[1])) else: company_full_string = str(clean(c[0])) employee_full_string = str(clean(p)) # handle this- where get_companies picks up more surrounding text # than get_persons: EMPLOYMENT AGREEMENT WHEREAS, Kensey Nash Corporation, # a Delaware corporation (the “Company”) and Todd M. DeWitt # (the “Executive”) entered into that certain Amended # and Restated Employment Agreement,... if (employee_full_string == company_full_string or employee_full_string in company_full_string): person_is_a_company = True if not person_is_a_company and not fake_person: found_employee = str(p) # take first person found meeting our employee criteria break fake_person = False # reset for next person if return_source: return found_employee, text else: return found_employee
def _matches_definition_words(self, text: str, text_is_sentence: bool) -> bool: if not self.detector.detector_definition_words: return False try: terms = get_definitions_in_sentence(text) \ if text_is_sentence else get_definitions(text) except Exception as e: msg = f'{self.get_detector_code()}: error in ' + \ f'_matches_definition_words("{text}"), ' + \ 'in get_definitions_in_sentence' if text_is_sentence \ else 'if get_definitions' e.detailed_error = msg raise if not terms: return False terms = set([self._clean_def_words(t) for t in terms]) for w in self.detector.detector_definition_words: if w in terms: return True return False
def extract_definitions(self, text=None): if not text: text = self.text return list(lex_definitions.get_definitions(text))
def test_overlapping_defs(self): text = load_resource_document( 'lexnlp/extract/en/tests/test_definitions/bad_def.txt', 'utf-8') defs = list(get_definitions(text)) self.assertGreater(len(defs), 12)
def test_definitions_simple(): sentence = '''Visual Networks Operations, Inc., a Delaware corporation with offices at 2092 Gaither Road, Rockville, Maryland 20850("Licensor.") and is made retroactive to December 3, 2002 ("Effective Date").''' definitions = list(get_definitions(sentence)) print(definitions)