Пример #1
0
    def en_parsers_speed(self):
        file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt')
        with codecs.open(file_path, 'r', encoding='utf-8') as fr:
            text = fr.read()

        ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/')
        entities_fn = ge_path + 'geoentities.csv'
        aliases_fn = ge_path + 'geoaliases.csv'
        geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn))

        times = {}  # type: Dict[str, float]
        self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times)
        self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times)
        self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times)
        self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times)
        self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times)
        self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times)
        self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times)
        self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times)
        self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times)
        self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times)
        self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times)
        self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times)
        self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times)
        self.check_time(text, lambda s: list(get_money(s)), 'get_money', times)
        self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times)
        self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times)
        self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times)
        self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times)
        self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times)
        self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times)

        self.assertTrue('get_amounts' in times)
Пример #2
0
def get_employer_name(text, return_source=False):
    definitions = list(get_definitions(text))

    companies = []
    defined_employer_found = False
    defined_employee_found = False
    first_company_string = None

    for d in definitions:
        if d.lower() in TRIGGER_LIST_COMPANY:
            defined_employer_found = True
        if d.lower() in TRIGGER_LIST_EMPLOYEE:
            defined_employee_found = True
        if defined_employee_found is True and defined_employer_found is True:
            break

    if defined_employer_found and defined_employee_found:
        companies = list(get_companies(text))
        if len(companies) > 0:
            # take first employer found
            first_company_string = ', '.join(str(s) for s in companies[0])

    if return_source:
        return first_company_string, text
    else:
        return first_company_string
Пример #3
0
 def parse(self, text, text_unit_id, _text_unit_lang,
           **kwargs) -> ParseResults:
     found = list(definitions.get_definitions(text))
     if found:
         unique = set(found)
         return ParseResults({
             DefinitionUsage: [
                 DefinitionUsage(text_unit_id=text_unit_id,
                                 definition=item,
                                 count=found.count(item)) for item in unique
             ]
         })
Пример #4
0
def get_employee_name(text, return_source=False):
    definitions = list(get_definitions(text))
    fake_person = False
    found_employee = None
    defined_employee_found = False
    for d in definitions:
        if d.lower() in TRIGGER_LIST_EMPLOYEE:
            defined_employee_found = True
            break

    if defined_employee_found:
        persons = list(get_persons(text))
        companies = list(get_companies(text))
        for p in persons:
            person_is_a_company = False
            for f in FALSE_PEOPLE:
                if f in str(p).lower():
                    fake_person = True
            if not fake_person:
                for c in companies:
                    # persons and companies return slightly different values for same text
                    # so need to standardize to compare
                    if len(c) > 0:
                        if c[1] is not None and c[0] is not None:
                            company_full_string = str(
                                clean(c[0]) + clean(c[1]))
                        else:
                            company_full_string = str(clean(c[0]))

                        employee_full_string = str(clean(p))
                        # handle this- where get_companies picks up more surrounding text
                        # than get_persons: EMPLOYMENT AGREEMENT WHEREAS, Kensey Nash Corporation,
                        # a Delaware corporation (the “Company”) and Todd M. DeWitt
                        # (the “Executive”) entered into that certain Amended
                        # and Restated Employment Agreement,...
                        if (employee_full_string == company_full_string or
                                employee_full_string in company_full_string):
                            person_is_a_company = True

            if not person_is_a_company and not fake_person:
                found_employee = str(p)
                # take first person found meeting our employee criteria
                break
            fake_person = False  # reset for next person

    if return_source:
        return found_employee, text
    else:
        return found_employee
Пример #5
0
    def _matches_definition_words(self, text: str, text_is_sentence: bool) -> bool:
        if not self.detector.detector_definition_words:
            return False
        try:
            terms = get_definitions_in_sentence(text) \
                if text_is_sentence else get_definitions(text)
        except Exception as e:
            msg = f'{self.get_detector_code()}: error in ' + \
                  f'_matches_definition_words("{text}"), ' + \
                  'in get_definitions_in_sentence' if text_is_sentence \
                  else 'if get_definitions'
            e.detailed_error = msg
            raise
        if not terms:
            return False
        terms = set([self._clean_def_words(t) for t in terms])

        for w in self.detector.detector_definition_words:
            if w in terms:
                return True
        return False
Пример #6
0
 def extract_definitions(self, text=None):
     if not text:
         text = self.text
     return list(lex_definitions.get_definitions(text))
Пример #7
0
 def test_overlapping_defs(self):
     text = load_resource_document(
         'lexnlp/extract/en/tests/test_definitions/bad_def.txt', 'utf-8')
     defs = list(get_definitions(text))
     self.assertGreater(len(defs), 12)
Пример #8
0
def test_definitions_simple():
    sentence = '''Visual Networks Operations, Inc., a Delaware corporation with offices at 2092 Gaither 
                             Road, Rockville, Maryland 20850("Licensor.") and is made retroactive to December 3, 2002 
                             ("Effective Date").'''
    definitions = list(get_definitions(sentence))
    print(definitions)