예제 #1
0
    def en_parsers_speed(self):
        file_path = os.path.join(lexnlp_test_path, 'long_parsed_text.txt')
        with codecs.open(file_path, 'r', encoding='utf-8') as fr:
            text = fr.read()

        ge_path = os.path.join(lexnlp_test_path, 'lexnlp/extract/en/tests/test_geoentities/')
        entities_fn = ge_path + 'geoentities.csv'
        aliases_fn = ge_path + 'geoaliases.csv'
        geo_config = list(DictionaryEntry.load_entities_from_files(entities_fn, aliases_fn))

        times = {}  # type: Dict[str, float]
        self.check_time(text, lambda s: list(get_amounts(s)), 'get_amounts', times)
        self.check_time(text, lambda s: list(get_acts(s)), 'get_acts', times)
        self.check_time(text, lambda s: list(get_citations(s)), 'get_citations', times)
        self.check_time(text, lambda s: list(get_conditions(s)), 'get_conditions', times)
        self.check_time(text, lambda s: list(get_constraints(s)), 'get_constraints', times)
        self.check_time(text, lambda s: list(get_copyright(s)), 'get_copyright', times)
        self.check_time(text, lambda s: list(_get_courts(s)), 'get_courts', times)
        self.check_time(text, lambda s: list(get_cusip(s)), 'get_cusip', times)
        self.check_time(text, lambda s: list(get_dates(s)), 'get_dates', times)
        self.check_time(text, lambda s: list(get_definitions(s)), 'get_definitions', times)
        self.check_time(text, lambda s: list(get_distances(s)), 'get_distances', times)
        self.check_time(text, lambda s: list(get_durations(s)), 'get_durations', times)
        self.check_time(text, lambda s: list(get_geoentities(s, geo_config)), 'get_geoentities', times)
        self.check_time(text, lambda s: list(get_money(s)), 'get_money', times)
        self.check_time(text, lambda s: list(get_percents(s)), 'get_percents', times)
        self.check_time(text, lambda s: list(get_pii(s)), 'get_pii', times)
        self.check_time(text, lambda s: list(get_ratios(s)), 'get_ratios', times)
        self.check_time(text, lambda s: list(get_regulations(s)), 'get_regulations', times)
        self.check_time(text, lambda s: list(get_trademarks(s)), 'get_trademarks', times)
        self.check_time(text, lambda s: list(get_urls(s)), 'get_urls', times)

        self.assertTrue('get_amounts' in times)
예제 #2
0
 def test_parse_comission(self):
     text = """
 Pursuant to section 10(d) of the Federal Advisory Committee Act, as amended, notice is hereby given of the following meetings.
 The meetings will be closed to the public in accordance with the provisions set forth in sections 552b(c)(4) and 552b(c)(6), Title 5 U.S.C., as amended. 
 The grant applications and the discussions could disclose confidential trade secrets or commercial property such as patentable material, 
 and personal information concerning individuals associated with the grant applications, the disclosure of which would constitute a clearly unwarranted invasion of personal privacy.
 Name of Committee: Center for Scientific Review Special Emphasis Panel; Small Business: Cancer Biotherapeutics Development.
 """
     ret = list(get_regulations(text))
     self.assertEqual(0, len(ret))
예제 #3
0
    def test_regulations(self):
        text = 'test 123 U.S.C § 456, code'
        rs = list(get_regulations(text))
        self.assertEqual(1, len(rs))
        self.assertEqual('United States Code', rs[0][0])
        self.assertEqual('123 USC § 456', rs[0][1])

        rs = list(get_regulations(text, as_dict=True))
        self.assertEqual(1, len(rs))
        self.assertEqual('United States Code', rs[0]['regulation_type'])
        self.assertEqual('123 USC § 456', rs[0]['regulation_code'])

        ants = list(get_regulation_annotations(text))
        self.assertEqual(1, len(ants))
        self.assertEqual('en', ants[0].locale)
        self.assertEqual('123 USC § 456', ants[0].name)
        self.assertEqual('United States Code', ants[0].source)

        start = text.find('123')
        self.assertGreater(ants[0].coords[1], ants[0].coords[0])
        self.assertEqual((start, ants[0].coords[1]), ants[0].coords)
예제 #4
0
 def parse(self, text, text_unit_id, _text_unit_lang,
           **kwargs) -> ParseResults:
     found = list(regulations.get_regulations(text))
     if found:
         unique = set(found)
         return ParseResults({
             RegulationUsage: [
                 RegulationUsage(text_unit_id=text_unit_id,
                                 regulation_type=item[0],
                                 regulation_name=item[1],
                                 count=found.count(item)) for item in unique
             ]
         })
예제 #5
0
    def extract_features(self,
                         ex_words: List[str],
                         add_to_indexer: bool = False) -> Counter:
        stop_words = set(EN_STOPWORDS)
        regulations = list(lexnlp.get_regulations(" ".join(ex_words)))

        base_filtered = [
            w for w in ex_words
            if w not in stop_words and not any(i.isdigit() for i in w)
        ]
        filtered = []

        for item in base_filtered:
            filtered.append(item)
            if add_to_indexer:
                self.indexer.add_and_get_index(item)

        for item in regulations:
            reg = item[1]
            filtered.append(reg)
            if add_to_indexer:
                self.indexer.add_and_get_index(reg)

        return Counter(filtered)
예제 #6
0
 def getRegulations(self):
     mem = []
     regulations = list(get_regulations(self.bill_text))
     for reg in regulations:
         mem.append(str(reg[1]))
     self.bill.info['regulations'] = mem