def test_estimate_dense_text(self): text = load_resource_document( "lexnlp/utils/parsing/pdf_malformat_parsed_default.txt", 'utf-8') estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) self.assertGreater(estim.extra_line_breaks_prob, 50) text = load_resource_document( 'lexnlp/utils/parsing/pdf_malformat_parsed_stripper.txt', 'utf-8') estim = estimator.estimate_text(text) self.assertLess(estim.extra_line_breaks_prob, 30)
def test_definitions_in_sentences_text(self): text = load_resource_document( 'lexnlp/extract/en/tests/test_definitions/test_definition_in_sentences.csv', 'utf-8') defs = list(get_definition_annotations(text)) self.assertGreater(len(defs), 16) self.assertLess(len(defs), 25)
def test_estimate_text_abusing_headers(self): text = load_resource_document( 'lexnlp/utils/parsing/text_abusing_headers.txt', 'utf-8') text = pre_process_document(text) estimator = ParsedTextQualityEstimator() estim = estimator.estimate_text(text) self.assertLess(estim.extra_line_breaks_prob, 50)
def test_parse_large_text(self): text = load_resource_document( 'lexnlp/extract/es/sample_es_regulations.txt', 'utf-8') ret = parser.parse(text) self.assertGreater(len(ret), 100) html = annotate_text(text, ret) save_test_document('sample_es_regulations.html', html)
def test_definitions_sample_doc(self): text = load_resource_document( 'lexnlp/extract/en/definitions/en_definitions_sample_doc.txt', 'utf-8') definitions = self.parse(text) self.assertGreater(len(definitions), 2) # 10) self.annotate_document(text, definitions, 'output/en_definitions_sample_doc.html') text = load_resource_document( 'lexnlp/extract/en/definitions/pure_definitions.txt', 'utf-8') lines_count = text.count('\n\n') + 1 definitions = self.parse(text) self.assertGreater(len(definitions), lines_count) self.annotate_document(text, definitions, 'output/pure_definitions.html')
def test_parse_de_definitions_simple(self): parser = make_es_definitions_parser() text = load_resource_document('lexnlp/extract/es/definitions/eula.txt', 'utf-8') ret = parser.parse(text) self.assertGreater(len(ret), 4) annotate_definitions_text(text, ret, 'output/es_definitions_01.html')
def test_load_courts_with_toponims(self): text = load_resource_document( 'lexnlp/extract/de/sample_de_courts02.txt', 'utf-8') ret = list(get_courts(text)) self.assertEqual(2, len(ret)) jurisdiction = ret[0]["tags"]["Extracted Entity Court Jurisdiction"] self.assertEqual("Federal", jurisdiction)
def test_hit_or_miss_samples(self): text = load_resource_document( 'lexnlp/extract/en/definitions/definitions_hit_or_miss.txt', 'utf-8') definitions = self.parse(text) self.assertGreater(len(definitions), 0) self.annotate_document(text, definitions, 'output/definitions_hit_or_miss.html')
def test_legacy_parse_court_annotations(self): court_config_list = self.load_en_courts() text = load_resource_document( 'lexnlp/extract/en/courts/courts_sample_01.txt', 'utf-8') ants = list(get_court_annotations_custom('en', text, court_config_list)) self.assertEqual(3, len(ants)) self.assertEqual('court', ants[0].record_type)
def process_big_document_with_false_positives(self): text = load_resource_document( 'lexnlp/extract/en/definitions/definitions_fp_collections.txt', 'utf-8') definitions = self.parse(text) self.assertGreater(len(definitions), 0) self.annotate_document(text, definitions, 'output/definitions_fp_collections.html')
def test_parse_de_definitions_simple(self): text = load_resource_document('lexnlp/extract/de/sample_de_definitions01.txt', 'utf-8') ret = get_definition_list(text) self.assertGreater(len(ret), 5) start = ret[0].coords[0] end = ret[0].coords[1] def_name = ret[0].name self.assertTrue("Diensteanbieter" in def_name) definition = text[start:end] self.assertTrue(def_name in definition) annotate_definitions_text(text, ret, 'output/de_definitions_01.html')
def test_check_match_attrs(self): parser = self.make_en_parser() text = load_resource_document( 'lexnlp/extract/en/courts/courts_sample_01.txt', 'utf-8') ret_list = parser.parse(text) self.assertEqual(4, len(ret_list)) for rv in [r.to_dictionary() for r in ret_list]: self.assertGreater(rv["attrs"]["end"], rv["attrs"]["start"]) self.assertGreater(rv["attrs"]["end"], 0) self.assertGreater(len(rv["tags"]["Extracted Entity Type"]), 0) _ = text[rv["attrs"]["start"]:rv["attrs"]["end"]]
def test_definition_fixed(self): text = load_resource_document( 'lexnlp/extract/en/tests/test_definitions/test_definition_fixed.csv', 'utf-8') defs = list(get_definition_annotations(text)) self.assertGreater(len(defs), 12) self.assertLess(len(defs), 25) for df in defs: txt = df.name.strip('''"[]'{}.\t ''') self.assertGreater(len(txt), 0) txt = df.name.strip('''"[]'{}.\t ''') self.assertGreater(len(txt), 0)
def test_definition_fixed(self): text = load_resource_document( 'lexnlp/extract/en/tests/test_definitions/test_definition_fixed.csv', 'utf-8') defs = self.parse(text) self.assertGreater(len(defs), 12) self.assertLess(len(defs), 25) for df in defs: txt = df["tags"]["Extracted Entity Definition Name"].strip( '''"[]'{}.\t ''') self.assertGreater(len(txt), 0) txt = df["tags"]["Extracted Entity Text"].strip('''"[]'{}.\t ''') self.assertGreater(len(txt), 0)
def test_compare_to_legacy_parser(self): parser = self.make_en_parser() text = load_resource_document( 'lexnlp/extract/en/courts/courts_sample_01.txt', 'utf-8') start = time.time() ret_n = parser.parse(text) _ = (time.time() - start) self.assertEqual(4, len(ret_n)) start = time.time() ret_l = [c for c in self.parse_courts_legacy_function(text)] __ = (time.time() - start) self.assertEqual(3, len(ret_l))
def test_overlapping_defs(self): text = load_resource_document( 'lexnlp/extract/en/tests/test_definitions/bad_def.txt', 'utf-8') defs = list(get_definitions(text)) self.assertGreater(len(defs), 12)
def test_estimate_dense_text(self): text = load_resource_document( 'lexnlp/utils/parsing/pdf_malformat_parsed_default.txt', 'utf-8') corrector = ParsedTextCorrector() corr = corrector.correct_line_breaks(text) self.assertLess(len(corr), len(text))
def test_long_doc(self): text = load_resource_document( 'lexnlp/extract/de/sample_de_court_citations01.txt', 'utf-8') items = get_court_citation_list(text, "xz") self.assertEqual(2, len(items)) self.assertEqual("xz", items[0].locale)
def test_load_courts(self): text = load_resource_document( 'lexnlp/extract/de/sample_de_courts01.txt', 'utf-8') ret = get_court_list(text, "y") self.assertEqual(4, len(ret)) self.assertEqual("y", ret[0].locale)