def test_save_soup(self): soup = create_soup(self.source_filename) save_soup(soup, self.xml_filename) soup = create_soup(self.xml_filename) self.assertEqual(soup.book.attrs["heading"], "X") with self.assertRaises(OSError): save_soup(soup, 100)
def find_references(decision): try: logs = [] areas_exists = os.path.exists(f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}") parsed_exists = os.path.exists( f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}" ) if not (areas_exists and parsed_exists): # General preparation with open(f"{DE_DECISIONS_HIERARCHY}/{decision}", encoding="utf8") as f: file_content = f.read() file_content = file_content.replace( "<!DOCTYPE dokument SYSTEM " '"http://www.rechtsprechung-im-internet.de/dtd/v1/rii-dok.dtd">', "", ) soup = BeautifulSoup(file_content, "lxml-xml") # Get Entscheidungsdatum date = get_lawnames_date(soup.document.attrs["datum"]) # Get laws in effect at time of decision laws_lookup = get_stemmed_law_names(date, law_names) parser = StatutesParser(laws_lookup) extractor = StatutesExtractor(laws_lookup) if not areas_exists: logs.append( find_references_in_soup( soup, extractor, para=0, art=0, text_tag_name=["text", "norm"], ) # set para and atr to 0 that refernece with naming a law are ignored. ) save_soup(soup, f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}") if not parsed_exists: with open( f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}", encoding="utf8" ) as f: soup = BeautifulSoup(f.read(), "lxml-xml") parse_reference_content_in_soup(soup, parser, decision) identify_reference_law_name_in_soup( soup, parser, current_lawid=None, skip_errors=True ) identify_lawreference_law_name_in_soup(soup, laws_lookup) save_soup(soup, f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}") except Exception: print("-----", decision, "-----") the_type, the_value, the_traceback = sys.exc_info() traceback.print_exception(the_type, the_value, the_traceback) raise
def test_save_soup_failed(self): class TestException(Exception): pass class FailingTestObj: def __str__(self): raise TestException() with self.assertRaises(TestException): save_soup(FailingTestObj(), "temp2.xml") self.assertFalse(os.path.exists("temp2.xml"))
def execute_item(self, item): src = US_REG_XML_PATH if self.regulations else US_XML_PATH dest = (US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH) soup = create_soup(f"{src}/{item}") logs = find_references(soup, usc_pattern, {"pattern": "block"}) logs += find_references(soup, inline_pattern, {"pattern": "inline"}) if self.regulations: logs += find_authority_references(soup, usc_pattern) save_soup(soup, f"{dest}/{item}") return logs
def clean_decision(decision): if not os.path.exists(f"{DE_DECISIONS_XML}/{decision}"): with open(f"{DE_DECISIONS_DOWNLOAD_XML}/{decision}", encoding="utf8") as f: content = f.read() content = content.replace("\xa0", " ") soup = BeautifulSoup(content, "lxml-xml") for doc_parts in get_docparts_with_p(soup): contents = clean_abs(doc_parts) replace_tag_with_content(doc_parts, contents, soup) soup_str = fix_data(decision, str(soup)) save_soup(soup_str, f"{DE_DECISIONS_XML}/{decision}")
def execute_item(self, item): from statutes_pipeline_steps.us_reference_reg import parse_authority_references src = ( US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH ) dest = ( US_REG_REFERENCE_PARSED_PATH if self.regulations else US_REFERENCE_PARSED_PATH ) soup = create_soup(f"{src}/{item}") this_title = self.get_title_from_filename(item) try: logs = parse_references(soup, this_title, this_usc=not self.regulations) logs += parse_authority_references(soup) except Exception: print(item) raise save_soup(soup, f"{dest}/{item}") return logs
def execute_item(self, item): src = (DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH) dest = (DE_REG_REFERENCE_PARSED_PATH if self.regulations else DE_REFERENCE_PARSED_PATH) laws_lookup = get_stemmed_law_names_for_filename(item, self.law_names) parser = StatutesParser(laws_lookup) logs = list() # for debug logs.append(f"Start file - {item}") soup = create_soup(f"{src}/{item}") parse_reference_content_in_soup(soup, parser, debug_context=item) current_lawid = soup.document.attrs["key"].split("_")[1] identify_reference_law_name_in_soup(soup, parser, current_lawid) identify_lawreference_law_name_in_soup(soup, laws_lookup) identify_reference_in_juris_vso_list(soup, parser) save_soup(soup, f"{dest}/{item}") return logs
def simplify_gii_xml(source, destination): soup = create_soup(source) simplify(soup) save_soup(soup, destination)