Exemplo n.º 1
0
 def test_save_soup(self):
     soup = create_soup(self.source_filename)
     save_soup(soup, self.xml_filename)
     soup = create_soup(self.xml_filename)
     self.assertEqual(soup.book.attrs["heading"], "X")
     with self.assertRaises(OSError):
         save_soup(soup, 100)
Exemplo n.º 2
0
def find_references(decision):
    try:
        logs = []
        areas_exists = os.path.exists(f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}")
        parsed_exists = os.path.exists(
            f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}"
        )

        if not (areas_exists and parsed_exists):  # General preparation
            with open(f"{DE_DECISIONS_HIERARCHY}/{decision}", encoding="utf8") as f:
                file_content = f.read()
            file_content = file_content.replace(
                "<!DOCTYPE dokument SYSTEM "
                '"http://www.rechtsprechung-im-internet.de/dtd/v1/rii-dok.dtd">',
                "",
            )
            soup = BeautifulSoup(file_content, "lxml-xml")

            # Get Entscheidungsdatum
            date = get_lawnames_date(soup.document.attrs["datum"])

            # Get laws in effect at time of decision
            laws_lookup = get_stemmed_law_names(date, law_names)
            parser = StatutesParser(laws_lookup)
            extractor = StatutesExtractor(laws_lookup)

        if not areas_exists:
            logs.append(
                find_references_in_soup(
                    soup,
                    extractor,
                    para=0,
                    art=0,
                    text_tag_name=["text", "norm"],
                )
                # set para and atr to 0 that refernece with naming a law are ignored.
            )
            save_soup(soup, f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}")

        if not parsed_exists:
            with open(
                f"{DE_DECISIONS_REFERENCE_AREAS}/{decision}", encoding="utf8"
            ) as f:
                soup = BeautifulSoup(f.read(), "lxml-xml")
            parse_reference_content_in_soup(soup, parser, decision)
            identify_reference_law_name_in_soup(
                soup, parser, current_lawid=None, skip_errors=True
            )
            identify_lawreference_law_name_in_soup(soup, laws_lookup)

            save_soup(soup, f"{DE_DECISIONS_REFERENCE_PARSED_XML}/{decision}")
    except Exception:
        print("-----", decision, "-----")
        the_type, the_value, the_traceback = sys.exc_info()
        traceback.print_exception(the_type, the_value, the_traceback)
        raise
Exemplo n.º 3
0
    def test_save_soup_failed(self):
        class TestException(Exception):
            pass

        class FailingTestObj:
            def __str__(self):
                raise TestException()

        with self.assertRaises(TestException):
            save_soup(FailingTestObj(), "temp2.xml")

        self.assertFalse(os.path.exists("temp2.xml"))
Exemplo n.º 4
0
    def execute_item(self, item):
        src = US_REG_XML_PATH if self.regulations else US_XML_PATH
        dest = (US_REG_REFERENCE_AREAS_PATH
                if self.regulations else US_REFERENCE_AREAS_PATH)
        soup = create_soup(f"{src}/{item}")
        logs = find_references(soup, usc_pattern, {"pattern": "block"})
        logs += find_references(soup, inline_pattern, {"pattern": "inline"})

        if self.regulations:
            logs += find_authority_references(soup, usc_pattern)

        save_soup(soup, f"{dest}/{item}")
        return logs
Exemplo n.º 5
0
def clean_decision(decision):
    if not os.path.exists(f"{DE_DECISIONS_XML}/{decision}"):
        with open(f"{DE_DECISIONS_DOWNLOAD_XML}/{decision}",
                  encoding="utf8") as f:
            content = f.read()
            content = content.replace("\xa0", " ")
            soup = BeautifulSoup(content, "lxml-xml")
        for doc_parts in get_docparts_with_p(soup):
            contents = clean_abs(doc_parts)
            replace_tag_with_content(doc_parts, contents, soup)

        soup_str = fix_data(decision, str(soup))
        save_soup(soup_str, f"{DE_DECISIONS_XML}/{decision}")
    def execute_item(self, item):
        from statutes_pipeline_steps.us_reference_reg import parse_authority_references

        src = (
            US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH
        )
        dest = (
            US_REG_REFERENCE_PARSED_PATH
            if self.regulations
            else US_REFERENCE_PARSED_PATH
        )

        soup = create_soup(f"{src}/{item}")

        this_title = self.get_title_from_filename(item)
        try:
            logs = parse_references(soup, this_title, this_usc=not self.regulations)
            logs += parse_authority_references(soup)
        except Exception:
            print(item)
            raise
        save_soup(soup, f"{dest}/{item}")
        return logs
Exemplo n.º 7
0
    def execute_item(self, item):
        src = (DE_REG_REFERENCE_AREAS_PATH
               if self.regulations else DE_REFERENCE_AREAS_PATH)
        dest = (DE_REG_REFERENCE_PARSED_PATH
                if self.regulations else DE_REFERENCE_PARSED_PATH)

        laws_lookup = get_stemmed_law_names_for_filename(item, self.law_names)
        parser = StatutesParser(laws_lookup)

        logs = list()

        # for debug
        logs.append(f"Start file - {item}")

        soup = create_soup(f"{src}/{item}")
        parse_reference_content_in_soup(soup, parser, debug_context=item)
        current_lawid = soup.document.attrs["key"].split("_")[1]
        identify_reference_law_name_in_soup(soup, parser, current_lawid)
        identify_lawreference_law_name_in_soup(soup, laws_lookup)

        identify_reference_in_juris_vso_list(soup, parser)

        save_soup(soup, f"{dest}/{item}")
        return logs
def simplify_gii_xml(source, destination):
    soup = create_soup(source)
    simplify(soup)
    save_soup(soup, destination)