예제 #1
0
    def get_dict_law_name_len(self, test_str):
        """
        Determines if the test_str starts with a law name given with self.laws_lookup.

        Returns: The length matched law name or 0.
        """

        # Stem the test_str as the law names are already stemmed
        test_str_stem = stem_law_name(test_str)

        # Look for matching law names
        match = self.match_law_name(test_str_stem)
        if not match:
            return 0

        # Transpose the area of the matched law name in the stemmed text to the
        # original text by splitting the original and the raw text into words (tokens)
        # and define the area of the original string that it contains of the same number
        # of tokens as the matched area in the stemmed string.
        test_str_splitted = regex.findall(r"[\w']+|[\W']+", test_str)
        match_splitted = regex.findall(r"[\w']+|[\W']+", match)
        match_raw = "".join(test_str_splitted[:len(match_splitted)])
        assert len(test_str_splitted[0].strip()) > 0, (match, test_str,
                                                       test_str_stem)

        # If last matched word of law name does continue after match with
        # a string that would not be stemmed, return no match
        # TODO look for other matches before returning no match
        last_word_test_stemmed = stem_law_name(
            test_str_splitted[len(match_splitted) - 1])
        last_word_match = match_splitted[-1]
        if last_word_match != last_word_test_stemmed:
            return 0

        return len(match_raw)
예제 #2
0
def load_law_names(date, path):
    r = requests.get(
        f"https://github.com/QuantLaw/gesetze-im-internet/archive/{date}.zip",
        stream=True,
    )
    assert r.status_code == 200
    with open(path + ".zip", "wb") as f:
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

    law_names = {}

    with zipfile.ZipFile(path + ".zip") as zip_file:
        for member_info in sorted(zip_file.namelist()):
            if member_info.endswith(".xml"):
                with zip_file.open(member_info) as member_file:
                    node = lxml.etree.parse(member_file)
                first_norm_nodes = node.xpath("(//norm)[1]")
                if not first_norm_nodes:
                    continue
                abk_nodes = first_norm_nodes[0].xpath(".//jurabk | //amtabk")
                if not abk_nodes:
                    continue
                abk = (lxml.etree.tostring(
                    abk_nodes[0], method="text",
                    encoding="utf8").decode("utf8").strip())
                abk_stem = re.sub(r"[^a-z0-9\-]", "_", abk.lower())

                law_names[stem_law_name(abk)] = abk_stem

                heading_nodes = first_norm_nodes[0].xpath(
                    ".//jurabk | //amtabk | "
                    ".//langue | .//kurzue")
                for heading_node in heading_nodes:
                    text = (lxml.etree.tostring(
                        heading_node, method="text",
                        encoding="utf8").decode("utf8").strip())
                    text = stem_law_name(text)
                    law_names[text] = abk_stem

    with open(path, "w", encoding="utf8") as f:
        json.dump(law_names, f, ensure_ascii=False, indent=0)

    os.remove(path + ".zip")
예제 #3
0
    def parse_law(self,
                  law_text: str,
                  match_type: str,
                  current_lawid: str = None):
        """
        Parses the law information from a references found by StatutesMatchWithMainArea

        Args:
            main_text: E.g. "§ 123 Abs. 4 und 5 Nr. 6"
            law_text: E.g. "BGB"
            match_type: E.g. "dict"

        Returns: The key of a parse law.

        """

        if match_type == "dict":
            lawname_stem = stem_law_name(law_text)
            match = self.match_law_name(lawname_stem)
            return self.laws_lookup[match]

        elif match_type == "sgb":
            lawid = sgb_dict[stem_law_name(law_text)]
            if type(lawid) is tuple:
                assert len(lawid) == 2
                if lawid[0] in self.laws_lookup.values():
                    return lawid[0]
                elif lawid[1] in self.laws_lookup.values():
                    return lawid[1]
                else:
                    return lawid[1]
            else:
                return lawid

        elif match_type == "internal":
            if current_lawid is None:
                raise Exception(
                    "Current law id must be set for internal reference")
            return current_lawid

        else:
            return None  # match_type: ignore or unknown
    def execute_item(self, item):
        src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH
        soup = create_soup(f"{src}/{item}")
        document = soup.find("document", recursive=False)
        result = set()
        citekey = document.attrs["key"].split("_")[1]

        if "heading" in document.attrs:
            law_name = stem_law_name(document.attrs["heading"])
            result.add((law_name, citekey, item))

        if "heading_short" in document.attrs:
            law_name = stem_law_name(document.attrs["heading_short"])
            result.add((law_name, citekey, item))

        if "abbr_1" in document.attrs:
            law_name = stem_law_name(document.attrs["abbr_1"])
            result.add((law_name, citekey, item))

        if "abbr_2" in document.attrs:
            law_name = stem_law_name(document.attrs["abbr_2"])
            result.add((law_name, citekey, item))
        return result
예제 #5
0
def identify_lawreference_law_name_in_soup(soup, laws_lookup):
    for reference in soup.find_all("reference", {"pattern": "generic"}):
        reference["parsed"] = [[laws_lookup[stem_law_name(reference.string)]]]
예제 #6
0
def identify_reference_in_juris_vso_list(soup, parser: StatutesParser):

    vso_tags = soup.find_all(["document", "seqitem"], attrs={"verweise": True})
    for vso_tag in vso_tags:
        parsed_vso_refs = []
        parsed_vso_refs_simple = []
        verweise = (
            []
            if vso_tag.attrs["verweise"] == "[]"
            else json.loads(vso_tag.attrs["verweise"])
        )
        for verweis in verweise:
            if not verweis["typ"] in [
                "Ermächtigung",
                "Rechtsgrundlage",
                "Durchführungsvorschrift",
            ]:
                # 'Vertragsgesetz', 'Sonderregelung', 'GLIEDERUNG', 'SAMMELVERWEISUNG',
                # 'Einführungsvorschrift', 'InnerstaatlDurchfVorschr' will be ignored
                continue
            if not verweis["normabk"]:
                continue
            lawname_stem = stem_law_name(verweis["normabk"])
            match = parser.match_law_name(lawname_stem)
            print(match)
            # if match:
            #     lawid = parser.laws_lookup[match]
            #     parsed_vso_ref = [[["Gesetz", lawid]]]
            #     parsed_vso_ref_simple = [[lawid]]
            #
            #     # Append ref. details if present in raw data
            #     enbez = verweis["enbez"]
            #     if enbez and reference_trigger_pattern.match(enbez):
            #
            #         try:
            #             (
            #                 reference_paths,
            #                 reference_paths_simple,
            #             ) = parse_reference_string(enbez, debug_context=None)
            #
            #             parsed_vso_ref = [
            #                 parsed_vso_ref[0] + r for r in reference_paths
            #             ]
            #             parsed_vso_ref_simple = [
            #                 parsed_vso_ref_simple[0] + r
            #                 for r in reference_paths_simple
            #             ]
            #
            #         except StringCaseException as error:
            #             print(error, "context", enbez)
            #
            #     parsed_vso_refs.extend(parsed_vso_ref)
            #     parsed_vso_refs_simple.extend(parsed_vso_ref_simple)

        # Remove duplicates
        parsed_vso_refs = remove_duplicate_references(parsed_vso_refs)
        parsed_vso_refs_simple = remove_duplicate_references(parsed_vso_refs_simple)

        vso_tag.attrs["parsed_verbose"] = json.dumps(
            parsed_vso_refs, ensure_ascii=False
        )
        vso_tag.attrs["parsed"] = json.dumps(parsed_vso_refs_simple, ensure_ascii=False)