def get_dict_law_name_len(self, test_str): """ Determines if the test_str starts with a law name given with self.laws_lookup. Returns: The length matched law name or 0. """ # Stem the test_str as the law names are already stemmed test_str_stem = stem_law_name(test_str) # Look for matching law names match = self.match_law_name(test_str_stem) if not match: return 0 # Transpose the area of the matched law name in the stemmed text to the # original text by splitting the original and the raw text into words (tokens) # and define the area of the original string that it contains of the same number # of tokens as the matched area in the stemmed string. test_str_splitted = regex.findall(r"[\w']+|[\W']+", test_str) match_splitted = regex.findall(r"[\w']+|[\W']+", match) match_raw = "".join(test_str_splitted[:len(match_splitted)]) assert len(test_str_splitted[0].strip()) > 0, (match, test_str, test_str_stem) # If last matched word of law name does continue after match with # a string that would not be stemmed, return no match # TODO look for other matches before returning no match last_word_test_stemmed = stem_law_name( test_str_splitted[len(match_splitted) - 1]) last_word_match = match_splitted[-1] if last_word_match != last_word_test_stemmed: return 0 return len(match_raw)
def load_law_names(date, path): r = requests.get( f"https://github.com/QuantLaw/gesetze-im-internet/archive/{date}.zip", stream=True, ) assert r.status_code == 200 with open(path + ".zip", "wb") as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) law_names = {} with zipfile.ZipFile(path + ".zip") as zip_file: for member_info in sorted(zip_file.namelist()): if member_info.endswith(".xml"): with zip_file.open(member_info) as member_file: node = lxml.etree.parse(member_file) first_norm_nodes = node.xpath("(//norm)[1]") if not first_norm_nodes: continue abk_nodes = first_norm_nodes[0].xpath(".//jurabk | //amtabk") if not abk_nodes: continue abk = (lxml.etree.tostring( abk_nodes[0], method="text", encoding="utf8").decode("utf8").strip()) abk_stem = re.sub(r"[^a-z0-9\-]", "_", abk.lower()) law_names[stem_law_name(abk)] = abk_stem heading_nodes = first_norm_nodes[0].xpath( ".//jurabk | //amtabk | " ".//langue | .//kurzue") for heading_node in heading_nodes: text = (lxml.etree.tostring( heading_node, method="text", encoding="utf8").decode("utf8").strip()) text = stem_law_name(text) law_names[text] = abk_stem with open(path, "w", encoding="utf8") as f: json.dump(law_names, f, ensure_ascii=False, indent=0) os.remove(path + ".zip")
def parse_law(self, law_text: str, match_type: str, current_lawid: str = None): """ Parses the law information from a references found by StatutesMatchWithMainArea Args: main_text: E.g. "§ 123 Abs. 4 und 5 Nr. 6" law_text: E.g. "BGB" match_type: E.g. "dict" Returns: The key of a parse law. """ if match_type == "dict": lawname_stem = stem_law_name(law_text) match = self.match_law_name(lawname_stem) return self.laws_lookup[match] elif match_type == "sgb": lawid = sgb_dict[stem_law_name(law_text)] if type(lawid) is tuple: assert len(lawid) == 2 if lawid[0] in self.laws_lookup.values(): return lawid[0] elif lawid[1] in self.laws_lookup.values(): return lawid[1] else: return lawid[1] else: return lawid elif match_type == "internal": if current_lawid is None: raise Exception( "Current law id must be set for internal reference") return current_lawid else: return None # match_type: ignore or unknown
def execute_item(self, item): src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH soup = create_soup(f"{src}/{item}") document = soup.find("document", recursive=False) result = set() citekey = document.attrs["key"].split("_")[1] if "heading" in document.attrs: law_name = stem_law_name(document.attrs["heading"]) result.add((law_name, citekey, item)) if "heading_short" in document.attrs: law_name = stem_law_name(document.attrs["heading_short"]) result.add((law_name, citekey, item)) if "abbr_1" in document.attrs: law_name = stem_law_name(document.attrs["abbr_1"]) result.add((law_name, citekey, item)) if "abbr_2" in document.attrs: law_name = stem_law_name(document.attrs["abbr_2"]) result.add((law_name, citekey, item)) return result
def identify_lawreference_law_name_in_soup(soup, laws_lookup): for reference in soup.find_all("reference", {"pattern": "generic"}): reference["parsed"] = [[laws_lookup[stem_law_name(reference.string)]]]
def identify_reference_in_juris_vso_list(soup, parser: StatutesParser): vso_tags = soup.find_all(["document", "seqitem"], attrs={"verweise": True}) for vso_tag in vso_tags: parsed_vso_refs = [] parsed_vso_refs_simple = [] verweise = ( [] if vso_tag.attrs["verweise"] == "[]" else json.loads(vso_tag.attrs["verweise"]) ) for verweis in verweise: if not verweis["typ"] in [ "Ermächtigung", "Rechtsgrundlage", "Durchführungsvorschrift", ]: # 'Vertragsgesetz', 'Sonderregelung', 'GLIEDERUNG', 'SAMMELVERWEISUNG', # 'Einführungsvorschrift', 'InnerstaatlDurchfVorschr' will be ignored continue if not verweis["normabk"]: continue lawname_stem = stem_law_name(verweis["normabk"]) match = parser.match_law_name(lawname_stem) print(match) # if match: # lawid = parser.laws_lookup[match] # parsed_vso_ref = [[["Gesetz", lawid]]] # parsed_vso_ref_simple = [[lawid]] # # # Append ref. details if present in raw data # enbez = verweis["enbez"] # if enbez and reference_trigger_pattern.match(enbez): # # try: # ( # reference_paths, # reference_paths_simple, # ) = parse_reference_string(enbez, debug_context=None) # # parsed_vso_ref = [ # parsed_vso_ref[0] + r for r in reference_paths # ] # parsed_vso_ref_simple = [ # parsed_vso_ref_simple[0] + r # for r in reference_paths_simple # ] # # except StringCaseException as error: # print(error, "context", enbez) # # parsed_vso_refs.extend(parsed_vso_ref) # parsed_vso_refs_simple.extend(parsed_vso_ref_simple) # Remove duplicates parsed_vso_refs = remove_duplicate_references(parsed_vso_refs) parsed_vso_refs_simple = remove_duplicate_references(parsed_vso_refs_simple) vso_tag.attrs["parsed_verbose"] = json.dumps( parsed_vso_refs, ensure_ascii=False ) vso_tag.attrs["parsed"] = json.dumps(parsed_vso_refs_simple, ensure_ascii=False)