def cd_cluster_evolution_inspection_prepare( overwrite, cluster_mapping_configs, source_folder, crossreference_graph_folder, target_folder, ): ensure_exists(target_folder) configs = get_configs(cluster_mapping_configs) existing_files = set(list_dir(target_folder, ".htm")) if not overwrite: configs = [ config for config in configs if filename_for_pp_config(snapshot="all", **config, file_ext=".htm") not in existing_files ] if configs: global cd_cluster_evolution_inspection_graphs cd_cluster_evolution_inspection_graphs = { f[:-len(".gpickle.gz")]: hierarchy_graph( nx.read_gpickle(os.path.join(crossreference_graph_folder, f))) for f in list_dir(crossreference_graph_folder, ".gpickle.gz") } return configs
def cd_preprocessing_prepare( overwrite, snapshots, pp_configs, source_folder, target_folder ): ensure_exists(target_folder) items = [ dict( snapshot=snapshot, pp_ratio=pp_ratio, pp_decay=pp_decay, pp_merge=pp_merge, pp_co_occurrence=pp_co_occurrence, pp_co_occurrence_type=pp_co_occurrence_type, ) for snapshot in snapshots for pp_ratio in pp_configs["pp_ratios"] for pp_decay in pp_configs["pp_decays"] for pp_merge in pp_configs["pp_merges"] for pp_co_occurrence in pp_configs["pp_co_occurrences"] for pp_co_occurrence_type in pp_configs["pp_co_occurrence_types"] ] # Check if source graphs exist existing_source_files = set(list_dir(f"{source_folder}/seqitems", ".gpickle.gz")) required_source_files = {f"{snapshot}.gpickle.gz" for snapshot in snapshots} check_for_missing_files(required_source_files, existing_source_files, "graphs") if not overwrite: existing_files = list_dir(target_folder, target_file_ext) items = get_no_overwrite_items(items, target_file_ext, existing_files) return items
def cd_cluster_evolution_mappings_prepare(overwrite, cluster_mapping_configs, source_folder, target_folder, snapshots): ensure_exists(target_folder) subseqitems_snapshots = [ f.split(".")[0] for f in list_dir(f"{source_folder}/", ".edges.csv.gz") ] # fix if snapshots: subseqitems_snapshots = [ s for s in subseqitems_snapshots if s in snapshots ] # get configs mappings = [ dict( pp_merge=pp_merge, snapshot=subseqitems_snapshot, ) for pp_merge in cluster_mapping_configs["pp_merges"] for subseqitems_snapshot in subseqitems_snapshots ] existing_files = set(list_dir(target_folder, ".pickle")) if not overwrite: mappings = [ mapping for mapping in mappings if filename_for_mapping(mapping) not in existing_files ] return sorted(mappings, key=str)
def cd_cluster_evolution_graph_prepare( overwrite, cluster_mapping_configs, source_folder, snaphot_mapping_folder, subseqitem_mapping_folder, target_folder, ): ensure_exists(target_folder) configs = get_configs(cluster_mapping_configs) # Check if clusterings exist for config in configs: config_clustering_files, snapshots = get_config_clustering_files( config, source_folder) mapping_files = list_dir(snaphot_mapping_folder, ".json") check_mapping_files(mapping_files, snapshots, config, ".json") mapping_files = list_dir(subseqitem_mapping_folder, ".pickle") check_mapping_files(mapping_files, snapshots, config, ".pickle") existing_files = set(list_dir(target_folder, ".gpickle.gz")) if not overwrite: get_configs_no_overwrite(configs, existing_files) return configs
def cd_cluster_prepare(overwrite, snapshots, pp_configs, source_folder, target_folder): ensure_exists(target_folder) items = get_configs_for_snapshots(snapshots, pp_configs) # Check if source graphs exist existing_source_files = set(list_dir(source_folder, source_file_ext)) required_source_files = { filename_for_pp_config( **{ **item, "seed": None, "markov_time": None, "number_of_modules": None, "consensus": None, "method": None, }, file_ext=source_file_ext, ) for item in items } check_for_missing_files(required_source_files, existing_source_files, "preprocessed graphs") if not overwrite: existing_files = list_dir(target_folder, target_file_ext) items = get_no_overwrite_items(items, target_file_ext, existing_files) return items
def reference_parse_areas(regulations): global law_names law_names = load_law_names_compiled(regulations) ensure_exists(DE_DECISIONS_REFERENCE_AREAS) ensure_exists(DE_DECISIONS_REFERENCE_PARSED_XML) decisions = list_dir(DE_DECISIONS_HIERARCHY, ".xml") with multiprocessing.Pool() as p: p.map(find_references, decisions)
def get_items(self, overwrite, snapshots) -> list: ensure_exists(self.destination) items = snapshots if not overwrite: existing_files = list_dir(self.destination, ".pickle") items = list( filter(lambda x: (x + ".pickle") not in existing_files, items)) return items
def finish_execution(self, results): logs = list(itertools.chain.from_iterable(results)) ensure_exists( US_REG_HELPERS_PATH if self.regulations else US_HELPERS_PATH) log_path = (US_REG_REFERENCE_AREAS_LOG_PATH if self.regulations else US_REFERENCE_AREAS_LOG_PATH) with open(log_path, mode="w") as f: f.write("\n".join(sorted(logs, key=lambda x: x.lower())))
def get_items(self, snapshots) -> list: ensure_exists(DE_REG_CROSSREFERENCE_LOOKUP_PATH if self. regulations else DE_CROSSREFERENCE_LOOKUP_PATH) files = [] law_names_data = load_law_names(self.regulations) for snapshot in snapshots: files.append( (snapshot, get_snapshot_law_list(snapshot, law_names_data))) return files
def get_items(self, overwrite, snapshots) -> list: ensure_exists(DE_REG_AUTHORITY_EDGELIST_PATH) if not overwrite: existing_files = os.listdir(DE_REG_AUTHORITY_EDGELIST_PATH) snapshots = list( filter(lambda f: get_filename(f) not in existing_files, snapshots)) return snapshots
def get_items(self, overwrite) -> list: ensure_exists(self.destination) files = list_dir(self.source, ".xml") if not overwrite: existing_files = list_dir(self.destination, ".gpickle") files = list( filter(lambda f: get_gpickle_filename(f) not in existing_files, files)) return files
def get_items(self, overwrite) -> list: src = US_REG_XML_PATH if self.regulations else US_XML_PATH dest = (US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH) ensure_exists(dest) files = list_dir(src, ".xml") if not overwrite: existing_files = os.listdir(dest) files = list(filter(lambda f: f not in existing_files, files)) return files
def get_items(self, overwrite, snapshots) -> list: target_folder = (DE_REG_CROSSREFERENCE_EDGELIST_PATH if self.regulations else DE_CROSSREFERENCE_EDGELIST_PATH) ensure_exists(target_folder) if not overwrite: existing_files = os.listdir(target_folder) snapshots = list( filter(lambda f: get_filename(f) not in existing_files, snapshots)) return snapshots
def copy_selected_doknrs(selection_list, target_dir): ensure_exists(target_dir) for doknr in selection_list: version_filenames = [ f for f in os.listdir(f"{JURIS_EXPORT_PATH}/{doknr}") if f.endswith(".xml") ] for version_filename in version_filenames: assert len(version_filename.split("_")) == 3 shutil.copy( f"{JURIS_EXPORT_PATH}/{doknr}/{version_filename}", f"{target_dir}/{version_filename}", )
def us_prepare_input(): """ moves source files into main dir and validate files roughly """ ensure_exists(US_ORIGINAL_PATH) subfolders = [f.name for f in os.scandir(US_INPUT_PATH) if f.is_dir()] for subfolder in subfolders: for item in os.listdir(f"{US_INPUT_PATH}/{subfolder}"): # Filter by filename pattern pattern = re.compile(r"(\d+)usc(\d+)(a)?\.html?", flags=re.IGNORECASE) match = pattern.fullmatch(item) if not match: continue new_name = f'{match[2]}{"1" if match[3] else "0"}_{match[1]}.htm' # Prevent overwriting files if os.path.exists(f"{US_ORIGINAL_PATH}/{new_name}"): print(f"{US_ORIGINAL_PATH}/{new_name} already exists") else: shutil.copy( f"{US_INPUT_PATH}/{subfolder}/{item}", f"{US_ORIGINAL_PATH}/{new_name}", ) files = os.listdir(US_ORIGINAL_PATH) files = [f for f in files if f.endswith(".htm")] pattern = re.compile(r"(\d+)_(\d+)\.htm") years = {} for file in files: match = pattern.fullmatch(file) year = match[2] title = match[1] years[year] = years[year] if years.get(year) else [] years[year].append(title) for idx in list(years.keys()): years[idx] = sorted(years[idx]) print(f"{len(files)} files found") print(f"{len(years)} years found") for year in sorted(years.keys()): titles = years[year] print(f"{year}: n={len(titles)}, max='{max(titles)}'")
def get_items(self, overwrite, snapshots) -> list: ensure_exists(self.dest) if not snapshots: snapshots = sorted( set([ os.path.splitext(x)[0] for x in list_dir(self.lookup, ".csv") ])) if not overwrite: existing_files = os.listdir(self.dest) snapshots = list( filter(lambda f: get_filename(f) not in existing_files, snapshots)) return snapshots
def get_items(self, overwrite, snapshots) -> list: ensure_exists(self.destination) items = sorted(list_dir(self.source, ".pickle")) items = [i[:-len(".pickle")] for i in items] # Create mappings to draw the edges mappings = [(file1, file2) for file1, file2 in zip( items[:-self.interval], items[self.interval:])] if snapshots: mappings = list(filter(lambda f: f[0] in snapshots, mappings)) if not overwrite: existing_files = list_dir(self.destination, ".json") mappings = list( filter(lambda x: mapping_filename(x) not in existing_files, mappings)) return mappings
def get_items(self, overwrite) -> list: src = (DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH) dest = (DE_REG_REFERENCE_PARSED_PATH if self.regulations else DE_REFERENCE_PARSED_PATH) ensure_exists(dest) files = list_dir(src, ".xml") ensure_exists(dest) files = list_dir(src, ".xml") if not overwrite: existing_files = os.listdir(dest) files = list(filter(lambda f: f not in existing_files, files)) copy_xml_schema_to_data_folder() return files
def cd_cluster_texts_prepare(overwrite, snapshots, pp_configs, source_folder, target_folder): ensure_exists(target_folder) items = get_configs_for_snapshots(snapshots, pp_configs) # Check if source graphs exist existing_source_files = set(list_dir(source_folder, source_file_ext)) required_source_files = { filename_for_pp_config(**item, file_ext=source_file_ext) for item in items } check_for_missing_files(required_source_files, existing_source_files, "clustering") if not overwrite: existing_files = os.listdir(target_folder) items = get_no_overwrite_items(items, "", existing_files) return items
def get_items(self, overwrite) -> list: src = DE_REG_ORIGINAL_PATH if self.regulations else DE_ORIGINAL_PATH dest = DE_REG_XML_PATH if self.regulations else DE_XML_PATH ensure_exists(dest) files = list_dir(src, ".xml") if not overwrite: existing_files = list_dir(dest, ".xml") # Remove cite_key converted_existing_files = [ f.split("_")[0] + "_" + "_".join(f.split("_")[2:]) for f in existing_files ] files = list( filter(lambda f: f not in converted_existing_files, files)) return sorted(files)
def get_items(self, overwrite, snapshots) -> list: dest = (US_REG_CROSSREFERENCE_LOOKUP_PATH if self.regulations else US_CROSSREFERENCE_LOOKUP_PATH) ensure_exists(dest) # If snapshots not set, create list of all years if not snapshots: snapshots = sorted( set([ x.split(".")[0].split("_")[-1] for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml") ])) if not overwrite: existing_files = os.listdir(dest) snapshots = list( filter(lambda f: get_filename(f) not in existing_files, snapshots)) return snapshots
def get_items(self, overwrite) -> list: # Create target folder ensure_exists(US_XML_PATH) # Get source files files = list_dir(US_ORIGINAL_PATH, ".htm") # Filter appendices pattern = re.compile(r"\d+0_\d+\.htm") html_files = list(filter(pattern.fullmatch, files)) # Prevent file overwrite if not overwrite: existing_files = list_dir(US_XML_PATH, ".xml") existing_files_sources = list( map(lambda x: x.replace(".xml", ".htm"), existing_files)) html_files = list( filter(lambda f: f not in existing_files_sources, html_files)) return html_files
def download(): ensure_exists(DE_DECISIONS_TEMP_DATA_PATH) toc = requests.get( "https://www.rechtsprechung-im-internet.de/rii-toc.xml").text with open(DE_DECISIONS_DOWNLOAD_TOC, "w") as f: f.write(toc) with open(DE_DECISIONS_DOWNLOAD_TOC) as f: toc = f.read() soup = BeautifulSoup(toc, "lxml-xml") len(soup.findAll("item")) ensure_exists(DE_DECISIONS_DOWNLOAD_ZIP) items = [i.link.text for i in soup.findAll("item")] with Pool(4) as p: p.map(download_item, items) ensure_exists(DE_DECISIONS_DOWNLOAD_XML) i = 0 for filename in os.listdir(DE_DECISIONS_DOWNLOAD_ZIP): if os.path.splitext(filename)[1] == ".zip": zip_ref = zipfile.ZipFile( f"{DE_DECISIONS_DOWNLOAD_ZIP}/{filename}", "r") zip_ref.extractall(DE_DECISIONS_DOWNLOAD_XML) zip_ref.close() i += 1 print(f"\r{i} entpackt", end="")
def cd_cluster_texts( config, dataset, source_folder, target_folder, reference_parsed_folders, regulations, ): source_filename_base = filename_for_pp_config(**config, file_ext="") clustering = get_clustering_result( f"{source_folder}/{source_filename_base}{source_file_ext}", dataset, graph_type="clustering", regulations=regulations, ) result_path = ensure_exists(f"{target_folder}/{source_filename_base}") reference_parsed_files = { os.path.splitext(f)[0]: f for reference_parsed_folder in reference_parsed_folders for f in list_dir(reference_parsed_folder, ".xml") } reference_parsed_files = { ("_".join(k.split("_")[:2] + k.split("_")[-1:]) if len(k.split("_")) == 4 else k): f for k, f in reference_parsed_files.items() } assert len([ file for reference_parsed_folder in reference_parsed_folders for file in list_dir(reference_parsed_folder, ".xml") ]) == len(reference_parsed_files) for idx, community_nodes in enumerate(clustering.communities): community_text = get_community_text(community_nodes, reference_parsed_folders, reference_parsed_files) write_community_text(result_path, idx, community_text)
def hierarchy(): ensure_exists(DE_DECISIONS_HIERARCHY) decisions = list_dir(DE_DECISIONS_XML, ".xml") with multiprocessing.Pool() as p: p.map(extract_hierarchy, decisions)
def get_items(self, overwrite, snapshots) -> list: ensure_exists(self.destination + "/seqitems") if not snapshots: snapshots = sorted( set( [ os.path.splitext(x)[0] for x in list_dir(self.edgelist_folder, ".csv") ] ) ) if not overwrite: existing_files = list_dir( os.path.join(self.destination, "seqitems"), ".gpickle.gz" ) snapshots = list( filter( lambda year: f"{year}.gpickle.gz" not in existing_files, snapshots ) ) if not len(snapshots): return [] if self.dataset == "us": files = [] for snapshot in snapshots: statute_files = [ f"{self.source}/subseqitems/{x}" for x in os.listdir(os.path.join(self.source, "subseqitems")) if str(snapshot) in x ] regulation_files = ( [ f"{self.source_regulation}/subseqitems/{x}" for x in os.listdir( os.path.join(self.source_regulation, "subseqitems") ) if str(snapshot) in x ] if self.regulations else None ) files.append( ( snapshot, statute_files, regulation_files, ) ) else: # is DE files = [] law_names_data = load_law_names(self.regulations) for snapshot in snapshots: graph_files = get_snapshot_law_list(snapshot, law_names_data) files.append( ( snapshot, [ f'{self.source}/subseqitems/{x.replace(".xml", ".gpickle")}' for x in graph_files ], None, ) ) return files
def copy_xml_schema_to_data_folder(): ensure_exists(DATA_PATH) shutil.copyfile("xml-schema.xsd", os.path.join(DATA_PATH, "xml-schema.xsd")) shutil.copyfile("xml-styles.css", os.path.join(DATA_PATH, "xml-styles.css"))
import os import shutil from multiprocessing.pool import Pool import requests from quantlaw.utils.files import ensure_exists from statics import US_REG_INPUT_PATH DOWNLOAD_BASE_URL = "https://www.govinfo.gov/bulkdata/CFR/{}/CFR-{}.zip" def download(year): zip_path = f"{US_REG_INPUT_PATH}/{year}.zip" if not os.path.exists(zip_path): print("loading", year) r = requests.get(DOWNLOAD_BASE_URL.format(year, year), stream=True) if r.status_code == 200: with open(zip_path, "wb") as f: r.raw.decode_content = True shutil.copyfileobj(r.raw, f) print("downloaded", year) if __name__ == "__main__": ensure_exists(US_REG_INPUT_PATH) with Pool(4) as p: p.map(download, list(range(1996, 2020 + 1)))
def clean(): ensure_exists(DE_DECISIONS_XML) decisions = list_dir(DE_DECISIONS_DOWNLOAD_XML, ".xml") with multiprocessing.Pool() as p: p.map(clean_decision, decisions)