def cd_preprocessing_prepare( overwrite, snapshots, pp_configs, source_folder, target_folder ): ensure_exists(target_folder) items = [ dict( snapshot=snapshot, pp_ratio=pp_ratio, pp_decay=pp_decay, pp_merge=pp_merge, pp_co_occurrence=pp_co_occurrence, pp_co_occurrence_type=pp_co_occurrence_type, ) for snapshot in snapshots for pp_ratio in pp_configs["pp_ratios"] for pp_decay in pp_configs["pp_decays"] for pp_merge in pp_configs["pp_merges"] for pp_co_occurrence in pp_configs["pp_co_occurrences"] for pp_co_occurrence_type in pp_configs["pp_co_occurrence_types"] ] # Check if source graphs exist existing_source_files = set(list_dir(f"{source_folder}/seqitems", ".gpickle.gz")) required_source_files = {f"{snapshot}.gpickle.gz" for snapshot in snapshots} check_for_missing_files(required_source_files, existing_source_files, "graphs") if not overwrite: existing_files = list_dir(target_folder, target_file_ext) items = get_no_overwrite_items(items, target_file_ext, existing_files) return items
def cd_cluster_prepare(overwrite, snapshots, pp_configs, source_folder, target_folder): ensure_exists(target_folder) items = get_configs_for_snapshots(snapshots, pp_configs) # Check if source graphs exist existing_source_files = set(list_dir(source_folder, source_file_ext)) required_source_files = { filename_for_pp_config( **{ **item, "seed": None, "markov_time": None, "number_of_modules": None, "consensus": None, "method": None, }, file_ext=source_file_ext, ) for item in items } check_for_missing_files(required_source_files, existing_source_files, "preprocessed graphs") if not overwrite: existing_files = list_dir(target_folder, target_file_ext) items = get_no_overwrite_items(items, target_file_ext, existing_files) return items
def cd_cluster_evolution_inspection_prepare( overwrite, cluster_mapping_configs, source_folder, crossreference_graph_folder, target_folder, ): ensure_exists(target_folder) configs = get_configs(cluster_mapping_configs) existing_files = set(list_dir(target_folder, ".htm")) if not overwrite: configs = [ config for config in configs if filename_for_pp_config(snapshot="all", **config, file_ext=".htm") not in existing_files ] if configs: global cd_cluster_evolution_inspection_graphs cd_cluster_evolution_inspection_graphs = { f[:-len(".gpickle.gz")]: hierarchy_graph( nx.read_gpickle(os.path.join(crossreference_graph_folder, f))) for f in list_dir(crossreference_graph_folder, ".gpickle.gz") } return configs
def cd_cluster_evolution_graph_prepare( overwrite, cluster_mapping_configs, source_folder, snaphot_mapping_folder, subseqitem_mapping_folder, target_folder, ): ensure_exists(target_folder) configs = get_configs(cluster_mapping_configs) # Check if clusterings exist for config in configs: config_clustering_files, snapshots = get_config_clustering_files( config, source_folder) mapping_files = list_dir(snaphot_mapping_folder, ".json") check_mapping_files(mapping_files, snapshots, config, ".json") mapping_files = list_dir(subseqitem_mapping_folder, ".pickle") check_mapping_files(mapping_files, snapshots, config, ".pickle") existing_files = set(list_dir(target_folder, ".gpickle.gz")) if not overwrite: get_configs_no_overwrite(configs, existing_files) return configs
def execute_item(self, item): yearfiles = [ os.path.join(US_REFERENCE_PARSED_PATH, x) for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml") if str(item) in x ] if self.regulations: yearfiles += [ os.path.join(US_REG_REFERENCE_PARSED_PATH, x) for x in list_dir(US_REG_REFERENCE_PARSED_PATH, ".xml") if str(item) in x ] key_df = pd.read_csv(f"{self.lookup}/{item}.csv").dropna().set_index( "citekey") key_dict = {} for idx, val in key_df.key.iteritems(): if idx not in key_dict: key_dict[idx] = val edge_list = [] for yearfile_path in yearfiles: edge_list_file = self.make_edge_list(yearfile_path, key_dict) edge_list.extend(edge_list_file) if edge_list: df = pd.DataFrame(edge_list, columns=["out_node", "in_node"]) df.to_csv(f"{self.dest}/{item}.csv", index=False)
def cd_cluster_evolution_mappings_prepare(overwrite, cluster_mapping_configs, source_folder, target_folder, snapshots): ensure_exists(target_folder) subseqitems_snapshots = [ f.split(".")[0] for f in list_dir(f"{source_folder}/", ".edges.csv.gz") ] # fix if snapshots: subseqitems_snapshots = [ s for s in subseqitems_snapshots if s in snapshots ] # get configs mappings = [ dict( pp_merge=pp_merge, snapshot=subseqitems_snapshot, ) for pp_merge in cluster_mapping_configs["pp_merges"] for subseqitems_snapshot in subseqitems_snapshots ] existing_files = set(list_dir(target_folder, ".pickle")) if not overwrite: mappings = [ mapping for mapping in mappings if filename_for_mapping(mapping) not in existing_files ] return sorted(mappings, key=str)
def get_items(self, overwrite) -> list: ensure_exists(self.destination) files = list_dir(self.source, ".xml") if not overwrite: existing_files = list_dir(self.destination, ".gpickle") files = list( filter(lambda f: get_gpickle_filename(f) not in existing_files, files)) return files
def network(): decisions = list_dir(DE_DECISIONS_REFERENCE_PARSED_XML, ".xml") with multiprocessing.Pool() as p: results = p.map(get_graph_data_from_decision, decisions) node_dicts = list(itertools.chain.from_iterable([x[0] for x in results])) containment_edges = list( itertools.chain.from_iterable([x[1] for x in results])) reference_edges = list( itertools.chain.from_iterable([x[2] for x in results])) hierarchy_G = nx.DiGraph() hierarchy_G.add_node("root", level=-1, key="root", bipartite="decision") hierarchy_G.add_nodes_from([(x["key"], x) for x in node_dicts], bipartite="decision") hierarchy_G.add_edges_from(containment_edges, edge_type="containment") reference_G = nx.MultiDiGraph(hierarchy_G) print("created") reference_G.add_nodes_from(sorted({x[-1] for x in reference_edges}), bipartite="statute") print("Statute nodes added") reference_G.add_edges_from(reference_edges, edge_type="reference") print("Reference edges added") reference_weighted_G = multi_to_weighted(reference_G) nx.write_gpickle(reference_weighted_G, DE_DECISIONS_NETWORK)
def get_items(self, overwrite, snapshots) -> list: ensure_exists(self.destination) items = snapshots if not overwrite: existing_files = list_dir(self.destination, ".pickle") items = list( filter(lambda x: (x + ".pickle") not in existing_files, items)) return items
def reference_parse_areas(regulations): global law_names law_names = load_law_names_compiled(regulations) ensure_exists(DE_DECISIONS_REFERENCE_AREAS) ensure_exists(DE_DECISIONS_REFERENCE_PARSED_XML) decisions = list_dir(DE_DECISIONS_HIERARCHY, ".xml") with multiprocessing.Pool() as p: p.map(find_references, decisions)
def get_items(self, overwrite) -> list: src = DE_REG_ORIGINAL_PATH if self.regulations else DE_ORIGINAL_PATH dest = DE_REG_XML_PATH if self.regulations else DE_XML_PATH ensure_exists(dest) files = list_dir(src, ".xml") if not overwrite: existing_files = list_dir(dest, ".xml") # Remove cite_key converted_existing_files = [ f.split("_")[0] + "_" + "_".join(f.split("_")[2:]) for f in existing_files ] files = list( filter(lambda f: f not in converted_existing_files, files)) return sorted(files)
def get_items(self, overwrite) -> list: src = (DE_REG_REFERENCE_AREAS_PATH if self.regulations else DE_REFERENCE_AREAS_PATH) dest = (DE_REG_REFERENCE_PARSED_PATH if self.regulations else DE_REFERENCE_PARSED_PATH) ensure_exists(dest) files = list_dir(src, ".xml") ensure_exists(dest) files = list_dir(src, ".xml") if not overwrite: existing_files = os.listdir(dest) files = list(filter(lambda f: f not in existing_files, files)) copy_xml_schema_to_data_folder() return files
def get_items(self, overwrite, snapshots) -> list: ensure_exists(self.destination) items = sorted(list_dir(self.source, ".pickle")) items = [i[:-len(".pickle")] for i in items] # Create mappings to draw the edges mappings = [(file1, file2) for file1, file2 in zip( items[:-self.interval], items[self.interval:])] if snapshots: mappings = list(filter(lambda f: f[0] in snapshots, mappings)) if not overwrite: existing_files = list_dir(self.destination, ".json") mappings = list( filter(lambda x: mapping_filename(x) not in existing_files, mappings)) return mappings
def get_items(self, overwrite) -> list: # Create target folder ensure_exists(US_XML_PATH) # Get source files files = list_dir(US_ORIGINAL_PATH, ".htm") # Filter appendices pattern = re.compile(r"\d+0_\d+\.htm") html_files = list(filter(pattern.fullmatch, files)) # Prevent file overwrite if not overwrite: existing_files = list_dir(US_XML_PATH, ".xml") existing_files_sources = list( map(lambda x: x.replace(".xml", ".htm"), existing_files)) html_files = list( filter(lambda f: f not in existing_files_sources, html_files)) return html_files
def get_items(self, overwrite) -> list: src = US_REG_XML_PATH if self.regulations else US_XML_PATH dest = (US_REG_REFERENCE_AREAS_PATH if self.regulations else US_REFERENCE_AREAS_PATH) ensure_exists(dest) files = list_dir(src, ".xml") if not overwrite: existing_files = os.listdir(dest) files = list(filter(lambda f: f not in existing_files, files)) return files
def get_leaf_texts_to_compare(graph_filename, G, source_text, source_text_reg, law_names_data, dataset): """ get text for leaves of a hierarchy graph. Can be seqitem or supseqitem graph. Leaves are only seqitems or supseqitems. """ leaf_keys = get_leaves(G) snapshot = graph_filename[:-len(".gpickle.gz")] if dataset == "us": files = [ os.path.join(source_text, x) for x in list_dir(source_text, ".xml") if x.split(".")[0].split("_")[-1] == snapshot ] if source_text_reg: files += [ os.path.join(source_text_reg, x) for x in list_dir(source_text_reg, ".xml") if x.split(".")[0].split("_")[-1] == snapshot ] files.sort() else: # is DE files = get_snapshot_law_list(snapshot, law_names_data) files = [os.path.join(source_text, f) for f in files] whitespace_pattern = regex.compile(r"[\s\n]+") texts = {} for file in files: print(f"\r{files.index(file)} / {len(files)}", end="") soup = create_soup(file) tags = soup.find_all(["seqitem", "subseqitem"]) for tag in tags: if tag["key"] in leaf_keys: text = tag.get_text(" ") text = whitespace_pattern.sub(" ", text).lower().strip() texts[tag["key"]] = text.lower() return texts
def cd_cluster_texts( config, dataset, source_folder, target_folder, reference_parsed_folders, regulations, ): source_filename_base = filename_for_pp_config(**config, file_ext="") clustering = get_clustering_result( f"{source_folder}/{source_filename_base}{source_file_ext}", dataset, graph_type="clustering", regulations=regulations, ) result_path = ensure_exists(f"{target_folder}/{source_filename_base}") reference_parsed_files = { os.path.splitext(f)[0]: f for reference_parsed_folder in reference_parsed_folders for f in list_dir(reference_parsed_folder, ".xml") } reference_parsed_files = { ("_".join(k.split("_")[:2] + k.split("_")[-1:]) if len(k.split("_")) == 4 else k): f for k, f in reference_parsed_files.items() } assert len([ file for reference_parsed_folder in reference_parsed_folders for file in list_dir(reference_parsed_folder, ".xml") ]) == len(reference_parsed_files) for idx, community_nodes in enumerate(clustering.communities): community_text = get_community_text(community_nodes, reference_parsed_folders, reference_parsed_files) write_community_text(result_path, idx, community_text)
def execute_item(self, item): dest = (US_REG_CROSSREFERENCE_LOOKUP_PATH if self.regulations else US_CROSSREFERENCE_LOOKUP_PATH) yearfiles = [ os.path.join(US_REFERENCE_PARSED_PATH, x) for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml") if str(item) in x ] if self.regulations: yearfiles += [ os.path.join(US_REG_REFERENCE_PARSED_PATH, x) for x in list_dir(US_REG_REFERENCE_PARSED_PATH, ".xml") if str(item) in x ] data = [] for file in yearfiles: with open(file, encoding="utf8") as f: file_elem = lxml.etree.parse(f) for node in file_elem.xpath("//*[@citekey]"): data.append([node.attrib["key"], node.attrib["citekey"]]) df = pd.DataFrame(data, columns=["key", "citekey"]) destination_file = f"{dest}/{get_filename(item)}" df.to_csv(destination_file, index=False)
def get_items(self, overwrite) -> list: # Create target folder ensure_exists(US_REG_XML_PATH) # Get source files years = sorted([ f for f in os.listdir(US_REG_ORIGINAL_PATH) if os.path.isdir(os.path.join(US_REG_ORIGINAL_PATH, f)) ]) if not overwrite: existing_files = set(list_dir(US_REG_XML_PATH, ".xml")) years = [f for f in years if item_not_complete(f, existing_files)] return years
def get_items(self, overwrite, snapshots) -> list: ensure_exists(self.dest) if not snapshots: snapshots = sorted( set([ os.path.splitext(x)[0] for x in list_dir(self.lookup, ".csv") ])) if not overwrite: existing_files = os.listdir(self.dest) snapshots = list( filter(lambda f: get_filename(f) not in existing_files, snapshots)) return snapshots
def get_config_clustering_files(config, source_folder): """ get all clusterings for a given config. (Multiple snapshots to be mapped) ::return filenames, snapshots """ existing_clustering = set(list_dir(source_folder, ".json")) config_filename_part = filename_for_pp_config(snapshot="", **config, file_ext=".json") config_clustering_files = sorted( [x for x in existing_clustering if x.endswith(config_filename_part)]) snapshots = sorted([ config_clustering_file.split("_")[0] for config_clustering_file in config_clustering_files ]) return config_clustering_files, snapshots
def cd_cluster_texts_prepare(overwrite, snapshots, pp_configs, source_folder, target_folder): ensure_exists(target_folder) items = get_configs_for_snapshots(snapshots, pp_configs) # Check if source graphs exist existing_source_files = set(list_dir(source_folder, source_file_ext)) required_source_files = { filename_for_pp_config(**item, file_ext=source_file_ext) for item in items } check_for_missing_files(required_source_files, existing_source_files, "clustering") if not overwrite: existing_files = os.listdir(target_folder) items = get_no_overwrite_items(items, "", existing_files) return items
def get_items(self, overwrite, snapshots) -> list: dest = (US_REG_CROSSREFERENCE_LOOKUP_PATH if self.regulations else US_CROSSREFERENCE_LOOKUP_PATH) ensure_exists(dest) # If snapshots not set, create list of all years if not snapshots: snapshots = sorted( set([ x.split(".")[0].split("_")[-1] for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml") ])) if not overwrite: existing_files = os.listdir(dest) snapshots = list( filter(lambda f: get_filename(f) not in existing_files, snapshots)) return snapshots
def get_texttags_to_compare(snapshot, source_texts, law_names_data, dataset): if dataset == "us": if type(source_texts) is str: source_texts = [source_texts] files = sorted([ os.path.join(source_text, x) for source_text in source_texts for x in list_dir(source_text, ".xml") if x.split(".")[0].split("_")[-1] == snapshot ]) else: # is DE assert type(source_texts) is str files = get_snapshot_law_list(snapshot, law_names_data) files = [os.path.join(source_texts, f) for f in files] whitespace_pattern = regex.compile(r"[\s\n]+") for file in files: tree = etree.parse(file) for text_tag in tree.xpath("//text"): item = text_tag.getparent() text_elems = [e for e in item.getchildren() if e.tag == "text"] pos_in_item = text_elems.index(text_tag) text_key = item.attrib["key"] + f"_{pos_in_item}" seqitem = get_seqitem(item) if seqitem is not None: citekey = seqitem.attrib.get("citekey") else: citekey = None text = etree.tostring(text_tag, method="text", encoding="utf8").decode("utf-8") text = whitespace_pattern.sub(" ", text).lower().strip() yield text_key, citekey, text
def hierarchy(): ensure_exists(DE_DECISIONS_HIERARCHY) decisions = list_dir(DE_DECISIONS_XML, ".xml") with multiprocessing.Pool() as p: p.map(extract_hierarchy, decisions)
def get_items(self, overwrite, snapshots) -> list: ensure_exists(self.destination + "/seqitems") if not snapshots: snapshots = sorted( set( [ os.path.splitext(x)[0] for x in list_dir(self.edgelist_folder, ".csv") ] ) ) if not overwrite: existing_files = list_dir( os.path.join(self.destination, "seqitems"), ".gpickle.gz" ) snapshots = list( filter( lambda year: f"{year}.gpickle.gz" not in existing_files, snapshots ) ) if not len(snapshots): return [] if self.dataset == "us": files = [] for snapshot in snapshots: statute_files = [ f"{self.source}/subseqitems/{x}" for x in os.listdir(os.path.join(self.source, "subseqitems")) if str(snapshot) in x ] regulation_files = ( [ f"{self.source_regulation}/subseqitems/{x}" for x in os.listdir( os.path.join(self.source_regulation, "subseqitems") ) if str(snapshot) in x ] if self.regulations else None ) files.append( ( snapshot, statute_files, regulation_files, ) ) else: # is DE files = [] law_names_data = load_law_names(self.regulations) for snapshot in snapshots: graph_files = get_snapshot_law_list(snapshot, law_names_data) files.append( ( snapshot, [ f'{self.source}/subseqitems/{x.replace(".xml", ".gpickle")}' for x in graph_files ], None, ) ) return files
def create_cluster_volume_df(country, n_clusters): cluster_result_path = get_cluster_result_path(country) cluster_result_files = [ f"{cluster_result_path}/{f}" for f in list_dir( cluster_result_path, f"_n{n_clusters}_m1-0_s0_c1000.json") if int(f[:4]) in YEARS ] cluster_evolution_path = get_cluster_evolution_path(country) cluster_evolution_file = [ f"{cluster_evolution_path}/{f}" for f in list_dir( cluster_evolution_path, f"_n{n_clusters}_m1-0_s0_c1000.families.json", ) ][0] crossreference_path = get_crossreference_path(country) nodefiles, _ = get_node_and_edge_files(crossreference_path, YEARS) cluster_families = { idx: sorted([x.replace("-12-31", "") for x in content ]) # this needs to match the -MM-DD part of the DE snapshots for idx, content in enumerate(load_json(cluster_evolution_file)) } cluster_families_inverted = { cluster: idx for idx, clusters in cluster_families.items() for cluster in clusters } cluster_results = { int(fn.split("/")[-1][:4]): { # the key is the snapshot year idx: content for idx, content in enumerate(load_json(fn)["communities"]) } for fn in cluster_result_files } cluster_volumes = pd.DataFrame(columns=["statute", "regulation"]) for year, file in zip(YEARS, nodefiles): print(f"Starting {year}...", end="\r") assert file.startswith(str(year)) nodes = pd.read_csv( f"{crossreference_path}/{file}", usecols=["key", "document_type", "tokens_n"], low_memory=True, skiprows=[ 1 ], # the global root is at position 1 and we don't need it ).set_index("key") clusters = cluster_results[year] dfs = [] for idx in list(clusters.keys()): dfs.append(nodes.loc[ clusters[idx]].groupby("document_type").sum().T.rename( dict(tokens_n=f"{year}_{idx}"))) cluster_volumes = (pd.concat([cluster_volumes, *dfs], ignore_index=False).fillna(0).astype(int)) cluster_volumes["year"] = cluster_volumes.index.map(lambda x: int(x[:4])) cluster_volumes[ "total"] = cluster_volumes.statute + cluster_volumes.regulation cluster_volumes["family"] = cluster_volumes.index.map( cluster_families_inverted) cluster_family_volumes = (cluster_volumes.groupby(["family", "year" ]).sum().reset_index()) return cluster_family_volumes
def get_items(self) -> list: src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH files = list_dir(src, ".xml") return files
def clean(): ensure_exists(DE_DECISIONS_XML) decisions = list_dir(DE_DECISIONS_DOWNLOAD_XML, ".xml") with multiprocessing.Pool() as p: p.map(clean_decision, decisions)