def cd_preprocessing_prepare(
    overwrite, snapshots, pp_configs, source_folder, target_folder
):
    ensure_exists(target_folder)
    items = [
        dict(
            snapshot=snapshot,
            pp_ratio=pp_ratio,
            pp_decay=pp_decay,
            pp_merge=pp_merge,
            pp_co_occurrence=pp_co_occurrence,
            pp_co_occurrence_type=pp_co_occurrence_type,
        )
        for snapshot in snapshots
        for pp_ratio in pp_configs["pp_ratios"]
        for pp_decay in pp_configs["pp_decays"]
        for pp_merge in pp_configs["pp_merges"]
        for pp_co_occurrence in pp_configs["pp_co_occurrences"]
        for pp_co_occurrence_type in pp_configs["pp_co_occurrence_types"]
    ]

    # Check if source graphs exist
    existing_source_files = set(list_dir(f"{source_folder}/seqitems", ".gpickle.gz"))
    required_source_files = {f"{snapshot}.gpickle.gz" for snapshot in snapshots}
    check_for_missing_files(required_source_files, existing_source_files, "graphs")

    if not overwrite:
        existing_files = list_dir(target_folder, target_file_ext)
        items = get_no_overwrite_items(items, target_file_ext, existing_files)

    return items
Пример #2
0
def cd_cluster_prepare(overwrite, snapshots, pp_configs, source_folder,
                       target_folder):
    ensure_exists(target_folder)
    items = get_configs_for_snapshots(snapshots, pp_configs)

    # Check if source graphs exist
    existing_source_files = set(list_dir(source_folder, source_file_ext))
    required_source_files = {
        filename_for_pp_config(
            **{
                **item,
                "seed": None,
                "markov_time": None,
                "number_of_modules": None,
                "consensus": None,
                "method": None,
            },
            file_ext=source_file_ext,
        )
        for item in items
    }
    check_for_missing_files(required_source_files, existing_source_files,
                            "preprocessed graphs")

    if not overwrite:
        existing_files = list_dir(target_folder, target_file_ext)
        items = get_no_overwrite_items(items, target_file_ext, existing_files)

    return items
def cd_cluster_evolution_inspection_prepare(
    overwrite,
    cluster_mapping_configs,
    source_folder,
    crossreference_graph_folder,
    target_folder,
):
    ensure_exists(target_folder)

    configs = get_configs(cluster_mapping_configs)

    existing_files = set(list_dir(target_folder, ".htm"))
    if not overwrite:
        configs = [
            config for config in configs
            if filename_for_pp_config(snapshot="all",
                                      **config,
                                      file_ext=".htm") not in existing_files
        ]
    if configs:
        global cd_cluster_evolution_inspection_graphs
        cd_cluster_evolution_inspection_graphs = {
            f[:-len(".gpickle.gz")]: hierarchy_graph(
                nx.read_gpickle(os.path.join(crossreference_graph_folder, f)))
            for f in list_dir(crossreference_graph_folder, ".gpickle.gz")
        }

    return configs
def cd_cluster_evolution_graph_prepare(
    overwrite,
    cluster_mapping_configs,
    source_folder,
    snaphot_mapping_folder,
    subseqitem_mapping_folder,
    target_folder,
):
    ensure_exists(target_folder)
    configs = get_configs(cluster_mapping_configs)

    # Check if clusterings exist
    for config in configs:
        config_clustering_files, snapshots = get_config_clustering_files(
            config, source_folder)

        mapping_files = list_dir(snaphot_mapping_folder, ".json")
        check_mapping_files(mapping_files, snapshots, config, ".json")

        mapping_files = list_dir(subseqitem_mapping_folder, ".pickle")
        check_mapping_files(mapping_files, snapshots, config, ".pickle")

    existing_files = set(list_dir(target_folder, ".gpickle.gz"))
    if not overwrite:
        get_configs_no_overwrite(configs, existing_files)

    return configs
    def execute_item(self, item):
        yearfiles = [
            os.path.join(US_REFERENCE_PARSED_PATH, x)
            for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
            if str(item) in x
        ]
        if self.regulations:
            yearfiles += [
                os.path.join(US_REG_REFERENCE_PARSED_PATH, x)
                for x in list_dir(US_REG_REFERENCE_PARSED_PATH, ".xml")
                if str(item) in x
            ]

        key_df = pd.read_csv(f"{self.lookup}/{item}.csv").dropna().set_index(
            "citekey")
        key_dict = {}
        for idx, val in key_df.key.iteritems():
            if idx not in key_dict:
                key_dict[idx] = val
        edge_list = []
        for yearfile_path in yearfiles:
            edge_list_file = self.make_edge_list(yearfile_path, key_dict)
            edge_list.extend(edge_list_file)
        if edge_list:
            df = pd.DataFrame(edge_list, columns=["out_node", "in_node"])
            df.to_csv(f"{self.dest}/{item}.csv", index=False)
Пример #6
0
def cd_cluster_evolution_mappings_prepare(overwrite, cluster_mapping_configs,
                                          source_folder, target_folder,
                                          snapshots):
    ensure_exists(target_folder)

    subseqitems_snapshots = [
        f.split(".")[0] for f in list_dir(f"{source_folder}/", ".edges.csv.gz")
    ]  # fix

    if snapshots:
        subseqitems_snapshots = [
            s for s in subseqitems_snapshots if s in snapshots
        ]

    # get configs
    mappings = [
        dict(
            pp_merge=pp_merge,
            snapshot=subseqitems_snapshot,
        ) for pp_merge in cluster_mapping_configs["pp_merges"]
        for subseqitems_snapshot in subseqitems_snapshots
    ]

    existing_files = set(list_dir(target_folder, ".pickle"))
    if not overwrite:
        mappings = [
            mapping for mapping in mappings
            if filename_for_mapping(mapping) not in existing_files
        ]

    return sorted(mappings, key=str)
Пример #7
0
    def get_items(self, overwrite) -> list:
        ensure_exists(self.destination)
        files = list_dir(self.source, ".xml")

        if not overwrite:
            existing_files = list_dir(self.destination, ".gpickle")
            files = list(
                filter(lambda f: get_gpickle_filename(f) not in existing_files,
                       files))

        return files
Пример #8
0
def network():
    decisions = list_dir(DE_DECISIONS_REFERENCE_PARSED_XML, ".xml")
    with multiprocessing.Pool() as p:
        results = p.map(get_graph_data_from_decision, decisions)

    node_dicts = list(itertools.chain.from_iterable([x[0] for x in results]))
    containment_edges = list(
        itertools.chain.from_iterable([x[1] for x in results]))
    reference_edges = list(
        itertools.chain.from_iterable([x[2] for x in results]))

    hierarchy_G = nx.DiGraph()
    hierarchy_G.add_node("root", level=-1, key="root", bipartite="decision")
    hierarchy_G.add_nodes_from([(x["key"], x) for x in node_dicts],
                               bipartite="decision")
    hierarchy_G.add_edges_from(containment_edges, edge_type="containment")

    reference_G = nx.MultiDiGraph(hierarchy_G)
    print("created")
    reference_G.add_nodes_from(sorted({x[-1]
                                       for x in reference_edges}),
                               bipartite="statute")
    print("Statute nodes added")
    reference_G.add_edges_from(reference_edges, edge_type="reference")
    print("Reference edges added")

    reference_weighted_G = multi_to_weighted(reference_G)

    nx.write_gpickle(reference_weighted_G, DE_DECISIONS_NETWORK)
 def get_items(self, overwrite, snapshots) -> list:
     ensure_exists(self.destination)
     items = snapshots
     if not overwrite:
         existing_files = list_dir(self.destination, ".pickle")
         items = list(
             filter(lambda x: (x + ".pickle") not in existing_files, items))
     return items
Пример #10
0
def reference_parse_areas(regulations):
    global law_names
    law_names = load_law_names_compiled(regulations)
    ensure_exists(DE_DECISIONS_REFERENCE_AREAS)
    ensure_exists(DE_DECISIONS_REFERENCE_PARSED_XML)
    decisions = list_dir(DE_DECISIONS_HIERARCHY, ".xml")
    with multiprocessing.Pool() as p:
        p.map(find_references, decisions)
Пример #11
0
    def get_items(self, overwrite) -> list:
        src = DE_REG_ORIGINAL_PATH if self.regulations else DE_ORIGINAL_PATH
        dest = DE_REG_XML_PATH if self.regulations else DE_XML_PATH

        ensure_exists(dest)
        files = list_dir(src, ".xml")

        if not overwrite:
            existing_files = list_dir(dest, ".xml")

            # Remove cite_key
            converted_existing_files = [
                f.split("_")[0] + "_" + "_".join(f.split("_")[2:])
                for f in existing_files
            ]
            files = list(
                filter(lambda f: f not in converted_existing_files, files))

        return sorted(files)
Пример #12
0
    def get_items(self, overwrite) -> list:
        src = (DE_REG_REFERENCE_AREAS_PATH
               if self.regulations else DE_REFERENCE_AREAS_PATH)
        dest = (DE_REG_REFERENCE_PARSED_PATH
                if self.regulations else DE_REFERENCE_PARSED_PATH)

        ensure_exists(dest)
        files = list_dir(src, ".xml")

        ensure_exists(dest)
        files = list_dir(src, ".xml")

        if not overwrite:
            existing_files = os.listdir(dest)
            files = list(filter(lambda f: f not in existing_files, files))

        copy_xml_schema_to_data_folder()

        return files
    def get_items(self, overwrite, snapshots) -> list:
        ensure_exists(self.destination)
        items = sorted(list_dir(self.source, ".pickle"))
        items = [i[:-len(".pickle")] for i in items]

        # Create mappings to draw the edges
        mappings = [(file1, file2) for file1, file2 in zip(
            items[:-self.interval], items[self.interval:])]

        if snapshots:
            mappings = list(filter(lambda f: f[0] in snapshots, mappings))

        if not overwrite:
            existing_files = list_dir(self.destination, ".json")
            mappings = list(
                filter(lambda x: mapping_filename(x) not in existing_files,
                       mappings))

        return mappings
Пример #14
0
    def get_items(self, overwrite) -> list:
        # Create target folder
        ensure_exists(US_XML_PATH)

        # Get source files
        files = list_dir(US_ORIGINAL_PATH, ".htm")

        # Filter appendices
        pattern = re.compile(r"\d+0_\d+\.htm")
        html_files = list(filter(pattern.fullmatch, files))

        # Prevent file overwrite
        if not overwrite:
            existing_files = list_dir(US_XML_PATH, ".xml")
            existing_files_sources = list(
                map(lambda x: x.replace(".xml", ".htm"), existing_files))

            html_files = list(
                filter(lambda f: f not in existing_files_sources, html_files))

        return html_files
Пример #15
0
    def get_items(self, overwrite) -> list:
        src = US_REG_XML_PATH if self.regulations else US_XML_PATH
        dest = (US_REG_REFERENCE_AREAS_PATH
                if self.regulations else US_REFERENCE_AREAS_PATH)
        ensure_exists(dest)
        files = list_dir(src, ".xml")

        if not overwrite:
            existing_files = os.listdir(dest)
            files = list(filter(lambda f: f not in existing_files, files))

        return files
def get_leaf_texts_to_compare(graph_filename, G, source_text, source_text_reg,
                              law_names_data, dataset):
    """
    get text for leaves of a hierarchy graph. Can be seqitem or supseqitem graph.
    Leaves are only seqitems or supseqitems.
    """
    leaf_keys = get_leaves(G)

    snapshot = graph_filename[:-len(".gpickle.gz")]

    if dataset == "us":
        files = [
            os.path.join(source_text, x)
            for x in list_dir(source_text, ".xml")
            if x.split(".")[0].split("_")[-1] == snapshot
        ]
        if source_text_reg:
            files += [
                os.path.join(source_text_reg, x)
                for x in list_dir(source_text_reg, ".xml")
                if x.split(".")[0].split("_")[-1] == snapshot
            ]
        files.sort()
    else:  # is DE
        files = get_snapshot_law_list(snapshot, law_names_data)
        files = [os.path.join(source_text, f) for f in files]

    whitespace_pattern = regex.compile(r"[\s\n]+")
    texts = {}
    for file in files:
        print(f"\r{files.index(file)} / {len(files)}", end="")
        soup = create_soup(file)
        tags = soup.find_all(["seqitem", "subseqitem"])
        for tag in tags:
            if tag["key"] in leaf_keys:
                text = tag.get_text(" ")
                text = whitespace_pattern.sub(" ", text).lower().strip()
                texts[tag["key"]] = text.lower()
    return texts
Пример #17
0
def cd_cluster_texts(
    config,
    dataset,
    source_folder,
    target_folder,
    reference_parsed_folders,
    regulations,
):
    source_filename_base = filename_for_pp_config(**config, file_ext="")

    clustering = get_clustering_result(
        f"{source_folder}/{source_filename_base}{source_file_ext}",
        dataset,
        graph_type="clustering",
        regulations=regulations,
    )
    result_path = ensure_exists(f"{target_folder}/{source_filename_base}")

    reference_parsed_files = {
        os.path.splitext(f)[0]: f
        for reference_parsed_folder in reference_parsed_folders
        for f in list_dir(reference_parsed_folder, ".xml")
    }
    reference_parsed_files = {
        ("_".join(k.split("_")[:2] +
                  k.split("_")[-1:]) if len(k.split("_")) == 4 else k): f
        for k, f in reference_parsed_files.items()
    }
    assert len([
        file for reference_parsed_folder in reference_parsed_folders
        for file in list_dir(reference_parsed_folder, ".xml")
    ]) == len(reference_parsed_files)

    for idx, community_nodes in enumerate(clustering.communities):
        community_text = get_community_text(community_nodes,
                                            reference_parsed_folders,
                                            reference_parsed_files)
        write_community_text(result_path, idx, community_text)
Пример #18
0
    def execute_item(self, item):
        dest = (US_REG_CROSSREFERENCE_LOOKUP_PATH
                if self.regulations else US_CROSSREFERENCE_LOOKUP_PATH)

        yearfiles = [
            os.path.join(US_REFERENCE_PARSED_PATH, x)
            for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
            if str(item) in x
        ]
        if self.regulations:
            yearfiles += [
                os.path.join(US_REG_REFERENCE_PARSED_PATH, x)
                for x in list_dir(US_REG_REFERENCE_PARSED_PATH, ".xml")
                if str(item) in x
            ]
        data = []
        for file in yearfiles:
            with open(file, encoding="utf8") as f:
                file_elem = lxml.etree.parse(f)
            for node in file_elem.xpath("//*[@citekey]"):
                data.append([node.attrib["key"], node.attrib["citekey"]])
        df = pd.DataFrame(data, columns=["key", "citekey"])
        destination_file = f"{dest}/{get_filename(item)}"
        df.to_csv(destination_file, index=False)
Пример #19
0
    def get_items(self, overwrite) -> list:
        # Create target folder
        ensure_exists(US_REG_XML_PATH)

        # Get source files
        years = sorted([
            f for f in os.listdir(US_REG_ORIGINAL_PATH)
            if os.path.isdir(os.path.join(US_REG_ORIGINAL_PATH, f))
        ])

        if not overwrite:
            existing_files = set(list_dir(US_REG_XML_PATH, ".xml"))
            years = [f for f in years if item_not_complete(f, existing_files)]

        return years
    def get_items(self, overwrite, snapshots) -> list:
        ensure_exists(self.dest)
        if not snapshots:
            snapshots = sorted(
                set([
                    os.path.splitext(x)[0]
                    for x in list_dir(self.lookup, ".csv")
                ]))

        if not overwrite:
            existing_files = os.listdir(self.dest)
            snapshots = list(
                filter(lambda f: get_filename(f) not in existing_files,
                       snapshots))

        return snapshots
def get_config_clustering_files(config, source_folder):
    """
    get all clusterings for a given config. (Multiple snapshots to be mapped)
    ::return filenames, snapshots
    """
    existing_clustering = set(list_dir(source_folder, ".json"))
    config_filename_part = filename_for_pp_config(snapshot="",
                                                  **config,
                                                  file_ext=".json")
    config_clustering_files = sorted(
        [x for x in existing_clustering if x.endswith(config_filename_part)])
    snapshots = sorted([
        config_clustering_file.split("_")[0]
        for config_clustering_file in config_clustering_files
    ])
    return config_clustering_files, snapshots
Пример #22
0
def cd_cluster_texts_prepare(overwrite, snapshots, pp_configs, source_folder,
                             target_folder):
    ensure_exists(target_folder)
    items = get_configs_for_snapshots(snapshots, pp_configs)

    # Check if source graphs exist
    existing_source_files = set(list_dir(source_folder, source_file_ext))
    required_source_files = {
        filename_for_pp_config(**item, file_ext=source_file_ext)
        for item in items
    }
    check_for_missing_files(required_source_files, existing_source_files,
                            "clustering")

    if not overwrite:
        existing_files = os.listdir(target_folder)
        items = get_no_overwrite_items(items, "", existing_files)

    return items
Пример #23
0
    def get_items(self, overwrite, snapshots) -> list:
        dest = (US_REG_CROSSREFERENCE_LOOKUP_PATH
                if self.regulations else US_CROSSREFERENCE_LOOKUP_PATH)
        ensure_exists(dest)

        # If snapshots not set, create list of all years
        if not snapshots:
            snapshots = sorted(
                set([
                    x.split(".")[0].split("_")[-1]
                    for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
                ]))

        if not overwrite:
            existing_files = os.listdir(dest)
            snapshots = list(
                filter(lambda f: get_filename(f) not in existing_files,
                       snapshots))

        return snapshots
def get_texttags_to_compare(snapshot, source_texts, law_names_data, dataset):

    if dataset == "us":
        if type(source_texts) is str:
            source_texts = [source_texts]

        files = sorted([
            os.path.join(source_text, x) for source_text in source_texts
            for x in list_dir(source_text, ".xml")
            if x.split(".")[0].split("_")[-1] == snapshot
        ])
    else:  # is DE
        assert type(source_texts) is str
        files = get_snapshot_law_list(snapshot, law_names_data)
        files = [os.path.join(source_texts, f) for f in files]

    whitespace_pattern = regex.compile(r"[\s\n]+")

    for file in files:
        tree = etree.parse(file)
        for text_tag in tree.xpath("//text"):
            item = text_tag.getparent()

            text_elems = [e for e in item.getchildren() if e.tag == "text"]
            pos_in_item = text_elems.index(text_tag)
            text_key = item.attrib["key"] + f"_{pos_in_item}"

            seqitem = get_seqitem(item)
            if seqitem is not None:
                citekey = seqitem.attrib.get("citekey")
            else:
                citekey = None

            text = etree.tostring(text_tag, method="text",
                                  encoding="utf8").decode("utf-8")
            text = whitespace_pattern.sub(" ", text).lower().strip()

            yield text_key, citekey, text
def hierarchy():
    ensure_exists(DE_DECISIONS_HIERARCHY)
    decisions = list_dir(DE_DECISIONS_XML, ".xml")

    with multiprocessing.Pool() as p:
        p.map(extract_hierarchy, decisions)
    def get_items(self, overwrite, snapshots) -> list:
        ensure_exists(self.destination + "/seqitems")
        if not snapshots:
            snapshots = sorted(
                set(
                    [
                        os.path.splitext(x)[0]
                        for x in list_dir(self.edgelist_folder, ".csv")
                    ]
                )
            )

        if not overwrite:
            existing_files = list_dir(
                os.path.join(self.destination, "seqitems"), ".gpickle.gz"
            )
            snapshots = list(
                filter(
                    lambda year: f"{year}.gpickle.gz" not in existing_files, snapshots
                )
            )

        if not len(snapshots):
            return []

        if self.dataset == "us":
            files = []
            for snapshot in snapshots:
                statute_files = [
                    f"{self.source}/subseqitems/{x}"
                    for x in os.listdir(os.path.join(self.source, "subseqitems"))
                    if str(snapshot) in x
                ]
                regulation_files = (
                    [
                        f"{self.source_regulation}/subseqitems/{x}"
                        for x in os.listdir(
                            os.path.join(self.source_regulation, "subseqitems")
                        )
                        if str(snapshot) in x
                    ]
                    if self.regulations
                    else None
                )
                files.append(
                    (
                        snapshot,
                        statute_files,
                        regulation_files,
                    )
                )
        else:  # is DE
            files = []
            law_names_data = load_law_names(self.regulations)
            for snapshot in snapshots:
                graph_files = get_snapshot_law_list(snapshot, law_names_data)
                files.append(
                    (
                        snapshot,
                        [
                            f'{self.source}/subseqitems/{x.replace(".xml", ".gpickle")}'
                            for x in graph_files
                        ],
                        None,
                    )
                )

        return files
Пример #27
0
def create_cluster_volume_df(country, n_clusters):
    cluster_result_path = get_cluster_result_path(country)
    cluster_result_files = [
        f"{cluster_result_path}/{f}" for f in list_dir(
            cluster_result_path, f"_n{n_clusters}_m1-0_s0_c1000.json")
        if int(f[:4]) in YEARS
    ]
    cluster_evolution_path = get_cluster_evolution_path(country)
    cluster_evolution_file = [
        f"{cluster_evolution_path}/{f}" for f in list_dir(
            cluster_evolution_path,
            f"_n{n_clusters}_m1-0_s0_c1000.families.json",
        )
    ][0]
    crossreference_path = get_crossreference_path(country)
    nodefiles, _ = get_node_and_edge_files(crossreference_path, YEARS)

    cluster_families = {
        idx:
        sorted([x.replace("-12-31", "") for x in content
                ])  # this needs to match the -MM-DD part of the DE snapshots
        for idx, content in enumerate(load_json(cluster_evolution_file))
    }
    cluster_families_inverted = {
        cluster: idx
        for idx, clusters in cluster_families.items() for cluster in clusters
    }

    cluster_results = {
        int(fn.split("/")[-1][:4]): {  # the key is the snapshot year
            idx: content
            for idx, content in enumerate(load_json(fn)["communities"])
        }
        for fn in cluster_result_files
    }

    cluster_volumes = pd.DataFrame(columns=["statute", "regulation"])
    for year, file in zip(YEARS, nodefiles):
        print(f"Starting {year}...", end="\r")
        assert file.startswith(str(year))
        nodes = pd.read_csv(
            f"{crossreference_path}/{file}",
            usecols=["key", "document_type", "tokens_n"],
            low_memory=True,
            skiprows=[
                1
            ],  # the global root is at position 1 and we don't need it
        ).set_index("key")
        clusters = cluster_results[year]
        dfs = []
        for idx in list(clusters.keys()):
            dfs.append(nodes.loc[
                clusters[idx]].groupby("document_type").sum().T.rename(
                    dict(tokens_n=f"{year}_{idx}")))
        cluster_volumes = (pd.concat([cluster_volumes, *dfs],
                                     ignore_index=False).fillna(0).astype(int))

    cluster_volumes["year"] = cluster_volumes.index.map(lambda x: int(x[:4]))
    cluster_volumes[
        "total"] = cluster_volumes.statute + cluster_volumes.regulation
    cluster_volumes["family"] = cluster_volumes.index.map(
        cluster_families_inverted)

    cluster_family_volumes = (cluster_volumes.groupby(["family", "year"
                                                       ]).sum().reset_index())
    return cluster_family_volumes
 def get_items(self) -> list:
     src = DE_REG_XML_PATH if self.regulations else DE_XML_PATH
     files = list_dir(src, ".xml")
     return files
Пример #29
0
def clean():
    ensure_exists(DE_DECISIONS_XML)
    decisions = list_dir(DE_DECISIONS_DOWNLOAD_XML, ".xml")
    with multiprocessing.Pool() as p:
        p.map(clean_decision, decisions)