Пример #1
0
    def test_read_write_json(self):
        g = nx.karate_club_graph()
        communities = algorithms.louvain(g)
        readwrite.write_community_json(communities, "coms.json")
        communities_r = readwrite.read_community_json("coms.json")
        self.assertListEqual(communities.communities,
                             communities_r.communities)
        os.remove("coms.json")

        communities = algorithms.louvain(g)
        readwrite.write_community_json(communities, "coms.gzip", zip=True)
        communities_r = readwrite.read_community_json("coms.gzip", zip=True)
        self.assertListEqual(communities.communities,
                             communities_r.communities)
        os.remove("coms.gzip")

        communities = algorithms.frc_fgsn(g, 1, 0.5, 3)
        readwrite.write_community_json(communities, "coms.json")
        communities_r = readwrite.read_community_json("coms.json")
        self.assertListEqual(communities.communities,
                             communities_r.communities)
        os.remove("coms.json")

        communities = algorithms.hierarchical_link_community(g)
        readwrite.write_community_json(communities, "coms.json")
        communities_r = readwrite.read_community_json("coms.json")
        self.assertListEqual(communities.communities,
                             communities_r.communities)

        with open("coms.json") as f:
            cr = f.read()
        readwrite.read_community_from_json_string(cr)
        os.remove("coms.json")
Пример #2
0
def get_clustering_result(cluster_path,
                          dataset,
                          graph_type,
                          path_prefix="",
                          regulations=False):
    """
    read the clustering result and the respective graph.
    ::param cluster_path: path of the cdlib.readwrite.write_community_json output
    ::param dataset: 'de' or 'us'
    ::param graph_type: 'clustering' for the rolled up graph.
        Other options: subseqitems, seqitems
    """

    filename_base = os.path.splitext(os.path.split(cluster_path)[-1])[0]
    snapshot = filename_base.split("_")[0]

    if graph_type == "clustering":
        config = get_config_from_filename(filename_base)
        graph_filename = filename_for_pp_config(
            **simplify_config_for_preprocessed_graph(config))
        graph_path = path_prefix + (
            (US_REG_CD_PREPROCESSED_GRAPH_PATH if regulations else
             US_CD_PREPROCESSED_GRAPH_PATH) if dataset.lower() == "us" else
            (DE_REG_CD_PREPROCESSED_GRAPH_PATH
             if regulations else DE_CD_PREPROCESSED_GRAPH_PATH))
        graph_path += f"/{graph_filename}"
        G = nx.read_gpickle(graph_path)
    elif graph_type in ["seqitems", "subseqitems"]:
        graph_path = path_prefix + (
            (US_REG_CROSSREFERENCE_GRAPH_PATH if regulations else
             US_CROSSREFERENCE_GRAPH_PATH) if dataset.lower() == "us" else
            (DE_REG_CROSSREFERENCE_GRAPH_PATH
             if regulations else DE_CROSSREFERENCE_GRAPH_PATH))

        graph_path += f"/{graph_type}/{snapshot}.gpickle.gz"
        G = nx.read_gpickle(graph_path)

    else:
        raise Exception(f"graph_type {graph_type} not allowed")

    clustering = readwrite.read_community_json(path_prefix + (
        (US_REG_CD_CLUSTER_PATH if regulations else US_CD_CLUSTER_PATH
         ) if dataset.lower() == "us" else
        (DE_REG_CD_CLUSTER_PATH if regulations else DE_CD_CLUSTER_PATH)) +
                                               "/" +
                                               os.path.split(cluster_path)[-1])
    clustering.graph = G

    add_communities_to_graph(clustering)

    return clustering
def cd_cluster_evolution_graph(
    config,
    source_folder,
    snaphot_mapping_folder,
    subseqitem_mapping_folder,
    target_folder,
    regulations,
):
    config_clustering_files, snapshots = get_config_clustering_files(
        config, source_folder)

    first = True

    B = nx.DiGraph()

    prev_community_id_for_rolled_down = None
    prev_preprocessed_mappings = None
    prev_snapshot = None

    for config_clustering_file, snapshot in zip(config_clustering_files,
                                                snapshots):
        # Add nodes to graph

        clustering = readwrite.read_community_json(
            os.path.join(source_folder, config_clustering_file))

        with open(
                os.path.join(
                    subseqitem_mapping_folder,
                    f'{snapshot}_{config["pp_merge"]}.pickle',
                ),
                "rb",
        ) as f:
            preprocessed_mappings = pickle.load(f)

        counters_dict = get_cluster_law_names_counting_seqitems(
            preprocessed_mappings, clustering.communities)
        most_common_dict = {
            k: ",".join(
                [f"{elem_k},{count}" for elem_k, count in v.most_common()])
            for k, v in counters_dict.items()
        }
        chars_n_dict = get_community_sizes(
            clustering.communities,
            preprocessed_mappings["chars_n"],
        )
        tokens_n_dict = get_community_sizes(clustering.communities,
                                            preprocessed_mappings["tokens_n"])

        for community_key, community_nodes in enumerate(
                clustering.communities):
            community_nodes_sorted = sorted(
                community_nodes,
                key=lambda n: preprocessed_mappings["tokens_n"].get(n, 0),
                reverse=True,
            )
            for n in community_nodes_sorted:
                assert "," not in n
            B.add_node(
                f"{snapshot}_{community_key}",
                bipartite=snapshot,
                chars_n=chars_n_dict[community_key],
                tokens_n=tokens_n_dict[community_key],
                law_names=most_common_dict[community_key],
                nodes_contained=",".join(community_nodes_sorted),
            )

        communities_rolled_down = [[
            n for rolled_up_node in community_nodes
            for n in preprocessed_mappings["items_mapping"][rolled_up_node]
        ] for community_nodes in clustering.communities]

        community_id_for_rolled_down = {
            n: community_id
            for community_id, nodes in enumerate(communities_rolled_down)
            for n in nodes
        }

        if not first:

            with open(
                    os.path.join(snaphot_mapping_folder,
                                 f"{prev_snapshot}_{snapshot}.json")) as f:
                mapping = json.load(f)

            # draw edges
            edges_tokens_n = defaultdict(int)
            edges_chars_n = defaultdict(int)
            for prev_leaf_and_text_idx, leaf_and_text_idx in mapping.items():
                prev_leaf, prev_text_idx = prev_leaf_and_text_idx.rsplit(
                    "_", 1)
                leaf, text_idx = leaf_and_text_idx.rsplit("_", 1)

                text_idx = int(text_idx)

                try:
                    prev_community_id = prev_community_id_for_rolled_down[
                        prev_leaf]
                except KeyError as err:
                    report_mapping_error(
                        err, prev_preprocessed_mappings["tokens_n"])
                    continue

                try:
                    community_id = community_id_for_rolled_down[leaf]
                except KeyError as err:
                    report_mapping_error(err,
                                         preprocessed_mappings["tokens_n"])
                    continue

                prev_community_name = f"{prev_snapshot}_{prev_community_id}"
                community_name = f"{snapshot}_{community_id}"
                edge = (prev_community_name, community_name)

                if leaf in preprocessed_mappings["texts_tokens_n"]:
                    texts_tokens_n = preprocessed_mappings["texts_tokens_n"][
                        leaf]
                    texts_chars_n = preprocessed_mappings["texts_chars_n"][
                        leaf]
                    tokens_n = texts_tokens_n[text_idx]
                    chars_n = texts_chars_n[text_idx]
                else:
                    assert text_idx == 0
                    tokens_n = preprocessed_mappings["tokens_n"][leaf]
                    chars_n = preprocessed_mappings["chars_n"][leaf]

                # Use the tokens_n and chars_n values of the later year
                edges_tokens_n[edge] += tokens_n
                edges_chars_n[edge] += chars_n

            B.add_edges_from(edges_tokens_n.keys())
            nx.set_edge_attributes(B, edges_tokens_n, "tokens_n")
            nx.set_edge_attributes(B, edges_chars_n, "chars_n")

        first = False
        prev_snapshot = snapshot
        prev_community_id_for_rolled_down = community_id_for_rolled_down
        prev_preprocessed_mappings = preprocessed_mappings

    nx.write_gpickle(
        B,
        f"{target_folder}/"
        f'{filename_for_pp_config(snapshot="all", **config, file_ext=".gpickle.gz")}',
    )

    # Write families
    families = cluster_families(B, threshold=0.15)
    path = (
        f"{target_folder}/"
        f'{filename_for_pp_config(snapshot="all", **config, file_ext=".families.json")}'
    )
    with open(path, "w") as f:
        json.dump(families, f)