def main():
    """Main execution steps
    Reads arguments, fixes random seeding, executes the HOPE model
    and saves resulting embeddings.
    """
    args = parse_args()

    # Set random seed if specified
    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)
    # numpy seeds need to be in [0, 2^32-1]
    args.seed = [
        np.random.randint(4294967296 - 1) for i in range(args.num_embeddings)
    ]

    # Parse graph data
    graph_name, graph = parse_graph(args.dataset, args.largest_cc)
    graph = graph.to_directed()

    # Compute embeddings
    model = HOPE(d=128, beta=args.beta)

    print("Num nodes: %d, num edges: %d" %
          (graph.number_of_nodes(), graph.number_of_edges()))
    times = []
    for i in range(args.num_embeddings):
        t1 = time()

        # Set the seed before learning
        np.random.seed(args.seed[i])
        random.seed(args.seed[i])

        Y, t = model.learn_embedding(graph=graph,
                                     edge_f=None,
                                     is_weighted=True,
                                     no_python=True)
        times.append(time() - t1)

        # save embedding
        file_path = (f"{args.outputdir}/hope_{graph_name}_" f"{i:03d}.emb")
        print(f"Saving embedding to {file_path}")
        save_embedding(
            Y, file_path, {
                "algorithm": "hope",
                "dimension": args.dimensions,
                "beta": args.beta,
                "seed": args.seed[i],
            })

    print(model._method_name + "\n\tAverage training time: %f" %
          (sum(times) / len(times)))
示例#2
0
                                "random_forest"
                            ],
                            choices=[
                                "adaboost", "decision_tree", "neural_network",
                                "random_forest"
                            ])
    arg_parser.add_argument(
        "-n",
        default=10,
        type=int,
        help=
        "How often the classifier should be trained. The performances of these different runs are then compared."
    )
    args = arg_parser.parse_args()

    _, graph = parse_graph(args.dataset)
    node_labels, _ = prepare_classification(graph)
    del graph  # free memory

    for embedding_file in args.embeddings:
        embedding_name = embedding_file.name.split("/")[-1].split(".")[0]
        print("Start embedding", embedding_name)

        embedding = read_embedding(embedding_file)
        node_labels_train, node_labels_test, embedding_train, embedding_test = train_test_split(
            node_labels, embedding)

        for clf in args.classifiers:
            print("Start classifier", clf)

            # We train n classifiers on the same embedding and always use the same
示例#3
0
                            ],
                            default=[
                                "adaboost", "decision_tree", "neural_network",
                                "random_forest"
                            ])
    arg_parser.add_argument("-splits", default=10, type=int)
    arg_parser.add_argument("-repeats", default=10, type=int)
    arg_parser.add_argument(
        "-processes",
        type=int,
        help=
        "Number of processes to use in parallel, defaults to number of cpus.")

    args = arg_parser.parse_args()

    graph_name, graph = parse_graph(args.dataset)
    node_labels, distinct_node_labels = prepare_classification(graph)
    del graph  # free memory

    files = {}
    for filename in args.embeddings:
        with open(filename, "r") as file:
            node_embeddings = read_embedding(file)

        files[filename] = {
            "embedding_name": os.path.basename(filename).split(".")[0],
            "node_embeddings": node_embeddings,
            "node_labels": node_labels,
            "distinct_node_labels": distinct_node_labels
        }
示例#4
0
    arg_parser.add_argument("-q", type=float, default=1.0)

    args = arg_parser.parse_args()

    if (args.seed is not None):
        assert "PYTHONHASHSEED" in os.environ, ("Execution is only deterministic if (next to the -seed parameter) the environmental variable PYTHONHASHSEED is set! Either remove -seed or set PYTHONHASHSEED")
        print(f"Setting seed to {args.seed}")
        # Set inital seeds
        random.seed(args.seed)
        np.random.seed(args.seed)

    # Get a list of seeds for 'num_embeddings' many embeddings
    args.seed = [np.random.randint(4294967296 - 1) for i in range(args.num_embeddings)]

    for dataset in args.dataset:
        graph_name, graph = parse_graph(dataset, args.largest_cc)

        node2vec_graph = Graph(graph, args.p, args.q, not args.no_alias)

        for i in range(args.num_embeddings):
            print("Generating embedding", i)
            embedding = generate(node2vec_graph, args.dimensions, args.walk_length, args.walk_number, seed=args.seed[i])
            if args.output and args.num_embeddings == 1:
                file_name = args.output
            else:
                file_name = os.path.dirname(os.path.abspath(__file__)) + ("/results/node2vec_{0}_{1}.emb").format(graph_name, i)
            print("Save embedding at", file_name)
            save_embedding(embedding, file_name, {
                "algorithm": "node2vec",
                "walk-length": args.walk_length,
                "walk-number": args.walk_number,
def main():
    # Load arguments
    args = parse_args()

    # Parse graph data
    graph = None
    graph_name = ""
    print(f"Parsing graph dataset from {args.dataset}"
          )  #, created by {args.source}")
    graph_name, graph = parse_graph(args.dataset, args.largest_cc)
    print(
        f"{graph_name} has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges"
    )
    # Create an undirected copy of the graph for the first order embedding.
    # Directed graphs will change whereas an undirected graph will stay the same.
    print(f"Graph has {graph.number_of_edges()} edges")
    if type(graph) == nx.Graph:
        print("Graph is undirected")
        graph = graph.to_directed()
        undirected_graph = graph
    else:
        print("Graph is directed")
        undirected_graph = graph.to_undirected()
        undirected_graph = undirected_graph.to_directed()

    node_count = len(graph)

    # Densify graph
    if args.threshold is None:
        # calculate the threshold from graph characteristics
        print("Calculating threshold...")
        avg_deg = 2 * graph.number_of_edges() / graph.number_of_nodes()
        args.threshold = max(0, int(avg_deg))
        print("nodes", node_count)
        print("threshold", args.threshold)
    path = None
    if os.path.isdir(args.dataset) is False:
        path = os.path.split(args.dataset)[0]
    else:
        path = args.dataset
    print("Densifying graph...")
    if args.densify is True:
        nx_to_weighted_edgelist(graph, f"{path}/{graph_name}.Edgelist")
        subprocess.call(
            f"{args.executables}/reconstruct -train {path}/{graph_name}.Edgelist "
            f"-output {path}/{graph_name}.denseEdgelist -depth 2 -threshold {args.threshold}",
            shell=True)
        nx_to_weighted_edgelist(undirected_graph,
                                f"{path}/{graph_name}_undir.Edgelist")
        subprocess.call(
            f"{args.executables}/reconstruct -train {path}/{graph_name}_undir.Edgelist "
            f"-output {path}/{graph_name}_undir.denseEdgelist -depth 2 -threshold {args.threshold}",
            shell=True)

    # Handle seed
    if args.seed is None:
        # srand(0) --> random seed for every iteration
        args.seed = [0 for i in range(args.num_embeddings)]
    else:
        np.random.seed(args.seed)
        args.seed = [
            np.random.randint(4294967296 - 1)
            for i in range(args.num_embeddings)
        ]

    # Learn embeddings
    if args.samples is None:
        args.samples = max(node_count // 1000, 1)
    timings = []
    for i in range(args.num_embeddings):
        t1 = time()
        # 1st order
        subprocess.call(
            f"{args.executables}/line -train {path}/{graph_name}_undir.denseEdgelist "
            f"-output {args.outputdir}/line_1st_order_{graph_name}_{i:03d} -binary 1 -size {args.dimensions//2} "
            f"-order 1 -negative {args.negative_samples} -samples {args.samples} -threads {args.num_threads}"
            f"-srand {args.seed[i]}",
            shell=True)
        file_path_1 = f"{args.outputdir}/line_1st_order_norm_{graph_name}_{i:03d}.emb"
        subprocess.call(
            f"{args.executables}/normalize -input {args.outputdir}/line_1st_order_{graph_name}_{i:03d} "
            f"-output {file_path_1} -binary 0",
            shell=True)

        # 2nd order
        subprocess.call(
            f"{args.executables}/line -train {path}/{graph_name}.denseEdgelist "
            f"-output {args.outputdir}/line_2nd_order_{graph_name}_{i:03d} -binary 1 -size {args.dimensions//2} "
            f"-order 2 -negative {args.negative_samples} -samples {args.samples} -threads {args.num_threads}"
            f"-srand {args.seed[i]}",
            shell=True)
        file_path_2 = f"{args.outputdir}/line_2nd_order_norm_{graph_name}_{i:03d}.emb"
        subprocess.call(
            f"{args.executables}/normalize -input {args.outputdir}/line_2nd_order_{graph_name}_{i:03d} "
            f"-output {file_path_2} -binary 0",
            shell=True)

        # add meta information
        param_lines = create_param_lines({
            "node_count":
            node_count,
            "embedding_dimension":
            f"{args.dimensions//2}"
        })
        prepend_param_lines(file_path_1, param_lines)
        prepend_param_lines(file_path_2, param_lines)

        # Concatenate embeddings
        print("Concatenating embeddings...")
        with open(file_path_1, "r") as f:
            first_order = read_embedding(f)
        with open(file_path_2, "r") as f:
            second_order = read_embedding(f)
        concat = np.concatenate((first_order, second_order), axis=1)
        save_embedding(
            concat, f"{args.outputdir}/line_{graph_name}_{i:03d}.emb", {
                "algorithm": "line",
                "order": "first+second",
                "negative_samples": args.negative_samples,
                "samples": args.samples,
                "threshold": args.threshold,
                "densify": args.densify,
                "node_count": node_count,
                "embedding_dimension": args.dimensions,
                "threads": args.num_threads,
                "seed": args.seed[i],
                "comment":
                "1st-order-and-2nd-order-get-the-same-dimensionality"
            })
        timings.append(time() - t1)

        # Clean up
        subprocess.call(
            f'rm {args.outputdir}/line_1st_order_{graph_name}_{i:03d}',
            shell=True)
        subprocess.call(
            f'rm {args.outputdir}/line_2nd_order_{graph_name}_{i:03d}',
            shell=True)
        subprocess.call(
            f'rm {args.outputdir}/line_1st_order_norm_{graph_name}_{i:03d}.emb',
            shell=True)
        subprocess.call(
            f'rm {args.outputdir}/line_2nd_order_norm_{graph_name}_{i:03d}.emb',
            shell=True)

    # Clean up
    subprocess.call(f"rm {path}/{graph_name}.Edgelist", shell=True)
    subprocess.call(f"rm {path}/{graph_name}_undir.Edgelist", shell=True)

    print(f"\nDone!\nAverage training time: {sum(timings)/len(timings)}")
示例#6
0
import numpy as np

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("graph", type=str)
parser.add_argument("--use-already-shrunken-graph", action="store_true")
parser.add_argument("--relabel-nodes", action="store_true")
args = parser.parse_args()

# just to have a fixed seed
np.random.seed(0)
random.seed(0)

graph_name = args.graph
_, graph = parse_graph(f'./graphs/{graph_name}/', largest_cc=True)

if args.relabel_nodes:
    mapping = dict(zip(graph.nodes(), range(graph.number_of_nodes())))
    with open(f'./graphs/{graph_name}/{graph_name}_reduced_edges.pickle',
              'rb') as file:
        true_edges = load(file)
    with open(f'./graphs/{graph_name}/{graph_name}_reduced_nonedges.pickle',
              'rb') as file:
        false_edges = load(file)
    with open(
            f'./graphs/{graph_name}/{graph_name}_reduced_nonedges_train.pickle',
            'rb') as file:
        false_edges_train = load(file)
    with open(f'./graphs/{graph_name}/{graph_name}_reduced_edges_train.pickle',
              'rb') as file:
示例#7
0
def neighbor_variance(graphdir, k, embedding_dir, embedding_list, dataset, nodes_dict=None, seed=None):
    """ 
    Compute similarity from nodes to specific kind of neighbors: 1-hop, 2-hop neighbors and 
    distant nodes (more than 2-Hops distance). The nodes will be sampled from the graph. Since some nodes
    may not have all required kinds of neighbors, the actual size of the experiment might be lower than expected.

    Args:
        graphdir: str, path to graph file (e.g. edgelist)
        k: int, number of nodes for which the similarities will be computed
        embedding_dir: str, path to directory of saved embeddings
        embedding_list: list, list of all embedding names that will be considered
        dataset: str, graph identifier
        nodes_dict: dict (optional), a dict of 3-tuples or lists of node ids. The tuples are ordered as 1-hop,
                    2-hop, distant. The keys are the origin node corresponding to the tuple. No nodes will be sampled.
        seed: int, sets the random seed for numpy

    Returns:
        agg_results: numpy array of size k x 3. Every row correponds to one sampled node and its mean 
            similarities to its pair nodes.
        DataFrame: pandas DataFrame that hold information about the dataset, algorithm, node and similarity 
            measure to a neighbor node. (neighbor_type: 0 -> 1-neighbor, 1 -> 2-neighbor, 2 -> distant node)
        di: dict with sampled nodes as keys and corresponding lists of 1-neighbor, 2-neighbor and distant node. 
            An entry is None if there could not be found a node with the correct characteristics.
    """

    if seed is not None:
        np.random.seed(seed)

    if nodes_dict is None:
        graph_name, graph = parse_graph(graphdir)

        # Sample k nodes if not specified by arguments
        vertices = np.array(list(graph))
        np.random.shuffle(vertices)
        nodes = vertices[:min(k, graph.number_of_nodes())].copy()
        
        # Find 1-hop, 2-hop and distant nodes for every sampled node
        di = defaultdict(list)
        for node in nodes:
            # 1-hop: sample neighbor that is not the node itself
            one_neighbors = list(graph.neighbors(node))
            if node in one_neighbors:
                one_neighbors.remove(node)
            # If there is no neighbor, just skip the node
            if not one_neighbors:
                di[node].extend([None, None, None])
                continue
            else:
                di[node].append(np.random.choice(one_neighbors))
            # 2-hop: sample neighbor; from there another node, that has no edge to the origin
            one_neighbor = np.random.choice(one_neighbors)
            one_neighbors.append(node)
            two_n = None
            for two_n_candidate in graph.neighbors(one_neighbor):
                if two_n_candidate not in one_neighbors:
                    two_n = two_n_candidate
                    break
            di[node].append(two_n)
            # distant node: sample random node, compute shortest path, if length is more than 2 accept the sample
            np.random.shuffle(vertices)
            distant_node = None
            for distant_node_candidate in vertices:
                try:
                    if nx.shortest_path_length(graph, distant_node_candidate, node) > 2:
                        distant_node = distant_node_candidate
                        break
                except nx.NetworkXNoPath:
                    distant_node = distant_node_candidate
                    break
            di[node].append(distant_node)
            assert len(di[node]) == 3, f"{node}: {di[node]}"
    else:
        di = nodes_dict
        nodes = np.asarray(list(nodes_dict.keys()))
    
    # Aggregate all node for comparison
    cossim_nodes = []
    for origin_node, node_list in di.items():
        cossim_nodes.append(origin_node)
        cossim_nodes.extend(node_list)
    cossim_nodes = sorted(list(set([node for node in cossim_nodes if node is not None])))
    array_map = dict([(node, i) for i, node in enumerate(cossim_nodes)])

    # Start Comparison.cosine_similarity with selected nodes
    comparison = Comparison(embedding_dir, embedding_list)
    comp_result = comparison.cosine_similarity(cossim_nodes)["sims"]

    # Analyse the result
    # Create a matrix, with columns 1-hop, 2-hop, distant neighbor;
    # rows are nodes from the sample; in every cell the mean is noted
    agg_results = np.empty((len(nodes), 3))
    # Construct a DataFrame that holds the detailed results
    # DataFrame has columns: node, neighbor_type, similarity, algorithm, dataset
    df_dict = defaultdict(list)
    algorithm = embedding_list[0].split("_")[0]
    
    for index, node in enumerate(nodes):
        nr = {0: [], 1: [], 2: []}
        for i, v in enumerate(di[node]):
            if v is None:
                nr[i] = [np.nan]
                df_dict["node"].append(node)
                df_dict["neighbor_type"].append(i)
                df_dict["algorithm"].append(algorithm)
                df_dict["dataset"].append(dataset)
                df_dict["similarity"].append(np.nan)
                continue
            for arr in comp_result.values():
                nr[i].append(arr[array_map[node], array_map[v]])
                df_dict["node"].append(node)
                df_dict["neighbor_type"].append(i)
                df_dict["algorithm"].append(algorithm)
                df_dict["dataset"].append(dataset)
                df_dict["similarity"].append(arr[array_map[node], array_map[v]])
        agg_results[index, :] = [np.mean(nr[0]), np.mean(nr[1]), np.mean(nr[2])]

    # Remove rows with NaNs
    agg_results = agg_results[~np.isnan(agg_results).any(axis=1)]
    print(f"Removed {len(nodes) - len(agg_results)} nodes from the sample due to them not having correct neighbors.")
    return agg_results, pd.DataFrame(df_dict), di