def main(): """Main execution steps Reads arguments, fixes random seeding, executes the HOPE model and saves resulting embeddings. """ args = parse_args() # Set random seed if specified if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) # numpy seeds need to be in [0, 2^32-1] args.seed = [ np.random.randint(4294967296 - 1) for i in range(args.num_embeddings) ] # Parse graph data graph_name, graph = parse_graph(args.dataset, args.largest_cc) graph = graph.to_directed() # Compute embeddings model = HOPE(d=128, beta=args.beta) print("Num nodes: %d, num edges: %d" % (graph.number_of_nodes(), graph.number_of_edges())) times = [] for i in range(args.num_embeddings): t1 = time() # Set the seed before learning np.random.seed(args.seed[i]) random.seed(args.seed[i]) Y, t = model.learn_embedding(graph=graph, edge_f=None, is_weighted=True, no_python=True) times.append(time() - t1) # save embedding file_path = (f"{args.outputdir}/hope_{graph_name}_" f"{i:03d}.emb") print(f"Saving embedding to {file_path}") save_embedding( Y, file_path, { "algorithm": "hope", "dimension": args.dimensions, "beta": args.beta, "seed": args.seed[i], }) print(model._method_name + "\n\tAverage training time: %f" % (sum(times) / len(times)))
"random_forest" ], choices=[ "adaboost", "decision_tree", "neural_network", "random_forest" ]) arg_parser.add_argument( "-n", default=10, type=int, help= "How often the classifier should be trained. The performances of these different runs are then compared." ) args = arg_parser.parse_args() _, graph = parse_graph(args.dataset) node_labels, _ = prepare_classification(graph) del graph # free memory for embedding_file in args.embeddings: embedding_name = embedding_file.name.split("/")[-1].split(".")[0] print("Start embedding", embedding_name) embedding = read_embedding(embedding_file) node_labels_train, node_labels_test, embedding_train, embedding_test = train_test_split( node_labels, embedding) for clf in args.classifiers: print("Start classifier", clf) # We train n classifiers on the same embedding and always use the same
], default=[ "adaboost", "decision_tree", "neural_network", "random_forest" ]) arg_parser.add_argument("-splits", default=10, type=int) arg_parser.add_argument("-repeats", default=10, type=int) arg_parser.add_argument( "-processes", type=int, help= "Number of processes to use in parallel, defaults to number of cpus.") args = arg_parser.parse_args() graph_name, graph = parse_graph(args.dataset) node_labels, distinct_node_labels = prepare_classification(graph) del graph # free memory files = {} for filename in args.embeddings: with open(filename, "r") as file: node_embeddings = read_embedding(file) files[filename] = { "embedding_name": os.path.basename(filename).split(".")[0], "node_embeddings": node_embeddings, "node_labels": node_labels, "distinct_node_labels": distinct_node_labels }
arg_parser.add_argument("-q", type=float, default=1.0) args = arg_parser.parse_args() if (args.seed is not None): assert "PYTHONHASHSEED" in os.environ, ("Execution is only deterministic if (next to the -seed parameter) the environmental variable PYTHONHASHSEED is set! Either remove -seed or set PYTHONHASHSEED") print(f"Setting seed to {args.seed}") # Set inital seeds random.seed(args.seed) np.random.seed(args.seed) # Get a list of seeds for 'num_embeddings' many embeddings args.seed = [np.random.randint(4294967296 - 1) for i in range(args.num_embeddings)] for dataset in args.dataset: graph_name, graph = parse_graph(dataset, args.largest_cc) node2vec_graph = Graph(graph, args.p, args.q, not args.no_alias) for i in range(args.num_embeddings): print("Generating embedding", i) embedding = generate(node2vec_graph, args.dimensions, args.walk_length, args.walk_number, seed=args.seed[i]) if args.output and args.num_embeddings == 1: file_name = args.output else: file_name = os.path.dirname(os.path.abspath(__file__)) + ("/results/node2vec_{0}_{1}.emb").format(graph_name, i) print("Save embedding at", file_name) save_embedding(embedding, file_name, { "algorithm": "node2vec", "walk-length": args.walk_length, "walk-number": args.walk_number,
def main(): # Load arguments args = parse_args() # Parse graph data graph = None graph_name = "" print(f"Parsing graph dataset from {args.dataset}" ) #, created by {args.source}") graph_name, graph = parse_graph(args.dataset, args.largest_cc) print( f"{graph_name} has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges" ) # Create an undirected copy of the graph for the first order embedding. # Directed graphs will change whereas an undirected graph will stay the same. print(f"Graph has {graph.number_of_edges()} edges") if type(graph) == nx.Graph: print("Graph is undirected") graph = graph.to_directed() undirected_graph = graph else: print("Graph is directed") undirected_graph = graph.to_undirected() undirected_graph = undirected_graph.to_directed() node_count = len(graph) # Densify graph if args.threshold is None: # calculate the threshold from graph characteristics print("Calculating threshold...") avg_deg = 2 * graph.number_of_edges() / graph.number_of_nodes() args.threshold = max(0, int(avg_deg)) print("nodes", node_count) print("threshold", args.threshold) path = None if os.path.isdir(args.dataset) is False: path = os.path.split(args.dataset)[0] else: path = args.dataset print("Densifying graph...") if args.densify is True: nx_to_weighted_edgelist(graph, f"{path}/{graph_name}.Edgelist") subprocess.call( f"{args.executables}/reconstruct -train {path}/{graph_name}.Edgelist " f"-output {path}/{graph_name}.denseEdgelist -depth 2 -threshold {args.threshold}", shell=True) nx_to_weighted_edgelist(undirected_graph, f"{path}/{graph_name}_undir.Edgelist") subprocess.call( f"{args.executables}/reconstruct -train {path}/{graph_name}_undir.Edgelist " f"-output {path}/{graph_name}_undir.denseEdgelist -depth 2 -threshold {args.threshold}", shell=True) # Handle seed if args.seed is None: # srand(0) --> random seed for every iteration args.seed = [0 for i in range(args.num_embeddings)] else: np.random.seed(args.seed) args.seed = [ np.random.randint(4294967296 - 1) for i in range(args.num_embeddings) ] # Learn embeddings if args.samples is None: args.samples = max(node_count // 1000, 1) timings = [] for i in range(args.num_embeddings): t1 = time() # 1st order subprocess.call( f"{args.executables}/line -train {path}/{graph_name}_undir.denseEdgelist " f"-output {args.outputdir}/line_1st_order_{graph_name}_{i:03d} -binary 1 -size {args.dimensions//2} " f"-order 1 -negative {args.negative_samples} -samples {args.samples} -threads {args.num_threads}" f"-srand {args.seed[i]}", shell=True) file_path_1 = f"{args.outputdir}/line_1st_order_norm_{graph_name}_{i:03d}.emb" subprocess.call( f"{args.executables}/normalize -input {args.outputdir}/line_1st_order_{graph_name}_{i:03d} " f"-output {file_path_1} -binary 0", shell=True) # 2nd order subprocess.call( f"{args.executables}/line -train {path}/{graph_name}.denseEdgelist " f"-output {args.outputdir}/line_2nd_order_{graph_name}_{i:03d} -binary 1 -size {args.dimensions//2} " f"-order 2 -negative {args.negative_samples} -samples {args.samples} -threads {args.num_threads}" f"-srand {args.seed[i]}", shell=True) file_path_2 = f"{args.outputdir}/line_2nd_order_norm_{graph_name}_{i:03d}.emb" subprocess.call( f"{args.executables}/normalize -input {args.outputdir}/line_2nd_order_{graph_name}_{i:03d} " f"-output {file_path_2} -binary 0", shell=True) # add meta information param_lines = create_param_lines({ "node_count": node_count, "embedding_dimension": f"{args.dimensions//2}" }) prepend_param_lines(file_path_1, param_lines) prepend_param_lines(file_path_2, param_lines) # Concatenate embeddings print("Concatenating embeddings...") with open(file_path_1, "r") as f: first_order = read_embedding(f) with open(file_path_2, "r") as f: second_order = read_embedding(f) concat = np.concatenate((first_order, second_order), axis=1) save_embedding( concat, f"{args.outputdir}/line_{graph_name}_{i:03d}.emb", { "algorithm": "line", "order": "first+second", "negative_samples": args.negative_samples, "samples": args.samples, "threshold": args.threshold, "densify": args.densify, "node_count": node_count, "embedding_dimension": args.dimensions, "threads": args.num_threads, "seed": args.seed[i], "comment": "1st-order-and-2nd-order-get-the-same-dimensionality" }) timings.append(time() - t1) # Clean up subprocess.call( f'rm {args.outputdir}/line_1st_order_{graph_name}_{i:03d}', shell=True) subprocess.call( f'rm {args.outputdir}/line_2nd_order_{graph_name}_{i:03d}', shell=True) subprocess.call( f'rm {args.outputdir}/line_1st_order_norm_{graph_name}_{i:03d}.emb', shell=True) subprocess.call( f'rm {args.outputdir}/line_2nd_order_norm_{graph_name}_{i:03d}.emb', shell=True) # Clean up subprocess.call(f"rm {path}/{graph_name}.Edgelist", shell=True) subprocess.call(f"rm {path}/{graph_name}_undir.Edgelist", shell=True) print(f"\nDone!\nAverage training time: {sum(timings)/len(timings)}")
import numpy as np import argparse parser = argparse.ArgumentParser() parser.add_argument("graph", type=str) parser.add_argument("--use-already-shrunken-graph", action="store_true") parser.add_argument("--relabel-nodes", action="store_true") args = parser.parse_args() # just to have a fixed seed np.random.seed(0) random.seed(0) graph_name = args.graph _, graph = parse_graph(f'./graphs/{graph_name}/', largest_cc=True) if args.relabel_nodes: mapping = dict(zip(graph.nodes(), range(graph.number_of_nodes()))) with open(f'./graphs/{graph_name}/{graph_name}_reduced_edges.pickle', 'rb') as file: true_edges = load(file) with open(f'./graphs/{graph_name}/{graph_name}_reduced_nonedges.pickle', 'rb') as file: false_edges = load(file) with open( f'./graphs/{graph_name}/{graph_name}_reduced_nonedges_train.pickle', 'rb') as file: false_edges_train = load(file) with open(f'./graphs/{graph_name}/{graph_name}_reduced_edges_train.pickle', 'rb') as file:
def neighbor_variance(graphdir, k, embedding_dir, embedding_list, dataset, nodes_dict=None, seed=None): """ Compute similarity from nodes to specific kind of neighbors: 1-hop, 2-hop neighbors and distant nodes (more than 2-Hops distance). The nodes will be sampled from the graph. Since some nodes may not have all required kinds of neighbors, the actual size of the experiment might be lower than expected. Args: graphdir: str, path to graph file (e.g. edgelist) k: int, number of nodes for which the similarities will be computed embedding_dir: str, path to directory of saved embeddings embedding_list: list, list of all embedding names that will be considered dataset: str, graph identifier nodes_dict: dict (optional), a dict of 3-tuples or lists of node ids. The tuples are ordered as 1-hop, 2-hop, distant. The keys are the origin node corresponding to the tuple. No nodes will be sampled. seed: int, sets the random seed for numpy Returns: agg_results: numpy array of size k x 3. Every row correponds to one sampled node and its mean similarities to its pair nodes. DataFrame: pandas DataFrame that hold information about the dataset, algorithm, node and similarity measure to a neighbor node. (neighbor_type: 0 -> 1-neighbor, 1 -> 2-neighbor, 2 -> distant node) di: dict with sampled nodes as keys and corresponding lists of 1-neighbor, 2-neighbor and distant node. An entry is None if there could not be found a node with the correct characteristics. """ if seed is not None: np.random.seed(seed) if nodes_dict is None: graph_name, graph = parse_graph(graphdir) # Sample k nodes if not specified by arguments vertices = np.array(list(graph)) np.random.shuffle(vertices) nodes = vertices[:min(k, graph.number_of_nodes())].copy() # Find 1-hop, 2-hop and distant nodes for every sampled node di = defaultdict(list) for node in nodes: # 1-hop: sample neighbor that is not the node itself one_neighbors = list(graph.neighbors(node)) if node in one_neighbors: one_neighbors.remove(node) # If there is no neighbor, just skip the node if not one_neighbors: di[node].extend([None, None, None]) continue else: di[node].append(np.random.choice(one_neighbors)) # 2-hop: sample neighbor; from there another node, that has no edge to the origin one_neighbor = np.random.choice(one_neighbors) one_neighbors.append(node) two_n = None for two_n_candidate in graph.neighbors(one_neighbor): if two_n_candidate not in one_neighbors: two_n = two_n_candidate break di[node].append(two_n) # distant node: sample random node, compute shortest path, if length is more than 2 accept the sample np.random.shuffle(vertices) distant_node = None for distant_node_candidate in vertices: try: if nx.shortest_path_length(graph, distant_node_candidate, node) > 2: distant_node = distant_node_candidate break except nx.NetworkXNoPath: distant_node = distant_node_candidate break di[node].append(distant_node) assert len(di[node]) == 3, f"{node}: {di[node]}" else: di = nodes_dict nodes = np.asarray(list(nodes_dict.keys())) # Aggregate all node for comparison cossim_nodes = [] for origin_node, node_list in di.items(): cossim_nodes.append(origin_node) cossim_nodes.extend(node_list) cossim_nodes = sorted(list(set([node for node in cossim_nodes if node is not None]))) array_map = dict([(node, i) for i, node in enumerate(cossim_nodes)]) # Start Comparison.cosine_similarity with selected nodes comparison = Comparison(embedding_dir, embedding_list) comp_result = comparison.cosine_similarity(cossim_nodes)["sims"] # Analyse the result # Create a matrix, with columns 1-hop, 2-hop, distant neighbor; # rows are nodes from the sample; in every cell the mean is noted agg_results = np.empty((len(nodes), 3)) # Construct a DataFrame that holds the detailed results # DataFrame has columns: node, neighbor_type, similarity, algorithm, dataset df_dict = defaultdict(list) algorithm = embedding_list[0].split("_")[0] for index, node in enumerate(nodes): nr = {0: [], 1: [], 2: []} for i, v in enumerate(di[node]): if v is None: nr[i] = [np.nan] df_dict["node"].append(node) df_dict["neighbor_type"].append(i) df_dict["algorithm"].append(algorithm) df_dict["dataset"].append(dataset) df_dict["similarity"].append(np.nan) continue for arr in comp_result.values(): nr[i].append(arr[array_map[node], array_map[v]]) df_dict["node"].append(node) df_dict["neighbor_type"].append(i) df_dict["algorithm"].append(algorithm) df_dict["dataset"].append(dataset) df_dict["similarity"].append(arr[array_map[node], array_map[v]]) agg_results[index, :] = [np.mean(nr[0]), np.mean(nr[1]), np.mean(nr[2])] # Remove rows with NaNs agg_results = agg_results[~np.isnan(agg_results).any(axis=1)] print(f"Removed {len(nodes) - len(agg_results)} nodes from the sample due to them not having correct neighbors.") return agg_results, pd.DataFrame(df_dict), di