# Input data paths dataset = "MUTAG" corpus_data_dir = "data/" + dataset # Desired output paths output_embedding_fh = "Graph2Vec_Embeddings.json" # Hyper parameters wl_depth = 2 min_count_patterns = 0 # min number of occurrences to be considered in vocabulary of subgraph patterns ####### # Step 1 Create corpus data for neural language model # We keep permanent files for sake of deeper post studies and testing ####### graph_files = utils.get_files(corpus_data_dir, ".gexf", max_files=0) wl_corpus(graph_files, wl_depth) extension = ".wld" + str(wl_depth) # Extension of the graph document ###### # Step 2 Train a neural language model to learn distributed representations # of the graphs directly or of its substructures. Here we learn it directly # for an example of the latter check out the DGK models. ###### # Instantiate a PV-DBOW trainer to learn distributed reps directly. trainer = InMemoryTrainer(corpus_dir=corpus_data_dir, extension=extension, max_files=0, output_fh=output_embedding_fh, emb_dimension=32, batch_size=128,
import pandas import geometric2dr.embedding_methods.utils as utils # Setup parameters perf_folder = "DGK_WL_Performance_MUTAG" dgk, mode, performance, dataset = perf_folder.strip().split("_") if mode == "GK": # Dataframe setup data = [] header = [ "dataset", "num_graphlet", "sample_size", "emb_dimension", "batch_size", "epochs", "run", "accuracy", "std" ] perf_files = utils.get_files(perf_folder, "", max_files=0) for perf_file in perf_files: perf_file_basename = os.path.basename(perf_file) dataset, num_graphlet, sample_size, emb_dimension, batch_size, epochs, run = perf_file_basename.strip( ).split("_") print(perf_file_basename.strip().split("_")) # Get the accuracy and the std in it with open(perf_file, "r") as fh: lines = fh.readlines() for line in lines: mean_acc, std = line.strip().split(",") mean_acc = float(mean_acc) std = float(std)
graph = nx.read_gexf(file_handle) adj_matrix = nx.to_numpy_matrix(graph) return graph, adj_matrix dataset = "MUTAG" path_to_gexf_data = "data/" graph_class_labels_fh = path_to_gexf_data + dataset + ".Labels" dataset_path = path_to_gexf_data + dataset # Yanardag style dataset data = {} labels = [] graph_files = {} graph_files = utils.get_files(dataset_path, extension=".gexf", max_files=0) label_tuples = utils.get_class_labels_tuples(graph_files, graph_class_labels_fh) graph_classes = np.array( [y for z, y in sorted(label_tuples, key=lambda x: x[0])]) data['labels'] = graph_classes gf = graph_files[0] graph_data = {} for gf in graph_files: gindex = int(os.path.basename(gf).split(".")[0]) - 1 nx_graph, adj_matrix = load_graph(gf) graph_data[gindex] = {} for node_string in nx_graph.nodes():
corpus_data_dir = "data/" + dataset class_labels_fh = data_path + dataset + ".Labels" if method == "graph2vec": embeddings_folder = "Graph2vec_Embeddings_" + dataset csv_fh = method + "_" + dataset + "_results.csv" csv_fh_avg = method + "_" + dataset + "fullCV_results.csv" # Dataframe setup data = [] header = [ "dataset", "wl_depth", "embedding_dimension", "batch_size", "epochs", "initial_lr", "run", "accuracy", "std" ] # if method == "graph2vec" embedding_files = utils.get_files(embeddings_folder, "", max_files=0) for embedding_file in embedding_files: embedding_file_basename = os.path.basename(embedding_file) dataset, wl_depth, embedding_dimension, batch_size, epochs, initial_lr, run = embedding_file_basename.split( "_") print(embedding_file_basename.strip().split("_")) extension = ".wld" + wl_depth classify_scores = cross_val_accuracy( corpus_dir=corpus_data_dir, extension=extension, embedding_fname=embedding_file, class_labels_fname=class_labels_fh) mean_acc, std_dev = classify_scores print("Mean accuracy using 10 cross fold accuracy: %s with std %s" %