def save_tr_graph(self, output_path, delimiter, write_stats=False, write_weights=False, write_dir=True): """ Saves the graph to a file. Parameters ---------- output_path : file or string File or filename to write. If a file is provided, it must be opened in 'wb' mode. delimiter : string, optional The string used to separate values. Default is . write_stats : bool, optional Sets if graph statistics should be added to the edgelist or not. Default is False. write_weights : bool, optional If True data will be stored as weighted edgelist (e.g. triplets src, dst, weight) otherwise as normal edgelist. If the graph edges have no weight attribute and this parameter is set to True, a weight of 1 will be assigned to each edge. Default is False. write_dir : bool, optional This option is only relevant for undirected graphs. If False, the train graph will be stored with a single direction of the edges. If True, both directions of edges will be stored. Default is True. """ pp.save_graph(self._TG, output_path=output_path, delimiter=delimiter, write_stats=write_stats, write_weights=write_weights, write_dir=write_dir)
def main(): # Check cmd args if len(argv) != 3: print("ERROR: wrong number of parameters") print("Usage: prep_data_prune.py <facebook_path> <webspam_path>") exit(-1) # Extract the dataset names and paths fb_path, fb_name = os.path.split(argv[1]) ws_path, ws_name = os.path.split(argv[2]) # Preprocess FB graph G1 = prep_fb(argv[1]) # Store FB graph to a file pp.save_graph(G1, output_path=fb_path + "/prep_graph_slfloops.edgelist", delimiter=',', write_stats=True) # Preprocess WS graph G2 = prep_ws(argv[2]) # Store preprocessed graph to a file pp.save_graph(G2, output_path=ws_path + "/prep_graph_slfloops.edgelist", delimiter=',', write_stats=True) print("Preprocessing finished.")
def preprocess(setup, nw_outpath, i): """ Graph preprocessing routine. """ print('Preprocessing graph...') # Load a graph if setup.task == 'sp': G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i], directed=setup.directed, datatype=int) else: G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i], directed=setup.directed, datatype=float) # Preprocess the graph if setup.task == 'lp' and setup.split_alg == 'random': G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops, maincc=False) else: G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops) # Save preprocessed graph to a file if setup.save_prep_nw: pp.save_graph(G, output_path=os.path.join(nw_outpath, 'prep_nw.edgelist'), delimiter=setup.delimiter, write_stats=setup.write_stats, write_weights=False, write_dir=True) # Return the preprocessed graph return G, ids
def preprocess(inpath, outpath, delimiter, directed, relabel, del_self_loops): """ Graph preprocessing routine. """ print('Preprocessing graph...') # Load a graph G = pp.load_graph(inpath, delimiter=delimiter, comments='#', directed=directed) # Preprocess the graph G, ids = pp.prep_graph(G, relabel=relabel, del_self_loops=del_self_loops) # Store preprocessed graph to a file pp.save_graph(G, output_path=outpath + "prep_graph.edgelist", delimiter=' ', write_stats=False) # Return the preprocessed graph return G
def test(): # Variables dataset_path = "./data/" output_path = "./data/" test_name = "network.edgelist" # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=',', comments='#', directed=True) # Print some stats print("") print("Original graph stats:") print("-----------------------------------------") pp.get_stats(G) # Save the graph pp.save_graph(G, output_path + "orig_graph.edgelist", delimiter=",") # Load the saved graph G2 = pp.load_graph(output_path + "orig_graph.edgelist", delimiter=",", comments='#', directed=True) # Stats comparison print("Has the same stats after being loaded?:") print("-----------------------------------------") pp.get_stats(G2) # Preprocess the graph GP, ids = pp.prep_graph(G2, del_self_loops=False, relabel=True) print("Preprocessed graph stats (restricted to main cc):") print("-----------------------------------------") pp.get_stats(GP) pp.save_graph(GP, output_path + "prep_graph.edgelist", delimiter=",") print("Sample of 10 (oldNodeID, newNodeID):") print("-----------------------------------------") print(ids[0:10]) pp.get_redges_false(GP, output_path + "redges_false.csv")
def save_tr_graph(self, output_path, delimiter, write_stats=False, write_weights=False, write_dir=True): """ Saves the TG graph to a file. Parameters ---------- output_path : file or string File or filename to write. If a file is provided, it must be opened in 'wb' mode. delimiter : string, optional The string used to separate values. Default is ','. write_stats : bool, optional Adds basic graph statistics to the file as a header or not. Default is True. write_weights : bool, optional If True data will be stored as weighted edgelist i.e. triplets (src, dst, weight), otherwise, as regular (src, dst) pairs. For unweighted graphs, setting this parameter to True will add weight 1 to all edges. Default is False. write_dir : bool, optional This parameter is only relevant for undirected graphs. If True, it forces the method to write both edge directions in the file i.e. (src, dst) and (dst, src). If False, only one direction is stored. Default is True. See also -------- evalne.utils.preprocess.save_graph """ pp.save_graph(self._TG, output_path=output_path, delimiter=delimiter, write_stats=write_stats, write_weights=write_weights, write_dir=write_dir)
def test_split(): # Variables dataset_path = "./data/" output_path = "./data/" test_name = "network.edgelist" subgraph_size = 400 train_frac = 0.5 directed = True # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=",", comments='#', directed=directed) # Restrict graph to a sub-graph of 'subgraph_size' nodes SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy() # Preprocess the graph PSG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True, maincc=True) # Save the preprocessed graph pp.save_graph(PSG, output_path + "prep_graph.edgelist", delimiter=",") # Compute train/test splits start = time.time() train_stt, test_stt = stt.split_train_test(PSG, train_frac=train_frac) end = time.time() - start print("Exec time stt: {}".format(end)) # Check that the train graph generated with stt has one single cc if directed: TG_stt = nx.DiGraph() TG_stt.add_edges_from(train_stt) print("Number of weakly CCs with stt: {}".format( nx.number_weakly_connected_components(TG_stt))) else: TG_stt = nx.Graph() TG_stt.add_edges_from(train_stt) print("Number of CCs with stt: {}".format( nx.number_connected_components(TG_stt))) print("Number train edges stt: {}".format(len(train_stt))) print("Number test edges stt: {}".format(len(test_stt))) print("Number of nodes in train graph: {}".format(len(TG_stt.nodes))) # Preprocess the graph PSG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True, maincc=False) # Compute train/test splits start = time.time() train_rstt, test_rstt = stt.rand_split_train_test(PSG, train_frac=train_frac) end = time.time() - start print("\nExec time rand_stt: {}".format(end)) # Check that the train graph generated with rstt has one single cc if directed: TG_rstt = nx.DiGraph() TG_rstt.add_edges_from(train_rstt) print("Number of weakly CCs with rstt: {}".format( nx.number_weakly_connected_components(TG_rstt))) else: TG_rstt = nx.Graph() TG_rstt.add_edges_from(train_rstt) print("Number of CCs with rstt: {}".format( nx.number_connected_components(TG_rstt))) print("Number train edges rstt: {}".format(len(train_rstt))) print("Number test edges rstt: {}".format(len(test_rstt))) print("Number of nodes in train graph: {}".format(len(TG_rstt.nodes)))
# Get some graph statistics pp.get_stats(G) # Or store them to a file pp.get_stats(G, os.path.join(output_path, "stats.txt")) # Preprocess the graph SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True) # Get non-edges so that the reversed edge exists in the graph if directed: redges = pp.get_redges_false(SG, output_path=os.path.join(output_path, "redges.csv")) # Store the graph to a file pp.save_graph(SG, output_path=os.path.join(output_path, "network_prep.edgelist"), delimiter=',', write_stats=True) # ---------------- # Split train test # ---------------- # Compute train/test splits and false edges in parallel stt.compute_splits_parallel(SG, os.path.join(traintest_path, "network_prep_51"), owa=True, train_frac=0.51, num_fe_train=None, num_fe_test=None, num_splits=5) # The overlap between the 5 generated sets can be easily checked print("Overlap check for train sets: ") stt.check_overlap(filename=os.path.join(traintest_path, "network_prep_51", "trE"), num_sets=5) print("Overlap check for test sets: ") stt.check_overlap(filename=os.path.join(traintest_path, "network_prep_51", "teE"), num_sets=5)