def main(): # Check cmd args if len(argv) != 3: print("ERROR: wrong number of parameters") print("Usage: prep_data_prune.py <facebook_path> <webspam_path>") exit(-1) # Extract the dataset names and paths fb_path, fb_name = os.path.split(argv[1]) ws_path, ws_name = os.path.split(argv[2]) # Preprocess FB graph G1 = prep_fb(argv[1]) # Store FB graph to a file pp.save_graph(G1, output_path=fb_path + "/prep_graph_slfloops.edgelist", delimiter=',', write_stats=True) # Preprocess WS graph G2 = prep_ws(argv[2]) # Store preprocessed graph to a file pp.save_graph(G2, output_path=ws_path + "/prep_graph_slfloops.edgelist", delimiter=',', write_stats=True) print("Preprocessing finished.")
def test_split(): # Variables dataset_path = "./data/" output_path = "./data/" test_name = "network.edgelist" subgraph_size = 1000 # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter="\t", comments='#', directed=True) # Restrict graph to a sub-graph of 'subgraph_size' nodes SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy() # Preprocess the graph SG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True) # Get stats of the preprocessed subgraph pp.save_graph(SG, output_path + "prep_graph.edgelist", delimiter=",") # Alternatively, train/test splits can be computed one at a time train_E, test_E = stt.split_train_test(SG, train_frac=0.51, seed=99) print(train_E) # Compute set of false edges train_E_false, test_E_false = stt.generate_false_edges_owa( SG, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None, seed=99)
def preprocess(setup, i): """ Graph preprocessing rutine. """ if setup.verbose: print('Preprocesing graph...') # Load a graph G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i], directed=setup.directed[i]) # Preprocess the graph G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops) if setup.prep_nw_name is not None: # Store preprocessed graph to a file pp.save_graph(G, output_path=setup.outpaths[i] + setup.prep_nw_name, delimiter=setup.delimiter, write_stats=setup.write_stats) # Return the preprocessed graph return G
def save_tr_graph(self, output_path, delimiter, write_stats=False, write_weights=False, write_dir=True): """ Saves the graph to a file. Parameters ---------- output_path : file or string File or filename to write. If a file is provided, it must be opened in 'wb' mode. delimiter : string, optional The string used to separate values. Default is . write_stats : bool, optional Sets if graph statistics should be added to the edgelist or not. Default is False. write_weights : bool, optional If True data will be stored as weighted edgelist (e.g. triplets src, dst, weight) otherwise as normal edgelist. If the graph edges have no weight attribute and this parameter is set to True, a weight of 1 will be assigned to each edge. Default is False. write_dir : bool, optional This option is only relevant for undirected graphs. If False, the train graph will be stored with a single direction of the edges. If True, both directions of edges will be stored. Default is True. """ pp.save_graph(self._TG, output_path=output_path, delimiter=delimiter, write_stats=write_stats, write_weights=write_weights, write_dir=write_dir)
def test(): # Variables dataset_path = "./data/" output_path = "./data/" test_name = "network.edgelist" # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=',', comments='#', directed=True) # Print some stats print("") print("Original graph stats:") print("-----------------------------------------") pp.get_stats(G) # Save the graph pp.save_graph(G, output_path + "orig_graph.edgelist", delimiter=",") # Load the saved graph G2 = pp.load_graph(output_path + "orig_graph.edgelist", delimiter=",", comments='#', directed=True) # Stats comparison print("Has the same stats after being loaded?:") print("-----------------------------------------") pp.get_stats(G2) # Preprocess the graph GP, ids = pp.prep_graph(G2, del_self_loops=False, relabel=True) print("Preprocessed graph stats (restricted to main cc):") print("-----------------------------------------") pp.get_stats(GP) pp.save_graph(GP, output_path + "prep_graph.edgelist", delimiter=",") print("Sample of 10 (oldNodeID, newNodeID):") print("-----------------------------------------") print(ids[0:10]) pp.get_redges_false(GP, output_path + "redges_false.csv")
def save_tr_graph(self, output_path, delimiter, write_stats=False): """ Saves the graph to a file. Parameters ---------- output_path : file or string File or filename to write. If a file is provided, it must be opened in 'wb' mode. delimiter : string, optional The string used to separate values. Default is . write_stats : bool, optional Sets if graph statistics should be added to the edgelist or not. Default is False. """ pp.save_graph(self._TG, output_path=output_path, delimiter=delimiter, write_stats=write_stats)
def preprocess(inpath, outpath, delimiter, directed): """ Graph preprocessing routine. """ print('Preprocessing graph...') # Load a graph G = pp.load_graph(inpath, delimiter=delimiter, comments='#', directed=directed) # Preprocess the graph G, ids = pp.prep_graph(G, relabel=True, del_self_loops=True) # Store preprocessed graph to a file pp.save_graph(G, output_path=outpath + "prep_graph.edgelist", delimiter=',', write_stats=True) # Return the preprocessed graph return G
# Get some graph statistics pp.get_stats(G) # Or store them to a file pp.get_stats(G, output_path + "stats.txt") # Preprocess the graph SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True) # Get non-edges so that the reversed edge exists in the graph if directed: redges = pp.get_redges_false(SG, output_path=output_path + "redges.csv") # Store the graph to a file pp.save_graph(SG, output_path=output_path + "network_prep.edgelist", delimiter=',', write_stats=True) # ---------------- # Split train test # ---------------- # Compute train/test splits and false edges in parallel stt.compute_splits_parallel(SG, output_path + "lp_train_test_splits/network_prep_51", owa=True, train_frac=0.51, num_fe_train=None, num_fe_test=None, num_splits=5)