def main(args=None): cpu_number = multiprocessing.cpu_count() parser = argparse.ArgumentParser(description='Path of networks') parser.add_argument('-n', type=str, help='Multiplex 1') parser.add_argument('-m', type=str, help='Multiplex 2') parser.add_argument('-b', type=str, help='Bipartite') args = parser.parse_args(args) print(args) ######################################################################## # Parameters multiverse and train/test ######################################################################## EMBED_DIMENSION = 128 CLOSEST_NODES = np.int64(300) NUM_SAMPLED = np.int64(10) LEARNING_RATE = np.float64(0.01) KL = False NB_CHUNK = np.int64(1) CHUNK_SIZE = np.int64(100) NUM_STEPS_1 = np.int64(100 * 10**6 / CHUNK_SIZE) # If toy example #EMBED_DIMENSION = 128 #CLOSEST_NODES = np.int64(2) #NUM_SAMPLED = np.int64(10) #LEARNING_RATE = np.float64(0.01) #KL = False #NB_CHUNK = np.int64(1) #CHUNK_SIZE = np.int64(2) #NUM_STEPS_1 = np.int64(100*10**6/CHUNK_SIZE) train_frac = 0.7 solver = 'lbfgs' max_iter = 1000 split_alg = 'random' lp_model = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \ max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True,\ oob_score=True, n_jobs=cpu_number, random_state=777, verbose=0, warm_start=False) graph_name = 'Test_Eval' ################################################################################## # !! Careful !! # Check if nodes in the bipartite have the same nodes in the multiplex # networks. If not you have to remove the nodes in the multiplexes not included in the # bipartites ################################################################################## ################################################################################### # EvalNE Link prediction processing ################################################################################### data_bipartite = pd.read_csv(args.b, delimiter=' ', header=None) data_bipartite = data_bipartite.drop(columns=[0, 3]) data_bipartite.to_csv('bipartite_2colformat.csv', header=None, index=None, sep=' ') G_hetereogeneous = f.preprocess('bipartite_2colformat.csv', '.', ' ', False, False, True) print('Preprocessing done') G_hetereogeneous_traintest_split = EvalSplit() G_hetereogeneous_traintest_split.compute_splits(G_hetereogeneous, split_alg=split_alg, train_frac=train_frac, owa=False) nee = LPEvaluator(G_hetereogeneous_traintest_split, dim=EMBED_DIMENSION, lp_model=lp_model) G_heterogeneous_split = (G_hetereogeneous_traintest_split.TG) os.replace('bipartite_2colformat.csv', './Generated_graphs/' + 'bipartite_2colformat.csv') print('Splitting done') # Write the bipartite training graph for multiverse in extended edgelist format 'layer n1 n2 weight' file_multi = open('bipartite_training_graph_' + '_' + graph_name, 'w+') tmp_array_het = [] tmp_array_het = np.asarray(G_heterogeneous_split.edges) for i in range(len(tmp_array_het[:, 0])): if tmp_array_het[i, 0] in list(data_bipartite[2]): tmp = tmp_array_het[i, 0] tmp_array_het[i, 0] = tmp_array_het[i, 1] tmp_array_het[i, 1] = tmp tmp_array_het = np.hstack((tmp_array_het, np.ones( (len(tmp_array_het), 1)))) tmp_array_het = np.hstack((np.ones( (len(tmp_array_het), 1)), tmp_array_het)) tmp_array_het = np.vstack(tmp_array_het) tmp_array_het = np.int_(tmp_array_het) np.savetxt(file_multi, tmp_array_het, fmt='%s', delimiter=' ', newline=os.linesep) file_multi.close() os.replace( 'bipartite_training_graph_' + '_' + graph_name, './Generated_graphs/' + 'bipartite_training_graph_' + '_' + graph_name + '.txt') ################################################################################### # MULTIVERSE ################################################################################### r_readRDS = robjects.r['readRDS'] print('RWR-MH') proc = subprocess.Popen(['Rscript', './RWR/GenerateSimMatrix_MH.R', \ '-n', '.' + args.n, \ '-m', '.' + args.m, \ '-b', '../Generated_graphs/'+ 'bipartite_training_graph_' + '_'+ graph_name+'.txt', '-o', '../ResultsRWR/MatrixSimilarityMultiplexHet'+graph_name, '-c', str(cpu_number)]) proc.wait() proc.kill() print('RWR done') r_DistancematrixPPI = r_readRDS( './ResultsRWR/MatrixSimilarityMultiplexHet' + graph_name + '.rds') import gc gc.collect() ######################################################################## # Processing of the network ######################################################################## reverse_data_DistancematrixPPI, list_neighbours, nodes, data_DistancematrixPPI, neighborhood, nodesstr \ = f.netpreprocess_hetero(r_DistancematrixPPI, CLOSEST_NODES) ######################################################################## # Initialization ######################################################################## embeddings = np.random.normal(0, 1, [np.size(nodes), EMBED_DIMENSION]) ######################################################################## # Training and saving best embeddings ######################################################################## # Train and test during training neighborhood = np.asarray(neighborhood) nodes = np.asarray(nodes) embeddings = f.train(neighborhood, nodes, list_neighbours, NUM_STEPS_1, NUM_SAMPLED, LEARNING_RATE, \ CLOSEST_NODES, CHUNK_SIZE, NB_CHUNK, embeddings, reverse_data_DistancematrixPPI) X = dict(zip(range(embeddings.shape[0]), embeddings)) X = {str(int(nodesstr[key]) + 1): X[key] for key in X} np.save('embeddings_MH', X) date = datetime.datetime.now() os.replace('embeddings_MH.npy', './ResultsMultiVERSE/' + 'embeddings_MH.npy') ######################################################################## # Link prediction for evaluation of MH ######################################################################## edge_emb = ['hadamard', 'weighted_l1', 'weighted_l2', 'average', 'cosine'] results_embeddings_methods = dict() for i in range(len(edge_emb)): tmp_result_multiverse = nee.evaluate_ne(data_split=nee.traintest_split, X=X, method="Multiverse", edge_embed_method=edge_emb[i], label_binarizer=lp_model) results_embeddings_methods[tmp_result_multiverse.method + '_' + str( edge_emb[i])] = tmp_result_multiverse.get_all()[1][4] ######################################################################## # Analysis and saving of the results ######################################################################## Result_file = 'Result_LinkpredMultiplexHet_' + graph_name + '_' + str( date) + '.txt' with open(Result_file, "w+") as overall_result: print("%s: \n\ EMBED_DIMENSION: %s \n\ CLOSEST_NODES: %s \n\ NUM_STEPS_1: %s \n\ NUM_SAMPLED: %s \n\ LEARNING_RATE: %s \n\ CHUNK_SIZE: %s \n\ NB_CHUNK: %s \n\ train_frac: %s \n\ solver: %s \n\ max_iter: %s \n\ split_alg: %s \n\ " % (str(date), EMBED_DIMENSION, CLOSEST_NODES, NUM_STEPS_1, NUM_SAMPLED, LEARNING_RATE, CHUNK_SIZE, NB_CHUNK, train_frac, solver, max_iter, split_alg), file=overall_result) print('Overall MULTIVERSE AUC hadamard:', results_embeddings_methods['Multiverse_hadamard'], file=overall_result) print('Overall MULTIVERSE AUC weighted_l1:', results_embeddings_methods['Multiverse_weighted_l1'], file=overall_result) print('Overall MULTIVERSE AUC weighted_l2:', results_embeddings_methods['Multiverse_weighted_l2'], file=overall_result) print('Overall MULTIVERSE AUC average:', results_embeddings_methods['Multiverse_average'], file=overall_result) print('Overall MULTIVERSE AUC cosine:', results_embeddings_methods['Multiverse_cosine'], file=overall_result) overall_result.close() os.replace(Result_file, './ResultsMultiVERSE/' + Result_file) print('End')
class LinkPredictionTuning(Tuning): r""" Clase general de entrenamiento y testeo de embeddings de grafos para la tarea de prediccion de enlaces. Parameters ---------- G: NetworkX graph Grafo de entrenamiento. G_test: NetworkX graph Grafo de testeo. root: str directorio en el que se guardaran los resultados """ def __init__(self, G, G_test, root="results/lp/"): super(LinkPredictionTuning, self).__init__(G, root=root) self.task = "lp" train_E = G.edges train_E_false = self.GetNegativeEdges(G, len(train_E)) test_E = G_test.edges test_E_false = self.GetNegativeEdges(G_test, len(test_E)) self.split = EvalSplit() self.split.set_splits(train_E, train_E_false=train_E_false, test_E=test_E, test_E_false=test_E_false, TG=G) self.training_graph = create_self_defined_dataset(root_dir="", name_dict={}, name="training " + self.tipo, weighted=True, directed=False, attributed=True)() self.training_graph.set_g(G) self.evaluator = LPEvaluator(self.split) def GetNegativeEdges(self, G, n): r""" Metodo auxiliar que muestrea enlaces negativos. Parameters ---------- G: NetworkX graph Grafo bipartito. n: int cantidad de enlaces que muestrear. """ prop_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 0] user_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 1] non_edges = [] while len(non_edges) <= n: random_prop = random.choice(prop_nodes) random_user = random.choice(user_nodes) edge = (random_prop, random_user) if G.has_edge(*edge): continue else: non_edges.append(edge) return non_edges def TestModel(self, emb, time=-1, method_name="method_name"): r""" Testea un embedding y lo guarda en el scoresheet. Parameters ---------- emb: dict diccionario de embeddings, llaves son los nodos y los valores una lista con el embedding time: float tiempo de ejecucion del metodo, para guardar en el scoresheet method_name: str nombre del metodo con el que guardar. """ df = pd.DataFrame(emb).T X = df.T.to_dict("list") X = {str(k): np.array(v) for k, v in X.items() } # tiene que ser array por que se hacen sumas self.evaluator.dim = df.shape[1] reslp = [] for edge_method in [ "weighted_l1", "weighted_l2", "hadamard", "average" ]: #TO DO que no evalue en los 4 embeddings de enlaces res = self.evaluator.evaluate_ne(self.split, X=X, method=method_name, edge_embed_method=edge_method, params={"nw_name": "GPI"}) res.params.update({'eval_time': time}) reslp.append(res) self.scoresheet.log_results(reslp) return reslp