def graphCreationForSingleStudent(transitionRow, activityCodeList, mode='networkx'): #transitionRow as series transitionList = generateTransition(activityCodeList) checkActivityList = [] G = nx.Graph() for i in transitionList: if i[1] in transitionRow.index: if transitionRow[i[1]] > 0: if i[0][0] not in checkActivityList: G.add_node(i[0][0], weight=transitionRow[i[2][0]], name=i[2][0]) checkActivityList.append(i[0][0]) if i[0][1] not in checkActivityList: G.add_node(i[0][1], weight=transitionRow[i[2][1]], name=i[2][1]) checkActivityList.append(i[0][1]) G.add_edge(i[0][0], i[0][1], weight=transitionRow[i[1]]) if mode == 'networkx': return G else: return StellarGraph.from_networkx(G)
def eda(graph): ''' eda for an apk graph --> filepath to a graph returns a dictionary in case ''' app_dir, app_filename = os.path.split(graph) #building output target = "/teams/DSC180A_FA20_A00/a04malware/personal-group03/eda_sab/features1/" out_csv = os.path.join(target, (app_filename + ".csv")) target1 = "/teams/DSC180A_FA20_A00/a04malware/personal-group03/eda/features/" others = os.path.join(target1, (app_filename + ".csv")) if os.path.exists(out_csv): print("csv exists already") return "csv exists already" if os.path.exists(others): print("csv others exists already") return "csv others exists already" try: networkx = nx.read_gml(graph) except: return graph + " might be broken!" stellar = StellarGraph.from_networkx(networkx, node_type_attr = "type") nodes = stellar.node_types node_types = {} for node in nodes: node_types[node] = len(stellar.nodes_of_type(node_type=node)) data = {} # get number of nodes and edges data["app"] = graph data["node_types_counts"] = len(stellar.node_types) data["node_types"] = node_types data["number_nodes"] = len(stellar.nodes()) data["number_edges"] = len(stellar.edges()) if "benign" in app_dir: label = 0 else: label = 1 data["label"] = label df = pd.DataFrame.from_dict([data]) return df.to_csv(out_csv)
def __init__(self, edges_path, lables_path): """ Hard-coded initialization """ fstar = 1 a = 0.125 # p, q left bound b = 4.125 # p, q right bound graph, labels = self.read_data(edges_path, lables_path) rw = BiasedRandomWalk(StellarGraph.from_networkx(graph)) super().__init__(fstar, a, b, graph, labels, rw)
def preprocessing(self, g, train_node, file_emb_output="./emb/100_900_nede2vec.emb"): node_subjects = train_node['values'] node_subjects = node_subjects.astype(str) print(Counter(node_subjects)) #file_emb_output = "./emb/100_900_nede2vec.emb" model = KeyedVectors.load_word2vec_format(file_emb_output) node_ids = model.wv.index2word node_embeddings = ( model.wv.vectors ) # num print("Embedding load success.") reinex_node_embedding = pd.DataFrame(node_embeddings, index=map(int, node_ids)) g_feature_attr = g.copy() G = StellarGraph.from_networkx( g_feature_attr, node_features=reinex_node_embedding, node_type_default="n", edge_type_default="e" ) print(G.info()) train_subjects, test_subjects = model_selection.train_test_split( node_subjects, train_size=160, test_size=None, stratify=node_subjects ) val_subjects, test_subjects = model_selection.train_test_split( test_subjects, train_size=20, test_size=None, stratify=test_subjects ) train_subjects.value_counts().to_frame() target_encoding = preprocessing.LabelBinarizer() # target_encoding = preprocessing.OneHotEncoder() train_targets = target_encoding.fit_transform(train_subjects) val_targets = target_encoding.transform(val_subjects) test_targets = target_encoding.transform(test_subjects) generator = FullBatchNodeGenerator(G, method="gcn") train_gen = generator.flow(train_subjects.index, train_targets) val_gen = generator.flow(val_subjects.index, val_targets) test_gen = generator.flow(test_subjects.index, test_targets) all_nodes = node_subjects.index all_gen = generator.flow(all_nodes) return G, train_gen, train_targets, val_gen, val_targets, test_targets, test_gen, all_gen, generator
def createGraphFromCounter(dfg, mode='networkx'): transitionList = list(dfg) #transitionRow as series checkActivityList = [] G = nx.DiGraph() for i in transitionList: # if i[0] != i[1]: if i[0] not in checkActivityList: G.add_node(i[0], name=i[0]) checkActivityList.append(i[0]) if i[1] not in checkActivityList: G.add_node(i[1], name=i[1]) checkActivityList.append(i[1]) G.add_edge(i[0], i[1], weight=1) if mode == 'networkx': return G else: return StellarGraph.from_networkx(G)
def API_abstraction_vectorized(inFP, outFP, kind, to_return, truename=False): """ abstracts edges and nodes of ONE APP to some level returns a graph that is abstracted (WILL CHANGE) inFP --> input file path (should be .gml.bz2) outFP --> output directory kind --> (str) FAMILY or PACKAGE or CLASS """ # getting the app name direc, app_name = utils.dir_and_app(inFP) try: networkx = nx.read_gml(inFP) except: return inFP + " might be broken!" nx_nodes = np.array(networkx.nodes(data=True)) nx_edges = np.array(networkx.edges, dtype=object) node_vfunc = np.vectorize(API_abstraction) edge_vfunc = np.vectorize(edge_processing) newnodes = [API_abstraction(kind, node) for node in nx_nodes] newedges = [edge_processing(kind, edge) for edge in nx_edges] G = nx.MultiDiGraph() G.add_nodes_from(newnodes) G.add_edges_from(newedges) if truename == False: G = add_apk_node(G, "") else: G = add_apk_node(G, app_name) metapaths = dfs(G, app_name) if to_return == "NX": return [G, metapaths] elif to_return == "SG": stellar = StellarGraph.from_networkx(G, node_type_attr="type") return [stellar, metapaths]
X = sio.loadmat('X_vector_Full_.mat') X = X['X_vec'] #Create Labels Y = np.append(np.zeros(shape=(3648,1)), np.ones(shape=(2156,1))) #First 3648 entries of A & X are healthy, rest 2156 are depressed A, X, Y = shuffle(np.reshape(A, (5804,62,62)), np.reshape(X, (5804, 62, 2)), np.reshape(Y, (5804,1,1))) #Shuffle to randomize order and reshape for consistent shuffle across A, X & Y Y = np.reshape(Y, (5804,)) graph_labels = pd.DataFrame(Y) #Store labels in a pd DataFrame # Format input data (A, X) for GCN input as StellarGraph object graphs = [] for participant in tqdm(range(len(X))): G = nx.from_numpy_matrix(A[participant]) #Create networkx graph object from ath Adjacency Matrix for node in range(62): #For each graph G.nodes[node]['x'] = X[participant, node, :] #Add node features to networkx Graph graphs.append(StellarGraph.from_networkx(G, node_features="x")) #Create and store StellarGraph objects for training generator = PaddedGraphGenerator(graphs=graphs) k = 10 # the number of rows for the output tensor layer_sizes = [64, 32, 16, 8, 4, 2, 1] #GCN Layer Size dgcnn_model = DeepGraphCNN( layer_sizes=layer_sizes, activations=["relu", "relu", "tanh", "tanh", "tanh","tanh","tanh"], k=k, bias=True, generator=generator, ) x_inp, x_out = dgcnn_model.in_out_tensors()
def barbell(): return StellarGraph.from_networkx(nx.barbell_graph(m1=10, m2=11))
def petersen_graph() -> StellarGraph: nxg = nx.petersen_graph() return StellarGraph.from_networkx(nxg, node_features=node_features())
link_ids_train = link_ids[train_indices] link_ids_test = link_ids[test_indices] link_labels_train = link_labels[train_indices] link_labels_test = link_labels[test_indices] g_train = g.copy() edgelist = [(start, end) for start, end in zip(link_ids_test[:,0], link_ids_test[:,1]) ] g_train.remove_edges_from(edgelist) g_test = g.copy() edgelist = [(start, end) for start, end in zip(link_ids_train[:,0], link_ids_train[:,1]) ] g_test.remove_edges_from(edgelist) G = StellarGraph.from_networkx(g, node_features="feature") g_train = StellarGraph.from_networkx(g_train, node_features="feature") g_test = StellarGraph.from_networkx(g_test, node_features="feature") print(g_train.info()) print(g_test.info()) ## batch_size = 40 num_samples = [15, 10, 5] train_gen = GraphSAGELinkGenerator(g_train, batch_size, num_samples) train_flow = train_gen.flow(link_ids_train, link_labels_train, shuffle=True) test_gen = GraphSAGELinkGenerator(g_test, batch_size, num_samples) test_flow = test_gen.flow(link_ids_test, link_labels_test)
def barbell(): graph = nx.barbell_graph(m1=10, m2=11) for i, (src, tgt) in enumerate(graph.edges): graph[src][tgt]["weight"] = (i + 1) / 5 return StellarGraph.from_networkx(graph)
from stellargraph.layer import GraphSAGE import stellargraph as sg from tensorflow.keras.models import load_model from tensorflow.keras import layers, optimizers, losses, metrics, Model, models from tensorflow.keras.layers import Dense, Dropout, Input from tensorflow.keras.callbacks import ModelCheckpoint, Callback from sklearn import preprocessing, feature_extraction, model_selection import numpy as np import tensorflow as tf import scipy.stats as stats from src.visualization import visualize as vs from src.features import build_features as bf ## ########################################### build graph ############################################### G = StellarGraph.from_networkx(gest, node_features="feature") print(G.info()) train_subjects, test_subjects = model_selection.train_test_split(targetdf, train_size=0.8, test_size=None) train_targets = np.array(train_subjects) test_targets = np.array(test_subjects) ## def get_dropout(input_tensor, p=0.1, mc=False): if mc: return Dropout(p)(input_tensor, training=True) else: return Dropout(p)(input_tensor) ## ======================== Graphsage Model building ===========================
G.add_edge(node_id_m, node_id_n, weight=edge_dict['r.weight']) del node_id_m, node_id_n, edge_dict # check the graph exists: this looks suspicious in that the in and out degree are the same to within 4dp nx.info(G) # the fact "not_found_dict" is not zero means that the two datasets have different data # the next steps here are to create a more complete dataset by downloading everything at once from neo4j len(found_dict) len(not_found_dict) # now the build of the Deep Graph Infomax embeddings model # https://stellargraph.readthedocs.io/en/v1.2.1/demos/embeddings/deep-graph-infomax-embeddings.html # note that there is an alternative graph called the StellarDiGraph, but no difference in performance was seen # the lack in difference in performance suggests that the networkx graph is not really taking account of direction stellar_G = StellarGraph.from_networkx(graph=G, node_features="v") print(stellar_G.info()) # https://stellargraph.readthedocs.io/en/stable/api.html#stellargraph.mapper.FullBatchNodeGenerator fullbatch_generator = FullBatchNodeGenerator(G=stellar_G, sparse=False, weighted=True, method='gcn') # intuition for GNN: # https://medium.com/analytics-vidhya/getting-the-intuition-of-graph-neural-networks-a30a2c34280d # understanding GCN: # https://towardsdatascience.com/understanding-graph-convolutional-networks-for-node-classification-a2bfdb7aba7b # stellargraph implementation # https://medium.com/stellargraph/do-i-know-you-flexible-unsupervised-and-semi-supervised-graph-models-with-deep-graph-infomax-96fbfd63ec31 # noqa: E501 # 2-layer GCN model
def node2vec(): print('Training Node2Vec mode!') # initialize results arrays total_mse = np.zeros(args.exp_number) total_pcc = np.zeros(args.exp_number) total_mae = np.zeros(args.exp_number) mse_datasets = {} std_datasets = {} pcc_datasets = {} pcc_std_datasets = {} mae_datasets = {} mae_std_datasets = {} t_total = time.time() if args.dataset == 'all': datasets = [ 'airport', 'collaboration', 'congress', 'forum', 'geom', 'astro' ] else: datasets = [args.dataset] for dataset in datasets: for exp_number in range(args.exp_number): print("%s: experiment number %d" % (dataset, exp_number + 1)) data = preprocess_dataset.clean_data(dataset) if dataset != 'usair': data['weights'] = preprocessing.normalize([data['weights']])[0] # random split of data data_train, data_test = train_test_split(data, test_size=0.2) data_train, data_val = train_test_split(data_train, test_size=0.08) data = data.reset_index() data_train = data_train.reset_index() data_val = data_val.reset_index() data_test = data_test.reset_index() G = preprocess_dataset.create_graph_gcn(dataset, data, data_train) val_G = preprocess_dataset.create_graph_gcn( dataset, data, data_val) test_G = preprocess_dataset.create_graph_gcn( dataset, data, data_test) nodes_len = len(G.nodes) node_ids_to_index = {} for i, node_id in enumerate(G.nodes): node_ids_to_index[node_id] = i train_A = nx.adjacency_matrix(G) val_A = nx.adjacency_matrix(val_G) test_A = nx.adjacency_matrix(test_G) train_labels = torch.FloatTensor( data_train['weights'].values).cuda() val_labels = torch.FloatTensor(data_val['weights'].values).cuda() test_labels = torch.FloatTensor(data_test['weights'].values).cuda() train_A = sparse_mx_to_torch_sparse_tensor(train_A).cuda() val_A = sparse_mx_to_torch_sparse_tensor(val_A).cuda() test_A = sparse_mx_to_torch_sparse_tensor(test_A).cuda() G = sg.from_networkx(G) rw = BiasedRandomWalk(G) weighted_walks = rw.run( nodes=G.nodes(), # root nodes length=args.length, # maximum length of a random walk n=args.n_size, # number of random walks per root node p=args. p, # Defines (unormalised) probability, 1/p, of returning to source node q=args. q, # Defines (unormalised) probability, 1/q, for moving away from source node weighted=True, # for weighted random walks seed=42, # random seed fixed for reproducibility ) print("Number of random walks: {}".format(len(weighted_walks))) weighted_model = Word2Vec(weighted_walks, vector_size=args.vector_size, window=5, min_count=0, sg=1, workers=4) weights = torch.FloatTensor(weighted_model.wv.vectors).cuda() ######################################## train_n1 = torch.tensor(data_train['A'].values).cuda() train_n2 = torch.tensor(data_train['B'].values).cuda() train_n1_indices = torch.ones(train_n1.shape[0]) for i, value in enumerate(train_n1): train_n1_indices[i] = node_ids_to_index[value.item()] train_n1_indices = train_n1_indices.cuda().long() train_n2_indices = torch.ones(train_n1.shape[0]) for i, value in enumerate(train_n2): train_n2_indices[i] = node_ids_to_index[value.item()] train_n2_indices = train_n2_indices.cuda().long() ######################################## val_n1 = torch.tensor(data_val['A'].values).cuda() val_n2 = torch.tensor(data_val['B'].values).cuda() val_n1_indices = torch.ones(val_n1.shape[0]) for i, value in enumerate(val_n1): val_n1_indices[i] = node_ids_to_index[value.item()] val_n1_indices = val_n1_indices.cuda().long() val_n2_indices = torch.ones(val_n1.shape[0]) for i, value in enumerate(val_n2): val_n2_indices[i] = node_ids_to_index[value.item()] val_n2_indices = val_n2_indices.cuda().long() ######################################## test_n1 = torch.tensor(data_test['A'].values).cuda() test_n2 = torch.tensor(data_test['B'].values).cuda() test_n1_indices = torch.ones(test_n1.shape[0]) for i, value in enumerate(test_n1): test_n1_indices[i] = node_ids_to_index[value.item()] test_n1_indices = test_n1_indices.cuda().long() test_n2_indices = torch.ones(test_n1.shape[0]) for i, value in enumerate(test_n2): test_n2_indices[i] = node_ids_to_index[value.item()] test_n2_indices = test_n2_indices.cuda().long() ######################################## model = Node2Vec(weights, 0.5) optimizer = optim.Adam(model.parameters(), lr=args.lr) model.train() model = model.to(args.device) # train for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() output = model(train_n1_indices, train_n2_indices) loss_train = F.mse_loss(output, train_labels) loss_train.backward() optimizer.step() # validation model.eval() output = model(val_n1_indices, val_n2_indices) loss_val = F.mse_loss(output, val_labels) if args.verbose: print('Epoch: {:04d}'.format(epoch + 1), 'loss_train: {:.4f}'.format(loss_train.item()), 'loss_val: {:.4f}'.format(loss_val.item()), 'time: {:.4f}s'.format(time.time() - t)) # test model.eval() with torch.no_grad(): output = model(test_n1_indices, test_n2_indices) loss_test = F.mse_loss(torch.flatten(output), test_labels) pcc_test = pearson_correlation(test_labels, output) mae_test = F.l1_loss(output, test_labels) print("Test set results:", "loss= {:.10f}".format(loss_test.item()), "pcc= {:.10f}".format(pcc_test), "mae= {:.10f}".format(mae_test.item())) total_mse[exp_number] = loss_test total_pcc[exp_number] = pcc_test total_mae[exp_number] = mae_test # results mse_datasets[dataset] = np.mean(total_mse) std_datasets[dataset] = np.std(total_mse) total_mse = np.zeros(args.exp_number) pcc_datasets[dataset] = np.mean(total_pcc[~np.isnan(total_pcc)]) pcc_std_datasets[dataset] = np.std(total_pcc[~np.isnan(total_pcc)]) total_pcc = np.zeros(args.exp_number) mae_datasets[dataset] = np.mean(total_mae) mae_std_datasets[dataset] = np.std(total_mae) total_mae = np.zeros(args.exp_number) for dataset in datasets: print("MSE %s: {:,f}".format(mse_datasets[dataset]) % dataset) print("MSE_STD %s: {:,f}".format(std_datasets[dataset]) % dataset) print("PCC %s: {:,f}".format(pcc_datasets[dataset]) % dataset) print("PCC_STD %s: {:,f}".format(pcc_std_datasets[dataset]) % dataset) print("MAE %s: {:,f}".format(mae_datasets[dataset]) % dataset) print("MAE_STD %s: {:,f}".format(mae_std_datasets[dataset]) % dataset) print("Total time elapsed: {:.4f}s".format(time.time() - t_total)) exit()
tf.compat.v1.keras.backend.set_session(session) from tensorflow.keras import layers, optimizers, losses, metrics, Model, models from tensorflow.keras.callbacks import ModelCheckpoint, Callback from sklearn import preprocessing, feature_extraction, model_selection import matplotlib matplotlib.use('TkAgg') from matplotlib import pyplot as plt import numpy as np import tensorflow as tf from sklearn.metrics import classification_report ## = ########################################### build graph ############################################### #%% ############################################################################################################ G = StellarGraph.from_networkx(gobsnoise, node_features="feature") print(G.info()) train_subjects, test_subjects = model_selection.train_test_split( targetdf, train_size=0.8, test_size=None) # temp_train_subjects = np.reshape(np.array(train_subjects), (train_subjects.shape[0],1)) # temp_test_subjects = np.reshape(np.array(test_subjects), (test_subjects.shape[0],1)) # train_targets = target_encoding.fit_transform(temp_train_subjects).toarray() # test_targets = target_encoding.transform(temp_test_subjects).toarray() train_targets = np.array(train_subjects) test_targets = np.array(test_subjects) ## #################################### Graphsage Model building ########################################### #%% ############################################################################################################
def applyLogisticRegression(save_dir): print('Load the data from files') #examples_train in the example .csv file #examples_train_df = pd.read_csv("/home/ubuntu/ssl/COVID_Data/Updated Paper Graph_node2vec/Train_Graphs/Graph_train_edges/graph_train_edges_sampled_1.csv") examples_train_df = pd.read_csv( "../../graphs/graph_train_edges_sampled_5.csv") examples_train_df = examples_train_df.replace(np.nan, 'nan', regex=True) labels_train = list(examples_train_df['labels']) examples_train = [[i, j] for i, j in zip(list(examples_train_df['node1']), list(examples_train_df['node2']))] #---------------------------------------------------------------------------------------- #examples_model_selection_df = pd.read_csv("/home/ubuntu/ssl/COVID_Data/Updated Paper Graph_node2vec/Validation_Graphs/Graph_validation_edges/graph_val_edges_sampled_1.csv") examples_model_selection_df = pd.read_csv( "../../graphs/graph_val_edges_sampled_5.csv") examples_model_selection_df = examples_model_selection_df.replace( np.nan, 'nan', regex=True) labels_model_selection = list(examples_model_selection_df['labels']) examples_model_selection = [[ i, j ] for i, j in zip(list(examples_model_selection_df['node1']), list(examples_model_selection_df['node2']))] #---------------------------------------------------------------------------------------- #examples_test_df = pd.read_csv("/home/ubuntu/ssl/COVID_Data/Updated Paper Graph_node2vec/Test_Graphs/Graph_test_edges/graph_test_edges_sampled_1.csv") examples_test_df = pd.read_csv( "../../graphs/graph_test_edges_sampled_5.csv") examples_test_df = examples_test_df.replace(np.nan, 'nan', regex=True) labels_test = list(examples_test_df['labels']) examples_test = [[i, j] for i, j in zip(list(examples_test_df['node1']), list(examples_test_df['node2']))] #---------------------------------------------------------------------------------------- #graph_to_embed = ("/home/ubuntu/ssl/COVID_Data/Updated Paper Graph_node2vec/Train_Graphs/graph_sampled_1.gml.gz") graph_to_embed = ("../../graphs/graph_sampled_5.gml.gz") g = nx.read_gml(graph_to_embed) #Assign networkx data utilizing Stellargraph ntxStgrph = StellarGraph.from_networkx(g) print( '\nGet the best fitting random walk parameters based on pre-selected others' ) p, q, best_result, embedding_train = findBestRandWalkParams(ntxStgrph) print('Best p parameter ' + str(p)) print('Best q parameter ' + str(q)) print('Best binary operator ' + str(best_result['binary_operator'].__name__)) print('\nEmbedding is ready, collect performance evaluation results') test_score_auc, test_score_acc = evaluate_link_prediction_model( best_result["classifier"], examples_test, labels_test, embedding_train, best_result["binary_operator"], save_dir + 'predicted_test_result_logit_nogrid_split5_') print( f"ROC AUC score on test set using '{best_result['binary_operator'].__name__}': {test_score_auc}" ) print( f"Accuracy score on test set using '{best_result['binary_operator'].__name__}': {test_score_acc}" )
# before running the embedding a check is done to see if the file is completed completed_file_path = scratch_folder + "/" + use_model_type + "_" + uni_name + ".csv" # load path of the university path load_path = file_folder + "/" + uni_name + ".graphml" # save path of the embedded data save_path = project_folder + "/" + use_model_type + "_" + uni_name + ".csv" G_graphml = nx.read_graphml(load_path) # get the node features as a dataframe, these will then be added to the stellar graph. # This seems to work better than trying to put them in directly nodefeatures = pd.DataFrame.from_dict(dict(G_graphml.nodes(data=True)), orient='index') # print(nodefeatures) # Convert the networkx graph to a Stellargraph G = StellarGraph.from_networkx(G_graphml, node_features=nodefeatures) # We create and train our DeepGraphInfomax model (docs). Note that the loss used here must always be # tf.nn.sigmoid_cross_entropy_with_logits. fullbatch_generator = FullBatchNodeGenerator(G, sparse=False) gcn_model = GCN(layer_sizes=[2], activations=["relu"], generator=fullbatch_generator) corrupted_generator = CorruptedGenerator(fullbatch_generator) gen = corrupted_generator.flow(G.nodes()) infomax = DeepGraphInfomax(gcn_model, corrupted_generator) x_in, x_out = infomax.in_out_tensors()
plt.show() if args.sampling_method == "local": print_distance_probabilities( edge_splitter_train.negative_edge_node_distances) # this is so that Node2Vec works because it expects Graph not MultiGraph type g_test = nx.Graph(g_test) g_train = nx.Graph(g_train) if args.hin: # prepare the metapaths if given in the command line metapaths = get_metapaths_from_str(args.metapaths) train_heterogeneous_graph( g_train=StellarGraph.from_networkx(g_train), g_test=StellarGraph.from_networkx(g_test), output_node_features=args.output_node_features, edge_data_ids_train=edge_data_ids_train, edge_data_labels_train=edge_data_labels_train, edge_data_ids_test=edge_data_ids_test, edge_data_labels_test=edge_data_labels_test, metapaths=metapaths, parameters=parameters, ) else: train_homogeneous_graph( g_train=g_train, g_test=g_test, output_node_features=args.output_node_features, edge_data_ids_train=edge_data_ids_train,
def wrapper(apk, target, metapathsFP, walksFP): """ wrapper to build features for doc2vec, metapath2vec. apk --> filepath to the apk target --> filepath to store common graph txts (for metapath2vec) metapathsFP --> filepath to store metapath2vec txts (for metapath2vec) walksFP --> filepath to store metapths2vec walks txt (for doc2vec) """ if ".gml.bz2" in apk: direc, appname = utils.dir_and_app(apk) else: appname = apk document_out = os.path.join(walksFP, (appname + "m2v_walks.txt")) metapath_out = os.path.join(metapathsFP, (appname + "m2v_metapaths.txt")) graph_out = os.path.join(target, (appname + "graph.txt")) if (os.path.exists(document_out)) & (os.path.exists(metapath_out)) & ( os.path.exists(graph_out)): print("the app: ", apk, " is already done!") else: try: networkx, metapaths = API_abstraction_vectorized( apk, "", "CLASS", "NX", True) stellar = StellarGraph.from_networkx(networkx, node_type_attr="type") ################## COMMON GRAPH INFORMATION ################## if not os.path.exists(graph_out): with open(graph_out, 'a') as file: for edge in np.array(networkx.edges): node1, node2, weight = edge type1 = networkx.nodes[node1]["type"] type2 = networkx.nodes[node2]["type"] # columns are: ["node1", "node2", "weight", "type1", "type2"] row = " ".join([node1, node2, weight, type1, type2 ]) + "\n" file.write(row) file.close() ############################################################## ################## DOC2VEC AND METAPATH2VEC INFORMATION ################## try: # OUTPUT WALKS OF ONE APP if not os.path.exists(document_out): document = metapath2vec(stellar, 500, metapaths) np.savetxt(document_out, np.hstack(document), fmt="%s") # OUTPUT METAPATHS OF ONE APP if not os.path.exists(metapath_out): joined = ["->".join(lst) for lst in metapaths] np.savetxt(metapath_out, joined, fmt="%s") print("the app: ", apk, " has finished!") except: print("The app: ", apk, " seems to be broken!") except: print("The app: ", apk, " seems to be broken!")
from tensorflow.keras.layers import Dense from tensorflow.keras.losses import binary_crossentropy from tensorflow.keras.callbacks import EarlyStopping import tensorflow as tf import matplotlib.pyplot as plt # A dataset with 80 samples, each graph is # of size [10, 20] #dataset = MiniGCDataset(80, 10, 20) #graph, label = dataset[0] fig, ax = plt.subplots() dataset = [] count1 = 0 count2 = 0 for filename in glob.glob("ant_graphs/*.pt"): graph = torch.load(filename) s = StellarGraph.from_networkx(graph[0]) count1 += 1 dataset.append(s) for filename in glob.glob("ago_graphs/*_aagraph.gml"): graph = torch.load(filename) s = StellarGraph.from_networkx(graph[0]) count2 += 1 dataset.append(s) labels = [] for i in range(0, 161): labels.append(-1) for i in range(0, 396): labels.append(1) graph_labels = pd.DataFrame(labels) #nx.draw(graph, ax=ax)
## The motivation was to utilize the embedded nodes for further ML processing. However, the embeddings were of poor quality. ## Not because of the algorithm, but the graph itself was noisy. import networkx as nx from stellargraph.mapper import GraphWaveGenerator from stellargraph import StellarGraph from sklearn.decomposition import PCA import numpy as np from matplotlib import pyplot as plt from scipy.sparse.linalg import eigs import tensorflow as tf from tensorflow.keras import backend as K from sklearn.manifold import TSNE G = nx.read_edgelist("/home/spirpinias/Desktop/MEGENAgraph") G = StellarGraph.from_networkx(G) sample_points = np.linspace(0, 100, 50).astype(np.float32) #degree20 and scales 5,10 degree = 10 scales = [5, 10] generator = GraphWaveGenerator(G, scales=scales, degree=degree) embeddings_dataset = generator.flow(node_ids=G.nodes(), sample_points=sample_points, batch_size=10, repeat=False) embeddings = [x.numpy() for x in embeddings_dataset]
def get_stellar_graph(self): return StellarGraph.from_networkx(self.g, node_features='feature')
def get_markov(inFP, outFP, kind): """ obtains the markov chain for one app inFP --> input file path (should be .gml.bz2) outFP --> output directory kind --> (str) FAMILY or PACKAGE """ direc, app_name = utils.dir_and_app(inFP) outputfp = os.path.join(outFP, (app_name + "_" + kind + ".txt")) if os.path.exists(outputfp): print("app ", inFP, " is already done!") else: try: networkx = nx.read_gml(inFP) except: return inFP + " might be broken!" nx_nodes = np.array(networkx.nodes()) nx_edges = np.array(networkx.edges, dtype=object) # convert to package/family mode vfunc = np.vectorize(get_package_family) newnodes = vfunc(kind, nx_nodes) new_edges = [] for edge in nx_edges: new_edges.append(edge_processing(edge, kind)) G = nx.MultiDiGraph() G.add_nodes_from(newnodes) G.add_edges_from(new_edges) stellar = StellarGraph.from_networkx(G) # step2: markov chain ## Set of possible states of the Markov chain is denoted as S ## If Sj and Sk are two connected states, Pjk denotes P(transition from Sj to Sk) ## Pjk is # occurances(Ojk), or edges(from j to k), divided by # of all occurrences ## Pjk = # of Edge(j, k) / # total edges if kind == "PACKAGE": possible_packages = get_possible_packages() S = ["/".join(item).strip() for item in possible_packages] + ["self_defined"] possible_edges = get_possible_edges() elif kind == "FAMILY": possible_packages = POSSIBLE_FAMILIES possible_edges = get_possible_family_edges() S = possible_packages + ["self_defined"] total_edges = stellar.number_of_edges() markov = [] counts_nd_stuff = pd.Series(stellar.edges()).value_counts() for j in S: for k in S: ## we might have self calling loops edge = (j, k) try: Pjk = counts_nd_stuff[edge] / total_edges markov.append(Pjk) except ValueError: markov.append(0) # build output fp and save if (round(sum(markov)) == 1) & (not os.path.exists(outputfp)): try: np.savetxt(outputfp, markov, fmt="%s") print("the app: ", inFP, " is done!", "mode: ", kind) return (inFP + " IS FINISHED!") except: print("the app: ", inFP, " encountered errors!")
# before running the embedding a check is done to see if the file is completed completed_file_path = scratch_folder + "/" + use_model_type + "_" + uni_name + ".csv" # load path of the university path load_path = file_folder + "/" + uni_name + ".graphml" # save path of the embedded data save_path = project_folder + "/" + use_model_type + "_" + uni_name + ".csv" G_graphml = nx.read_graphml(load_path) # get the node features as a dataframe, these will then be added to the stellar graph. # This seems to work better than trying to put them in directly # nodefeatures = pd.DataFrame.from_dict(dict(G_graphml.nodes(data=True)), orient='index') # print(nodefeatures) # Convert the networkx graph to a Stellargraph G = StellarGraph.from_networkx(G_graphml) rw = BiasedRandomWalk(G) walks = rw.run( nodes=list(G.nodes()), # root nodes length=30, # maximum length of a random walk n=100, # number of random walks per root node p=0.5, # Defines (unormalised) probability, 1/p, of returning to source node q=2.0, # Defines (unormalised) probability, 1/q, for moving away from source node ) print("Number of random walks: {}".format(len(walks))) str_walks = [[str(n) for n in walk] for walk in walks] model = Word2Vec(str_walks, size=dims,