Exemplo n.º 1
0
def graphCreationForSingleStudent(transitionRow,
                                  activityCodeList,
                                  mode='networkx'):
    #transitionRow as series
    transitionList = generateTransition(activityCodeList)
    checkActivityList = []
    G = nx.Graph()
    for i in transitionList:
        if i[1] in transitionRow.index:
            if transitionRow[i[1]] > 0:
                if i[0][0] not in checkActivityList:
                    G.add_node(i[0][0],
                               weight=transitionRow[i[2][0]],
                               name=i[2][0])
                    checkActivityList.append(i[0][0])
                if i[0][1] not in checkActivityList:
                    G.add_node(i[0][1],
                               weight=transitionRow[i[2][1]],
                               name=i[2][1])
                    checkActivityList.append(i[0][1])
                G.add_edge(i[0][0], i[0][1], weight=transitionRow[i[1]])
    if mode == 'networkx':
        return G
    else:
        return StellarGraph.from_networkx(G)
Exemplo n.º 2
0
def eda(graph):
    '''
    eda for an apk

    graph --> filepath to a graph
    returns a dictionary in case
    '''

    app_dir, app_filename = os.path.split(graph)
    
    #building output
    target = "/teams/DSC180A_FA20_A00/a04malware/personal-group03/eda_sab/features1/"
    out_csv = os.path.join(target, (app_filename + ".csv"))
    target1 = "/teams/DSC180A_FA20_A00/a04malware/personal-group03/eda/features/"
    others = os.path.join(target1, (app_filename + ".csv"))

    if os.path.exists(out_csv):
        print("csv exists already")
        return "csv exists already"
    if os.path.exists(others):
        print("csv others exists already")
        return "csv others exists already"

    try:
        networkx = nx.read_gml(graph)
    except:
        return graph + " might be broken!"

    stellar = StellarGraph.from_networkx(networkx, node_type_attr = "type")

    nodes = stellar.node_types
    node_types = {}
    for node in nodes:
        node_types[node] = len(stellar.nodes_of_type(node_type=node))

    data = {}
    
    # get number of nodes and edges
    data["app"] = graph
    data["node_types_counts"] = len(stellar.node_types)
    data["node_types"] = node_types
    data["number_nodes"] = len(stellar.nodes())
    data["number_edges"] = len(stellar.edges())

    if "benign" in app_dir:
        label = 0
    else:
        label = 1

    data["label"] = label
    
    df = pd.DataFrame.from_dict([data])
    
    return df.to_csv(out_csv)
Exemplo n.º 3
0
    def __init__(self, edges_path, lables_path):
        """
        Hard-coded initialization
        """
        fstar = 1
        a = 0.125 # p, q left bound
        b = 4.125 # p, q right bound
        graph, labels = self.read_data(edges_path, lables_path)
        rw = BiasedRandomWalk(StellarGraph.from_networkx(graph))

        super().__init__(fstar, a, b, graph, labels, rw)
    def preprocessing(self, g, train_node, file_emb_output="./emb/100_900_nede2vec.emb"):

        node_subjects = train_node['values']

        node_subjects = node_subjects.astype(str)
        print(Counter(node_subjects))

        #file_emb_output = "./emb/100_900_nede2vec.emb"
        model = KeyedVectors.load_word2vec_format(file_emb_output)
        node_ids = model.wv.index2word
        node_embeddings = (
            model.wv.vectors
        )  # num
        print("Embedding load success.")

        reinex_node_embedding = pd.DataFrame(node_embeddings, index=map(int, node_ids))
        g_feature_attr = g.copy()

        G = StellarGraph.from_networkx(
            g_feature_attr, node_features=reinex_node_embedding, node_type_default="n", edge_type_default="e"
        )
        print(G.info())

        train_subjects, test_subjects = model_selection.train_test_split(
            node_subjects, train_size=160, test_size=None, stratify=node_subjects
        )
        val_subjects, test_subjects = model_selection.train_test_split(
            test_subjects, train_size=20, test_size=None, stratify=test_subjects
        )

        train_subjects.value_counts().to_frame()

        target_encoding = preprocessing.LabelBinarizer()
        # target_encoding = preprocessing.OneHotEncoder()

        train_targets = target_encoding.fit_transform(train_subjects)
        val_targets = target_encoding.transform(val_subjects)
        test_targets = target_encoding.transform(test_subjects)

        generator = FullBatchNodeGenerator(G, method="gcn")
        train_gen = generator.flow(train_subjects.index, train_targets)
        val_gen = generator.flow(val_subjects.index, val_targets)
        test_gen = generator.flow(test_subjects.index, test_targets)

        all_nodes = node_subjects.index
        all_gen = generator.flow(all_nodes)

        return G, train_gen, train_targets, val_gen, val_targets, test_targets, test_gen, all_gen, generator
Exemplo n.º 5
0
def createGraphFromCounter(dfg, mode='networkx'):
    transitionList = list(dfg)
    #transitionRow as series
    checkActivityList = []
    G = nx.DiGraph()
    for i in transitionList:
        # if i[0] != i[1]:
        if i[0] not in checkActivityList:
            G.add_node(i[0], name=i[0])
            checkActivityList.append(i[0])
        if i[1] not in checkActivityList:
            G.add_node(i[1], name=i[1])
            checkActivityList.append(i[1])
        G.add_edge(i[0], i[1], weight=1)
    if mode == 'networkx':
        return G
    else:
        return StellarGraph.from_networkx(G)
Exemplo n.º 6
0
def API_abstraction_vectorized(inFP, outFP, kind, to_return, truename=False):
    """
    abstracts edges and nodes of ONE APP to some level
    
    returns a graph that is abstracted (WILL CHANGE)
    
    inFP --> input file path (should be .gml.bz2)
    outFP --> output directory
    kind --> (str) FAMILY or PACKAGE or CLASS
    """

    # getting the app name
    direc, app_name = utils.dir_and_app(inFP)

    try:
        networkx = nx.read_gml(inFP)
    except:
        return inFP + " might be broken!"

    nx_nodes = np.array(networkx.nodes(data=True))
    nx_edges = np.array(networkx.edges, dtype=object)
    node_vfunc = np.vectorize(API_abstraction)
    edge_vfunc = np.vectorize(edge_processing)

    newnodes = [API_abstraction(kind, node) for node in nx_nodes]
    newedges = [edge_processing(kind, edge) for edge in nx_edges]

    G = nx.MultiDiGraph()
    G.add_nodes_from(newnodes)
    G.add_edges_from(newedges)
    if truename == False:
        G = add_apk_node(G, "")
    else:
        G = add_apk_node(G, app_name)
    metapaths = dfs(G, app_name)

    if to_return == "NX":
        return [G, metapaths]
    elif to_return == "SG":
        stellar = StellarGraph.from_networkx(G, node_type_attr="type")
        return [stellar, metapaths]
Exemplo n.º 7
0
X = sio.loadmat('X_vector_Full_.mat')
X = X['X_vec']
#Create Labels
Y = np.append(np.zeros(shape=(3648,1)), np.ones(shape=(2156,1))) #First 3648 entries of A & X are healthy, rest 2156 are depressed

A, X, Y = shuffle(np.reshape(A, (5804,62,62)), np.reshape(X, (5804, 62, 2)), np.reshape(Y, (5804,1,1))) #Shuffle to randomize order and reshape for consistent shuffle across A, X & Y
Y = np.reshape(Y, (5804,))
graph_labels = pd.DataFrame(Y)  #Store labels in a pd DataFrame

# Format input data (A, X) for GCN input as StellarGraph object 
graphs = []
for participant in tqdm(range(len(X))):
  G = nx.from_numpy_matrix(A[participant])  #Create networkx graph object from ath Adjacency Matrix
  for node in range(62): #For each graph
    G.nodes[node]['x'] = X[participant, node, :]  #Add node features to networkx Graph
  graphs.append(StellarGraph.from_networkx(G, node_features="x")) #Create and store StellarGraph objects for training

generator = PaddedGraphGenerator(graphs=graphs)

k = 10  # the number of rows for the output tensor
layer_sizes = [64, 32, 16, 8, 4, 2, 1]  #GCN Layer Size

dgcnn_model = DeepGraphCNN(
    layer_sizes=layer_sizes,
    activations=["relu", "relu", "tanh", "tanh", "tanh","tanh","tanh"],
    k=k,
    bias=True,
    generator=generator,
)

x_inp, x_out = dgcnn_model.in_out_tensors()
Exemplo n.º 8
0
def barbell():
    return StellarGraph.from_networkx(nx.barbell_graph(m1=10, m2=11))
Exemplo n.º 9
0
def petersen_graph() -> StellarGraph:
    nxg = nx.petersen_graph()
    return StellarGraph.from_networkx(nxg, node_features=node_features())
Exemplo n.º 10
0
link_ids_train = link_ids[train_indices]
link_ids_test = link_ids[test_indices]

link_labels_train = link_labels[train_indices]
link_labels_test = link_labels[test_indices]

g_train = g.copy()
edgelist = [(start, end) for start, end in zip(link_ids_test[:,0], link_ids_test[:,1]) ]
g_train.remove_edges_from(edgelist)

g_test = g.copy()
edgelist = [(start, end) for start, end in zip(link_ids_train[:,0], link_ids_train[:,1]) ]
g_test.remove_edges_from(edgelist)

G = StellarGraph.from_networkx(g, node_features="feature")
g_train = StellarGraph.from_networkx(g_train, node_features="feature")
g_test = StellarGraph.from_networkx(g_test, node_features="feature")

print(g_train.info())
print(g_test.info())

##
batch_size = 40
num_samples = [15, 10, 5]

train_gen = GraphSAGELinkGenerator(g_train, batch_size, num_samples)
train_flow = train_gen.flow(link_ids_train, link_labels_train, shuffle=True)

test_gen = GraphSAGELinkGenerator(g_test, batch_size, num_samples)
test_flow = test_gen.flow(link_ids_test, link_labels_test)
Exemplo n.º 11
0
def barbell():
    graph = nx.barbell_graph(m1=10, m2=11)
    for i, (src, tgt) in enumerate(graph.edges):
        graph[src][tgt]["weight"] = (i + 1) / 5
    return StellarGraph.from_networkx(graph)
Exemplo n.º 12
0
from stellargraph.layer import GraphSAGE
import stellargraph as sg
from tensorflow.keras.models import load_model
from tensorflow.keras import layers, optimizers, losses, metrics, Model, models
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from sklearn import preprocessing, feature_extraction, model_selection
import numpy as np
import tensorflow as tf
import scipy.stats as stats
from src.visualization import visualize as vs
from src.features import build_features as bf

## ########################################### build graph ###############################################

G = StellarGraph.from_networkx(gest, node_features="feature")
print(G.info())

train_subjects, test_subjects = model_selection.train_test_split(targetdf, train_size=0.8, test_size=None)

train_targets = np.array(train_subjects)
test_targets = np.array(test_subjects)
##
def get_dropout(input_tensor, p=0.1, mc=False):
   if mc:
      return Dropout(p)(input_tensor, training=True)
   else:
      return Dropout(p)(input_tensor)

## ======================== Graphsage Model building ===========================
Exemplo n.º 13
0
        G.add_edge(node_id_m, node_id_n, weight=edge_dict['r.weight'])

del node_id_m, node_id_n, edge_dict
# check the graph exists: this looks suspicious in that the in and out degree are the same to within 4dp
nx.info(G)

# the fact "not_found_dict" is not zero means that the two datasets have different data
# the next steps here are to create a more complete dataset by downloading everything at once from neo4j
len(found_dict)
len(not_found_dict)

# now the build of the Deep Graph Infomax embeddings model
# https://stellargraph.readthedocs.io/en/v1.2.1/demos/embeddings/deep-graph-infomax-embeddings.html
# note that there is an alternative graph called the StellarDiGraph, but no difference in performance was seen
# the lack in difference in performance suggests that the networkx graph is not really taking account of direction
stellar_G = StellarGraph.from_networkx(graph=G, node_features="v")
print(stellar_G.info())

# https://stellargraph.readthedocs.io/en/stable/api.html#stellargraph.mapper.FullBatchNodeGenerator
fullbatch_generator = FullBatchNodeGenerator(G=stellar_G,
                                             sparse=False,
                                             weighted=True,
                                             method='gcn')

# intuition for GNN:
# https://medium.com/analytics-vidhya/getting-the-intuition-of-graph-neural-networks-a30a2c34280d
# understanding GCN:
# https://towardsdatascience.com/understanding-graph-convolutional-networks-for-node-classification-a2bfdb7aba7b
# stellargraph implementation
# https://medium.com/stellargraph/do-i-know-you-flexible-unsupervised-and-semi-supervised-graph-models-with-deep-graph-infomax-96fbfd63ec31  # noqa: E501
# 2-layer GCN model
Exemplo n.º 14
0
def node2vec():
    print('Training Node2Vec mode!')

    # initialize results arrays
    total_mse = np.zeros(args.exp_number)

    total_pcc = np.zeros(args.exp_number)
    total_mae = np.zeros(args.exp_number)
    mse_datasets = {}
    std_datasets = {}
    pcc_datasets = {}
    pcc_std_datasets = {}
    mae_datasets = {}
    mae_std_datasets = {}

    t_total = time.time()

    if args.dataset == 'all':
        datasets = [
            'airport', 'collaboration', 'congress', 'forum', 'geom', 'astro'
        ]
    else:
        datasets = [args.dataset]

    for dataset in datasets:
        for exp_number in range(args.exp_number):
            print("%s: experiment number %d" % (dataset, exp_number + 1))

            data = preprocess_dataset.clean_data(dataset)
            if dataset != 'usair':
                data['weights'] = preprocessing.normalize([data['weights']])[0]

            # random split of data
            data_train, data_test = train_test_split(data, test_size=0.2)
            data_train, data_val = train_test_split(data_train, test_size=0.08)

            data = data.reset_index()
            data_train = data_train.reset_index()
            data_val = data_val.reset_index()
            data_test = data_test.reset_index()

            G = preprocess_dataset.create_graph_gcn(dataset, data, data_train)
            val_G = preprocess_dataset.create_graph_gcn(
                dataset, data, data_val)
            test_G = preprocess_dataset.create_graph_gcn(
                dataset, data, data_test)

            nodes_len = len(G.nodes)
            node_ids_to_index = {}
            for i, node_id in enumerate(G.nodes):
                node_ids_to_index[node_id] = i

            train_A = nx.adjacency_matrix(G)
            val_A = nx.adjacency_matrix(val_G)
            test_A = nx.adjacency_matrix(test_G)

            train_labels = torch.FloatTensor(
                data_train['weights'].values).cuda()
            val_labels = torch.FloatTensor(data_val['weights'].values).cuda()
            test_labels = torch.FloatTensor(data_test['weights'].values).cuda()

            train_A = sparse_mx_to_torch_sparse_tensor(train_A).cuda()
            val_A = sparse_mx_to_torch_sparse_tensor(val_A).cuda()
            test_A = sparse_mx_to_torch_sparse_tensor(test_A).cuda()

            G = sg.from_networkx(G)
            rw = BiasedRandomWalk(G)
            weighted_walks = rw.run(
                nodes=G.nodes(),  # root nodes
                length=args.length,  # maximum length of a random walk
                n=args.n_size,  # number of random walks per root node
                p=args.
                p,  # Defines (unormalised) probability, 1/p, of returning to source node
                q=args.
                q,  # Defines (unormalised) probability, 1/q, for moving away from source node
                weighted=True,  # for weighted random walks
                seed=42,  # random seed fixed for reproducibility
            )
            print("Number of random walks: {}".format(len(weighted_walks)))
            weighted_model = Word2Vec(weighted_walks,
                                      vector_size=args.vector_size,
                                      window=5,
                                      min_count=0,
                                      sg=1,
                                      workers=4)
            weights = torch.FloatTensor(weighted_model.wv.vectors).cuda()

            ########################################

            train_n1 = torch.tensor(data_train['A'].values).cuda()
            train_n2 = torch.tensor(data_train['B'].values).cuda()

            train_n1_indices = torch.ones(train_n1.shape[0])
            for i, value in enumerate(train_n1):
                train_n1_indices[i] = node_ids_to_index[value.item()]
            train_n1_indices = train_n1_indices.cuda().long()

            train_n2_indices = torch.ones(train_n1.shape[0])
            for i, value in enumerate(train_n2):
                train_n2_indices[i] = node_ids_to_index[value.item()]
            train_n2_indices = train_n2_indices.cuda().long()

            ########################################

            val_n1 = torch.tensor(data_val['A'].values).cuda()
            val_n2 = torch.tensor(data_val['B'].values).cuda()

            val_n1_indices = torch.ones(val_n1.shape[0])
            for i, value in enumerate(val_n1):
                val_n1_indices[i] = node_ids_to_index[value.item()]
            val_n1_indices = val_n1_indices.cuda().long()

            val_n2_indices = torch.ones(val_n1.shape[0])
            for i, value in enumerate(val_n2):
                val_n2_indices[i] = node_ids_to_index[value.item()]
            val_n2_indices = val_n2_indices.cuda().long()

            ########################################

            test_n1 = torch.tensor(data_test['A'].values).cuda()
            test_n2 = torch.tensor(data_test['B'].values).cuda()

            test_n1_indices = torch.ones(test_n1.shape[0])
            for i, value in enumerate(test_n1):
                test_n1_indices[i] = node_ids_to_index[value.item()]
            test_n1_indices = test_n1_indices.cuda().long()

            test_n2_indices = torch.ones(test_n1.shape[0])
            for i, value in enumerate(test_n2):
                test_n2_indices[i] = node_ids_to_index[value.item()]
            test_n2_indices = test_n2_indices.cuda().long()

            ########################################

            model = Node2Vec(weights, 0.5)
            optimizer = optim.Adam(model.parameters(), lr=args.lr)

            model.train()
            model = model.to(args.device)

            # train
            for epoch in range(args.epochs):
                t = time.time()
                model.train()
                optimizer.zero_grad()

                output = model(train_n1_indices, train_n2_indices)

                loss_train = F.mse_loss(output, train_labels)
                loss_train.backward()
                optimizer.step()

                # validation
                model.eval()
                output = model(val_n1_indices, val_n2_indices)
                loss_val = F.mse_loss(output, val_labels)

                if args.verbose:
                    print('Epoch: {:04d}'.format(epoch + 1),
                          'loss_train: {:.4f}'.format(loss_train.item()),
                          'loss_val: {:.4f}'.format(loss_val.item()),
                          'time: {:.4f}s'.format(time.time() - t))

            # test
            model.eval()
            with torch.no_grad():
                output = model(test_n1_indices, test_n2_indices)

                loss_test = F.mse_loss(torch.flatten(output), test_labels)
                pcc_test = pearson_correlation(test_labels, output)
                mae_test = F.l1_loss(output, test_labels)
                print("Test set results:",
                      "loss= {:.10f}".format(loss_test.item()),
                      "pcc= {:.10f}".format(pcc_test),
                      "mae= {:.10f}".format(mae_test.item()))

                total_mse[exp_number] = loss_test
                total_pcc[exp_number] = pcc_test
                total_mae[exp_number] = mae_test

        # results
        mse_datasets[dataset] = np.mean(total_mse)
        std_datasets[dataset] = np.std(total_mse)
        total_mse = np.zeros(args.exp_number)

        pcc_datasets[dataset] = np.mean(total_pcc[~np.isnan(total_pcc)])
        pcc_std_datasets[dataset] = np.std(total_pcc[~np.isnan(total_pcc)])
        total_pcc = np.zeros(args.exp_number)

        mae_datasets[dataset] = np.mean(total_mae)
        mae_std_datasets[dataset] = np.std(total_mae)
        total_mae = np.zeros(args.exp_number)

    for dataset in datasets:
        print("MSE %s: {:,f}".format(mse_datasets[dataset]) % dataset)
        print("MSE_STD %s: {:,f}".format(std_datasets[dataset]) % dataset)

        print("PCC %s: {:,f}".format(pcc_datasets[dataset]) % dataset)
        print("PCC_STD %s: {:,f}".format(pcc_std_datasets[dataset]) % dataset)

        print("MAE %s: {:,f}".format(mae_datasets[dataset]) % dataset)
        print("MAE_STD %s: {:,f}".format(mae_std_datasets[dataset]) % dataset)

    print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

    exit()
tf.compat.v1.keras.backend.set_session(session)

from tensorflow.keras import layers, optimizers, losses, metrics, Model, models
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from sklearn import preprocessing, feature_extraction, model_selection
import matplotlib
matplotlib.use('TkAgg')
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report

## = ########################################### build graph ###############################################
#%% ############################################################################################################

G = StellarGraph.from_networkx(gobsnoise, node_features="feature")
print(G.info())

train_subjects, test_subjects = model_selection.train_test_split(
    targetdf, train_size=0.8, test_size=None)

# temp_train_subjects = np.reshape(np.array(train_subjects), (train_subjects.shape[0],1))
# temp_test_subjects = np.reshape(np.array(test_subjects), (test_subjects.shape[0],1))
# train_targets = target_encoding.fit_transform(temp_train_subjects).toarray()
# test_targets = target_encoding.transform(temp_test_subjects).toarray()

train_targets = np.array(train_subjects)
test_targets = np.array(test_subjects)

## #################################### Graphsage Model building ###########################################
#%% ############################################################################################################
def applyLogisticRegression(save_dir):

    print('Load the data from files')

    #examples_train in the example .csv file
    #examples_train_df = pd.read_csv("/home/ubuntu/ssl/COVID_Data/Updated Paper Graph_node2vec/Train_Graphs/Graph_train_edges/graph_train_edges_sampled_1.csv")
    examples_train_df = pd.read_csv(
        "../../graphs/graph_train_edges_sampled_5.csv")
    examples_train_df = examples_train_df.replace(np.nan, 'nan', regex=True)
    labels_train = list(examples_train_df['labels'])
    examples_train = [[i, j] for i, j in zip(list(examples_train_df['node1']),
                                             list(examples_train_df['node2']))]

    #----------------------------------------------------------------------------------------

    #examples_model_selection_df = pd.read_csv("/home/ubuntu/ssl/COVID_Data/Updated Paper Graph_node2vec/Validation_Graphs/Graph_validation_edges/graph_val_edges_sampled_1.csv")
    examples_model_selection_df = pd.read_csv(
        "../../graphs/graph_val_edges_sampled_5.csv")

    examples_model_selection_df = examples_model_selection_df.replace(
        np.nan, 'nan', regex=True)
    labels_model_selection = list(examples_model_selection_df['labels'])
    examples_model_selection = [[
        i, j
    ] for i, j in zip(list(examples_model_selection_df['node1']),
                      list(examples_model_selection_df['node2']))]
    #----------------------------------------------------------------------------------------

    #examples_test_df = pd.read_csv("/home/ubuntu/ssl/COVID_Data/Updated Paper Graph_node2vec/Test_Graphs/Graph_test_edges/graph_test_edges_sampled_1.csv")
    examples_test_df = pd.read_csv(
        "../../graphs/graph_test_edges_sampled_5.csv")

    examples_test_df = examples_test_df.replace(np.nan, 'nan', regex=True)
    labels_test = list(examples_test_df['labels'])
    examples_test = [[i, j] for i, j in zip(list(examples_test_df['node1']),
                                            list(examples_test_df['node2']))]
    #----------------------------------------------------------------------------------------

    #graph_to_embed = ("/home/ubuntu/ssl/COVID_Data/Updated Paper Graph_node2vec/Train_Graphs/graph_sampled_1.gml.gz")
    graph_to_embed = ("../../graphs/graph_sampled_5.gml.gz")
    g = nx.read_gml(graph_to_embed)

    #Assign networkx data utilizing Stellargraph
    ntxStgrph = StellarGraph.from_networkx(g)

    print(
        '\nGet the best fitting random walk parameters based on pre-selected others'
    )
    p, q, best_result, embedding_train = findBestRandWalkParams(ntxStgrph)

    print('Best p parameter ' + str(p))
    print('Best q parameter ' + str(q))
    print('Best binary operator ' +
          str(best_result['binary_operator'].__name__))

    print('\nEmbedding is ready, collect performance evaluation results')
    test_score_auc, test_score_acc = evaluate_link_prediction_model(
        best_result["classifier"], examples_test, labels_test, embedding_train,
        best_result["binary_operator"],
        save_dir + 'predicted_test_result_logit_nogrid_split5_')
    print(
        f"ROC AUC score on test set using '{best_result['binary_operator'].__name__}': {test_score_auc}"
    )
    print(
        f"Accuracy score on test set using '{best_result['binary_operator'].__name__}': {test_score_acc}"
    )
# before running the embedding a check is done to see if the file is completed
completed_file_path = scratch_folder + "/" + use_model_type + "_" + uni_name + ".csv"

# load path of the university path
load_path = file_folder + "/" + uni_name + ".graphml"
# save path of the embedded data
save_path = project_folder + "/" + use_model_type + "_" + uni_name + ".csv"

G_graphml = nx.read_graphml(load_path)
# get the node features as a dataframe, these will then be added to the stellar graph.
# This seems to work better than trying to put them in directly
nodefeatures = pd.DataFrame.from_dict(dict(G_graphml.nodes(data=True)),
                                      orient='index')
# print(nodefeatures)
# Convert the networkx graph to a Stellargraph
G = StellarGraph.from_networkx(G_graphml, node_features=nodefeatures)

# We create and train our DeepGraphInfomax model (docs). Note that the loss used here must always be
# tf.nn.sigmoid_cross_entropy_with_logits.

fullbatch_generator = FullBatchNodeGenerator(G, sparse=False)
gcn_model = GCN(layer_sizes=[2],
                activations=["relu"],
                generator=fullbatch_generator)

corrupted_generator = CorruptedGenerator(fullbatch_generator)
gen = corrupted_generator.flow(G.nodes())

infomax = DeepGraphInfomax(gcn_model, corrupted_generator)
x_in, x_out = infomax.in_out_tensors()
Exemplo n.º 18
0
        plt.show()

    if args.sampling_method == "local":
        print_distance_probabilities(
            edge_splitter_train.negative_edge_node_distances)

    # this is so that Node2Vec works because it expects Graph not MultiGraph type
    g_test = nx.Graph(g_test)
    g_train = nx.Graph(g_train)

    if args.hin:
        # prepare the metapaths if given in the command line
        metapaths = get_metapaths_from_str(args.metapaths)

        train_heterogeneous_graph(
            g_train=StellarGraph.from_networkx(g_train),
            g_test=StellarGraph.from_networkx(g_test),
            output_node_features=args.output_node_features,
            edge_data_ids_train=edge_data_ids_train,
            edge_data_labels_train=edge_data_labels_train,
            edge_data_ids_test=edge_data_ids_test,
            edge_data_labels_test=edge_data_labels_test,
            metapaths=metapaths,
            parameters=parameters,
        )
    else:
        train_homogeneous_graph(
            g_train=g_train,
            g_test=g_test,
            output_node_features=args.output_node_features,
            edge_data_ids_train=edge_data_ids_train,
Exemplo n.º 19
0
def wrapper(apk, target, metapathsFP, walksFP):
    """
    wrapper to build features for doc2vec, metapath2vec. 
    
    apk --> filepath to the apk
    target --> filepath to store common graph txts (for metapath2vec)
    metapathsFP --> filepath to store metapath2vec txts (for metapath2vec)
    walksFP --> filepath to store metapths2vec walks txt (for doc2vec)
    
    """
    if ".gml.bz2" in apk:
        direc, appname = utils.dir_and_app(apk)
    else:
        appname = apk

    document_out = os.path.join(walksFP, (appname + "m2v_walks.txt"))
    metapath_out = os.path.join(metapathsFP, (appname + "m2v_metapaths.txt"))
    graph_out = os.path.join(target, (appname + "graph.txt"))

    if (os.path.exists(document_out)) & (os.path.exists(metapath_out)) & (
            os.path.exists(graph_out)):
        print("the app: ", apk, " is already done!")
    else:
        try:
            networkx, metapaths = API_abstraction_vectorized(
                apk, "", "CLASS", "NX", True)
            stellar = StellarGraph.from_networkx(networkx,
                                                 node_type_attr="type")
            ################## COMMON GRAPH INFORMATION ##################
            if not os.path.exists(graph_out):
                with open(graph_out, 'a') as file:
                    for edge in np.array(networkx.edges):
                        node1, node2, weight = edge
                        type1 = networkx.nodes[node1]["type"]
                        type2 = networkx.nodes[node2]["type"]

                        # columns are: ["node1", "node2", "weight", "type1", "type2"]
                        row = " ".join([node1, node2, weight, type1, type2
                                        ]) + "\n"

                        file.write(row)
                file.close()
            ##############################################################

            ################## DOC2VEC AND METAPATH2VEC INFORMATION ##################
            try:
                # OUTPUT WALKS OF ONE APP
                if not os.path.exists(document_out):
                    document = metapath2vec(stellar, 500, metapaths)
                    np.savetxt(document_out, np.hstack(document), fmt="%s")

                # OUTPUT METAPATHS OF ONE APP
                if not os.path.exists(metapath_out):
                    joined = ["->".join(lst) for lst in metapaths]
                    np.savetxt(metapath_out, joined, fmt="%s")
                print("the app: ", apk, " has finished!")
            except:
                print("The app: ", apk, " seems to be broken!")

        except:
            print("The app: ", apk, " seems to be broken!")
Exemplo n.º 20
0
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import matplotlib.pyplot as plt
# A dataset with 80 samples, each graph is
# of size [10, 20]
#dataset = MiniGCDataset(80, 10, 20)
#graph, label = dataset[0]
fig, ax = plt.subplots()
dataset = []
count1 = 0
count2 = 0
for filename in glob.glob("ant_graphs/*.pt"):
    graph = torch.load(filename)
    s = StellarGraph.from_networkx(graph[0])
    count1 += 1
    dataset.append(s)
for filename in glob.glob("ago_graphs/*_aagraph.gml"):
    graph = torch.load(filename)
    s = StellarGraph.from_networkx(graph[0])
    count2 += 1
    dataset.append(s)
labels = []
for i in range(0, 161):
    labels.append(-1)
for i in range(0, 396):
    labels.append(1)

graph_labels = pd.DataFrame(labels)
#nx.draw(graph, ax=ax)
Exemplo n.º 21
0
## The motivation was to utilize the embedded nodes for further ML processing. However, the embeddings were of poor quality.
## Not because of the algorithm, but the graph itself was noisy.

import networkx as nx
from stellargraph.mapper import GraphWaveGenerator
from stellargraph import StellarGraph
from sklearn.decomposition import PCA
import numpy as np
from matplotlib import pyplot as plt
from scipy.sparse.linalg import eigs
import tensorflow as tf
from tensorflow.keras import backend as K
from sklearn.manifold import TSNE

G = nx.read_edgelist("/home/spirpinias/Desktop/MEGENAgraph")
G = StellarGraph.from_networkx(G)
sample_points = np.linspace(0, 100, 50).astype(np.float32)
#degree20 and scales 5,10

degree = 10
scales = [5, 10]

generator = GraphWaveGenerator(G, scales=scales, degree=degree)

embeddings_dataset = generator.flow(node_ids=G.nodes(),
                                    sample_points=sample_points,
                                    batch_size=10,
                                    repeat=False)

embeddings = [x.numpy() for x in embeddings_dataset]
 def get_stellar_graph(self):
     return StellarGraph.from_networkx(self.g, node_features='feature')
Exemplo n.º 23
0
def get_markov(inFP, outFP, kind):
    """
    obtains the markov chain for one app
    
    inFP --> input file path (should be .gml.bz2)
    outFP --> output directory
    kind --> (str) FAMILY or PACKAGE
    
    """

    direc, app_name = utils.dir_and_app(inFP)
    outputfp = os.path.join(outFP, (app_name + "_" + kind + ".txt"))
    if os.path.exists(outputfp):
        print("app ", inFP, " is already done!")

    else:
        try:
            networkx = nx.read_gml(inFP)
        except:
            return inFP + " might be broken!"

        nx_nodes = np.array(networkx.nodes())
        nx_edges = np.array(networkx.edges, dtype=object)

        # convert to package/family mode
        vfunc = np.vectorize(get_package_family)
        newnodes = vfunc(kind, nx_nodes)
        new_edges = []
        for edge in nx_edges:
            new_edges.append(edge_processing(edge, kind))
        G = nx.MultiDiGraph()
        G.add_nodes_from(newnodes)
        G.add_edges_from(new_edges)
        stellar = StellarGraph.from_networkx(G)

        # step2: markov chain
        ## Set of possible states of the Markov chain is denoted as S
        ## If Sj and Sk are two connected states, Pjk denotes P(transition from Sj to Sk)
        ## Pjk is # occurances(Ojk), or edges(from j to k), divided by # of all occurrences
        ## Pjk = # of Edge(j, k) / # total edges
        if kind == "PACKAGE":
            possible_packages = get_possible_packages()
            S = ["/".join(item).strip()
                 for item in possible_packages] + ["self_defined"]
            possible_edges = get_possible_edges()
        elif kind == "FAMILY":
            possible_packages = POSSIBLE_FAMILIES
            possible_edges = get_possible_family_edges()
            S = possible_packages + ["self_defined"]
        total_edges = stellar.number_of_edges()
        markov = []
        counts_nd_stuff = pd.Series(stellar.edges()).value_counts()

        for j in S:
            for k in S:  ## we might have self calling loops
                edge = (j, k)
                try:
                    Pjk = counts_nd_stuff[edge] / total_edges
                    markov.append(Pjk)
                except ValueError:
                    markov.append(0)

        # build output fp and save
        if (round(sum(markov)) == 1) & (not os.path.exists(outputfp)):
            try:
                np.savetxt(outputfp, markov, fmt="%s")
                print("the app: ", inFP, " is done!", "mode: ", kind)
                return (inFP + " IS FINISHED!")
            except:
                print("the app: ", inFP, " encountered errors!")
# before running the embedding a check is done to see if the file is completed
completed_file_path = scratch_folder + "/" + use_model_type + "_" + uni_name + ".csv"

# load path of the university path
load_path = file_folder + "/" + uni_name + ".graphml"
# save path of the embedded data
save_path = project_folder + "/" + use_model_type + "_" + uni_name + ".csv"

G_graphml = nx.read_graphml(load_path)
# get the node features as a dataframe, these will then be added to the stellar graph.
# This seems to work better than trying to put them in directly
# nodefeatures = pd.DataFrame.from_dict(dict(G_graphml.nodes(data=True)), orient='index')
# print(nodefeatures)
# Convert the networkx graph to a Stellargraph
G = StellarGraph.from_networkx(G_graphml)

rw = BiasedRandomWalk(G)

walks = rw.run(
    nodes=list(G.nodes()),  # root nodes
    length=30,  # maximum length of a random walk
    n=100,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
)
print("Number of random walks: {}".format(len(walks)))

str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks,
                 size=dims,