Exemplo n.º 1
0
def compare_node_embedding(vary_q=True):
    par = parSet(
        dim=25,
        walk=15,
        num_walk=30,
        p=0.2,
        q=0.5
    )
    root = Path(os.getcwd()).parent.parent
    root = str(root) + os.sep + 'data/graphs/benign/3abfa08b4e1de7195c8e9fe52796a37f9a275cb47f6d0fc904eed172061cd56a.apk.top.dot'
    G = Graph(dot_file=root, config=default_config)
    target = list(G.nodes)[np.random.randint(len(list(G.nodes)))]
    res = []
    for ran_1 in ran:

        if vary_q: 
            p = par.p
            q = par.q + ran_1
        else:
            p =par.p + ran_1
            q = par.q

        par_1 = parSet(dim = par.dim, 
                    walk = par.walk,
                    num_walk = par.num_walk,
                    p=p, 
                    q=q)
        mapping = node2vec_mapping(FILE, G, par_1)
        res.append(mapping[target])

    sns.set()
    if vary_q:
        y_label=np.array(par.q + np.array(ran))
    else:
        y_label= [round(r + par.p,2) for r in ran]
    pl = sns.heatmap(np.array(res),yticklabels=y_label)
    pl.set(xlabel=par.__str__())
    plt.title('Different node embedding on the same node')
    if vary_q:
        plt.ylabel('q')
    else:
        plt.ylabel('p')

    plt.xlabel('dimension')
    fig = pl.get_figure()
    if vary_q: 
        fig.savefig( 'q: ' + par.__str__() + '.png')
    else:
        fig.savefig( 'p: ' + par.__str__() + '.png')

    fig.clf()
Exemplo n.º 2
0
def main():
    # arbitrary parameter set
    # par = parSet(
    #     dim=250,
    #     walk=15,
    #     num_walk=30,
    #     p=5.0,
    #     q=0.05
    # )

    par = parSet(
        dim=250,
        walk=15,
        num_walk=100,
        p=0.5,
        q=0.8
    )

    fabricate()
    to_vector(par)
    d = read_p()


    candidates = [2, 5, 10, 15, 20]
    res = []
    for cand in candidates:
        dis = np.linalg.norm(np.array(d['1']) - np.array(d[str(cand)]))
        res.append(dis)
    with open('distance_test/result.txt', 'w') as filehandle:
        for i in range(len(candidates)):
            filehandle.write('%s: %s\n' % (candidates[i], res[i]))
Exemplo n.º 3
0
def grid_search():
    t = true_val(src='metadata', merge=True)

    dimSet = [5, 10, 25, 50, 70, 100, 128, 200, 250]
    mean = []
    std = []
    for dim in dimSet:
        par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5)
        mean_t, std_t = test(par, t, src="metadata")
        mean.append(mean_t)
        std.append(std_t)
    plt.figure(1)
    plt.plot(dimSet, mean)
    plt.xlabel('dimension')
    plt.ylabel('accuracy')
    plt.title('dimension vs accuracy')
    plt.show()

    plt.figure(2)
    plt.plot(dimSet, std)
    plt.xlabel('dimension')
    plt.ylabel('standard deviation of accuracy')
    plt.title('dimension vs standard deviation of accuracy')
    plt.show()

    print(mean)
    print(std)
Exemplo n.º 4
0
def compare_graph(vary_q=True):
    par = parSet(
        dim=25,
        walk=15,
        num_walk=30,
        p=0.2,
        q=0.5
    )
    compare(par,vary_q=vary_q)
Exemplo n.º 5
0
def dataset_test_binary(src='metadata',
                        fn=tru_bin,
                        cv=10,
                        name='Binary',
                        ran=[50, 100, 150, 200, 250, 314]):
    par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5)
    main(par, src=src)
    prev_method(src=src)
    t = fn(src=src, merge=False)
    lib_prev = reading_lib('final_result_prev.pickle')
    lib_new = reading_lib('final_result.pickle')

    diff = list(set(lib_prev.keys()) - set(t.keys()))
    for d in diff:
        del lib_prev[d]
        del lib_new[d]
    new = []
    prev = []
    for ran_1 in ran:
        temp1 = []
        temp2 = []
        for ran_2 in range(100):
            selected_vecs, selected_prev, selected_tru = draw(
                size=ran_1, lib_new=lib_new, lib_prev=lib_prev, trueval=t)
            selected_vecs, selected_prev, selected_tru_w = process_data(
                selected_vecs, selected_tru, selected_prev)
            clf = RandomForestClassifier(n_estimators=100,
                                         max_depth=50,
                                         random_state=0)
            scores_new = cross_val_score(clf,
                                         selected_vecs,
                                         selected_tru_w,
                                         cv=cv)

            clf_2 = RandomForestClassifier(n_estimators=100,
                                           max_depth=50,
                                           random_state=0)
            scores_prev = cross_val_score(clf_2,
                                          selected_prev,
                                          selected_tru_w,
                                          cv=cv)
            temp1.append(scores_new.mean())
            temp2.append(scores_prev.mean())

        new.append(sum(temp1) / len(temp1))
        prev.append(sum(temp2) / len(temp2))

    # print(mean)
    plt.plot(ran, new, '-g', label='new method')
    plt.plot(ran, prev, '-b', label='previous method')
    plt.legend()
    plt.xlabel("size of dataset")
    plt.ylabel(str(cv) + "-fold cross validation accuracy")
    plt.title(name + " Classification")
    plt.show()
Exemplo n.º 6
0
def compare(src='data/graphs'):
    # an arbitrary good parameter set
    par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5)
    t = true_val(src=src)

    main(par, src=src)
    prev_method(src=src)
    new_mean, new_std = evaluate('final_result.pickle', t)
    prev_mean, prev_std = evaluate('final_result_prev.pickle', t)

    with open('compare_result.txt', 'w') as handle:
        string = "new_mean: {}, new_std: {}, prev_mean: {}, prev_std: {}".format(
            new_mean, new_std, prev_mean, prev_std)
        handle.write(string)
Exemplo n.º 7
0
def main():
    # arbitrary parameter set
    par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5)
    fabricate_adjacent()
    to_vector(par)
    d = read_p()

    candidates = [1, 2, 3, 4]
    res = []
    for cand in candidates:
        dis = np.linalg.norm(np.array(d['0']) - np.array(d[str(cand)]))
        res.append(dis)
    with open('distance_test/result_2.txt', 'w') as filehandle:
        for i in range(len(candidates)):
            filehandle.write('%s: %s\n' % (candidates[i], res[i]))
Exemplo n.º 8
0
def adj_distribution():
    par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5)
    res = []

    for i in range(10000):
        fabricate_adjacent("distance_test/adj_dist.edgelist")
        to_vector(par,
                  edgelist="distance_test/adj_dist.edgelist",
                  output="distance_test/adj_dist.emb")
        d = read_p("distance_test/adj_dist.emb")
        dis = np.linalg.norm(np.array(d['0']) - np.array(d['1']))
        res.append(dis)
    print(len(res))

    n, bins, patches = plt.hist(res, 20, facecolor='blue', alpha=0.5)
    plt.xlabel('distance')
    plt.ylabel('count')
    plt.title("adjacent distance distribution")
    plt.show()
Exemplo n.º 9
0
def compare(params,vary_q=True):
    res = []
    for ran_1 in ran:
        if vary_q: 
            q = params.q + ran_1
            p = params.p
        else:
            q = params.q
            p = params.p + ran_1
        par = parSet(
            dim=params.dim,
            walk=params.walk,
            num_walk=params.num_walk,
            q=q,
            p=p
        )

        # the number of node distances is 2485 in this dataset
        #
        d, node_lib, edge_lib = lib_gen(par)
        g = d[FILE]
        vec3 = g.distance(node_lib)
        vec = [x for x in vec3 if x != 2.0]
        res.append(vec)
        # 71 nodes
    sns.set()
    ylabel = np.array( ran) + params.q
    pl = sns.heatmap(np.array(res), yticklabels=ylabel,xticklabels=False)
    # pl.set(xlabel=par.__str__())
    fig = pl.get_figure()
    plt.xlabel('dimension')
    plt.title('Different featurization on the same graph')
    if vary_q:
        plt.ylabel('q')
        fig.savefig( 'q: ' + par.__str__() + '.png')
    else:
        plt.ylabel('p')
        fig.savefig( 'p: ' + par.__str__() + '.png')
    fig.clf()
Exemplo n.º 10
0
def adjacent_test():
    # arbitrary parameter set
    par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5)

    candidates = [1, 2, 3, 4]

    distances = np.zeros((50, 4))
    for i in range(50):
        fabricate_adjacent()
        to_vector(par,
                  edgelist="distance_test/adjacent.edgelist",
                  output="distance_test/adjacent.emb")
        d = read_p("distance_test/adjacent.emb")
        for index in range(len(candidates)):
            distances[i][index] = np.linalg.norm(
                np.array(d['0']) - np.array(d[str(candidates[index])]))
    sns.set()
    pl = sns.heatmap(np.moveaxis(distances, 0, -1), yticklabels=candidates)
    plt.title("Distance between adjacent nodes under same setting")
    plt.ylabel('Distance from node 0')
    plt.xlabel('trials')
    fig = pl.get_figure()
    fig.savefig("distance_test/adjacent_graph.png")
Exemplo n.º 11
0
def distance_test():
    par = parSet(dim=250, walk=15, num_walk=100, p=0.5, q=0.8)
    candidates = [1, 5, 10, 15, 20]
    distances = np.zeros((50, 5))
    for i in range(50):
        fabricate_distance()
        to_vector(par,
                  edgelist="distance_test/distance.edgelist",
                  output="distance_test/distance.emb")
        d = read_p(read_path="distance_test/distance.emb")

        for index in range(len(candidates)):
            dis = np.linalg.norm(
                np.array(d['0']) - np.array(d[str(candidates[index])]))
            distances[i][index] = dis

    sns.set()
    pl = sns.heatmap(np.moveaxis(distances, 0, -1), yticklabels=candidates)
    plt.title("Distance between loosely conneceted nodes under same setting")
    plt.ylabel('Distance from node 0')
    plt.xlabel('trials')
    fig = pl.get_figure()
    fig.savefig("distance_test/distance_graph.png")