def generate_positive_features():
    features = []
    count = 0
    print("Generating positive features......")
    for sample in positive_samples:
        if (count % 100 == 0):
            print(count)
        count += 1
        feature = []
        try:
            preds = nx.resource_allocation_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.jaccard_coefficient(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.preferential_attachment(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.cn_soundarajan_hopcroft(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.ra_index_soundarajan_hopcroft(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.within_inter_cluster(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            feature.append(1)  # label=1

        except:
            print("one error at: " + str(count))
            pass
        features.append(feature)
    print("positive features: " + str(len(features)))
    return features
示例#2
0
def community_features(U, node_features, edges, sinks, sources):
    within_inter_cluster = lisify_links(
        nx.within_inter_cluster(U, ebunch=edges))

    communities_dict = nx.algorithms.community.asyn_fluidc(U, k=5)
    communities = []
    for community in communities_dict:
        communities.append(list(community))
    community_ids = np.zeros(node_features.shape[0])
    for i in range(5):
        community_ids[communities[i]] = i
    communities_same = (
        community_ids[sources] == community_ids[sinks]).astype(int)
    for i in range(node_features.shape[0]):
        U.node[i]['community'] = int(community_ids[i])
    community_features = np.vstack([communities_same, within_inter_cluster]).T
    logger.info(f'community_features generated: {community_features.shape}')
    return community_features
示例#3
0
def get_features(L, flag):
    X = [[] for i in range(len(L))]

    #=====================Social features(user-to-user graph)======================

    #g0.adamic adar score
    if flag['g0'] is True:
        print("get feature g0")
        preds = nx.adamic_adar_index(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g1.jaccard coefficient
    if flag['g1'] is True:
        print("get feature g1")
        preds = nx.jaccard_coefficient(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1
    #g2.resource_allocation
    if flag['g2'] is True:
        print("get feature g2")
        preds = nx.resource_allocation_index(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g3.preferentail_attachment
    if flag['g3'] is True:
        print("get feature g3")
        preds = nx.preferential_attachment(G, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #g4.shortest path length
    if flag['g4'] is True:
        print("get feature g4")
        cnt = 0
        for (u, v) in L:
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                if nx.has_path(G, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(G, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
                G.add_edge(u, v)
            else:
                if nx.has_path(G, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(G, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
            cnt += 1

    #g5.common neighbors
    if flag['g5'] is True:
        print("get feature g5")
        cnt = 0
        for (u, v) in L:
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                T = [w for w in nx.common_neighbors(G, u, v)]
                G.add_edge(u, v)
            else:
                T = [w for w in nx.common_neighbors(G, u, v)]
            X[cnt].append(len(T))
            cnt += 1

    #g6.Approximate katz for social graph
    if flag['g6'] is True:
        print("get feature g6")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for x in G.neighbors(u):
                    for y in G.neighbors(v):
                        if x == y or G.has_edge(x, y):
                            p += 1
                G.add_edge(u, v)
            else:
                for x in G.neighbors(u):
                    for y in G.neighbors(v):
                        if x == y or G.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    if flag['g7'] is True:
        print("get feature g7")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g8'] is True:
        print("get feature g8")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                if line == "":
                    continue
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g9'] is True:
        print("get feature g9")
        cnt = 0
        with open("best_part_G.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                G.node[v]['community'] = c
        iters = nx.within_inter_cluster(G, L, delta=0.5)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['g10'] is True:
        print("get feature g10")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.cn_soundarajan_hopcroft(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['g11'] is True:
        print("get feature g11")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.ra_index_soundarajan_hopcroft(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['g12'] is True:
        print("get feature g12")
        cnt = 0
        with open("dendo_G.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.within_inter_cluster(G, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    G.node[v]['community'] = c
        iters = nx.within_inter_cluster(G, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds
    #=========================checkin features=========================================
    #c0.follower number
    if flag['c0'] is True:
        print("get feature c0")
        cnt = 0
        for (u, v) in L:
            X[cnt].append(U[u]['follow_cnt'] * U[v]['follow_cnt'])  # fu*fv
            cnt += 1

    #c1.same time same location
    if flag['c1'] is True:
        print("get feature c1")
        cnt = 0
        for (u, v) in L:
            p = calculate_CCC(G, u, v)
            X[cnt].append(p)
            cnt += 1

    #c2.same time same distinct spot
    if flag['c2'] is True:
        print("get deature c2")
        cnt = 0
        for (u, v) in L:
            p = 0
            dis_same_spot = []
            for k in C[u]:
                if k[1] not in dis_same_spot and k in C[v]:
                    dis_same_spot.append(k[1])
                    p += 1
            X[cnt].append(p)
            cnt += 1

    #c3.same distinct spot (not necessarily same time)
    if flag['c3'] is True:
        cnt = 0
        print("get feature c3")
        for (u, v) in L:
            p = 0
            dis_same_spot = []
            for k in C[u]:
                if k[1] not in dis_same_spot:
                    for m in C[v]:
                        if k[1] == m[1]:
                            dis_same_spot.append(k[1])
                            p += 1
                            break
            X[cnt].append(p)
            cnt += 1

    #c4.min Entropy
    if flag['c4'] is True:
        print("get feature c4")
        cnt = 0
        for (u, v) in L:
            p = 0
            E_list = []
            for k in C[u]:
                if k in C[v]:
                    spot = k[1]
                    if spot in S and S[spot]['entropy'] > 0:
                        E_list.append(S[spot]['entropy'])
            if len(E_list) > 0:
                p = min(E_list)
            X[cnt].append(p)
            cnt += 1

    #c5. distance of mean_LL
    if flag['c5'] is True:
        cnt = 0
        print("get feature c5")
        for (u, v) in L:
            dist = np.sqrt((U[u]['mean_LL'][0] - U[v]['mean_LL'][0])**2 +
                           (U[u]['mean_LL'][1] - U[v]['mean_LL'][1])**2)
            X[cnt].append(dist)
            cnt += 1

    #c6.weighted same location
    if flag['c6'] is True:
        print("get feature c6")
        cnt = 0
        for (u, v) in L:
            p = 0
            for k in C[u]:
                if k in C[v]:
                    spot = k[1]
                    #if spot in S and S[spot]['entropy'] > 0:
                    #p += 1/S[spot]['entropy']
                    if spot in S:
                        dist = np.sqrt(
                            (S[spot]['LL'][0] - U[u]['mean_LL'][0])**2 +
                            (S[spot]['LL'][1] - U[u]['mean_LL'][1])**2)
                        p += dist
                        dist = np.sqrt(
                            (S[spot]['LL'][0] - U[v]['mean_LL'][0])**2 +
                            (S[spot]['LL'][1] - U[v]['mean_LL'][1])**2)
                        p += dist
            X[cnt].append(p)
            cnt += 1

    #c7.PP
    if flag['c7'] is True:
        print("get feature c7")
        cnt = 0
        for (u, v) in L:
            p = len(C[u]) * len(C[v])
            X[cnt].append(p)
            cnt += 1

    #c8.Total Common Friend Closeness (TCFC)
    if flag['c8'] is True:
        print("get feature c8")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for w in nx.common_neighbors(G, u, v):
                    T1 = [x for x in nx.common_neighbors(G, u, w)]
                    T2 = [x for x in nx.common_neighbors(G, v, w)]
                    p += len(T1) * len(T2)
                G.add_edge(u, v)
            else:
                for w in nx.common_neighbors(G, u, v):
                    T1 = [x for x in nx.common_neighbors(G, u, w)]
                    T2 = [x for x in nx.common_neighbors(G, v, w)]
                    p += len(T1) * len(T2)
            X[cnt].append(p)
            cnt += 1

    #c9.Total Common friend Checkin Count (TCFCC)
    if flag['c9'] is True:
        print("get feature c9")
        cnt = 0
        for (u, v) in L:
            p = 0
            if G.has_edge(u, v):
                G.remove_edge(u, v)
                for w in nx.common_neighbors(G, u, v):
                    p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w)
                G.add_edge(u, v)
            else:
                for w in nx.common_neighbors(G, u, v):
                    p += calculate_CCC(G, u, w) * calculate_CCC(G, v, w)
            X[cnt].append(p)
            cnt += 1

    #c10. Common Category Checkin Counts Product (CCCP)
    if flag['c10'] is True:
        print("get feature c10")
        cnt = 0
        for (u, v) in L:
            p = 0
            for cat in U[u]['cate']:
                if cat in U[v]['cate']:
                    p += U[u]['cate'][cat] * U[v]['cate'][cat]
            X[cnt].append(p)
            cnt += 1

    #c11. Common Category Checkin Counts Product Ratio(CCCPR)
    if flag['c11'] is True:
        print("get feature c11")
        cnt = 0
        for (u, v) in L:
            p = 0
            u_cate_total = sum(U[u]['cate'][cat]**2 for cat in U[u]['cate'])
            v_cate_total = sum(U[v]['cate'][cat]**2 for cat in U[v]['cate'])
            for cat in U[u]['cate']:
                if cat in U[v]['cate']:
                    p += (U[u]['cate'][cat] * U[v]['cate'][cat] /
                          np.sqrt(u_cate_total * v_cate_total))
            X[cnt].append(p)
            cnt += 1

#c12.trip route length all
    if flag['c12'] is True:
        print("get feature c12")
        cnt = 0
        for (u, v) in L:
            tripDayLen1 = list()
            tripDayLen2 = list()
            tripDay = "starting"
            tripLen = 0.0
            lastSpot = [0.0, 0.0]
            for k in C[u]:
                if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0):
                    if k[1] in S:
                        tripLen += np.sqrt((lastSpot[0] -
                                            S[k[1]]['LL'][0])**2 +
                                           (lastSpot[1] - S[k[1]]['LL'][1])**2)
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
                else:
                    if k[1] in S:
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
            tripDay = "starting"
            tripLen2 = 0.0
            lastSpot = [0.0, 0.0]
            for k in C[v]:
                if not (lastSpot[0] == 0.0 and lastSpot[1] == 0.0):
                    if k[1] in S:
                        tripLen2 += np.sqrt(
                            (lastSpot[0] - S[k[1]]['LL'][0])**2 +
                            (lastSpot[1] - S[k[1]]['LL'][1])**2)
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
                else:
                    if k[1] in S:
                        lastSpot[0] = S[k[1]]['LL'][0]
                        lastSpot[1] = S[k[1]]['LL'][1]
            X[cnt].append(tripLen + tripLen2)
            cnt += 1

    #=========================Heter Graph features=====================================

    #h0.Approximate katz for bipartite graph
    if flag['h0'] is True:
        print("get feature h0")
        cnt = 0
        for (u, v) in L:
            p = 0
            for x in B.neighbors(u):
                for y in B.neighbors(v):
                    if x == y or B.has_edge(x, y):
                        p += 1
            X[cnt].append(p)
            cnt += 1

    #h1.Approximate katz on HB
    if flag['h1'] is True:
        print("get feature h1")
        cnt = 0
        for (u, v) in L:
            p = 0
            if HB.has_edge(u, v):
                HB.remove_edge(u, v)
                for x in HB.neighbors(u):
                    for y in HB.neighbors(v):
                        if x == y or HB.has_edge(x, y):
                            p += 1
                HB.add_edge(u, v)
            else:
                for x in HB.neighbors(u):
                    for y in HB.neighbors(v):
                        if x == y or HB.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #h2.Approximate katz on H
    if flag['h2'] is True:
        print("get feature h2")
        cnt = 0
        for (u, v) in L:
            p = 0
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
                H.add_edge(u, v)
            else:
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    #h3.shortest path length on B
    if flag['h3'] is True:
        print("get feature h3")
        cnt = 0
        for (u, v) in L:
            if nx.has_path(B, u, v):
                X[cnt].append(
                    nx.shortest_path_length(B, source=u, target=v) / 50000)
            else:
                X[cnt].append(1)
            cnt += 1

    #h4.clustering coefiicient on H
    if flag['h4'] is True:
        print("get feature h4")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                p = nx.clustering(H, u) * nx.clustering(H, v)
                H.add_edge(u, v)
            else:
                p = nx.clustering(H, u) * nx.clustering(H, v)
            X[cnt].append(p)
            cnt += 1

    #h5. number of (user's loc friends)'s loc friends
    if flag['h5'] is True:
        print("get feature h5")
        cnt = 0
        for (u, v) in L:
            counter1 = 0
            for neighbor in H.neighbors(u):
                if not neighbor.isnumeric():
                    for neighbor2 in H.neighbors(neighbor):
                        if not neighbor.isnumeric():
                            counter1 += 1
            counter2 = 0
            for neighbor in H.neighbors(v):
                if not neighbor.isnumeric():
                    for neighbor2 in H.neighbors(neighbor):
                        if not neighbor.isnumeric():
                            counter2 += 1

            #print(str(counter1)+" "+str(counter2)+"\n")
            X[cnt].append(counter1 * counter2)
            cnt += 1

    #h6. location friends' degree sum
    if flag['h6'] is True:
        print("get feature h6")
        cnt = 0
        for (u, v) in L:
            counter1 = 0
            for locationNeighbor in H.neighbors(u):
                if not locationNeighbor.isnumeric():
                    #print(str(locationNeighbor)+"\n")
                    if locationNeighbor in LG:
                        counter1 += LG.degree(locationNeighbor)

            counter2 = 0
            for locationNeighbor in H.neighbors(v):
                if not locationNeighbor.isnumeric():
                    if locationNeighbor in LG:
                        counter2 += LG.degree(locationNeighbor)
            X[cnt].append(counter1 * counter2)
            cnt += 1

    #h7. Approximate katz for social graph
    if flag['h7'] is True:
        print("get feature h7")
        cnt = 0
        for (u, v) in L:
            counter = 0
            for node in H.neighbors(u):
                if not node.isnumeric():
                    for node2 in H.neighbors(v):
                        if not node2.isnumeric():
                            if node == node2 or H.has_edge(node, node2):
                                counter += 1
            X[cnt].append(counter)
            cnt += 1

    #h8. adamic adar score on H
    if flag['h8'] is True:
        print("get feature h8")
        preds = nx.adamic_adar_index(H, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1
    #h9. resource_allocation on H
    if flag['h9'] is True:
        print("get feature h9")
        preds = nx.resource_allocation_index(H, L)
        cnt = 0
        for (u, v, p) in preds:
            X[cnt].append(p)
            cnt += 1

    #h10. shortest path length on H
    if flag['h10'] is True:
        print("get feature h10")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                if nx.has_path(H, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(H, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
                H.add_edge(u, v)
            else:
                if nx.has_path(H, u, v):
                    X[cnt].append(
                        nx.shortest_path_length(H, source=u, target=v) / 50000)
                else:
                    X[cnt].append(1)
            cnt += 1
    #h11. common neighbors on H
    if flag['h11'] is True:
        print("get feature h11")
        cnt = 0
        for (u, v) in L:
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                T = [w for w in nx.common_neighbors(H, u, v)]
                H.add_edge(u, v)
            else:
                T = [w for w in nx.common_neighbors(H, u, v)]
            X[cnt].append(len(T))
            cnt += 1

    #h12.Approximate katz for social graph
    if flag['h12'] is True:
        print("get feature h12")
        cnt = 0
        for (u, v) in L:
            p = 0
            if H.has_edge(u, v):
                H.remove_edge(u, v)
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
                H.add_edge(u, v)
            else:
                for x in H.neighbors(u):
                    for y in H.neighbors(v):
                        if x == y or H.has_edge(x, y):
                            p += 1
            X[cnt].append(p)
            cnt += 1

    if flag['h13'] is True:
        print("get feature h13")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h14'] is True:
        print("get feature h14")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                if line == "":
                    continue
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h15'] is True:
        print("get feature h15")
        cnt = 0
        with open("best_part_HB.txt", "r") as f:
            for line in f:
                v, c = line.split()
                c = int(c)
                HB.node[v]['community'] = c
        iters = nx.within_inter_cluster(HB, L, delta=0.5)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(preds[(u, v)])
            cnt += 1

    if flag['h16'] is True:
        print("get feature h16")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.cn_soundarajan_hopcroft(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.cn_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['h17'] is True:
        print("get feature h17")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.ra_index_soundarajan_hopcroft(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    if flag['h18'] is True:
        print("get feature h18")
        cnt = 0
        with open("dendo_HB.txt", "r") as f:
            line = f.readline()
            p_dict = {(u, v): 0.0 for (u, v) in L}
            for line in f:
                if 'level' in line:
                    l = int(line.split()[1])
                    if l != 0:
                        iters = nx.within_inter_cluster(HB, L)
                        for (u, v, p) in iters:
                            p_dict[(u, v)] += p
                else:
                    v, c = line.split()
                    c = int(c)
                    HB.node[v]['community'] = c
        iters = nx.within_inter_cluster(HB, L)
        preds = {(u, v): p for (u, v, p) in iters}
        for (u, v) in L:
            X[cnt].append(p_dict[(u, v)] + preds[(u, v)])
            cnt += 1
        del p_dict
        del preds

    return X
示例#4
0
                     delimiter="\t",
                     nodetype=str)

partition = community.best_partition(G)
nx.set_node_attributes(G, name='community', values=partition)

ap = list(all_pairs(G.nodes()))

cn = cn.cnbors(G, ap)
rai = nx.resource_allocation_index(G, ap)
jc = nx.jaccard_coefficient(G, ap)
aai = nx.adamic_adar_index(G, ap)
pa = nx.preferential_attachment(G, ap)
ccn = nx.cn_soundarajan_hopcroft(G, ap)
cra = nx.ra_index_soundarajan_hopcroft(G, ap)
wic = nx.within_inter_cluster(G, ap, community='community')

u, v, s1, s2, s3, s4, s5, s6, s7, s8, has_edge = ([] for i in range(11))
for m1, m2, m3, m4, m5, m6, m7, m8 in zip(cn, rai, jc, aai, pa, ccn, cra, wic):
    u.append(m1[0])
    v.append(m1[1])
    s1.append(m1[2])
    s2.append(m2[2])
    s3.append(m3[2])
    s4.append(m4[2])
    s5.append(m5[2])
    s6.append(m6[2])
    s7.append(m7[2])
    s8.append(m8[2])
    has_edge.append(int(G.has_edge(m1[0], m2[1])))
 positive_predictions_proba_jcc.append(
     list(nx.jaccard_coefficient(G, [edge]))[0][2])
 positive_predictions_proba_ra.append(
     list(nx.resource_allocation_index(G, [edge]))[0][2])
 positive_predictions_proba_aa.append(
     list(nx.adamic_adar_index(G, [edge]))[0][2])
 positive_predictions_proba_pa.append(
     list(nx.preferential_attachment(G, [edge]))[0][2])
 positive_predictions_proba_cnsh.append(
     list(nx.cn_soundarajan_hopcroft(
         G, [edge]))[0][2])  # needs community information
 positive_predictions_proba_rash.append(
     list(nx.ra_index_soundarajan_hopcroft(
         G, [edge]))[0][2])  # needs community information
 positive_predictions_proba_wic.append(
     list(nx.within_inter_cluster(
         G, [edge]))[0][2])  # needs community information
 positive_predictions_proba_slp_DegCent.append(
     list(SLP_prediction(G, [edge], centrality="DegCent"))[0][2])
 positive_predictions_proba_slp_EigenCent.append(
     list(SLP_prediction(G, [edge], centrality="EigenCent"))[0][2])
 positive_predictions_proba_slp_ClosenessCent.append(
     list(SLP_prediction(G, [edge], centrality="ClosenessCent"))[0][2])
 positive_predictions_proba_slp_BetweenCent.append(
     list(SLP_prediction(G, [edge], centrality="BetweenCent"))[0][2])
 positive_predictions_proba_slp_PageRank.append(
     list(SLP_prediction(G, [edge], centrality="PageRank"))[0][2])
 positive_predictions_proba_slpc_DegCent.append(
     list(SLPC_prediction(
         G, [edge],
         centrality="DegCent"))[0][2])  # needs community information
 positive_predictions_proba_slpc_EigenCent.append(
    def set_edge_weight(self, edge_weight_method='weight'):

        if edge_weight_method == 'weight':
            return

        # Centrality based methods

        elif edge_weight_method == 'edge_betweenness_centrality':
            print("comptuing edge_betweenness_centrality..")
            C = nx.edge_betweenness_centrality(self.G, weight='weight')
            print("done!")

        elif edge_weight_method == 'edge_betweenness_centrality_subset':
            print("comptuing edge_betweenness_centrality_subset..")
            C = nx.edge_current_flow_betweenness_centrality(self.G,
                                                            weight='weight')
            print('done')

        elif edge_weight_method == 'edge_current_flow_betweenness_centrality_subset':
            print(
                "comptuing edge_current_flow_betweenness_centrality_subset..")
            C = nx.edge_current_flow_betweenness_centrality_subset(
                self.G, weight='weight')
            print('done')

        elif edge_weight_method == 'edge_load_centrality':
            print("comptuing edge_load_centrality..")
            C = nx.edge_load_centrality(self.G)
            print('done!')

        # Link Prediction based methods

        elif edge_weight_method == 'adamic_adar_index':
            print("comptuing adamic_adar_index ..")
            preds = nx.adamic_adar_index(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        elif edge_weight_method == 'ra_index_soundarajan_hopcroft':
            print("comptuing ra_index_soundarajan_hopcroft ..")
            preds = nx.ra_index_soundarajan_hopcroft(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        elif edge_weight_method == 'preferential_attachment':
            print("comptuing preferential_attachment ..")
            preds = nx.preferential_attachment(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        #elif edge_weight_method=='cn_soundarajan_hopcroft':
        #    print("comptuing cn_soundarajan_hopcroft ..")
        #    preds=nx.cn_soundarajan_hopcroft(self.G,self.G.edges())
        #    C={}
        #    for u, v, p in preds:
        #        C[(u,v)]=p

        elif edge_weight_method == 'within_inter_cluster':
            print("comptuing within_inter_cluster ..")
            preds = nx.within_inter_cluster(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        elif edge_weight_method == 'resource_allocation_index':
            print("comptuing resource allocation index ..")
            preds = nx.resource_allocation_index(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

        elif edge_weight_method == 'jaccard_coefficient':
            print("comptuing jaccard_coefficient..")
            preds = nx.jaccard_coefficient(self.G, self.G.edges())
            C = {}
            for u, v, p in preds:
                C[(u, v)] = p

            print('done!')

        for u, v, d in self.G.edges(data=True):
            if edge_weight_method == None:
                d['weight'] = 1
            else:

                d['weight'] = C[(u, v)]

        return 1
示例#7
0
 def within_inter_cluster(uG, ni, nj, rand_node):
     a, b = nx.within_inter_cluster(uG, [(ni, nj), (ni, rand_node)])
     return a[2], b[2]
示例#8
0
    def compute_variable(self,
                         variable_name,
                         train: bool,
                         load=True,
                         path_to_file=None,
                         save=True):

        assert variable_name in self.handled_variables, "Variable %s is not handled. Handled variables are : %s" % (
            variable_name, str(self.handled_variables))

        if load and train:
            if path_to_file is None and os.path.isfile(
                    "variables/%s.npy" % variable_name):
                print("Loading STANDARD %s file!" % variable_name)
                result = np.load("variables/%s.npy" % variable_name)
                return result[:self.nb_training_samples]
            elif path_to_file is not None and os.path.isfile(path_to_file):
                print("Loading CUSTOM %s file!" % variable_name)
                result = np.load(path_to_file)
                return result[:self.nb_training_samples]
            print("Did not find saved %s in `variables` folder." %
                  variable_name)

        if load and not train:
            if path_to_file is None and os.path.isfile(
                    "variables/TEST_%s.npy" % variable_name):
                print("Loading STANDARD TEST_%s file!" % variable_name)
                result = np.load("variables/TEST_%s.npy" % variable_name)
                return result[:self.nb_training_samples]
            elif path_to_file is not None and os.path.isfile(path_to_file):
                print("Loading CUSTOM %s file!" % variable_name)
                result = np.load(path_to_file)
                return result[:self.nb_training_samples]
            print("Did not find saved TEST_%s in `variables` folder." %
                  variable_name)

        print("Starting computation of %s..." % variable_name)
        t1 = time()
        gd = self.graph_structure.graph_dicts  # "graph_dictionaries
        if train:
            nb_of_samples = self.nb_training_samples
        else:
            nb_of_samples = self.nb_testing_samples
        result = np.zeros(shape=nb_of_samples)
        for i in range(nb_of_samples):
            if train:
                t = self.train_array[i]
            else:
                t = self.test_array[i]
            if variable_name == "publication_2":
                result[i] = np.log(
                    len(
                        set(self.node_information.loc[t[0], "publication_2"])
                        & set(self.node_information.loc[t[1],
                                                        "publication_2"])) + 1)
            elif variable_name == "adam_coeff":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = \
                            next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g,
                                                                                 [(t[0], t[1])]))[2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = \
                            next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g,
                                                                                 [(t[0], t[1])]))[2]
                else:
                    result[i] = \
                        next(nx.algorithms.link_prediction.adamic_adar_index(self.graph_structure.g, [(t[0], t[1])]))[2]
            elif variable_name == "overlapping_words_in_title":
                result[i] = compute_intersection(
                    self.node_information.loc[t[0], "title"],
                    self.node_information.loc[t[1], "title"], self.stemmer,
                    self.stpwds)
            elif variable_name == "number_of_common_authors":
                result[i] = nbr_common_authors(
                    self.node_information.loc[t[0], "author"],
                    self.node_information.loc[t[1], "author"])

            elif variable_name == "difference_of_years":
                result[i] = abs(self.node_information.loc[t[0], 'year'] -
                                self.node_information.loc[t[1], 'year'])

            elif variable_name == "affinity_between_authors":
                result[i] = compute_affinity_between_authors(
                    self.node_information.loc[t[0], 'author'],
                    self.node_information.loc[t[1],
                                              'author'], self.authors_dict)
            elif variable_name == "identical_journal":
                result[i] = np.int(self.node_information.loc[t[0], 'journal']
                                   == self.node_information.loc[t[1],
                                                                'journal'])

            elif variable_name == "l2_distance":
                result[i] = np.linalg.norm(
                    self.node_information.loc[t[0], 'wv'] -
                    self.node_information.loc[t[1], 'wv'])

            elif variable_name == "cosine_distance_tfid":
                v1 = self.node_information.loc[t[0], "wv_tfid"]
                v2 = self.node_information.loc[t[1], "wv_tfid"]
                try:
                    b1 = np.isnan(v1)
                except TypeError:
                    b1 = False
                try:
                    b2 = np.isnan(v2)
                except TypeError:
                    b2 = False
                if not b1 and not b2:
                    result[i] = cosine_similarity(v1, v2)
                else:
                    result[i] = 0

            elif variable_name == "l2_distance_between_titles":
                dst = np.linalg.norm(
                    self.node_information.loc[t[0], 'title_wv'] -
                    self.node_information.loc[t[1], 'title_wv'])
                if np.isnan(dst):
                    result[i] = 0
                else:
                    result[i] = dst

            # elif variable_name == "cosine_distance_between_titles":
            #     result[i] = cosine_distances(
            #         np.nan_to_num(self.node_information.loc[t[0], 'title_wv']).reshape(-1, 1) - (self.node_information.loc[t[1], 'title_wv']).reshape(-1, 1)
            #     )[0][0]

            elif variable_name == "common_neighbors":
                result[i] = len(
                    sorted(
                        nx.common_neighbors(self.graph_structure.g, t[0],
                                            t[1])))

            elif variable_name == "clustering_coeff":
                result[i] = gd["clustering_coeff"][
                    t[0]] * gd["clustering_coeff"][t[1]]

            elif variable_name == "betweenness":
                result[i] = gd["betweenness"][t[0]] * gd["betweenness"][t[1]]

            elif variable_name == "closeness":
                result[i] = gd["closeness"][t[0]] * gd["closeness"][t[1]]

            elif variable_name == "degree":
                result[i] = gd["degree"][t[0]] * gd["degree"][t[1]]

            elif variable_name == "eigenvector":
                result[i] = gd["eigenvector"][t[0]] * gd["eigenvector"][t[1]]

            elif variable_name == "jaccard_coeff":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = next(
                            nx.jaccard_coefficient(self.graph_structure.g,
                                                   [(t[0], t[1])]))[2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = next(
                            nx.jaccard_coefficient(self.graph_structure.g,
                                                   [(t[0], t[1])]))[2]
                else:
                    result[i] = next(
                        nx.jaccard_coefficient(self.graph_structure.g,
                                               [(t[0], t[1])]))[2]
            elif variable_name == "shortest_path":
                if train:
                    if t[2] == 1:
                        assert self.graph_structure.g.has_edge(
                            t[0], t[1]
                        ), "There's a problem with the structure of the graph for id %i and %i" % (
                            t[0], t[1])
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        try:
                            result[
                                i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                    self.graph_structure.g, t[0], t[1])
                        except nx.NetworkXNoPath:
                            result[i] = 0
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        try:
                            result[
                                i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                    self.graph_structure.g, t[0], t[1])
                        except nx.NetworkXNoPath:
                            result[i] = 0
                else:
                    try:
                        result[
                            i] = 1 / nx.algorithms.shortest_paths.generic.shortest_path_length(
                                self.graph_structure.g, t[0], t[1])
                    except nx.NetworkXNoPath:
                        result[i] = 0

            elif variable_name == "pagerank":
                result[i] = gd["pagerank"][t[0]] * gd["pagerank"][t[1]]

            elif variable_name == "community":
                if self.graph_structure.partition[
                        t[0]] == self.graph_structure.partition[t[1]]:
                    result[i] = 1
                else:
                    result[i] = 0

            elif variable_name == "lp_resource_allocation_index":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.resource_allocation_index(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.resource_allocation_index(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.resource_allocation_index(self.graph_structure.g,
                                                     [(t[0], t[1])]))[0][2]

            elif variable_name == "lp_preferential_attachment":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.preferential_attachment(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.preferential_attachment(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.preferential_attachment(self.graph_structure.g,
                                                   [(t[0], t[1])]))[0][2]
            elif variable_name == "lp_cn_soundarajan":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                       [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.cn_soundarajan_hopcroft(self.graph_structure.g,
                                                   [(t[0], t[1])]))[0][2]
            elif variable_name == "lp_ra_index_soundarajan":
                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.ra_index_soundarajan_hopcroft(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.ra_index_soundarajan_hopcroft(
                                self.graph_structure.g, [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.ra_index_soundarajan_hopcroft(
                            self.graph_structure.g, [(t[0], t[1])]))[0][2]

            elif variable_name == "lp_within_inter_cluster":

                if train:
                    if t[2] == 1:
                        self.graph_structure.g.remove_edge(t[0], t[1])
                        result[i] = sorted(
                            nx.within_inter_cluster(self.graph_structure.g,
                                                    [(t[0], t[1])]))[0][2]
                        self.graph_structure.g.add_edge(t[0], t[1])
                    else:
                        result[i] = sorted(
                            nx.within_inter_cluster(self.graph_structure.g,
                                                    [(t[0], t[1])]))[0][2]
                else:
                    result[i] = sorted(
                        nx.within_inter_cluster(self.graph_structure.g,
                                                [(t[0], t[1])]))[0][2]

        print("Did %s column in %5.1fs" % (variable_name, time() - t1))
        if save and train:
            print("Saved variable %s in `variables` directory." %
                  variable_name)
            np.save("variables/" + variable_name, result)
        if save and not train:
            np.save("variables/TEST_" + variable_name, result)
            print("Saved variable TEST_%s in `variables` directory." %
                  variable_name)
        if np.isnan(result).shape[0] >= 1:
            print("Careful, you have nan values !")
            result[np.isnan(result)] = 0
        return result