Exemplo n.º 1
0
def drop_edge(graph):
    edges = graph.edges
    indices = np.random.permutation(len(edges))
    bound = int(len(edges) * 0.8)
    training_idx, test_idx = indices[:bound], indices[bound:]

    test_edges = np.array(edges)[test_idx].tolist()
    # non_zero = [(u,v,k) for u,v,k in edges if k != 0]
    # self_loop = [(u,v) for u,v,k in edges if u == v]
    # pdb.set_trace()
    for u, v in test_edges:
        graph.remove_edge(u, v)

    pred_adamic = list(link_prediction.adamic_adar_index(graph))

    unpre_edges = [(u, v) for u, v, p in pred_adamic]
    score_adamic = [p for u, v, p in pred_adamic]
    pred_jaccard = list(link_prediction.jaccard_coefficient(graph))
    score_jaccard = [p for u, v, p in pred_jaccard]
    label = [1 if [u, v] in test_edges else 0 for u, v in unpre_edges]

    adamic_results = roc_auc_score(label, score_adamic)
    jaccard_results = roc_auc_score(label, score_jaccard)

    return adamic_results, jaccard_results
Exemplo n.º 2
0
def jaccard_sims(g, bipartite_mode, pairs):
    '''
    return a generator that yields tuples of form (label1,label2,sim) for all similarities in the given node pairs
    :param g: the artists tags graph
    :param bipartite_mode: which set of nodes to calculate similarity for: ARTIST_MODE or TAG_MODE
    :param pairs: tuple of pairs of artist or tag nodes (an ebunch in networkx)
    '''

    if bipartite_mode not in [ARTIST_MODE, TAG_MODE]:
        logging.error('invalid value for bipartite mode: %d' % bipartite_mode)
        return
    
    for counter,(a1, a2) in enumerate(pairs):
        sim_iter = jaccard_coefficient(g, [(a1, a2)])
        (u, v, sim) = sim_iter.next()
        yield(u[1], v[1], sim)
            
        if (counter % 10000 == 0):
            logging.info('Calculated similarity for pair %d, mode %d' % (counter, bipartite_mode))
Exemplo n.º 3
0
def jaccard_sims(g, bipartite_mode, pairs):
    '''
    return a generator that yields tuples of form (label1,label2,sim) for all similarities in the given node pairs
    :param g: the artists tags graph
    :param bipartite_mode: which set of nodes to calculate similarity for: ARTIST_MODE or TAG_MODE
    :param pairs: tuple of pairs of artist or tag nodes (an ebunch in networkx)
    '''

    if bipartite_mode not in [ARTIST_MODE, TAG_MODE]:
        logging.error('invalid value for bipartite mode: %d' % bipartite_mode)
        return

    for counter, (a1, a2) in enumerate(pairs):
        sim_iter = jaccard_coefficient(g, [(a1, a2)])
        (u, v, sim) = sim_iter.next()
        yield (u[1], v[1], sim)

        if (counter % 10000 == 0):
            logging.info('Calculated similarity for pair %d, mode %d' %
                         (counter, bipartite_mode))
Exemplo n.º 4
0
def extract_features(
    edge_list: list,
    G: Graph,
    DiG: DiGraph,
    page_rank: dict,
    katz: dict,
    parameters: dict,
) -> pd.DataFrame:
    """Extracts features for a list of edges on an undirected graph.

    Args:
        edges: a list of edges.

        G: a NetworkX undirected graph.

        DiG: a NetworkX directed graph.

        page_rank: dictionary containing page_rank measures.

        katz: dictionary containing katz centrality measures.

        parameters: parameters defined in parameters.yml.

    Returns:
        Pandas dataframe with edge features.
    """

    # Initialise logger and progress bar
    log = logging.getLogger(__name__)
    tqdm.pandas()

    # DEBUG ONLY: calculate features for subset
    subset = parameters["features"]["subset"]
    if subset:
        edges = edge_list[:subset]
        log.warning(
            red("Calculating features on first {} edges.".format(subset)))
        # pause()
    else:
        edges = edge_list
        log.warning(
            red("Calculating features on all {} edges.".format(len(edges))))
        # pause()

    # Calculate edge features
    try:

        # Initialise feature matrix
        log.info(blue("Initialising feature matrix..."))
        df = pd.DataFrame(dict(edge=edges))

        # Degree features
        log.info(blue("Calculating degree features..."))
        df = add_degree_features(DiG, df)

        # Undirected similarity
        log.info(blue("Calculating undirected similarity..."))
        df["RA_undirected"] = [
            x for u, v, x in lp.resource_allocation_index(G, df.edge)
        ]
        df["JC_undirected"] = [
            x for u, v, x in lp.jaccard_coefficient(G, df.edge)
        ]
        df["AA_undirected"] = [
            x for u, v, x in lp.adamic_adar_index(G, df.edge)
        ]
        df["PA_undirected"] = [
            x for u, v, x in lp.preferential_attachment(G, df.edge)
        ]

        # Shortest path
        log.info(blue("Calculating shortest path..."))
        df["shortest_path"] = df.edge.progress_apply(shortest_path, G=DiG)

        # Assortativity
        log.info(blue("Calculating average neighbor degree..."))
        df["source_avg_nbr_degree"] = df.edge.progress_apply(
            source_avg_nbr_degree, G=DiG)
        df["sink_avg_nbr_degree"] = df.edge.progress_apply(sink_avg_nbr_degree,
                                                           G=DiG)

        # Boundary size
        log.info(blue("Calculating boundary size..."))
        df["node_boundary_size"] = df.edge.progress_apply(node_boundary_size,
                                                          G=DiG)

        # Centrality
        log.info(blue("Calculating centrality..."))
        centrality = degree_centrality(DiG)
        df["source_centrality"] = df.edge.progress_apply(
            lambda e: centrality[e[0]])
        df["sink_centrality"] = df.edge.progress_apply(
            lambda e: centrality[e[1]])
        log.info(blue("Calculating in-degree centrality..."))
        in_centrality = in_degree_centrality(DiG)
        df["source_in_centrality"] = df.edge.progress_apply(
            lambda e: in_centrality[e[0]])
        df["sink_in_centrality"] = df.edge.progress_apply(
            lambda e: in_centrality[e[1]])
        log.info(blue("Calculating out-degree centrality..."))
        out_centrality = out_degree_centrality(DiG)
        df["source_out_centrality"] = df.edge.progress_apply(
            lambda e: out_centrality[e[0]])
        df["sink_out_centrality"] = df.edge.progress_apply(
            lambda e: out_centrality[e[1]])
        log.info(blue("Calculating Katz centrality..."))
        df["source_katz"] = df.edge.progress_apply(source_katz, katz=katz)
        df["sink_katz"] = df.edge.progress_apply(sink_katz, katz=katz)

        # Clustering
        # log.info(blue("Calculating source clustering..."))
        # X_train["source_clustering"] = X_train.edge.progress_apply(
        #     source_clustering, G=DiG
        # )
        # X_valid["source_clustering"] = X_valid.edge.progress_apply(
        #     source_clustering, G=DiG
        # )
        # X_test["source_clustering"] = X_test.edge.progress_apply(
        #     source_clustering, G=DiG
        # )
        # log.info(blue("Calculating sink clustering..."))
        # df["sink_clustering"] = df.edge.progress_apply(sink_clustering, G=DiG)

        # PageRank
        log.info(blue("Calculating PageRank..."))
        df["source_page_rank"] = df.edge.progress_apply(source_page_rank,
                                                        page_rank=page_rank)
        df["sink_page_rank"] = df.edge.progress_apply(sink_page_rank,
                                                      page_rank=page_rank)

        # Efficiency
        log.info(blue("Calculating link efficiency..."))
        df["link_efficiency"] = df.edge.progress_apply(link_efficiency, G=G)

        # Reciprocity
        log.info(blue("Calculating reciprocity metrics..."))
        df["is_followed_back"] = df.edge.progress_apply(is_followed_back,
                                                        G=DiG)
        df["source_reciprocity"] = df.edge.progress_apply(source_reciprocity,
                                                          G=DiG)
        df["sink_reciprocity"] = df.edge.progress_apply(sink_reciprocity,
                                                        G=DiG)

        # # TOO SLOW
        # # Connectivity
        # log.info(blue("Calculating connectivity..."))
        # C = parameters["features"]["connectivity"]["cutoff"]
        # X_train["edge_connectivity"] = X_train.edge.progress_apply(
        #     connectivity, G=DiG, cutoff=C
        # )
        # X_valid["edge_connectivity"] = X_valid.edge.progress_apply(
        #     connectivity, G=DiG, cutoff=C
        # )
        # X_test["edge_connectivity"] = X_test.edge.progress_apply(
        #     connectivity, G=DiG, cutoff=C
        # )
        #
        # Dispersion
        # log.info(blue("Calculating link dispersion..."))
        # X_train["link_dispersion"] = X_train.edge.progress_apply(link_dispersion, G=G)
        # X_valid["link_dispersion"] = X_valid.edge.progress_apply(link_dispersion, G=G)
        # X_test["link_dispersion"] = X_test.edge.progress_apply(link_dispersion, G=G)

        # Remove edge column
        df = df.drop("edge", axis=1)

    except:
        del df
        gc.collect()
        raise

    return df
Exemplo n.º 5
0
from ptsplitter.utils import positive_edges, negative_edges, iter_get_scores_networkx


print("Reading in dataset.")
G = max(
    nx.connected_component_subgraphs(nx.read_edgelist("data_input/CA-AstroPh.txt")),
    key=len,
)
sample_number = G.number_of_edges() // 2
G_original = nx.Graph(G)
positive_samples = list(take(sample_number, positive_edges(G)))
negative_samples = list(take(sample_number, negative_edges(G)))
G.remove_edges_from(positive_samples)

positive_scores_non_persona = list(
    map(nth(2), jaccard_coefficient(G, positive_samples))
)
negative_scores_non_persona = list(
    map(nth(2), jaccard_coefficient(G, negative_samples))
)

print(sum(positive_scores_non_persona))
print(sum(negative_scores_non_persona))

print(
    roc_auc_score(
        [1] * len(positive_samples) + [0] * len(negative_samples),
        positive_scores_non_persona + negative_scores_non_persona,
    )
)
print('sample negative edges')
#sample negative edges
#G.add_edges_from(target_test_edges)
target_neg_edges = sample_negative_edges(G, target_test_edges, 1)
print(len(target_neg_edges))
print(len(target_test_edges))
G.remove_edges_from(target_test_edges)

print('generate the models')
#calculate the scores for all testing edges
testing_tuples = [
    cn_soundarajan_hopcroft(G, target_test_edges, 'node_type'),
    ra_index_soundarajan_hopcroft(G, target_test_edges, 'node_type'),
    adamic_adar_index(G, target_test_edges),
    resource_allocation_index(G, target_test_edges),
    jaccard_coefficient(G, target_test_edges),
    preferential_attachment(G, target_test_edges)
]
#testing_tuples = [resource_allocation_index(G,test_edges[0:1000])]

#calculate the scores for all non-existing edges
neg_tuples = [
    cn_soundarajan_hopcroft(G, target_neg_edges, 'node_type'),
    ra_index_soundarajan_hopcroft(G, target_neg_edges, 'node_type'),
    adamic_adar_index(G, target_neg_edges),
    resource_allocation_index(G, target_neg_edges),
    jaccard_coefficient(G, target_neg_edges),
    preferential_attachment(G, target_neg_edges)
]
#neg_tuples = [resource_allocation_index(G,neg_edges)]
Exemplo n.º 7
0
G = nx.compose_all(graphs)

print('sample negative edges')
#sample negative edges
start = time.time()
neg_edges = sample_negative_edges(G, test_edges[0:1000])
end = time.time()
print(end-start)

print(len(neg_edges))
print(len(test_edges))
G.remove_edges_from(test_edges)

print('generate the models')
#calculate the scores for all testing edges
testing_tuples = [resource_allocation_index(G,test_edges[0:1000]), jaccard_coefficient(G,test_edges[0:1000]), \
                 preferential_attachment(G,test_edges[0:1000])]
#testing_tuples = [resource_allocation_index(G,test_edges[0:1000])]

#calculate the scores for all non-existing edges
neg_tuples = [resource_allocation_index(G,neg_edges), jaccard_coefficient(G,neg_edges), \
                 preferential_attachment(G,neg_edges)]
#neg_tuples = [resource_allocation_index(G,neg_edges)]

#list of methods
models = ['resource_allocation_index', 'jaccard_coefficient', 'preferential_attachment']
#models = ['resource_allocation_index']

fout = open('baseline_performance.txt','w')
fout.write('Method\tAverage Precision Score\tAUROC\tAUPR\n')