Exemplo n.º 1
0
def preprocess(setup, nw_outpath, i):
    """
    Graph preprocessing routine.
    """
    print('Preprocessing graph...')

    # Load a graph
    if setup.task == 'sp':
        G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i],
                          directed=setup.directed, datatype=int)
    else:
        G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i],
                          directed=setup.directed, datatype=float)

    # Preprocess the graph
    if setup.task == 'lp' and setup.split_alg == 'random':
        G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops, maincc=False)
    else:
        G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops)

    # Save preprocessed graph to a file
    if setup.save_prep_nw:
        pp.save_graph(G, output_path=os.path.join(nw_outpath, 'prep_nw.edgelist'), delimiter=setup.delimiter,
                      write_stats=setup.write_stats, write_weights=False, write_dir=True)

    # Return the preprocessed graph
    return G, ids
Exemplo n.º 2
0
def test():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=',',
                      comments='#',
                      directed=True)

    # Print some stats
    print("")
    print("Original graph stats:")
    print("-----------------------------------------")
    pp.get_stats(G)

    # Save the graph
    pp.save_graph(G, output_path + "orig_graph.edgelist", delimiter=",")

    # Load the saved graph
    G2 = pp.load_graph(output_path + "orig_graph.edgelist",
                       delimiter=",",
                       comments='#',
                       directed=True)

    # Stats comparison
    print("Has the same stats after being loaded?:")
    print("-----------------------------------------")
    pp.get_stats(G2)

    # Preprocess the graph
    GP, ids = pp.prep_graph(G2, del_self_loops=False, relabel=True)

    print("Preprocessed graph stats (restricted to main cc):")
    print("-----------------------------------------")
    pp.get_stats(GP)

    pp.save_graph(GP, output_path + "prep_graph.edgelist", delimiter=",")

    print("Sample of 10 (oldNodeID, newNodeID):")
    print("-----------------------------------------")
    print(ids[0:10])

    pp.get_redges_false(GP, output_path + "redges_false.csv")
Exemplo n.º 3
0
def prep_fb(inpath):
    """
    Preprocess facebook wall post graph.
    """
    # Load a graph
    G = pp.load_graph(inpath, delimiter='\t', comments='#', directed=True)

    # The FB graph is stores as destination, origin so needs to be reversed
    G = G.reverse()

    # Preprocess the graph
    G, ids = pp.prep_graph(G, relabel=True, del_self_loops=False)

    # Return the preprocessed graph
    return G
Exemplo n.º 4
0
def test_split():
    # Variables
    dataset_path = "./data/"
    test_name = "network.edgelist"

    # Load a graph
    SG = pp.load_graph(dataset_path + test_name,
                       delimiter=",",
                       comments='#',
                       directed=False)

    # Preprocess the graph
    SG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True)
    print("Number of CCs input: {}".format(nx.number_connected_components(SG)))

    # Store the edges in the graphs as a set E
    E = set(SG.edges())

    # Use LERW approach to get the ST
    start = time.time()
    train_lerw = stt.wilson_alg(SG, E)
    end1 = time.time() - start

    # Use BRO approach to get the ST
    start = time.time()
    train_bro = stt.broder_alg(SG, E)
    end2 = time.time() - start

    print("LERW time: {}".format(end1))
    print("Bro time: {}".format(end2))

    print("Num tr_e lerw: {}".format(len(train_lerw)))
    print("Num tr_e bro: {}".format(len(train_bro)))

    print("All tr_e in E for lerw?: {}".format(train_lerw - E))
    print("All tr_e in E for bro?: {}".format(train_bro - E))

    # Check that the graph generated with lerw has indeed one single cc
    TG_lerw = nx.Graph()
    TG_lerw.add_edges_from(train_lerw)
    print("Number of CCs with lerw: {}".format(
        nx.number_connected_components(TG_lerw)))

    # Check that the graph generated with broder algorithm has indeed one single cc
    TG_bro = nx.Graph()
    TG_bro.add_edges_from(train_bro)
    print("Number of CCs with lerw: {}".format(
        nx.number_connected_components(TG_bro)))
Exemplo n.º 5
0
def preprocess(inpath, outpath, delimiter, directed, relabel, del_self_loops):
    """
    Graph preprocessing routine.
    """
    print('Preprocessing graph...')

    # Load a graph
    G = pp.load_graph(inpath, delimiter=delimiter, comments='#', directed=directed)

    # Preprocess the graph
    G, ids = pp.prep_graph(G, relabel=relabel, del_self_loops=del_self_loops)

    # Store preprocessed graph to a file
    pp.save_graph(G, output_path=outpath + "prep_graph.edgelist", delimiter=' ', write_stats=False)

    # Return the preprocessed graph
    return G
Exemplo n.º 6
0
def run_test():

    random.seed(42)
    np.random.seed(42)

    # Set some variables
    filename = "./data/network.edgelist"
    directed = False

    # Load the test graph
    G = pp.load_graph(filename, delimiter=",", comments='#', directed=directed)
    G, ids = pp.prep_graph(G)

    # Print some stars about the graph
    pp.get_stats(G)

    # Generate one train/test split with all edges in train set
    start = time()
    traintest_split = split.EvalSplit()
    traintest_split.compute_splits(G, train_frac=0.9)
    end = time() - start
    print("\nSplits computed in {} sec".format(end))

    # Create an evaluator
    nee = evaluator.LPEvaluator(traintest_split)

    # Test baselines
    start = time()
    test_baselines(nee, directed)
    end = time() - start
    print("\nBaselines computed in {} sec".format(end))

    # Test Katz
    start = time()
    test_katz(nee)
    end = time() - start
    print("\nKatz computed in {} sec".format(end))
Exemplo n.º 7
0
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: [email protected]
# Date: 18/12/2018

# This simple example is the one presented in the README.md file.
# Network reconstruction and sign prediction can be computed in the same manner by simply substituting LPEvaluator and
# LPEvalSplit by NREvaluator and NREvalSplit or SPEvaluator and SPEvalSplit.

from evalne.evaluation.evaluator import LPEvaluator
from evalne.evaluation.score import Scoresheet
from evalne.evaluation.split import LPEvalSplit
from evalne.utils import preprocess as pp

# Load and preprocess the network
G = pp.load_graph('../../evalne/tests/data/network.edgelist')
G, _ = pp.prep_graph(G)

# Create an evaluator and generate train/test edge split
traintest_split = LPEvalSplit()
traintest_split.compute_splits(G)
nee = LPEvaluator(traintest_split)

# Create a Scoresheet to store the results
scoresheet = Scoresheet()

# Set the baselines
methods = ['random_prediction', 'common_neighbours', 'jaccard_coefficient']

# Evaluate baselines
for method in methods:
Exemplo n.º 8
0
def test_stt():
    # Variables
    dataset_path = "./data/"
    test_name = "network.edgelist"
    frac = 0.5

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=",",
                      comments='#',
                      directed=False)

    # Preprocess the graph for stt alg.
    SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True, maincc=True)

    # Split train/test using stt
    start = time.time()
    train_E, test_E = stt.split_train_test(SG, train_frac=frac)
    end1 = time.time() - start

    # Compute the false edges
    train_E_false, test_E_false = stt.generate_false_edges_owa(
        SG,
        train_E=train_E,
        test_E=test_E,
        num_fe_train=None,
        num_fe_test=None)
    # Store data to file
    _ = stt.store_train_test_splits(dataset_path + "stt_frac_" + str(frac),
                                    train_E=train_E,
                                    train_E_false=train_E_false,
                                    test_E=test_E,
                                    test_E_false=test_E_false,
                                    split_id=0)

    # Split train/test using rstt
    start = time.time()
    tr_E, te_E = stt.rand_split_train_test(G, train_frac=frac)
    end2 = time.time() - start

    train_E, test_E, J, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed())

    print("Number of nodes in G: {}".format(len(G.nodes())))
    print("Number of nodes in J: {}".format(len(J.nodes())))
    print("Are nodes in J sequential integers? {}".format(
        not len(set(J.nodes()) - set(range(len(J.nodes()))))))

    checks = list()
    queries = 200
    # Check if the mapping is correct
    for i in range(queries):
        ag = tr_E.pop()  # a random element from train
        aj = (mp[ag[0]], mp[ag[1]])  # check what it maps to in J
        checks.append(aj in train_E)
        # print("Random tuple from G: {}".format(ag))
        # print("The tuple maps in J to: {}".format(aj))
        # print("Is that tuple in the new train?: {}".format(aj in train_E))

    print(
        "For train edges out of {} samples, {} were in the relabeled train_E".
        format(queries, sum(checks)))

    checks = list()
    # Check if the mapping is correct
    for i in range(queries):
        ag = te_E.pop()  # a random element from test
        aj = (mp[ag[0]], mp[ag[1]])  # check what it maps to in J
        checks.append(aj in test_E)
        # print("Random tuple from G: {}".format(ag))
        # print("The tuple maps in J to: {}".format(aj))
        # print("Is that tuple in the new train?: {}".format(aj in train_E))

    print("For test edges out of {} samples, {} were in the relabeled test_E".
          format(queries, sum(checks)))

    # Compute the false edges
    train_E_false, test_E_false = stt.generate_false_edges_owa(
        J, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None)
    # Store data to file
    _ = stt.store_train_test_splits(dataset_path + "rstt_frac_" + str(frac),
                                    train_E=train_E,
                                    train_E_false=train_E_false,
                                    test_E=test_E,
                                    test_E_false=test_E_false,
                                    split_id=0)
Exemplo n.º 9
0
def test_split():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"
    subgraph_size = 400
    train_frac = 0.5
    directed = True

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=",",
                      comments='#',
                      directed=directed)

    # Restrict graph to a sub-graph of 'subgraph_size' nodes
    SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy()

    # Preprocess the graph
    PSG, ids = pp.prep_graph(SG,
                             relabel=True,
                             del_self_loops=True,
                             maincc=True)

    # Save the preprocessed graph
    pp.save_graph(PSG, output_path + "prep_graph.edgelist", delimiter=",")

    # Compute train/test splits
    start = time.time()
    train_stt, test_stt = stt.split_train_test(PSG, train_frac=train_frac)
    end = time.time() - start
    print("Exec time stt: {}".format(end))

    # Check that the train graph generated with stt has one single cc
    if directed:
        TG_stt = nx.DiGraph()
        TG_stt.add_edges_from(train_stt)
        print("Number of weakly CCs with stt: {}".format(
            nx.number_weakly_connected_components(TG_stt)))
    else:
        TG_stt = nx.Graph()
        TG_stt.add_edges_from(train_stt)
        print("Number of CCs with stt: {}".format(
            nx.number_connected_components(TG_stt)))
    print("Number train edges stt: {}".format(len(train_stt)))
    print("Number test edges stt: {}".format(len(test_stt)))
    print("Number of nodes in train graph: {}".format(len(TG_stt.nodes)))

    # Preprocess the graph
    PSG, ids = pp.prep_graph(SG,
                             relabel=True,
                             del_self_loops=True,
                             maincc=False)

    # Compute train/test splits
    start = time.time()
    train_rstt, test_rstt = stt.rand_split_train_test(PSG,
                                                      train_frac=train_frac)
    end = time.time() - start
    print("\nExec time rand_stt: {}".format(end))

    # Check that the train graph generated with rstt has one single cc
    if directed:
        TG_rstt = nx.DiGraph()
        TG_rstt.add_edges_from(train_rstt)
        print("Number of weakly CCs with rstt: {}".format(
            nx.number_weakly_connected_components(TG_rstt)))
    else:
        TG_rstt = nx.Graph()
        TG_rstt.add_edges_from(train_rstt)
        print("Number of CCs with rstt: {}".format(
            nx.number_connected_components(TG_rstt)))
    print("Number train edges rstt: {}".format(len(train_rstt)))
    print("Number test edges rstt: {}".format(len(test_rstt)))
    print("Number of nodes in train graph: {}".format(len(TG_rstt.nodes)))
Exemplo n.º 10
0
from evalne.evaluation.evaluator import LPEvaluator
from evalne.evaluation.split import EvalSplit
from evalne.evaluation.score import Scoresheet
from evalne.utils import preprocess as pp

# Load and preprocess the network
#G = pp.load_graph('evalne/tests/data/network.edgelist')
G = pp.load_graph(
    '../Graph_Conv_Neural_Nets/generic_datasets/Zachary-Karate/Zachary-Karate.edgelist'
)
G, _ = pp.prep_graph(G)

# Create an evaluator and generate train/test edge split
traintest_split = EvalSplit(
)  # Bhevencious: EvalSplit() contains methods used to READ/SET a variety of properties/variables. Use the DOT & PARANTHESIS helpers to access parameters.
traintest_split.compute_splits(G,
                               nw_name='Zachary-Karate.edgelist',
                               train_frac=0.8)
nee = LPEvaluator(traintest_split)

# Create a Scoresheet to store the results
scoresheet = Scoresheet()

# Set the baselines
methods = [
    'adamic_adar_index', 'common_neighbours', 'jaccard_coefficient', 'katz',
    'preferential_attachment', 'resource_allocation_index', 'random_prediction'
]

# Evaluate baselines
for method in methods:
Exemplo n.º 11
0
start = time()

# Create folders for the results if these do not exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

traintest_path = os.path.join(output_path, "lp_train_test_splits")
if not os.path.exists(traintest_path):
    os.makedirs(traintest_path)

# ---------------
# Preprocess data
# ---------------

# Load the data as a directed graph
G = pp.load_graph(dataset_path, delimiter=",", comments='#', directed=directed)

# Get some graph statistics
pp.get_stats(G)

# Or store them to a file
pp.get_stats(G, os.path.join(output_path, "stats.txt"))

# Preprocess the graph
SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True)

# Get non-edges so that the reversed edge exists in the graph
if directed:
    redges = pp.get_redges_false(SG, output_path=os.path.join(output_path, "redges.csv"))

# Store the graph to a file