Пример #1
0
    def __init__(self, G, G_test, root="results/lp/"):
        super(LinkPredictionTuning, self).__init__(G, root=root)
        self.task = "lp"

        train_E = G.edges
        train_E_false = self.GetNegativeEdges(G, len(train_E))

        test_E = G_test.edges
        test_E_false = self.GetNegativeEdges(G_test, len(test_E))

        self.split = EvalSplit()
        self.split.set_splits(train_E,
                              train_E_false=train_E_false,
                              test_E=test_E,
                              test_E_false=test_E_false,
                              TG=G)

        self.training_graph = create_self_defined_dataset(root_dir="",
                                                          name_dict={},
                                                          name="training " +
                                                          self.tipo,
                                                          weighted=True,
                                                          directed=False,
                                                          attributed=True)()
        self.training_graph.set_g(G)

        self.evaluator = LPEvaluator(self.split)
Пример #2
0
def main():
    # Initialize some parameters
    inpath = list()
    nw_names = ['network',
                'blogCatalog']  # Stores the names of the networks evaluated
    inpath.append("../evalne/tests/data/network.edgelist")
    # inpath.append("../../data/BlogCatalog/blog.edgelist")
    outpath = "./output/"
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    directed = False  # indicates if the graphs are directed or undirected
    delimiters = (',', '\t')  # indicates the delimiter in the original graph
    repeats = 2  # number of time the experiment will be repeated

    # Create a scoresheet to store the results
    scoresheet = Scoresheet(tr_te='test')

    for i in range(len(inpath)):

        # Create folders for the evaluation results (one per input network)
        if not os.path.exists(outpath):
            os.makedirs(outpath)

        # Load and preprocess the graph
        G = preprocess(inpath[i], outpath, delimiters[i], directed)

        # For each repeat of the experiment generate new data splits
        for repeat in range(repeats):
            print('Repetition {} of experiment'.format(repeat))

            # Generate one train/test split with default parameters
            traintest_split = EvalSplit()
            traintest_split.compute_splits(G,
                                           nw_name=nw_names[i],
                                           train_frac=0.8,
                                           split_id=repeat)

            trainvalid_split = EvalSplit()
            trainvalid_split.compute_splits(traintest_split.TG,
                                            nw_name=nw_names[i],
                                            train_frac=0.9,
                                            split_id=repeat)

            # Create an evaluator
            nee = LPEvaluator(traintest_split, trainvalid_split)

            # Evaluate baselines
            eval_baselines(nee, directed, scoresheet)

            # Evaluate other NE methods
            eval_other(nee, scoresheet)

    # Write results averaged over exp repeats to a single file
    scoresheet.write_tabular(filename=os.path.join(outpath, 'eval_output.txt'),
                             metric='auroc')

    print("End of evaluation")
Пример #3
0
def main(args=None):

    cpu_number = multiprocessing.cpu_count()

    parser = argparse.ArgumentParser(description='Path of networks')
    parser.add_argument('-n', type=str, help='Multiplex 1')
    parser.add_argument('-m', type=str, help='Multiplex 2')
    parser.add_argument('-b', type=str, help='Bipartite')

    args = parser.parse_args(args)
    print(args)

    ########################################################################
    # Parameters multiverse and train/test
    ########################################################################
    EMBED_DIMENSION = 128
    CLOSEST_NODES = np.int64(300)
    NUM_SAMPLED = np.int64(10)
    LEARNING_RATE = np.float64(0.01)
    KL = False
    NB_CHUNK = np.int64(1)
    CHUNK_SIZE = np.int64(100)
    NUM_STEPS_1 = np.int64(100 * 10**6 / CHUNK_SIZE)

    # If toy example
    #EMBED_DIMENSION = 128
    #CLOSEST_NODES = np.int64(2)
    #NUM_SAMPLED = np.int64(10)
    #LEARNING_RATE = np.float64(0.01)
    #KL = False
    #NB_CHUNK = np.int64(1)
    #CHUNK_SIZE = np.int64(2)
    #NUM_STEPS_1 = np.int64(100*10**6/CHUNK_SIZE)

    train_frac = 0.7
    solver = 'lbfgs'
    max_iter = 1000
    split_alg = 'random'
    lp_model = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                                                      max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True,\
                                                      oob_score=True, n_jobs=cpu_number, random_state=777, verbose=0, warm_start=False)

    graph_name = 'Test_Eval'

    ##################################################################################
    # !! Careful !!
    # Check if nodes in the bipartite have the same nodes in the multiplex
    # networks. If not you have to remove the nodes in the multiplexes not included in the
    # bipartites
    ##################################################################################

    ###################################################################################
    # EvalNE Link prediction processing
    ###################################################################################

    data_bipartite = pd.read_csv(args.b, delimiter=' ', header=None)
    data_bipartite = data_bipartite.drop(columns=[0, 3])
    data_bipartite.to_csv('bipartite_2colformat.csv',
                          header=None,
                          index=None,
                          sep=' ')

    G_hetereogeneous = f.preprocess('bipartite_2colformat.csv', '.', ' ',
                                    False, False, True)
    print('Preprocessing done')
    G_hetereogeneous_traintest_split = EvalSplit()
    G_hetereogeneous_traintest_split.compute_splits(G_hetereogeneous,
                                                    split_alg=split_alg,
                                                    train_frac=train_frac,
                                                    owa=False)
    nee = LPEvaluator(G_hetereogeneous_traintest_split,
                      dim=EMBED_DIMENSION,
                      lp_model=lp_model)
    G_heterogeneous_split = (G_hetereogeneous_traintest_split.TG)
    os.replace('bipartite_2colformat.csv',
               './Generated_graphs/' + 'bipartite_2colformat.csv')
    print('Splitting done')

    # Write the bipartite training graph for multiverse in extended edgelist format 'layer n1 n2 weight'
    file_multi = open('bipartite_training_graph_' + '_' + graph_name, 'w+')
    tmp_array_het = []
    tmp_array_het = np.asarray(G_heterogeneous_split.edges)
    for i in range(len(tmp_array_het[:, 0])):
        if tmp_array_het[i, 0] in list(data_bipartite[2]):
            tmp = tmp_array_het[i, 0]
            tmp_array_het[i, 0] = tmp_array_het[i, 1]
            tmp_array_het[i, 1] = tmp

    tmp_array_het = np.hstack((tmp_array_het, np.ones(
        (len(tmp_array_het), 1))))
    tmp_array_het = np.hstack((np.ones(
        (len(tmp_array_het), 1)), tmp_array_het))
    tmp_array_het = np.vstack(tmp_array_het)
    tmp_array_het = np.int_(tmp_array_het)

    np.savetxt(file_multi,
               tmp_array_het,
               fmt='%s',
               delimiter=' ',
               newline=os.linesep)

    file_multi.close()
    os.replace(
        'bipartite_training_graph_' + '_' + graph_name, './Generated_graphs/' +
        'bipartite_training_graph_' + '_' + graph_name + '.txt')

    ###################################################################################
    # MULTIVERSE
    ###################################################################################
    r_readRDS = robjects.r['readRDS']

    print('RWR-MH')
    proc = subprocess.Popen(['Rscript',  './RWR/GenerateSimMatrix_MH.R', \
              '-n', '.' + args.n,  \
              '-m', '.' + args.m,  \
              '-b', '../Generated_graphs/'+ 'bipartite_training_graph_'  + '_'+ graph_name+'.txt',
              '-o', '../ResultsRWR/MatrixSimilarityMultiplexHet'+graph_name, '-c', str(cpu_number)])

    proc.wait()
    proc.kill()
    print('RWR done')

    r_DistancematrixPPI = r_readRDS(
        './ResultsRWR/MatrixSimilarityMultiplexHet' + graph_name + '.rds')

    import gc
    gc.collect()

    ########################################################################
    # Processing of the network
    ########################################################################
    reverse_data_DistancematrixPPI, list_neighbours, nodes, data_DistancematrixPPI, neighborhood, nodesstr \
     = f.netpreprocess_hetero(r_DistancematrixPPI, CLOSEST_NODES)

    ########################################################################
    # Initialization
    ########################################################################

    embeddings = np.random.normal(0, 1, [np.size(nodes), EMBED_DIMENSION])

    ########################################################################
    # Training and saving best embeddings
    ########################################################################
    # Train and test during training
    neighborhood = np.asarray(neighborhood)
    nodes = np.asarray(nodes)

    embeddings = f.train(neighborhood, nodes, list_neighbours, NUM_STEPS_1, NUM_SAMPLED, LEARNING_RATE, \
                         CLOSEST_NODES, CHUNK_SIZE, NB_CHUNK, embeddings, reverse_data_DistancematrixPPI)

    X = dict(zip(range(embeddings.shape[0]), embeddings))
    X = {str(int(nodesstr[key]) + 1): X[key] for key in X}
    np.save('embeddings_MH', X)
    date = datetime.datetime.now()
    os.replace('embeddings_MH.npy',
               './ResultsMultiVERSE/' + 'embeddings_MH.npy')

    ########################################################################
    # Link prediction for evaluation of MH
    ########################################################################

    edge_emb = ['hadamard', 'weighted_l1', 'weighted_l2', 'average', 'cosine']
    results_embeddings_methods = dict()

    for i in range(len(edge_emb)):
        tmp_result_multiverse = nee.evaluate_ne(data_split=nee.traintest_split,
                                                X=X,
                                                method="Multiverse",
                                                edge_embed_method=edge_emb[i],
                                                label_binarizer=lp_model)
        results_embeddings_methods[tmp_result_multiverse.method + '_' + str(
            edge_emb[i])] = tmp_result_multiverse.get_all()[1][4]

    ########################################################################
    # Analysis and saving of the results
    ########################################################################

    Result_file = 'Result_LinkpredMultiplexHet_' + graph_name + '_' + str(
        date) + '.txt'
    with open(Result_file, "w+") as overall_result:
        print("%s: \n\
                EMBED_DIMENSION: %s \n\
                CLOSEST_NODES: %s  \n\
                NUM_STEPS_1: %s  \n\
                NUM_SAMPLED: %s  \n\
                LEARNING_RATE: %s  \n\
                CHUNK_SIZE: %s  \n\
                NB_CHUNK: %s  \n\
                train_frac: %s \n\
                solver: %s \n\
                max_iter: %s  \n\
                split_alg: %s  \n\
                " % (str(date), EMBED_DIMENSION, CLOSEST_NODES, NUM_STEPS_1,
                     NUM_SAMPLED, LEARNING_RATE, CHUNK_SIZE, NB_CHUNK,
                     train_frac, solver, max_iter, split_alg),
              file=overall_result)

        print('Overall MULTIVERSE AUC hadamard:',
              results_embeddings_methods['Multiverse_hadamard'],
              file=overall_result)
        print('Overall MULTIVERSE AUC weighted_l1:',
              results_embeddings_methods['Multiverse_weighted_l1'],
              file=overall_result)
        print('Overall MULTIVERSE AUC weighted_l2:',
              results_embeddings_methods['Multiverse_weighted_l2'],
              file=overall_result)
        print('Overall MULTIVERSE AUC average:',
              results_embeddings_methods['Multiverse_average'],
              file=overall_result)
        print('Overall MULTIVERSE AUC cosine:',
              results_embeddings_methods['Multiverse_cosine'],
              file=overall_result)

    overall_result.close()
    os.replace(Result_file, './ResultsMultiVERSE/' + Result_file)

    print('End')
Пример #4
0
# Network reconstruction and sign prediction can be computed in the same manner by simply substituting LPEvaluator and
# LPEvalSplit by NREvaluator and NREvalSplit or SPEvaluator and SPEvalSplit.

from evalne.evaluation.evaluator import LPEvaluator
from evalne.evaluation.score import Scoresheet
from evalne.evaluation.split import LPEvalSplit
from evalne.utils import preprocess as pp

# Load and preprocess the network
G = pp.load_graph('../../evalne/tests/data/network.edgelist')
G, _ = pp.prep_graph(G)

# Create an evaluator and generate train/test edge split
traintest_split = LPEvalSplit()
traintest_split.compute_splits(G)
nee = LPEvaluator(traintest_split)

# Create a Scoresheet to store the results
scoresheet = Scoresheet()

# Set the baselines
methods = ['random_prediction', 'common_neighbours', 'jaccard_coefficient']

# Evaluate baselines
for method in methods:
    result = nee.evaluate_baseline(method=method)
    scoresheet.log_results(result)

try:
    # Check if OpenNE is installed
    import openne
Пример #5
0
class LinkPredictionTuning(Tuning):
    r"""

    Clase general de entrenamiento y testeo de embeddings de grafos para la tarea de prediccion de enlaces.

    Parameters
    ----------
    G: NetworkX graph
        Grafo de entrenamiento.
    G_test: NetworkX graph
        Grafo de testeo.
    root: str
        directorio en el que se guardaran los resultados
    """
    def __init__(self, G, G_test, root="results/lp/"):
        super(LinkPredictionTuning, self).__init__(G, root=root)
        self.task = "lp"

        train_E = G.edges
        train_E_false = self.GetNegativeEdges(G, len(train_E))

        test_E = G_test.edges
        test_E_false = self.GetNegativeEdges(G_test, len(test_E))

        self.split = EvalSplit()
        self.split.set_splits(train_E,
                              train_E_false=train_E_false,
                              test_E=test_E,
                              test_E_false=test_E_false,
                              TG=G)

        self.training_graph = create_self_defined_dataset(root_dir="",
                                                          name_dict={},
                                                          name="training " +
                                                          self.tipo,
                                                          weighted=True,
                                                          directed=False,
                                                          attributed=True)()
        self.training_graph.set_g(G)

        self.evaluator = LPEvaluator(self.split)

    def GetNegativeEdges(self, G, n):
        r"""

        Metodo auxiliar que muestrea enlaces negativos.

        Parameters
        ----------
        G: NetworkX graph
           Grafo bipartito.
        n: int
            cantidad de enlaces que muestrear.
        """

        prop_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 0]
        user_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 1]

        non_edges = []

        while len(non_edges) <= n:
            random_prop = random.choice(prop_nodes)
            random_user = random.choice(user_nodes)
            edge = (random_prop, random_user)
            if G.has_edge(*edge):
                continue
            else:
                non_edges.append(edge)
        return non_edges

    def TestModel(self, emb, time=-1, method_name="method_name"):
        r"""

        Testea un embedding y lo guarda en el scoresheet.

        Parameters
        ----------
        emb: dict
            diccionario de embeddings, llaves son los nodos y los valores una lista con el embedding
        time: float
            tiempo de ejecucion del metodo, para guardar en el scoresheet
        method_name: str
            nombre del metodo con el que guardar.
        """
        df = pd.DataFrame(emb).T
        X = df.T.to_dict("list")
        X = {str(k): np.array(v)
             for k, v in X.items()
             }  # tiene que ser array por que se hacen sumas

        self.evaluator.dim = df.shape[1]

        reslp = []
        for edge_method in [
                "weighted_l1", "weighted_l2", "hadamard", "average"
        ]:
            #TO DO que no evalue en los 4 embeddings de enlaces
            res = self.evaluator.evaluate_ne(self.split,
                                             X=X,
                                             method=method_name,
                                             edge_embed_method=edge_method,
                                             params={"nw_name": "GPI"})
            res.params.update({'eval_time': time})
            reslp.append(res)
        self.scoresheet.log_results(reslp)
        return reslp
Пример #6
0
def main(args=None):

    cpu_number = multiprocessing.cpu_count()

    parser = argparse.ArgumentParser(description='Path of networks')
    parser.add_argument('-m', type=str, help='Multiplex')

    args = parser.parse_args(args)
    graph_path = args.m

    ########################################################################
    # Parameters multiverse and train/test
    ########################################################################

    EMBED_DIMENSION = 128
    CLOSEST_NODES = np.int64(20)
    NUM_SAMPLED = np.int64(3)
    LEARNING_RATE = np.float64(0.01)
    NB_CHUNK = np.int64(1)
    CHUNK_SIZE = np.int64(10)
    NUM_STEPS_1 = np.int64(100 * 10**6 / CHUNK_SIZE)
    graph_name = os.path.basename(graph_path)
    train_frac = 0.7
    solver = 'lbfgs'
    max_iter = 2000
    split_alg = 'spanning_tree'

    lp_model = LogisticRegressionCV(Cs=10, cv= 5, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1.0, max_iter=max_iter, \
                        multi_class='ovr', n_jobs=cpu_number, random_state=None, refit=True, scoring='roc_auc', solver=solver, tol=0.0001, verbose=0)

    edge_data_by_type, _, all_nodes = f.load_network_data(graph_path)
    nb_layers = len(edge_data_by_type.keys())

    # Divide multiplex graph in several in edgelist format
    for layer in range(nb_layers - 1):
        file = open(
            'multiplex_graph_layer_' + str(layer + 1) + '_' + graph_name, 'w+')
        tmp_array = np.asarray(edge_data_by_type[str(layer + 1)])
        np.savetxt(file, tmp_array, fmt='%s')
        file.close()
        os.replace(
            'multiplex_graph_layer_' + str(layer + 1) + '_' + graph_name,
            'Generated_graphs/' + 'multiplex_graph_layer_' + str(layer + 1) +
            '_' + graph_name)

    # Load each graph with EvalNE, preprocess and split train/test edges
    nee = list()
    G_original = list()
    Gsplit = list()
    traintestsplit = list()
    for layer in range(nb_layers - 1):
        G_original.append(
            f.preprocess('./Generated_graphs/' + 'multiplex_graph_layer_' +
                         str(layer + 1) + '_' + graph_name,
                         '.',
                         ' ',
                         directed=False,
                         relabel=False,
                         del_self_loops=True))
        G_original_traintest_split = EvalSplit()
        G_original_traintest_split.compute_splits(G_original[layer],
                                                  split_alg=split_alg,
                                                  train_frac=train_frac,
                                                  owa=False)
        traintestsplit.append(G_original_traintest_split)
        nee.append(
            LPEvaluator(G_original_traintest_split,
                        dim=EMBED_DIMENSION,
                        lp_model=lp_model))
        Gsplit.append(G_original_traintest_split.TG)

    # Write the multiplex training graph for multiverse in extended edgelist format 'layer n1 n2 weight'
    file_multi = open('multiverse_graph_' + 'training' + '_' + graph_name,
                      'w+')
    matrix_train_edges = []
    sorted_matrix_train_edges = []
    tmp_array_multi = []
    tmp_array_collapsed = []
    for layer in range(nb_layers - 1):

        tmp_array = np.asarray(Gsplit[layer].edges)
        tmp_array = np.hstack((tmp_array, np.ones((len(tmp_array), 1))))
        tmp_array = np.hstack(((layer + 1) * np.ones(
            (len(tmp_array), 1)), tmp_array))
        tmp_array = np.vstack(tmp_array)
        tmp_array_multi.append(tmp_array)

        tmp_array_mat_train_edges = np.asarray(Gsplit[layer].edges)
        tmp_array_mat_train_edges = np.hstack(
            (tmp_array_mat_train_edges,
             np.ones((len(tmp_array_mat_train_edges), 1))))
        tmp_array_mat_train_edges = np.hstack(((layer) * np.ones(
            (len(tmp_array), 1)), tmp_array_mat_train_edges))
        matrix_train_edges.append(tmp_array_mat_train_edges)

        matrix_train_edges = sorted(tmp_array_mat_train_edges,
                                    key=itemgetter(1))
        sorted_matrix_train_edges.extend(matrix_train_edges)
        matrix_train_edges = []

    tmp_array_multi = np.vstack(tmp_array_multi)
    tmp_array_multi = np.int_(tmp_array_multi)
    np.savetxt(file_multi,
               tmp_array_multi,
               fmt='%s',
               delimiter=' ',
               newline=os.linesep)

    file_multi.close()
    os.replace(
        'multiverse_graph_' + 'training' + '_' + graph_name,
        './Generated_graphs/' + 'multiverse_graph_' + 'training' + '_' +
        graph_name + '.txt')

    ###################################################################################"
    # MULTIVERSE
    ###################################################################################"
    r_readRDS = robjects.r['readRDS']

    proc = subprocess.Popen(['Rscript',  './RWR/GenerateSimMatrix.R', \
              '-n', '../Generated_graphs/'+'multiverse_graph_' + 'training' + '_'+ graph_name+'.txt', '-o', \
              '../ResultsRWR/MatrixSimilarityMultiplex'+graph_name, '-c', str(cpu_number)])

    proc.wait()
    pid = proc.pid
    proc.kill()
    print('RWR done')
    r_DistancematrixPPI = r_readRDS('./ResultsRWR/MatrixSimilarityMultiplex' +
                                    graph_name + '.rds')

    ########################################################################
    # Processing of the network
    ########################################################################
    reverse_data_DistancematrixPPI, list_neighbours, nodes, data_DistancematrixPPI, nodes_incomponent, neighborhood, nodesstr \
     = f.netpreprocess(r_DistancematrixPPI, graph_path, CLOSEST_NODES)

    ########################################################################
    # Initialization
    ########################################################################
    embeddings = np.random.normal(0, 1, [np.size(nodes), EMBED_DIMENSION])

    ########################################################################
    # Training and saving best embeddings
    ########################################################################

    nodes = np.asarray(nodes)
    embeddings = f.train(neighborhood, nodes, list_neighbours, NUM_STEPS_1, NUM_SAMPLED, LEARNING_RATE, \
                         CLOSEST_NODES, CHUNK_SIZE, NB_CHUNK, embeddings, reverse_data_DistancematrixPPI)

    X = dict(zip(range(embeddings.shape[0]), embeddings))
    X = {str(int(nodesstr[key]) + 1): X[key] for key in X}
    np.save('embeddings_M', X)
    date = datetime.datetime.now()
    os.replace('embeddings_M.npy', './ResultsMultiVERSE/' + 'embeddings_M.npy')

    print('Embedding done')

    ########################################################################
    # Evaluation on link prediction
    ########################################################################

    edge_emb = ['hadamard', 'weighted_l1', 'weighted_l2', 'average', 'cosine']
    results_embeddings_methods = dict()
    date = datetime.datetime.now()
    for layer in range(nb_layers - 1):
        for i in range(len(edge_emb)):
            tmp_result_multiverse = nee[layer].evaluate_ne(
                data_split=nee[layer].traintest_split,
                X=X,
                method="Multiverse",
                edge_embed_method=edge_emb[i],
                label_binarizer=lp_model)
            results_embeddings_methods[
                tmp_result_multiverse.method + '_' + str(layer) +
                str(edge_emb[i])] = tmp_result_multiverse.get_all()[1][4]
    print('Evaluation done')

    ########################################################################
    # Analysis and saving of the results
    ########################################################################

    tmp_Multiverse_Result_hada = 0
    tmp_Multiverse_Result_wl1 = 0
    tmp_Multiverse_Result_wL2 = 0
    tmp_Multiverse_Result_avg = 0
    tmp_Multiverse_Result_cos = 0

    for layer in range(nb_layers - 1):
        tmp_Multiverse_Result_hada += results_embeddings_methods[
            'Multiverse' + '_' + str(layer) + str(edge_emb[0])]
        tmp_Multiverse_Result_wl1 += results_embeddings_methods[
            'Multiverse' + '_' + str(layer) + str(edge_emb[1])]
        tmp_Multiverse_Result_wL2 += results_embeddings_methods[
            'Multiverse' + '_' + str(layer) + str(edge_emb[2])]
        tmp_Multiverse_Result_avg += results_embeddings_methods[
            'Multiverse' + '_' + str(layer) + str(edge_emb[3])]
        tmp_Multiverse_Result_cos += results_embeddings_methods[
            'Multiverse' + '_' + str(layer) + str(edge_emb[4])]

    results_embeddings_methods[
        'Multiverse_av_hadamard'] = tmp_Multiverse_Result_hada / (nb_layers -
                                                                  1)
    results_embeddings_methods[
        'Multiverse_av_weighted_l1'] = tmp_Multiverse_Result_wl1 / (nb_layers -
                                                                    1)
    results_embeddings_methods[
        'Multiverse_av_weighted_l2'] = tmp_Multiverse_Result_wL2 / (nb_layers -
                                                                    1)
    results_embeddings_methods[
        'Multiverse_av_average'] = tmp_Multiverse_Result_avg / (nb_layers - 1)
    results_embeddings_methods[
        'Multiverse_av_cosine'] = tmp_Multiverse_Result_cos / (nb_layers - 1)

    # Save results
    Result_file = 'Result_Linkpred_Multiplex_' + graph_name + '_' + str(
        date) + '.txt'
    with open(Result_file, "w+") as overall_result:
        print("%s: \n\
                EMBED_DIMENSION: %s \n\
                CLOSEST_NODES: %s  \n\
                NUM_STEPS_1: %s  \n\
                NUM_SAMPLED: %s  \n\
                LEARNING_RATE: %s  \n\
                CHUNK_SIZE: %s  \n\
                NB_CHUNK: %s  \n\
                train_frac: %s \n\
                solver: %s \n\
                max_iter: %s  \n\
                split_alg: %s  \n\
                " % (str(date), EMBED_DIMENSION, CLOSEST_NODES, NUM_STEPS_1,
                     NUM_SAMPLED, LEARNING_RATE, CHUNK_SIZE, NB_CHUNK,
                     train_frac, solver, max_iter, split_alg),
              file=overall_result)

        print('Overall MULTIVERSE AUC hadamard:',
              results_embeddings_methods['Multiverse_av_hadamard'],
              file=overall_result)
        print('Overall MULTIVERSE AUC weighted_l1:',
              results_embeddings_methods['Multiverse_av_weighted_l1'],
              file=overall_result)
        print('Overall MULTIVERSE AUC weighted_l2:',
              results_embeddings_methods['Multiverse_av_weighted_l2'],
              file=overall_result)
        print('Overall MULTIVERSE AUC average:',
              results_embeddings_methods['Multiverse_av_average'],
              file=overall_result)
        print('Overall MULTIVERSE AUC cosine:',
              results_embeddings_methods['Multiverse_av_cosine'],
              file=overall_result)

    overall_result.close()
    os.replace(Result_file, './ResultsMultiVERSE/' + Result_file)

    print('End')
Пример #7
0
def evaluate(setup):
    # Set the random seed
    random.seed(setup.seed)
    np.random.seed(setup.seed)

    # Get input and output paths
    inpaths = setup.inpaths
    filename = '{}_eval_{}'.format(setup.task,
                                   datetime.now().strftime("%m%d_%H%M"))
    outpath = os.path.join(os.getcwd(), filename)
    if not os.path.exists(outpath):
        os.makedirs(outpath)

    # Logging configuration (file opened in append mode)
    logging.basicConfig(filename=os.path.join(outpath, 'eval.log'),
                        format='%(asctime)s - %(levelname)s: %(message)s',
                        datefmt='%d-%m-%y %H:%M:%S',
                        level=logging.INFO)
    logging.info('Evaluation start')
    if setup.task != 'nc':
        logging.info('Running evaluation using classifier: {}'.format(
            setup.lp_model))

    # Create a Scoresheet object to store all results
    if setup.task == 'nr':
        scoresheet = Scoresheet(tr_te='train', precatk_vals=setup.precatk_vals)
    else:
        scoresheet = Scoresheet(tr_te='test', precatk_vals=setup.precatk_vals)

    # Initialize some variables
    edge_split_time = list()
    lp_coef = dict()
    repeats = setup.lp_num_edge_splits if setup.task == 'lp' else 1
    t = tqdm(total=len(inpaths) * repeats)
    t.set_description(desc='Progress on {} task'.format(setup.task))

    # Loop over all input networks
    for i in range(len(inpaths)):
        logging.info('====== Evaluating {} network ======'.format(
            setup.names[i]))
        print('\nEvaluating {} network...'.format(setup.names[i]))
        print('=====================================')

        # Create path to store info per network if needed
        nw_outpath = os.path.join(outpath, setup.names[i])
        if setup.save_prep_nw or setup.curves != '':
            if not os.path.exists(nw_outpath):
                os.makedirs(nw_outpath)

        # Load and preprocess the graph
        G, ids = preprocess(setup, nw_outpath, i)
        if setup.task == 'nc':
            try:
                labels = pp.read_labels(setup.labelpaths[i], idx_mapping=ids)
            except (ValueError, IOError) as e:
                logging.exception(
                    'Exception occurred while reading labels of `{}` network. Skipping network eval...'
                    .format(setup.names[i]))
                break

        # For each repeat of the experiment generate new edge splits
        for repeat in range(repeats):
            logging.info(
                '------ Repetition {} of experiment ------'.format(repeat))
            print('\nRepetition {} of experiment...'.format(repeat))
            print('-------------------------------------')

            # Create train and validation edge splits
            traintest_split = EvalSplit()
            trainvalid_split = EvalSplit()

            split_time = time.time()
            if setup.task == 'lp':
                # For LP compute train/test and train/valid splits
                traintest_split.compute_splits(G,
                                               nw_name=setup.names[i],
                                               train_frac=setup.traintest_frac,
                                               split_alg=setup.split_alg,
                                               owa=setup.owa,
                                               fe_ratio=setup.fe_ratio,
                                               split_id=repeat,
                                               verbose=setup.verbose)
                trainvalid_split.compute_splits(
                    traintest_split.TG,
                    nw_name=setup.names[i],
                    train_frac=setup.trainvalid_frac,
                    split_alg=setup.split_alg,
                    owa=setup.owa,
                    fe_ratio=setup.fe_ratio,
                    split_id=repeat,
                    verbose=setup.verbose)
                # traintest_split.save_tr_graph(nw_outpath + '/TG_rep_{}'.format(repeat), ',', True, False, False)
                # Create an LP evaluator
                nee = LPEvaluator(traintest_split, trainvalid_split,
                                  setup.embed_dim, setup.lp_model)

            elif setup.task == 'nr':
                # For NR set TG = G no train/valid split needed and get random subset of true and false edges for pred
                pos_e, neg_e = stt.random_edge_sample(nx.adj_matrix(G),
                                                      setup.nr_edge_samp_frac,
                                                      nx.is_directed(G))
                if len(pos_e) == 0:
                    logging.error(
                        'Sampling fraction {} on {} network returned 0 positive edges. Skipping evaluation...'
                        .format(setup.nr_edge_samp_frac, setup.names[i]))
                    break
                traintest_split.set_splits(train_E=pos_e,
                                           train_E_false=neg_e,
                                           test_E=None,
                                           test_E_false=None,
                                           directed=nx.is_directed(G),
                                           nw_name=setup.names[i],
                                           TG=G)
                # Create an NR evaluator
                nee = NREvaluator(traintest_split, setup.embed_dim,
                                  setup.lp_model)

            else:
                # Create an NC evaluator (train/valid fraction hardcoded to 10%)
                nee = NCEvaluator(G, labels, setup.names[i],
                                  setup.nc_num_node_splits,
                                  setup.nc_node_fracs, 0.2, setup.embed_dim)

            edge_split_time.append(time.time() - split_time)

            # Evaluate baselines
            if setup.lp_baselines is not None and setup.task != 'nc':
                eval_baselines(setup, nee, i, scoresheet, repeat, nw_outpath)

            # Evaluate other NE methods
            if setup.methods_opne is not None or setup.methods_other is not None:
                lp_coef = eval_other(setup, nee, i, scoresheet, repeat,
                                     nw_outpath)

            # Update progress bar
            t.update(1)

        # Store in a pickle file the results up to this point in evaluation
        scoresheet.write_pickle(os.path.join(outpath, 'eval.pkl'))

    # Store the results
    if setup.scores is not None:
        if setup.scores == 'all':
            scoresheet.write_all(
                filename=os.path.join(outpath, 'eval_output.txt'))
        else:
            scoresheet.write_tabular(filename=os.path.join(
                outpath, 'eval_output.txt'),
                                     metric=setup.scores)
            scoresheet.write_tabular(filename=os.path.join(
                outpath, 'eval_output.txt'),
                                     metric='eval_time')
    scoresheet.write_pickle(os.path.join(outpath, 'eval.pkl'))

    # Close progress bar
    t.close()
    print('Average edge split times per dataset:')
    print(setup.names)
    print(np.array(edge_split_time).reshape(-1, repeats).mean(axis=1))
    # if setup.task != 'nc':
    #     print('Coefficients of LP model ({}) for each NE method:'.format(setup.lp_model))
    #     print(lp_coef)
    logging.info('Evaluation end\n\n')