Exemplo n.º 1
0
def main(args=None):

    cpu_number = multiprocessing.cpu_count()

    parser = argparse.ArgumentParser(description='Path of networks')
    parser.add_argument('-n', type=str, help='Multiplex 1')
    parser.add_argument('-m', type=str, help='Multiplex 2')
    parser.add_argument('-b', type=str, help='Bipartite')

    args = parser.parse_args(args)
    print(args)

    ########################################################################
    # Parameters multiverse and train/test
    ########################################################################
    EMBED_DIMENSION = 128
    CLOSEST_NODES = np.int64(300)
    NUM_SAMPLED = np.int64(10)
    LEARNING_RATE = np.float64(0.01)
    KL = False
    NB_CHUNK = np.int64(1)
    CHUNK_SIZE = np.int64(100)
    NUM_STEPS_1 = np.int64(100 * 10**6 / CHUNK_SIZE)

    # If toy example
    #EMBED_DIMENSION = 128
    #CLOSEST_NODES = np.int64(2)
    #NUM_SAMPLED = np.int64(10)
    #LEARNING_RATE = np.float64(0.01)
    #KL = False
    #NB_CHUNK = np.int64(1)
    #CHUNK_SIZE = np.int64(2)
    #NUM_STEPS_1 = np.int64(100*10**6/CHUNK_SIZE)

    train_frac = 0.7
    solver = 'lbfgs'
    max_iter = 1000
    split_alg = 'random'
    lp_model = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                                                      max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True,\
                                                      oob_score=True, n_jobs=cpu_number, random_state=777, verbose=0, warm_start=False)

    graph_name = 'Test_Eval'

    ##################################################################################
    # !! Careful !!
    # Check if nodes in the bipartite have the same nodes in the multiplex
    # networks. If not you have to remove the nodes in the multiplexes not included in the
    # bipartites
    ##################################################################################

    ###################################################################################
    # EvalNE Link prediction processing
    ###################################################################################

    data_bipartite = pd.read_csv(args.b, delimiter=' ', header=None)
    data_bipartite = data_bipartite.drop(columns=[0, 3])
    data_bipartite.to_csv('bipartite_2colformat.csv',
                          header=None,
                          index=None,
                          sep=' ')

    G_hetereogeneous = f.preprocess('bipartite_2colformat.csv', '.', ' ',
                                    False, False, True)
    print('Preprocessing done')
    G_hetereogeneous_traintest_split = EvalSplit()
    G_hetereogeneous_traintest_split.compute_splits(G_hetereogeneous,
                                                    split_alg=split_alg,
                                                    train_frac=train_frac,
                                                    owa=False)
    nee = LPEvaluator(G_hetereogeneous_traintest_split,
                      dim=EMBED_DIMENSION,
                      lp_model=lp_model)
    G_heterogeneous_split = (G_hetereogeneous_traintest_split.TG)
    os.replace('bipartite_2colformat.csv',
               './Generated_graphs/' + 'bipartite_2colformat.csv')
    print('Splitting done')

    # Write the bipartite training graph for multiverse in extended edgelist format 'layer n1 n2 weight'
    file_multi = open('bipartite_training_graph_' + '_' + graph_name, 'w+')
    tmp_array_het = []
    tmp_array_het = np.asarray(G_heterogeneous_split.edges)
    for i in range(len(tmp_array_het[:, 0])):
        if tmp_array_het[i, 0] in list(data_bipartite[2]):
            tmp = tmp_array_het[i, 0]
            tmp_array_het[i, 0] = tmp_array_het[i, 1]
            tmp_array_het[i, 1] = tmp

    tmp_array_het = np.hstack((tmp_array_het, np.ones(
        (len(tmp_array_het), 1))))
    tmp_array_het = np.hstack((np.ones(
        (len(tmp_array_het), 1)), tmp_array_het))
    tmp_array_het = np.vstack(tmp_array_het)
    tmp_array_het = np.int_(tmp_array_het)

    np.savetxt(file_multi,
               tmp_array_het,
               fmt='%s',
               delimiter=' ',
               newline=os.linesep)

    file_multi.close()
    os.replace(
        'bipartite_training_graph_' + '_' + graph_name, './Generated_graphs/' +
        'bipartite_training_graph_' + '_' + graph_name + '.txt')

    ###################################################################################
    # MULTIVERSE
    ###################################################################################
    r_readRDS = robjects.r['readRDS']

    print('RWR-MH')
    proc = subprocess.Popen(['Rscript',  './RWR/GenerateSimMatrix_MH.R', \
              '-n', '.' + args.n,  \
              '-m', '.' + args.m,  \
              '-b', '../Generated_graphs/'+ 'bipartite_training_graph_'  + '_'+ graph_name+'.txt',
              '-o', '../ResultsRWR/MatrixSimilarityMultiplexHet'+graph_name, '-c', str(cpu_number)])

    proc.wait()
    proc.kill()
    print('RWR done')

    r_DistancematrixPPI = r_readRDS(
        './ResultsRWR/MatrixSimilarityMultiplexHet' + graph_name + '.rds')

    import gc
    gc.collect()

    ########################################################################
    # Processing of the network
    ########################################################################
    reverse_data_DistancematrixPPI, list_neighbours, nodes, data_DistancematrixPPI, neighborhood, nodesstr \
     = f.netpreprocess_hetero(r_DistancematrixPPI, CLOSEST_NODES)

    ########################################################################
    # Initialization
    ########################################################################

    embeddings = np.random.normal(0, 1, [np.size(nodes), EMBED_DIMENSION])

    ########################################################################
    # Training and saving best embeddings
    ########################################################################
    # Train and test during training
    neighborhood = np.asarray(neighborhood)
    nodes = np.asarray(nodes)

    embeddings = f.train(neighborhood, nodes, list_neighbours, NUM_STEPS_1, NUM_SAMPLED, LEARNING_RATE, \
                         CLOSEST_NODES, CHUNK_SIZE, NB_CHUNK, embeddings, reverse_data_DistancematrixPPI)

    X = dict(zip(range(embeddings.shape[0]), embeddings))
    X = {str(int(nodesstr[key]) + 1): X[key] for key in X}
    np.save('embeddings_MH', X)
    date = datetime.datetime.now()
    os.replace('embeddings_MH.npy',
               './ResultsMultiVERSE/' + 'embeddings_MH.npy')

    ########################################################################
    # Link prediction for evaluation of MH
    ########################################################################

    edge_emb = ['hadamard', 'weighted_l1', 'weighted_l2', 'average', 'cosine']
    results_embeddings_methods = dict()

    for i in range(len(edge_emb)):
        tmp_result_multiverse = nee.evaluate_ne(data_split=nee.traintest_split,
                                                X=X,
                                                method="Multiverse",
                                                edge_embed_method=edge_emb[i],
                                                label_binarizer=lp_model)
        results_embeddings_methods[tmp_result_multiverse.method + '_' + str(
            edge_emb[i])] = tmp_result_multiverse.get_all()[1][4]

    ########################################################################
    # Analysis and saving of the results
    ########################################################################

    Result_file = 'Result_LinkpredMultiplexHet_' + graph_name + '_' + str(
        date) + '.txt'
    with open(Result_file, "w+") as overall_result:
        print("%s: \n\
                EMBED_DIMENSION: %s \n\
                CLOSEST_NODES: %s  \n\
                NUM_STEPS_1: %s  \n\
                NUM_SAMPLED: %s  \n\
                LEARNING_RATE: %s  \n\
                CHUNK_SIZE: %s  \n\
                NB_CHUNK: %s  \n\
                train_frac: %s \n\
                solver: %s \n\
                max_iter: %s  \n\
                split_alg: %s  \n\
                " % (str(date), EMBED_DIMENSION, CLOSEST_NODES, NUM_STEPS_1,
                     NUM_SAMPLED, LEARNING_RATE, CHUNK_SIZE, NB_CHUNK,
                     train_frac, solver, max_iter, split_alg),
              file=overall_result)

        print('Overall MULTIVERSE AUC hadamard:',
              results_embeddings_methods['Multiverse_hadamard'],
              file=overall_result)
        print('Overall MULTIVERSE AUC weighted_l1:',
              results_embeddings_methods['Multiverse_weighted_l1'],
              file=overall_result)
        print('Overall MULTIVERSE AUC weighted_l2:',
              results_embeddings_methods['Multiverse_weighted_l2'],
              file=overall_result)
        print('Overall MULTIVERSE AUC average:',
              results_embeddings_methods['Multiverse_average'],
              file=overall_result)
        print('Overall MULTIVERSE AUC cosine:',
              results_embeddings_methods['Multiverse_cosine'],
              file=overall_result)

    overall_result.close()
    os.replace(Result_file, './ResultsMultiVERSE/' + Result_file)

    print('End')
Exemplo n.º 2
0
class LinkPredictionTuning(Tuning):
    r"""

    Clase general de entrenamiento y testeo de embeddings de grafos para la tarea de prediccion de enlaces.

    Parameters
    ----------
    G: NetworkX graph
        Grafo de entrenamiento.
    G_test: NetworkX graph
        Grafo de testeo.
    root: str
        directorio en el que se guardaran los resultados
    """
    def __init__(self, G, G_test, root="results/lp/"):
        super(LinkPredictionTuning, self).__init__(G, root=root)
        self.task = "lp"

        train_E = G.edges
        train_E_false = self.GetNegativeEdges(G, len(train_E))

        test_E = G_test.edges
        test_E_false = self.GetNegativeEdges(G_test, len(test_E))

        self.split = EvalSplit()
        self.split.set_splits(train_E,
                              train_E_false=train_E_false,
                              test_E=test_E,
                              test_E_false=test_E_false,
                              TG=G)

        self.training_graph = create_self_defined_dataset(root_dir="",
                                                          name_dict={},
                                                          name="training " +
                                                          self.tipo,
                                                          weighted=True,
                                                          directed=False,
                                                          attributed=True)()
        self.training_graph.set_g(G)

        self.evaluator = LPEvaluator(self.split)

    def GetNegativeEdges(self, G, n):
        r"""

        Metodo auxiliar que muestrea enlaces negativos.

        Parameters
        ----------
        G: NetworkX graph
           Grafo bipartito.
        n: int
            cantidad de enlaces que muestrear.
        """

        prop_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 0]
        user_nodes = [n for n, d in G.nodes(data=True) if d['bipartite'] == 1]

        non_edges = []

        while len(non_edges) <= n:
            random_prop = random.choice(prop_nodes)
            random_user = random.choice(user_nodes)
            edge = (random_prop, random_user)
            if G.has_edge(*edge):
                continue
            else:
                non_edges.append(edge)
        return non_edges

    def TestModel(self, emb, time=-1, method_name="method_name"):
        r"""

        Testea un embedding y lo guarda en el scoresheet.

        Parameters
        ----------
        emb: dict
            diccionario de embeddings, llaves son los nodos y los valores una lista con el embedding
        time: float
            tiempo de ejecucion del metodo, para guardar en el scoresheet
        method_name: str
            nombre del metodo con el que guardar.
        """
        df = pd.DataFrame(emb).T
        X = df.T.to_dict("list")
        X = {str(k): np.array(v)
             for k, v in X.items()
             }  # tiene que ser array por que se hacen sumas

        self.evaluator.dim = df.shape[1]

        reslp = []
        for edge_method in [
                "weighted_l1", "weighted_l2", "hadamard", "average"
        ]:
            #TO DO que no evalue en los 4 embeddings de enlaces
            res = self.evaluator.evaluate_ne(self.split,
                                             X=X,
                                             method=method_name,
                                             edge_embed_method=edge_method,
                                             params={"nw_name": "GPI"})
            res.params.update({'eval_time': time})
            reslp.append(res)
        self.scoresheet.log_results(reslp)
        return reslp