Python normalize_vectors示例，local.gae.preprocessing.normalize_vectors Python示例

示例#1

0

显示文件

def Testcheck(embedding, embeddingLabels, name):

    Maxscore = -10000
    NumberOfCluster = 0
    emb_norm = normalize_vectors(embedding)

    c, num_clust, req_c = FINCH(emb_norm)
    print("num_clust: ", num_clust)
    for nc in num_clust:
        emb_norm = normalize_vectors(emb_norm)
        TempLabels = clustering(emb_norm, nc)
        score = silhouette_score(emb_norm, TempLabels)
        print('nc: ', nc, ', score: ', score)
        if score > Maxscore:
            Maxscore = score
            # tClusterLabels = TempLabels
            NumberOfCluster = nc

    clusters_pred = list(clustering(emb_norm, num_clusters=NumberOfCluster))

    print("clusters_pred: ", clusters_pred)
    print("embeddingLabels: ", embeddingLabels)
    prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred,
                                                 embeddingLabels)
    print('pairwise precision', '{:.5f}'.format(prec), 'recall',
          '{:.5f}'.format(rec), 'f1', '{:.5f}'.format(f1))
    tSNEAnanlyse(embedding,
                 labels=embeddingLabels,
                 trueLabels=embeddingLabels,
                 savepath=join(settings.OUT_DIR, name))

示例#2

0

显示文件

def check(embedding, embeddingLabels, name):
    emb_norm = normalize_vectors(embedding)
    clusters_pred = list(
        clustering(emb_norm, num_clusters=len(list(set(embeddingLabels)))))

    print("clusters_pred: ", clusters_pred)
    print("embeddingLabels: ", embeddingLabels)
    prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred,
                                                 embeddingLabels)
    print('pairwise precision', '{:.5f}'.format(prec), 'recall',
          '{:.5f}'.format(rec), 'f1', '{:.5f}'.format(f1))
    tSNEAnanlyse(embedding,
                 labels=embeddingLabels,
                 trueLabels=embeddingLabels,
                 savepath=join(settings.OUT_DIR, name))
    return f1

示例#3

0

显示文件

文件： IslandLossModel.py 项目： shanxuanchen/AminerGame

    def tarin(self, batchX, batchy, testX, testy, AllX, Ally, epochs=3000):

        model = self.buildModel()
        loss, opt = self.buildOptimizer(model)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            # Train model
            for epoch in range(epochs):
                # for batchid, batchX in enumerate(X):
                # batchy = y[batchid]

                # Construct feed dictionary
                feed_dict = {self.placeholder['input']: batchX, self.placeholder['labels']: batchy}
                # Run single weight update
                outs = sess.run([loss, opt], feed_dict=feed_dict)
                # Compute average loss
                lossScale = outs[0]
                # accScale = outs[2]

                print("Epoch:", '%04d' % (epoch + 1), "loss=", "{:.5f}".format(lossScale))
                # print("Epoch:", '%04d' % (epoch + 1), "loss=", "{:.5f}".format(lossScale), ',acc = {:.5f}'.format(accScale))

            # testacc = sess.run([acc], feed_dict={self.placeholder['input']: testX, self.placeholder['labels']: testy})
            # print ("test acc: ", testacc)


            # check embedding
            ClusterCheckX = AllX
            ClusterCheckY = Ally


            embedding = sess.run(model, feed_dict={self.placeholder['input']: ClusterCheckX, self.placeholder['labels']: ClusterCheckY})
            print (embedding.shape)
            emb_norm = normalize_vectors(embedding)
            clusters_pred = list(clustering(emb_norm, num_clusters=len(list(set(ClusterCheckY)))))
            embeddingLabels = self.codeLabel(ClusterCheckY)
            print ("clusters_pred: ", clusters_pred)
            print ("embeddingLabels: ", embeddingLabels)
            prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, embeddingLabels)
            print('pairwise precision', '{:.5f}'.format(prec),
                  'recall', '{:.5f}'.format(rec),
                  'f1', '{:.5f}'.format(f1))

示例#4

0

显示文件

文件： inductive_preprocess.py 项目： timxiao317/disambiguation

def preprocess(name):
    adj, features, labels = load_local_data(exp_name, IDF_THRESHOLD, name=name)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    adj_train = gen_train_edges(adj)

    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    edge_labels = []
    n_samples = len(labels)
    for i in range(n_samples - 1):
        for j in range(i + 1, n_samples):
            if labels[i] == labels[j]:
                edge_labels.append([i, j])
    edge_labels = np.array(edge_labels)
    adj_label = sp.coo_matrix((np.ones(edge_labels.shape[0]),
                               (edge_labels[:, 0], edge_labels[:, 1])),
                              shape=(features.shape[0], features.shape[0]),
                              dtype=np.float32)
    pos_weight = float(adj_label.shape[0] * adj_label.shape[0] -
                       adj_label.sum()) / adj_label.sum()
    norm = adj_label.shape[0] * adj_label.shape[0] / float(
        (adj_label.shape[0] * adj_label.shape[0] - adj_label.nnz) * 2)
    adj_label = adj_label + sp.eye(adj_label.shape[0])
    adj_label = sparse_to_tuple(adj_label)
    if FLAGS.is_sparse:  # TODO to test
        # features = sparse_to_tuple(features.tocoo())
        # features_nonzero = features[1].shape[0]
        features = features.todense()  # TODO
    else:
        features = normalize_vectors(features)
    # print("positive_label", len(edge_labels))
    print('positive edge weight', pos_weight)  # negative edges/pos edges
    print('norm', norm)  # negative edges/pos edges
    return adj_norm, adj_label, features, pos_weight, norm, labels

示例#5

0

显示文件

def main():
    names = load_train_names()
    Prec = 0
    Recall = 0
    F1 = 0
    for name in names:
        adj, features, labels, Ids = load_local_data(name=name)
        emb_norm = normalize_vectors(features)
        clusters_pred = clustering(emb_norm, len(list(set(labels))))
        prec, rec, f1 = pairwise_precision_recall_f1(clusters_pred, labels)
        print('pairwise precision', '{:.5f}'.format(prec),
              'recall', '{:.5f}'.format(rec),
              'f1', '{:.5f}'.format(f1))
        Prec += prec
        Recall += rec
        F1 += f1
    Prec = Prec / (1.0 * len(names))
    Recall = Recall / (1.0 * len(names))
    F1 = F1 / (1.0 * len(names))

    print('All pairwise precision', '{:.5f}'.format(Prec),
          'recall', '{:.5f}'.format(Recall),
          'f1', '{:.5f}'.format(F1))

示例#6

0

显示文件

文件： train.py 项目： shanxuanchen/AttentionBasedNameDisambiguation

def gae_for_na(name, rawfeature):
    """
    train and evaluate disambiguation results for a specific name
    :param name:  author name
    :return: evaluation results
    """
    adj, features, labels = load_local_data(name=name, rawfeature=rawfeature)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    adj_train = gen_train_edges(adj)

    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    num_nodes = adj.shape[0]
    input_feature_dim = features.shape[1]
    if FLAGS.is_sparse:  # TODO to test
        # features = sparse_to_tuple(features.tocoo())
        # features_nonzero = features[1].shape[0]
        features = features.todense()  # TODO
    else:
        features = normalize_vectors(features)

    # Define placeholders
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelAE(placeholders, input_feature_dim)
    elif model_str == 'gcn_vae':
        model = GCNModelVAE(placeholders, input_feature_dim, num_nodes)
    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()  # negative edges/pos edges
    print('positive edge weight', pos_weight)
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerAE(preds=model.reconstructions,
                              labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                          validate_indices=False), [-1]),
                              pos_weight=pos_weight,
                              norm=norm)
        elif model_str == 'gcn_vae':
            opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                           validate_indices=False), [-1]),
                               model=model, num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm)

    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    def get_embs():
        feed_dict.update({placeholders['dropout']: 0})
        emb = sess.run(model.z_mean, feed_dict=feed_dict)  # z_mean is better
        return emb

    # Train model
    for epoch in range(FLAGS.epochs):

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                        feed_dict=feed_dict)

        # Compute average loss
        avg_cost = outs[1]
        avg_accuracy = outs[2]

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
              "train_acc=", "{:.5f}".format(avg_accuracy),
              "time=", "{:.5f}".format(time.time() - t))

    emb = get_embs()
    n_clusters = len(set(labels))
    emb_norm = normalize_vectors(emb)
    clusters_pred = clustering(emb_norm, num_clusters=n_clusters)
    prec, rec, f1 =  pairwise_precision_recall_f1(clusters_pred, labels)
    print('pairwise precision', '{:.5f}'.format(prec),
          'recall', '{:.5f}'.format(rec),
          'f1', '{:.5f}'.format(f1))

    clusters_pred2 = clustering(features, num_clusters=n_clusters)
    prec2, rec2, f12 =  pairwise_precision_recall_f1(clusters_pred2, labels)
    print('pairwise precision', '{:.5f}'.format(prec2),
          'recall', '{:.5f}'.format(rec2),
          'f1', '{:.5f}'.format(f12))

    from sklearn.manifold import TSNE
    features_new = TSNE(learning_rate=100).fit_transform(features)
    emb_new = TSNE(learning_rate=100).fit_transform(emb_norm)

    labels = np.array(labels) + 2
    clusters_pred = np.array(clusters_pred) + 2
    clusters_pred2 = np.array(clusters_pred2) + 2

    if rawfeature == RAW_INTER_NAME:
        tSNEAnanlyse(emb_norm, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_raw.png" % (name)))
        tSNEAnanlyse(features, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_raw.png" % (name)))
    elif rawfeature == ATTENTIONFEATURE:
        tSNEAnanlyse(emb_new, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final.png" % (name)))
        tSNEAnanlyse(features_new, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features.png" % (name)))
        tSNEAnanlyse(emb_new, clusters_pred, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_clusterresult.png" % (name)))
        tSNEAnanlyse(features_new, clusters_pred2, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_clusterresult.png" % (name)))
    else:
        tSNEAnanlyse(emb_norm, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_final_triplet.png" % (name)))
        tSNEAnanlyse(features, labels, join(settings.PIC_DIR, "FINALResult", "rawReature_%s_gae_features_triplet.png" % (name)))

    return [prec, rec, f1], num_nodes, n_clusters

示例#7

0

显示文件

文件： train.py 项目： zhysora/BUAALAB_IN_WDQ

def gae_for_na(name, n_clusters): # 对一个具体的姓名预测其消歧结果  评估值[pre, rec, f1], 文档数， 聚类数
    """
    train and evaluate disambiguation results for a specific name
    :param name:  author name
    :return: evaluation results
    """
    adj, features, pids = load_local_data(name=name) # 邻接矩阵(i,j)=1， 文档特征集(y, aid), 标记集 aid索引编号i; features与labels关于下标 一一对应

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros() # 在原邻接矩阵的基础上 去对角线元素 删除0
    adj_train = gen_train_edges(adj) # 这里搞了半天 感觉就是 把adj的对角元素删了 用的csr_matrix 类型 

    adj = adj_train # 完整的邻接矩阵 

    # Some preprocessing
    adj_norm = preprocess_graph(adj)  # 标准化 矩阵 A^' 返回的是元组tuple 坐标(x,y), 值, 形状
    num_nodes = adj.shape[0] # 节点数
    input_feature_dim = features.shape[1] # 输入 特征 维数 [0]是个数
    if FLAGS.is_sparse:  # TODO to test
        # features = sparse_to_tuple(features.tocoo())
        # features_nonzero = features[1].shape[0]
        features = features.todense()  # TODO
    else: # 条件 进入的是 这边
        features = normalize_vectors(features)# 特征向量 标准化

    # Define placeholders
    # tf.placeholder 此函数可以理解为形参，用于定义过程，在执行的时候再赋具体的值 ?_?
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)),
        'adj': tf.sparse_placeholder(tf.float32),# 为稀疏张量插入占位符，该稀疏张量将始终被提供
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=())# 该函数将返回一个张量。与 input 具有相同的类型。一个占位符张量，默认为 input 的占位符张量 (如果未送入)。
    }

    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelAE(placeholders, input_feature_dim)
    elif model_str == 'gcn_vae':# 使用的模型 是 gcn_vae
        model = GCNModelVAE(placeholders, input_feature_dim, num_nodes)
    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()  # negative edges/pos edges
    print('positive edge weight', pos_weight)# 负边/正边
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.nnz) * 2) # 矩阵中非零元素的数量nnz

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerAE(preds=model.reconstructions,
                              labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                          validate_indices=False), [-1]),
                              pos_weight=pos_weight,
                              norm=norm)
        elif model_str == 'gcn_vae':# 使用的模型 是 gcn_vae
            opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                           validate_indices=False), [-1]),
                               model=model, num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm)

    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    adj_label = adj_train + sp.eye(adj_train.shape[0])# sp.eye 单位矩阵 标记， 解码应该得到原矩阵
    adj_label = sparse_to_tuple(adj_label)# 稀疏矩阵 -> 元组 (坐标， 值， 维度形状)

    def get_embs():# 获得内部 嵌入z
        feed_dict.update({placeholders['dropout']: 0})
        emb = sess.run(model.z_mean, feed_dict=feed_dict)  # z_mean is better
        return emb

    # Train model
    for epoch in range(FLAGS.epochs):# 训练批次 epoch

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                        feed_dict=feed_dict)

        # Compute average loss
        avg_cost = outs[1]
        avg_accuracy = outs[2]

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
              "train_acc=", "{:.5f}".format(avg_accuracy),
              "time=", "{:.5f}".format(time.time() - t))

    emb = get_embs() # 经过 编码器后 的 嵌入层
    ''' n_clusters = int(name_to_ncluster.get(name, 0))
    n_clusters = len(set(labels))# 直接获得 真实的 聚类大小
    if n_clusters == 1:
        return None, None, None, None '''
    #n_clusters = len(set(labels))# 直接获得 真实的 聚类大小
    emb_norm = normalize_vectors(emb)# 标准化 嵌入层
    clusters_pred = clustering(emb_norm, num_clusters=max(n_clusters,1)) # 聚类， 嵌入集 与 聚类大小

    print('clusters_pred: ', clusters_pred)

    ret = {}
    for i, pred_label in enumerate(clusters_pred):
        pred_label = str(pred_label)
        if pred_label not in ret:
            ret[pred_label] = []
        ret[pred_label].append(pids[i])

    rett = []
    for pred_label in ret:
        tmp = []
        for pid in ret[pred_label]:
            tmp.append(pid)
        rett.append(tmp)
    return rett

    ''' ret = {}

示例#8

0

显示文件

文件： inductive_train.py 项目： timxiao317/disambiguation

def main():
    """
        train and evaluate YUTAO results for a specific name
        :param name:  author name
        :return: evaluation results
        """

    # Store original adjacency matrix (without diagonal entries) for later
    # Define placeholders
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32,
                                   shape=(None, input_feature_dim)),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=()),
        'pos_weight': tf.placeholder(tf.float32, shape=()),
        'norm': tf.placeholder(tf.float32),
    }
    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelInductiveAE(placeholders, input_feature_dim)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerInductiveAE(preds=model.reconstructions,
                                       labels=tf.reshape(
                                           tf.sparse_tensor_to_dense(
                                               placeholders['adj_orig'],
                                               validate_indices=False), [-1]),
                                       pos_weight=model.pos_weight,
                                       norm=model.norm)

    saver = tf.train.Saver()
    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def infer():
        feed_dict.update({placeholders['dropout']: 0})
        acc, emb = sess.run([opt.accuracy, model.z_mean],
                            feed_dict=feed_dict)  # z_mean is better
        return acc, emb

    train_name_list, _ = settings.get_split_name_list(train_dataset_name)
    _, test_name_list = settings.get_split_name_list(test_dataset_name)

    # Train model
    for epoch in range(FLAGS.epochs):
        epoch_avg_cost = 0
        epoch_avg_accuracy = 0
        for name in train_name_list:
            adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result(
                exp_name, IDF_THRESHOLD, name)
            # print('positive edge weight', pos_weight)  # negative edges/pos edges
            t = time.time()
            # Construct feed dictionary
            feed_dict = construct_feed_dict_inductive(adj_norm, adj_label,
                                                      features, pos_weight,
                                                      norm, placeholders)
            feed_dict.update({placeholders['dropout']: FLAGS.dropout})
            # Run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                            feed_dict=feed_dict)
            # Compute average loss
            avg_cost = outs[1]
            avg_accuracy = outs[2]
            epoch_avg_cost += avg_cost
            epoch_avg_accuracy += avg_accuracy
            # print(avg_cost, avg_accuracy)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(epoch_avg_cost / len(train_name_list)),
              "train_acc=",
              "{:.5f}".format(epoch_avg_accuracy / len(train_name_list)),
              "time=", "{:.5f}".format(time.time() - t))
        metrics = np.zeros(3)
        tp_fp_fn_sum = np.zeros(3)
        avg_acc = 0
        for name in test_name_list:
            adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result(
                exp_name, IDF_THRESHOLD, name)
            feed_dict = construct_feed_dict_inductive(adj_norm, adj_label,
                                                      features, pos_weight,
                                                      norm, placeholders)
            acc, emb = infer()
            n_clusters = len(set(labels))
            emb_norm = normalize_vectors(emb)
            clusters_pred = clustering(emb_norm, num_clusters=n_clusters)
            tp, fp, fn, prec, rec, f1 = pairwise_precision_recall_f1(
                clusters_pred, labels)
            tp_fp_fn_sum += np.array([tp, fp, fn])
            metrics += np.array([prec, rec, f1])
            avg_acc += acc
        macro_prec = metrics[0] / len(test_name_list)
        macro_rec = metrics[1] / len(test_name_list)
        avg_acc /= len(test_name_list)
        macro_f1 = cal_f1(macro_prec, macro_rec)
        tp, fp, fn = tp_fp_fn_sum
        micro_precision = tp / (tp + fp)
        micro_recall = tp / (tp + fn)
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision +
                                                         micro_recall)
        print(
            'average,acc:{0:.5f},macro_prec:{1:.5f},macro_rec:{2:.5f},macro_f1:{3:.5f},micro_precision:{4:.5f},micro_recall:{5:5f},micro_f1:{6:5f}\n'
            .format(avg_acc, macro_prec, macro_rec, macro_f1, micro_precision,
                    micro_recall, micro_f1))
    path = join(settings.get_data_dir(exp_name), 'local',
                'model-{}'.format(IDF_THRESHOLD), model_name)
    saver.save(sess, path)

示例#9

0

显示文件

def gae_for_na(name, mode=0):
    """
    train and evaluate disambiguation results for a specific name
    :param name:  author name
    :return: evaluation results
    :mode: 0-train 1-val
    """
    pids, adj, features, labels = load_local_data(name=name)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    adj_train = gen_train_edges(adj)

    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    num_nodes = adj.shape[0]
    input_feature_dim = features.shape[1]
    if FLAGS.is_sparse:  # TODO to test
        # features = sparse_to_tuple(features.tocoo())
        # features_nonzero = features[1].shape[0]
        features = features.todense()  # TODO
    else:
        features = normalize_vectors(features)

    # Define placeholders
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32,
                                   shape=(None, input_feature_dim)),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelAE(placeholders, input_feature_dim)
    elif model_str == 'gcn_vae':
        model = GCNModelVAE(placeholders, input_feature_dim, num_nodes)
    pos_weight = float(adj.shape[0] * adj.shape[0] -
                       adj.sum()) / adj.sum()  # negative edges/pos edges
    print('positive edge weight', pos_weight)
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.nnz) * 2)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerAE(preds=model.reconstructions,
                              labels=tf.reshape(
                                  tf.sparse_tensor_to_dense(
                                      placeholders['adj_orig'],
                                      validate_indices=False), [-1]),
                              pos_weight=pos_weight,
                              norm=norm)
        elif model_str == 'gcn_vae':
            opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.reshape(
                                   tf.sparse_tensor_to_dense(
                                       placeholders['adj_orig'],
                                       validate_indices=False), [-1]),
                               model=model,
                               num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm)

    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    def get_embs():
        feed_dict.update({placeholders['dropout']: 0})
        emb = sess.run(model.z_mean, feed_dict=feed_dict)  # z_mean is better
        return emb

    # Train model
    for epoch in range(FLAGS.epochs):

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features,
                                        placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                        feed_dict=feed_dict)

        # Compute average loss
        avg_cost = outs[1]
        avg_accuracy = outs[2]

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(avg_cost), "train_acc=",
              "{:.5f}".format(avg_accuracy), "time=",
              "{:.5f}".format(time.time() - t))

    emb = get_embs()

    for idx, pid in enumerate(pids):
        local_output[pid] = emb[idx]

    # Train mode calcul F1
    if not (mode == 2):
        n_clusters = len(set(labels))
        emb_norm = normalize_vectors(emb)
        model = AgglomerativeClustering(n_clusters=n_clusters)
        model.fit(emb_norm)
        prec, rec, f1 = pairwise_precision_recall_f1(model.labels_, labels)
        print('pairwise precision', '{:.5f}'.format(prec), 'recall',
              '{:.5f}'.format(rec), 'f1', '{:.5f}'.format(f1))
        return [prec, rec, f1], num_nodes, n_clusters