def main():
    names = load_test_names()
    wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'), 'w', encoding='utf-8')
    wf.write('name,n_pubs,n_clusters,precision,recall,f1\n')
    metrics = np.zeros(3)
    cnt = 0
    for name in names:
        cur_metric, num_nodes, n_clusters = gae_for_na(name, rawfeature="attention_feature")
        # cur_metric, num_nodes, n_clusters = gae_for_na(name, rawfeature="attention_feature")
        wf.write('{0},{1},{2},{3:.5f},{4:.5f},{5:.5f}\n'.format(
            name, num_nodes, n_clusters, cur_metric[0], cur_metric[1], cur_metric[2]))
        wf.flush()
        for i, m in enumerate(cur_metric):
            metrics[i] += m
        cnt += 1
        macro_prec = metrics[0] / cnt
        macro_rec = metrics[1] / cnt
        macro_f1 = cal_f1(macro_prec, macro_rec)
        print('average until now', [macro_prec, macro_rec, macro_f1])
        time_acc = time.time()-start_time
        print(cnt, 'names', time_acc, 'avg time', time_acc/cnt)
    macro_prec = metrics[0] / cnt
    macro_rec = metrics[1] / cnt
    macro_f1 = cal_f1(macro_prec, macro_rec)
    wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format(
        macro_prec, macro_rec, macro_f1))
    wf.close()
示例#2
0
def main():
    names = load_test_names()# 加载测试 作者名 列表
    ans = {}

    wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'), 'w', encoding='utf-8')# 结果保存 文件
    wf.write('name,n_pubs,n_clusters,precision,recall,f1\n')#姓名, 论文数, 聚类数, 准确率, 召回, f1分数
    metrics = np.zeros(3)# 3个0
    cnt = 0
    for name in names:#枚举 姓名
        cur_metric, num_nodes, n_clusters, ans[name] = gae_for_na(name)#评估值[pre, rec, f1], 文档数, 聚类数
        if cur_metric == None:
            continue
        wf.write('{0},{1},{2},{3:.5f},{4:.5f},{5:.5f}\n'.format(# 保存到文件
            name, num_nodes, n_clusters, cur_metric[0], cur_metric[1], cur_metric[2]))
        wf.flush()
        for i, m in enumerate(cur_metric): # 各评估值 求和 取平均
            metrics[i] += m
        cnt += 1
        macro_prec = metrics[0] / cnt
        macro_rec = metrics[1] / cnt
        macro_f1 = cal_f1(macro_prec, macro_rec)
        print('average until now', [macro_prec, macro_rec, macro_f1]) # 现在的 各宏-评估值, 计算到 当前name的
        time_acc = time.time()-start_time
        print(cnt, 'names', time_acc, 'avg time', time_acc/cnt)# 运算 的 时间
    macro_prec = metrics[0] / cnt
    macro_rec = metrics[1] / cnt
    macro_f1 = cal_f1(macro_prec, macro_rec)
    wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format(
        macro_prec, macro_rec, macro_f1))# 最终 的 各宏-评估值
    wf.close() 

    dump_json(ans, settings.OUT_DIR, 'local_clustering_results.json', True) 
示例#3
0
def main():
    names = load_test_names(test_dataset_name)
    wf = codecs.open(join(settings.get_out_dir(exp_name), 'local_clustering_results.csv'), 'w', encoding='utf-8')
    wf.write('name,n_pubs,n_clusters,precision,recall,f1\n')
    metrics = np.zeros(3)
    cnt = 0
    tp_fp_fn_sum = np.zeros(3)
    for name in names:
        try:
            tp_fp_fn, cur_metric, num_nodes, n_clusters = gae_for_na(name)
            wf.write('{0},{1},{2},{3:.5f},{4:.5f},{5:.5f},{6:.5f},{7:.5f},{8:.5f},\n'.format(
                name, num_nodes, n_clusters, cur_metric[0], cur_metric[1], cur_metric[2], *tp_fp_fn))
            wf.flush()
            for i, m in enumerate(cur_metric):
                metrics[i] += m
            cnt += 1
            tp_fp_fn_sum += np.array(tp_fp_fn)
            macro_prec = metrics[0] / cnt
            macro_rec = metrics[1] / cnt
            macro_f1 = cal_f1(macro_prec, macro_rec)
            print('average until now', [macro_prec, macro_rec, macro_f1])
            time_acc = time.time() - start_time
            print(cnt, 'names', time_acc, 'avg time', time_acc / cnt)
        except:
            continue
    macro_prec = metrics[0] / cnt
    macro_rec = metrics[1] / cnt
    macro_f1 = cal_f1(macro_prec, macro_rec)
    tp, fp, fn = tp_fp_fn_sum
    micro_precision = tp / (tp + fp)
    micro_recall = tp / (tp + fn)
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)
    wf.write('average,,,{0:.5f},{1:.5f},{2:.5f},{3:.5f},{4:5f},{5:5f}\n'.format(
        macro_prec, macro_rec, macro_f1, micro_precision, micro_recall, micro_f1))
    wf.close()
示例#4
0
def testDataRun():
    cnt = 0
    metrics = np.zeros(3)
    wf = codecs.open(join(settings.OUT_DIR, 'local_clustering_results.csv'),
                     'w',
                     encoding='utf-8')
    LMDB_NAME_EMB = "graph_auto_encoder_embedding"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    han = HAN(lc_emb)
    name_to_pubs_test = load_test_names()
    for name in name_to_pubs_test:
        prec, rec, f1, pids, attentionEmbeddings = han.prepare_and_train(
            name=name, needtSNE=True)
        print(name, prec, rec, f1)
        wf.write('{0},{1:.5f},{2:.5f},{3:.5f}\n'.format(name, prec, rec, f1))
        wf.flush()

        metrics[0] = metrics[0] + prec
        metrics[1] = metrics[1] + rec
        metrics[2] = metrics[2] + f1
        cnt += 1

        for pid, embedding in zip(pids, attentionEmbeddings):
            lc_emb.set(pid, embedding)

    macro_prec = metrics[0] / cnt
    macro_rec = metrics[1] / cnt
    macro_f1 = eval_utils.cal_f1(macro_prec, macro_rec)
    wf.write('average,,,{0:.5f},{1:.5f},{2:.5f}\n'.format(
        macro_prec, macro_rec, macro_f1))
    wf.close()
def main():
    """
        train and evaluate YUTAO results for a specific name
        :param name:  author name
        :return: evaluation results
        """

    # Store original adjacency matrix (without diagonal entries) for later
    # Define placeholders
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32,
                                   shape=(None, input_feature_dim)),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=()),
        'pos_weight': tf.placeholder(tf.float32, shape=()),
        'norm': tf.placeholder(tf.float32),
    }
    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelInductiveAE(placeholders, input_feature_dim)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerInductiveAE(preds=model.reconstructions,
                                       labels=tf.reshape(
                                           tf.sparse_tensor_to_dense(
                                               placeholders['adj_orig'],
                                               validate_indices=False), [-1]),
                                       pos_weight=model.pos_weight,
                                       norm=model.norm)

    saver = tf.train.Saver()
    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def infer():
        feed_dict.update({placeholders['dropout']: 0})
        acc, emb = sess.run([opt.accuracy, model.z_mean],
                            feed_dict=feed_dict)  # z_mean is better
        return acc, emb

    train_name_list, _ = settings.get_split_name_list(train_dataset_name)
    _, test_name_list = settings.get_split_name_list(test_dataset_name)

    # Train model
    for epoch in range(FLAGS.epochs):
        epoch_avg_cost = 0
        epoch_avg_accuracy = 0
        for name in train_name_list:
            adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result(
                exp_name, IDF_THRESHOLD, name)
            # print('positive edge weight', pos_weight)  # negative edges/pos edges
            t = time.time()
            # Construct feed dictionary
            feed_dict = construct_feed_dict_inductive(adj_norm, adj_label,
                                                      features, pos_weight,
                                                      norm, placeholders)
            feed_dict.update({placeholders['dropout']: FLAGS.dropout})
            # Run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                            feed_dict=feed_dict)
            # Compute average loss
            avg_cost = outs[1]
            avg_accuracy = outs[2]
            epoch_avg_cost += avg_cost
            epoch_avg_accuracy += avg_accuracy
            # print(avg_cost, avg_accuracy)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(epoch_avg_cost / len(train_name_list)),
              "train_acc=",
              "{:.5f}".format(epoch_avg_accuracy / len(train_name_list)),
              "time=", "{:.5f}".format(time.time() - t))
        metrics = np.zeros(3)
        tp_fp_fn_sum = np.zeros(3)
        avg_acc = 0
        for name in test_name_list:
            adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result(
                exp_name, IDF_THRESHOLD, name)
            feed_dict = construct_feed_dict_inductive(adj_norm, adj_label,
                                                      features, pos_weight,
                                                      norm, placeholders)
            acc, emb = infer()
            n_clusters = len(set(labels))
            emb_norm = normalize_vectors(emb)
            clusters_pred = clustering(emb_norm, num_clusters=n_clusters)
            tp, fp, fn, prec, rec, f1 = pairwise_precision_recall_f1(
                clusters_pred, labels)
            tp_fp_fn_sum += np.array([tp, fp, fn])
            metrics += np.array([prec, rec, f1])
            avg_acc += acc
        macro_prec = metrics[0] / len(test_name_list)
        macro_rec = metrics[1] / len(test_name_list)
        avg_acc /= len(test_name_list)
        macro_f1 = cal_f1(macro_prec, macro_rec)
        tp, fp, fn = tp_fp_fn_sum
        micro_precision = tp / (tp + fp)
        micro_recall = tp / (tp + fn)
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision +
                                                         micro_recall)
        print(
            'average,acc:{0:.5f},macro_prec:{1:.5f},macro_rec:{2:.5f},macro_f1:{3:.5f},micro_precision:{4:.5f},micro_recall:{5:5f},micro_f1:{6:5f}\n'
            .format(avg_acc, macro_prec, macro_rec, macro_f1, micro_precision,
                    micro_recall, micro_f1))
    path = join(settings.get_data_dir(exp_name), 'local',
                'model-{}'.format(IDF_THRESHOLD), model_name)
    saver.save(sess, path)