def dump_inter_emb():
    """
    dump hidden embedding via trained global model for local model to use
    """
    LMDB_NAME = "author_100.emb.weighted"
    lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME)
    lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME)
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME)
    global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000)
    trained_global_model = global_model.load_triplets_model()
    name_to_pubs_test = {}
    name_to_pubs_train = {}
    TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name)
    _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name)
    for case_name in TRAIN_NAME_LIST:
        name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json")
    for case_name in TEST_NAME_LIST:
        name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json")
    # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
    for name in name_to_pubs_test:
        print('name', name)
        name_data = name_to_pubs_test[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            # print(len(name_data[aid]))
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input_test.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
    for name in name_to_pubs_train:
        print('name', name)
        name_data = name_to_pubs_train[name]
        embs_input = []
        pids = []
        for i, aid in enumerate(name_data.keys()):
            # print(len(name_data[aid]))
            if len(name_data[aid]) < 5:  # n_pubs of current author is too small
                continue
            for pid in name_data[aid]:
                cur_emb = lc_input_train.get(pid)
                if cur_emb is None:
                    continue
                embs_input.append(cur_emb)
                pids.append(pid)
        embs_input = np.stack(embs_input)
        inter_embs = get_hidden_output(trained_global_model, embs_input)
        for i, pid_ in enumerate(pids):
            lc_inter.set(pid_, inter_embs[i])
def main():
    train_names, _ = settings.get_split_name_list(train_dataset_name)
    _, test_names = settings.get_split_name_list(test_dataset_name)
    # neg_sum = 0
    # pos_sum = 0
    for name in train_names + test_names:
        adj_norm, adj_label, features, pos_weight, norm, labels = preprocess(
            name)
        # neg_sum += adj.shape[0] * adj.shape[0] - adj.sum()
        # pos_sum += adj.sum()
        # print(features.shape[1])
        save_local_preprocess_result(
            (adj_norm, adj_label, features, pos_weight, norm, labels), name)
Пример #3
0
def gen_test(dataset_name, k=300, flatten=False):
    name_to_pubs_test = {}
    _, TEST_NAME_LIST = settings.get_split_name_list(dataset_name)
    for case_name in TEST_NAME_LIST:
        name_to_pubs_test[case_name] = data_utils.load_json(
            join(settings.get_raw_data_dir(dataset_name), case_name),
            "assignments.json")
    # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
    xs, ys = [], []
    names = []
    for name in name_to_pubs_test:
        names.append(name)
        num_clusters = len(name_to_pubs_test[name])
        x = []
        items = []
        for c in name_to_pubs_test[name]:  # one person
            for item in name_to_pubs_test[name][c]:
                items.append(item)
        sampled_points = [
            items[p] for p in np.random.choice(len(items), k, replace=True)
        ]
        for p in sampled_points:
            if p in data_cache:
                x.append(data_cache[p])
            else:
                x.append(lc.get(p))
        if flatten:
            xs.append(np.sum(x, axis=0))
        else:
            xs.append(np.stack(x))
        ys.append(num_clusters)
    xs = np.stack(xs)
    ys = np.stack(ys)
    return names, xs, ys
Пример #4
0
def run_rnn(dataset_name, k=300, seed=1106):
    name_to_pubs_train = {}
    TRAIN_NAME_LIST, _, _ = settings.get_split_name_list(dataset_name)
    for case_name in TRAIN_NAME_LIST:
        name_to_pubs_train[case_name] = data_utils.load_json(
            join(settings.get_raw_data_dir(dataset_name), case_name),
            "assignments.json")
    # name_to_pubs_train = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_train_500.json')
    test_names, test_x, test_y = gen_test(dataset_name, k)
    np.random.seed(seed)
    clusters = []
    for domain in name_to_pubs_train.values():
        for cluster in domain.values():
            clusters.append(cluster)
    for i, c in enumerate(clusters):
        if i % 100 == 0:
            print(i, len(c), len(clusters))
        for pid in c:
            data_cache[pid] = lc.get(pid)
    model = create_model()
    # print(model.summary())
    model.fit_generator(gen_train(clusters, k=300, batch_size=1000),
                        steps_per_epoch=100,
                        epochs=1000,
                        validation_data=(test_x, test_y))
    kk = model.predict(test_x)
    wf = open(join(settings.get_out_dir(dataset_name), 'n_clusters_rnn.txt'),
              'w')
    for i, name in enumerate(test_names):
        wf.write('{}\t{}\t{}\n'.format(name, test_y[i], kk[i][0]))
    wf.close()
Пример #5
0
    def prepare_data(self):
        self.name2pubs_train = {}
        # self.name2pubs_val = {}
        self.name2pubs_test = {}
        TRAIN_NAME_LIST, TEST_NAME_LIST = settings.get_split_name_list(self.dataset_name)
        for case_name in TRAIN_NAME_LIST:
            self.name2pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name),
                                                                   "assignments.json")
        # for case_name in VAL_NAME_LIST:
        #     self.name2pubs_val[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name),
        #                                                          "assignments.json")
        for case_name in TEST_NAME_LIST:
            self.name2pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name),
                                                                  "assignments.json")
        # self.name2pubs_train = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_train_500.json')  # for test
        # self.name2pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
        # self.names_train = self.name2pubs_train.keys()
        # print('names train', len(self.names_train))
        # self.names_test = self.name2pubs_test.keys()
        # print('names test', len(self.names_test))
        self.names_train, self.names_test = settings.get_split_name_list(self.dataset_name)

        assert not set(self.names_train).intersection(set(self.names_test))
        # assert not set(self.names_train).intersection(set(self.names_val))
        # assert not set(self.names_val).intersection(set(self.names_test))

        for name in self.names_train:
            name_pubs_dict = self.name2pubs_train[name]
            for aid in name_pubs_dict:
                self.pids_train += name_pubs_dict[aid]
        random.shuffle(self.pids_train)
        self.n_pubs_train = len(self.pids_train)
        print('pubs2train', self.n_pubs_train)

        for name in self.names_test:
            name_pubs_dict = self.name2pubs_test[name]
            for aid in name_pubs_dict:
                self.pids_test += name_pubs_dict[aid]
        random.shuffle(self.pids_test)
        self.n_pubs_test = len(self.pids_test)
        print('pubs2test', self.n_pubs_test)
def gen_local_data(idf_threshold):
    """
    generate local data (including paper features and paper network) for each associated name
    :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29)
    """
    name_to_pubs_train = {}
    name_to_pubs_test = {}
    _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name)
    TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name)
    for case_name in TEST_NAME_LIST:
        name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json")
    for case_name in TRAIN_NAME_LIST:
        name_to_pubs_train[case_name] = data_utils.load_json(
            join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json")
    # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json')
    idf = data_utils.load_data(settings.get_feature_dir(train_dataset_name), 'feature_idf.pkl')
    INTER_LMDB_NAME = 'author_triplets.emb'
    lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"
    lc_feature_train = LMDBClient(train_dataset_name, LMDB_AUTHOR_FEATURE)
    lc_feature_test = LMDBClient(test_dataset_name, LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.get_data_dir(exp_name), 'local', 'graph-{}'.format(idf_threshold))
    os.makedirs(graph_dir, exist_ok=True)
    for i, name in enumerate(name_to_pubs_test):
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # generate content
        wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w')
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            if len(items) < 5:
                continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)
        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                cur_pub_emb = list(map(str, cur_pub_emb))
                pids_set.add(pid)
                wf_content.write('{}\t'.format(pid))
                wf_content.write('\t'.join(cur_pub_emb))
                wf_content.write('\t{}\n'.format(pids2label[pid]))
        wf_content.close()

        # generate network
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        print('n_pubs', n_pubs)
        wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)), 'w')
        for i in range(n_pubs-1):
            if i % 10 == 0:
                print(i)
            author_feature1 = set(lc_feature_test.get(pids_filter[i]))
            for j in range(i+1, n_pubs):
                author_feature2 = set(lc_feature_test.get(pids_filter[j]))
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                    # print(f, idf.get(f, idf_threshold))
                if idf_sum >= idf_threshold:
                    wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j]))
        wf_network.close()
    for i, name in enumerate(name_to_pubs_train):
        print(i, name)
        cur_person_dict = name_to_pubs_train[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # generate content
        wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w')
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            if len(items) < 5:
                continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)
        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                cur_pub_emb = list(map(str, cur_pub_emb))
                pids_set.add(pid)
                wf_content.write('{}\t'.format(pid))
                wf_content.write('\t'.join(cur_pub_emb))
                wf_content.write('\t{}\n'.format(pids2label[pid]))
        wf_content.close()

        # generate network
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        print('n_pubs', n_pubs)
        wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)), 'w')
        for i in range(n_pubs-1):
            if i % 10 == 0:
                print(i)
            author_feature1 = set(lc_feature_train.get(pids_filter[i]))
            for j in range(i+1, n_pubs):
                author_feature2 = set(lc_feature_train.get(pids_filter[j]))
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                    # print(f, idf.get(f, idf_threshold))
                if idf_sum >= idf_threshold:
                    wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j]))
        wf_network.close()
Пример #7
0
def main():
    """
        train and evaluate YUTAO results for a specific name
        :param name:  author name
        :return: evaluation results
        """

    # Store original adjacency matrix (without diagonal entries) for later
    # Define placeholders
    placeholders = {
        # 'features': tf.sparse_placeholder(tf.float32),
        'features': tf.placeholder(tf.float32,
                                   shape=(None, input_feature_dim)),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=()),
        'pos_weight': tf.placeholder(tf.float32, shape=()),
        'norm': tf.placeholder(tf.float32),
    }
    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelInductiveAE(placeholders, input_feature_dim)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerInductiveAE(preds=model.reconstructions,
                                       labels=tf.reshape(
                                           tf.sparse_tensor_to_dense(
                                               placeholders['adj_orig'],
                                               validate_indices=False), [-1]),
                                       pos_weight=model.pos_weight,
                                       norm=model.norm)

    saver = tf.train.Saver()
    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def infer():
        feed_dict.update({placeholders['dropout']: 0})
        acc, emb = sess.run([opt.accuracy, model.z_mean],
                            feed_dict=feed_dict)  # z_mean is better
        return acc, emb

    train_name_list, _ = settings.get_split_name_list(train_dataset_name)
    _, test_name_list = settings.get_split_name_list(test_dataset_name)

    # Train model
    for epoch in range(FLAGS.epochs):
        epoch_avg_cost = 0
        epoch_avg_accuracy = 0
        for name in train_name_list:
            adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result(
                exp_name, IDF_THRESHOLD, name)
            # print('positive edge weight', pos_weight)  # negative edges/pos edges
            t = time.time()
            # Construct feed dictionary
            feed_dict = construct_feed_dict_inductive(adj_norm, adj_label,
                                                      features, pos_weight,
                                                      norm, placeholders)
            feed_dict.update({placeholders['dropout']: FLAGS.dropout})
            # Run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                            feed_dict=feed_dict)
            # Compute average loss
            avg_cost = outs[1]
            avg_accuracy = outs[2]
            epoch_avg_cost += avg_cost
            epoch_avg_accuracy += avg_accuracy
            # print(avg_cost, avg_accuracy)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(epoch_avg_cost / len(train_name_list)),
              "train_acc=",
              "{:.5f}".format(epoch_avg_accuracy / len(train_name_list)),
              "time=", "{:.5f}".format(time.time() - t))
        metrics = np.zeros(3)
        tp_fp_fn_sum = np.zeros(3)
        avg_acc = 0
        for name in test_name_list:
            adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result(
                exp_name, IDF_THRESHOLD, name)
            feed_dict = construct_feed_dict_inductive(adj_norm, adj_label,
                                                      features, pos_weight,
                                                      norm, placeholders)
            acc, emb = infer()
            n_clusters = len(set(labels))
            emb_norm = normalize_vectors(emb)
            clusters_pred = clustering(emb_norm, num_clusters=n_clusters)
            tp, fp, fn, prec, rec, f1 = pairwise_precision_recall_f1(
                clusters_pred, labels)
            tp_fp_fn_sum += np.array([tp, fp, fn])
            metrics += np.array([prec, rec, f1])
            avg_acc += acc
        macro_prec = metrics[0] / len(test_name_list)
        macro_rec = metrics[1] / len(test_name_list)
        avg_acc /= len(test_name_list)
        macro_f1 = cal_f1(macro_prec, macro_rec)
        tp, fp, fn = tp_fp_fn_sum
        micro_precision = tp / (tp + fp)
        micro_recall = tp / (tp + fn)
        micro_f1 = 2 * micro_precision * micro_recall / (micro_precision +
                                                         micro_recall)
        print(
            'average,acc:{0:.5f},macro_prec:{1:.5f},macro_rec:{2:.5f},macro_f1:{3:.5f},micro_precision:{4:.5f},micro_recall:{5:5f},micro_f1:{6:5f}\n'
            .format(avg_acc, macro_prec, macro_rec, macro_f1, micro_precision,
                    micro_recall, micro_f1))
    path = join(settings.get_data_dir(exp_name), 'local',
                'model-{}'.format(IDF_THRESHOLD), model_name)
    saver.save(sess, path)
Пример #8
0
def load_test_names(dataset_name):
    _, TEST_NAME_LIST = settings.get_split_name_list(dataset_name)
    return TEST_NAME_LIST