def dump_inter_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME) lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME) global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = {} name_to_pubs_train = {} TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name) _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name) for case_name in TRAIN_NAME_LIST: name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json") for case_name in TEST_NAME_LIST: name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json") # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') for name in name_to_pubs_test: print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_test.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) for name in name_to_pubs_train: print('name', name) name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_train.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def main(): train_names, _ = settings.get_split_name_list(train_dataset_name) _, test_names = settings.get_split_name_list(test_dataset_name) # neg_sum = 0 # pos_sum = 0 for name in train_names + test_names: adj_norm, adj_label, features, pos_weight, norm, labels = preprocess( name) # neg_sum += adj.shape[0] * adj.shape[0] - adj.sum() # pos_sum += adj.sum() # print(features.shape[1]) save_local_preprocess_result( (adj_norm, adj_label, features, pos_weight, norm, labels), name)
def gen_test(dataset_name, k=300, flatten=False): name_to_pubs_test = {} _, TEST_NAME_LIST = settings.get_split_name_list(dataset_name) for case_name in TEST_NAME_LIST: name_to_pubs_test[case_name] = data_utils.load_json( join(settings.get_raw_data_dir(dataset_name), case_name), "assignments.json") # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') xs, ys = [], [] names = [] for name in name_to_pubs_test: names.append(name) num_clusters = len(name_to_pubs_test[name]) x = [] items = [] for c in name_to_pubs_test[name]: # one person for item in name_to_pubs_test[name][c]: items.append(item) sampled_points = [ items[p] for p in np.random.choice(len(items), k, replace=True) ] for p in sampled_points: if p in data_cache: x.append(data_cache[p]) else: x.append(lc.get(p)) if flatten: xs.append(np.sum(x, axis=0)) else: xs.append(np.stack(x)) ys.append(num_clusters) xs = np.stack(xs) ys = np.stack(ys) return names, xs, ys
def run_rnn(dataset_name, k=300, seed=1106): name_to_pubs_train = {} TRAIN_NAME_LIST, _, _ = settings.get_split_name_list(dataset_name) for case_name in TRAIN_NAME_LIST: name_to_pubs_train[case_name] = data_utils.load_json( join(settings.get_raw_data_dir(dataset_name), case_name), "assignments.json") # name_to_pubs_train = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_train_500.json') test_names, test_x, test_y = gen_test(dataset_name, k) np.random.seed(seed) clusters = [] for domain in name_to_pubs_train.values(): for cluster in domain.values(): clusters.append(cluster) for i, c in enumerate(clusters): if i % 100 == 0: print(i, len(c), len(clusters)) for pid in c: data_cache[pid] = lc.get(pid) model = create_model() # print(model.summary()) model.fit_generator(gen_train(clusters, k=300, batch_size=1000), steps_per_epoch=100, epochs=1000, validation_data=(test_x, test_y)) kk = model.predict(test_x) wf = open(join(settings.get_out_dir(dataset_name), 'n_clusters_rnn.txt'), 'w') for i, name in enumerate(test_names): wf.write('{}\t{}\t{}\n'.format(name, test_y[i], kk[i][0])) wf.close()
def prepare_data(self): self.name2pubs_train = {} # self.name2pubs_val = {} self.name2pubs_test = {} TRAIN_NAME_LIST, TEST_NAME_LIST = settings.get_split_name_list(self.dataset_name) for case_name in TRAIN_NAME_LIST: self.name2pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name), "assignments.json") # for case_name in VAL_NAME_LIST: # self.name2pubs_val[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name), # "assignments.json") for case_name in TEST_NAME_LIST: self.name2pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name), "assignments.json") # self.name2pubs_train = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_train_500.json') # for test # self.name2pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') # self.names_train = self.name2pubs_train.keys() # print('names train', len(self.names_train)) # self.names_test = self.name2pubs_test.keys() # print('names test', len(self.names_test)) self.names_train, self.names_test = settings.get_split_name_list(self.dataset_name) assert not set(self.names_train).intersection(set(self.names_test)) # assert not set(self.names_train).intersection(set(self.names_val)) # assert not set(self.names_val).intersection(set(self.names_test)) for name in self.names_train: name_pubs_dict = self.name2pubs_train[name] for aid in name_pubs_dict: self.pids_train += name_pubs_dict[aid] random.shuffle(self.pids_train) self.n_pubs_train = len(self.pids_train) print('pubs2train', self.n_pubs_train) for name in self.names_test: name_pubs_dict = self.name2pubs_test[name] for aid in name_pubs_dict: self.pids_test += name_pubs_dict[aid] random.shuffle(self.pids_test) self.n_pubs_test = len(self.pids_test) print('pubs2test', self.n_pubs_test)
def gen_local_data(idf_threshold): """ generate local data (including paper features and paper network) for each associated name :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29) """ name_to_pubs_train = {} name_to_pubs_test = {} _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name) TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name) for case_name in TEST_NAME_LIST: name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json") for case_name in TRAIN_NAME_LIST: name_to_pubs_train[case_name] = data_utils.load_json( join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json") # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') idf = data_utils.load_data(settings.get_feature_dir(train_dataset_name), 'feature_idf.pkl') INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME) LMDB_AUTHOR_FEATURE = "pub_authors.feature" lc_feature_train = LMDBClient(train_dataset_name, LMDB_AUTHOR_FEATURE) lc_feature_test = LMDBClient(test_dataset_name, LMDB_AUTHOR_FEATURE) graph_dir = join(settings.get_data_dir(exp_name), 'local', 'graph-{}'.format(idf_threshold)) os.makedirs(graph_dir, exist_ok=True) for i, name in enumerate(name_to_pubs_test): print(i, name) cur_person_dict = name_to_pubs_test[name] pids_set = set() pids = [] pids2label = {} # generate content wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w') for i, aid in enumerate(cur_person_dict): items = cur_person_dict[aid] if len(items) < 5: continue for pid in items: pids2label[pid] = aid pids.append(pid) shuffle(pids) for pid in pids: cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: cur_pub_emb = list(map(str, cur_pub_emb)) pids_set.add(pid) wf_content.write('{}\t'.format(pid)) wf_content.write('\t'.join(cur_pub_emb)) wf_content.write('\t{}\n'.format(pids2label[pid])) wf_content.close() # generate network pids_filter = list(pids_set) n_pubs = len(pids_filter) print('n_pubs', n_pubs) wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)), 'w') for i in range(n_pubs-1): if i % 10 == 0: print(i) author_feature1 = set(lc_feature_test.get(pids_filter[i])) for j in range(i+1, n_pubs): author_feature2 = set(lc_feature_test.get(pids_filter[j])) common_features = author_feature1.intersection(author_feature2) idf_sum = 0 for f in common_features: idf_sum += idf.get(f, idf_threshold) # print(f, idf.get(f, idf_threshold)) if idf_sum >= idf_threshold: wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j])) wf_network.close() for i, name in enumerate(name_to_pubs_train): print(i, name) cur_person_dict = name_to_pubs_train[name] pids_set = set() pids = [] pids2label = {} # generate content wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w') for i, aid in enumerate(cur_person_dict): items = cur_person_dict[aid] if len(items) < 5: continue for pid in items: pids2label[pid] = aid pids.append(pid) shuffle(pids) for pid in pids: cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: cur_pub_emb = list(map(str, cur_pub_emb)) pids_set.add(pid) wf_content.write('{}\t'.format(pid)) wf_content.write('\t'.join(cur_pub_emb)) wf_content.write('\t{}\n'.format(pids2label[pid])) wf_content.close() # generate network pids_filter = list(pids_set) n_pubs = len(pids_filter) print('n_pubs', n_pubs) wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)), 'w') for i in range(n_pubs-1): if i % 10 == 0: print(i) author_feature1 = set(lc_feature_train.get(pids_filter[i])) for j in range(i+1, n_pubs): author_feature2 = set(lc_feature_train.get(pids_filter[j])) common_features = author_feature1.intersection(author_feature2) idf_sum = 0 for f in common_features: idf_sum += idf.get(f, idf_threshold) # print(f, idf.get(f, idf_threshold)) if idf_sum >= idf_threshold: wf_network.write('{}\t{}\n'.format(pids_filter[i], pids_filter[j])) wf_network.close()
def main(): """ train and evaluate YUTAO results for a specific name :param name: author name :return: evaluation results """ # Store original adjacency matrix (without diagonal entries) for later # Define placeholders placeholders = { # 'features': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, input_feature_dim)), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()), 'pos_weight': tf.placeholder(tf.float32, shape=()), 'norm': tf.placeholder(tf.float32), } # Create model model = None if model_str == 'gcn_ae': model = GCNModelInductiveAE(placeholders, input_feature_dim) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerInductiveAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=model.pos_weight, norm=model.norm) saver = tf.train.Saver() # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) def infer(): feed_dict.update({placeholders['dropout']: 0}) acc, emb = sess.run([opt.accuracy, model.z_mean], feed_dict=feed_dict) # z_mean is better return acc, emb train_name_list, _ = settings.get_split_name_list(train_dataset_name) _, test_name_list = settings.get_split_name_list(test_dataset_name) # Train model for epoch in range(FLAGS.epochs): epoch_avg_cost = 0 epoch_avg_accuracy = 0 for name in train_name_list: adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result( exp_name, IDF_THRESHOLD, name) # print('positive edge weight', pos_weight) # negative edges/pos edges t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict_inductive(adj_norm, adj_label, features, pos_weight, norm, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] epoch_avg_cost += avg_cost epoch_avg_accuracy += avg_accuracy # print(avg_cost, avg_accuracy) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(epoch_avg_cost / len(train_name_list)), "train_acc=", "{:.5f}".format(epoch_avg_accuracy / len(train_name_list)), "time=", "{:.5f}".format(time.time() - t)) metrics = np.zeros(3) tp_fp_fn_sum = np.zeros(3) avg_acc = 0 for name in test_name_list: adj_norm, adj_label, features, pos_weight, norm, labels = load_local_preprocess_result( exp_name, IDF_THRESHOLD, name) feed_dict = construct_feed_dict_inductive(adj_norm, adj_label, features, pos_weight, norm, placeholders) acc, emb = infer() n_clusters = len(set(labels)) emb_norm = normalize_vectors(emb) clusters_pred = clustering(emb_norm, num_clusters=n_clusters) tp, fp, fn, prec, rec, f1 = pairwise_precision_recall_f1( clusters_pred, labels) tp_fp_fn_sum += np.array([tp, fp, fn]) metrics += np.array([prec, rec, f1]) avg_acc += acc macro_prec = metrics[0] / len(test_name_list) macro_rec = metrics[1] / len(test_name_list) avg_acc /= len(test_name_list) macro_f1 = cal_f1(macro_prec, macro_rec) tp, fp, fn = tp_fp_fn_sum micro_precision = tp / (tp + fp) micro_recall = tp / (tp + fn) micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) print( 'average,acc:{0:.5f},macro_prec:{1:.5f},macro_rec:{2:.5f},macro_f1:{3:.5f},micro_precision:{4:.5f},micro_recall:{5:5f},micro_f1:{6:5f}\n' .format(avg_acc, macro_prec, macro_rec, macro_f1, micro_precision, micro_recall, micro_f1)) path = join(settings.get_data_dir(exp_name), 'local', 'model-{}'.format(IDF_THRESHOLD), model_name) saver.save(sess, path)
def load_test_names(dataset_name): _, TEST_NAME_LIST = settings.get_split_name_list(dataset_name) return TEST_NAME_LIST