def __init__( self, dtype='train', maxk=500, ): self.dtye = dtype self.seq_len = maxk self.maxk = maxk self.clusters = [] # 将pub_emb转换成clusters # for author, author_dict in self.author_dict.items(): # for author_id, author_id_list in author_dict.items(): # for article in author_id_list: # self.author.append([author, author_id, article]) # # with open('./data/train_pub_new.json') as f: # self.pub = json.loads(f.read()) if dtype == 'train': pub_emb = load_json(rfdir='../data/', rfname='pub_emb.json') authors = load_json(rfdir='../data/', rfname='train_set_author.json') for author in authors: for nameid in authors[author]: doc_set = [] for pid in authors[author][nameid]: doc_set.append(pub_emb[pid]) self.clusters.append(doc_set)
def prepare_data(self): self.name2pubs_train = data_utils.load_json( settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json') # for test self.name2pubs_test = data_utils.load_json( settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') self.names_train = self.name2pubs_train.keys() print('names train', len(self.names_train)) self.names_test = self.name2pubs_test.keys() print('names test', len(self.names_test)) assert not set(self.names_train).intersection(set(self.names_test)) for name in self.names_train: name_pubs_dict = self.name2pubs_train[name] for aid in name_pubs_dict: self.pids_train += name_pubs_dict[aid] random.shuffle(self.pids_train) self.n_pubs_train = len(self.pids_train) print('pubs2train', self.n_pubs_train) for name in self.names_test: name_pubs_dict = self.name2pubs_test[name] for aid in name_pubs_dict: self.pids_test += name_pubs_dict[aid] random.shuffle(self.pids_test) self.n_pubs_test = len(self.pids_test) print('pubs2test', self.n_pubs_test)
def dump_inter_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME) lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME) global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = {} name_to_pubs_train = {} TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name) _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name) for case_name in TRAIN_NAME_LIST: name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json") for case_name in TEST_NAME_LIST: name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json") # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') for name in name_to_pubs_test: print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_test.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) for name in name_to_pubs_train: print('name', name) name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_train.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def prepare_data(self): self.name2pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json') # for test self.name2pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') self.names_train = self.name2pubs_train.keys() self.names_test = self.name2pubs_test.keys() assert not set(self.names_train).intersection(set(self.names_test)) for authorName in self.names_train: self.genPAPandPSP(authorName=authorName, idf_threshold=IDF_THRESHOLD)
def dump_inter_emb(): """ dump hidden embedding via trained global_ model for local model to use """ Res = defaultdict(list) LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') # print(name_to_pubs_test) for name in name_to_pubs_test: name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid] ) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) Res[pid_].append(inter_embs[i]) # the same as the train data name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json') for name in name_to_pubs_train: name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid] ) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def crop_to_size(dataset_type="STB", crop_size=320): """crop the dataset to crop_size""" half_crop_size = crop_size / 2 dataset_path = data_path + dataset_path_dict[dataset_type] image_names_json = dataset_json_dict[dataset_type][2] image_names = load_json(dataset_path + "/" + image_names_json) image_type = ".png" if dataset_type in ["RHD", "STB"] else ".jpg" save_dir = dataset_path + "_" + str(crop_size) begin = time.time() print("crop {}:{} to size {}".format(dataset_type, len(image_names), crop_size)) for _, image_name in enumerate(image_names): img_raw = cv.imread(dataset_path + "/" + image_name + image_type) #img_raw = cv.cvtColor(img_raw, cv.COLOR_BGR2RGB) anno_infos = load_json(dataset_path + "/" + image_name + ".json") joints = np.array(anno_infos['hand_pts']).astype("float32") img_h, img_w, _ = img_raw.shape crop_center = joints[:, :2][12].astype(int) half_size = min(crop_center[0], crop_center[1], half_crop_size, img_w - crop_center[0], img_h - crop_center[1]) x0, y0 = (crop_center - half_size).astype(int) x1, y1 = (crop_center + half_size).astype(int) img_crop = img_raw[y0:y1, x0:x1] joints[:, :2] -= [x0, y0] scale = half_crop_size / half_size if scale > 1: img_crop = cv.resize(img_crop, (0, 0), fx=scale, fy=scale, interpolation=cv.INTER_CUBIC) joints[:, :2] *= scale if not os.path.exists(save_dir + "/" + image_name.split("/")[0]): print("Make dirs:", save_dir + "/" + image_name.split("/")[0]) os.makedirs(save_dir + "/" + image_name.split("/")[0]) cv.imwrite(save_dir + "/" + image_name + image_type, img_crop) anno_infos = {} anno_infos['img_name'] = image_name anno_infos['hand_pts'] = joints.tolist() store_json(save_dir + "/" + image_name + ".json", anno_infos) print("Done! Cost time:", time.time() - begin) store_json(save_dir + "/" + image_names_json, image_names)
def gen_test(k=300, flatten=False): # 测试集 中 抽样 k个 name_to_pubs_test = data_utils.load_json( settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') # 测试集 name->aid->pid-j #xs = [] xs, ys = [], [] # 特征 与 标记 names = [] for name in name_to_pubs_test: # 枚举名字 对于一个名字name 有重复抽样k个 文档 names.append(name) # 加入 名字列表 中 num_clusters = len(name_to_pubs_test[name]) # name 下 的 真实 聚类数 x = [] # 在name下 抽样k个 文档特征x^- 放入一个列表中 items = [] ''' for item in name_to_pubs_test[name]: # 属于他的 文档id items.append(item) ''' for c in name_to_pubs_test[name]: # one person 对于 name下 的 一个 实体 c for item in name_to_pubs_test[name][c]: # 属于他的 文档id items.append(item) # 加入 到 文档列表 中 sampled_points = [ items[p] for p in np.random.choice(len(items), k, replace=True) ] # 文档集 中 随机取样 k 个 for p in sampled_points: if p in data_cache: # 在 cache 中 x.append(data_cache[p]) # 从 cache 中 取出 特征x^- else: x.append(lc.get(p)) # 否则 从 数据库 中 取出 特征x^- if flatten: xs.append(np.sum(x, axis=0)) else: # 条件走的是 这里 xs.append(np.stack(x)) # 数组堆积后 放入 xs ys.append(num_clusters) # ys 存标记 即 实际聚类大小 xs = np.stack( xs ) # 再堆积一次 此时 xs = array([ [(一个name下的若干文档) [100维特征向量(x^-)], ... ], [[...], ...], ...]) ys = np.stack(ys) # ys = array([聚类大小1, 聚类大小2...]) return names, xs, ys # 姓名name, 文档特征(x^-), 聚类大小
def gen_test(dataset_name, k=300, flatten=False): name_to_pubs_test = {} _, TEST_NAME_LIST = settings.get_split_name_list(dataset_name) for case_name in TEST_NAME_LIST: name_to_pubs_test[case_name] = data_utils.load_json( join(settings.get_raw_data_dir(dataset_name), case_name), "assignments.json") # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') xs, ys = [], [] names = [] for name in name_to_pubs_test: names.append(name) num_clusters = len(name_to_pubs_test[name]) x = [] items = [] for c in name_to_pubs_test[name]: # one person for item in name_to_pubs_test[name][c]: items.append(item) sampled_points = [ items[p] for p in np.random.choice(len(items), k, replace=True) ] for p in sampled_points: if p in data_cache: x.append(data_cache[p]) else: x.append(lc.get(p)) if flatten: xs.append(np.sum(x, axis=0)) else: xs.append(np.stack(x)) ys.append(num_clusters) xs = np.stack(xs) ys = np.stack(ys) return names, xs, ys
def run_rnn(dataset_name, k=300, seed=1106): name_to_pubs_train = {} TRAIN_NAME_LIST, _, _ = settings.get_split_name_list(dataset_name) for case_name in TRAIN_NAME_LIST: name_to_pubs_train[case_name] = data_utils.load_json( join(settings.get_raw_data_dir(dataset_name), case_name), "assignments.json") # name_to_pubs_train = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_train_500.json') test_names, test_x, test_y = gen_test(dataset_name, k) np.random.seed(seed) clusters = [] for domain in name_to_pubs_train.values(): for cluster in domain.values(): clusters.append(cluster) for i, c in enumerate(clusters): if i % 100 == 0: print(i, len(c), len(clusters)) for pid in c: data_cache[pid] = lc.get(pid) model = create_model() # print(model.summary()) model.fit_generator(gen_train(clusters, k=300, batch_size=1000), steps_per_epoch=100, epochs=1000, validation_data=(test_x, test_y)) kk = model.predict(test_x) wf = open(join(settings.get_out_dir(dataset_name), 'n_clusters_rnn.txt'), 'w') for i, name in enumerate(test_names): wf.write('{}\t{}\t{}\n'.format(name, test_y[i], kk[i][0])) wf.close()
def run_rnn(k=300, seed=1106, split=0.9): np.random.seed(seed) name_to_pubs = data_utils.load_json(settings.ASSIGNMENT_JSON) names = list(name_to_pubs.keys()) num_train = int(len(names) * split) names_train = names[:num_train] name_to_pubs_test = dict((name, item) for name, item in name_to_pubs.items() if name not in names_train) clusters = [] for name, pubs in name_to_pubs.items(): if name not in names_train: continue clusters.extend(pubs) # for i, c in enumerate(clusters): # if i % 100 == 0: # print(i, len(c), len(clusters)) # for pid in c: # v = lc.get(pid) # if not v: # data_cache[pid] = v # print(model.summary()) model = create_model(k=k) test_names, test_x, test_y = gen_test(name_to_pubs_test, k=k) model.fit_generator(gen_train(clusters, k=k, batch_size=1000), steps_per_epoch=100, epochs=1000, validation_data=(test_x, test_y)) kk = model.predict(test_x) wf = open(join(settings.CLUSTER_SIZE), 'w') for i, name in enumerate(test_names): wf.write('{}\t{}\t{}\n'.format(name, test_y[i], kk[i][0])) wf.close()
def dump_author_features_to_file(): """ generate author features by raw publication data and dump to files author features are defined by his/her paper attributes excluding the author's name """ pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json') print('n_papers', len(pubs_dict)) wf = codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'), 'w', encoding='utf-8') for i, pid in enumerate(pubs_dict): if i % 1000 == 0: print(i, datetime.now() - start_time) paper = pubs_dict[pid] if "title" not in paper or "authors" not in paper: continue if len(paper["authors"]) > 30: print(i, pid, len(paper["authors"])) if len(paper["authors"]) > 100: continue n_authors = len(paper.get('authors', [])) for j in range(n_authors): author_feature = feature_utils.extract_author_features(paper, j) aid = '{}-{}'.format(pid, j) wf.write(aid + '\t' + ' '.join(author_feature) + '\n') wf.close()
def dump_test_emb(): LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=1000000) trained_global_model = global_model.load_triplets_model() sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json') for name in sna_valid_author_raw.keys(): if name == "j_yu": continue print ("name: ", name) checkPids = sna_valid_author_raw[name] embs_input = [] pids = [] for pid in checkPids: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i])
def gen_test(k=300, flatten=False): name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test.json') xs, ys = [], [] names = [] # print (name_to_pubs_test) for name in name_to_pubs_test: # print ("name: ", name) names.append(name) num_clusters = len(name_to_pubs_test[name]) x = [] items = [] for c in name_to_pubs_test[name]: # one person for item in name_to_pubs_test[name][c]: if lc.get(item) is None: continue items.append(item) if len(items) < k: continue sampled_points = [ items[p] for p in np.random.choice(len(items), k, replace=True) ] for p in sampled_points: x.append(lc.get(p)) # print ("name: ", name, "x: ", x) if flatten: xs.append(np.sum(x, axis=0)) else: xs.append(np.stack(x)) ys.append(num_clusters) xs = np.stack(xs) ys = np.stack(ys) return names, xs, ys
def gen_test(k=300, flatten=False): name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') xs, ys = [], [] names = [] for name in name_to_pubs_test: names.append(name) num_clusters = len(name_to_pubs_test[name]) x = [] items = [] for c in name_to_pubs_test[name]: # one person for item in name_to_pubs_test[name][c]: items.append(item) sampled_points = [ items[p] for p in np.random.choice(len(items), k, replace=True) ] for p in sampled_points: if p in data_cache: x.append(data_cache[p]) else: x.append(lc.get(p)) if flatten: xs.append(np.sum(x, axis=0)) else: xs.append(np.stack(x)) ys.append(num_clusters) xs = np.stack(xs) ys = np.stack(ys) return names, xs, ys
def dump_author_features_to_file(): #提取作者特征到文件中 """ generate author features by raw publication data and dump to files author features are defined by his/her paper attributes excluding the author's name """ pubs_dict = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'pubs_raw.json') #原始数据 pubs_raw.json print('n_papers', len(pubs_dict)) #论文数量 wf = codecs.open(join(settings.GLOBAL_DATA_DIR, 'author_features.txt'), 'w', encoding='utf-8') #特征写入 author_features.txt for i, pid in enumerate(pubs_dict): #枚举一篇论文 i, pid = 索引, 枚举对象 if i % 1000 == 0: print(i, datetime.now() - start_time) paper = pubs_dict[pid] # 某个paper 的信息 if "title" not in paper or "authors" not in paper: continue if len(paper["authors"]) > 30: # 合作者 人数 print(i, pid, len(paper["authors"])) if len(paper["authors"]) > 100: continue n_authors = len( paper.get('authors', []) ) #该论文的作者数 dict.get(key, default=None) 在字典中查询键值key 若不存在返回默认值default for j in range(n_authors): #枚举每一位作者 if 'id' not in paper['authors'][j]: continue author_feature = feature_utils.extract_author_features( paper, j) #提取论文paper中的作者j的特征 __$f_name$_$word$ aid = '{}-{}'.format(pid, j) #aid: pid-j wf.write(aid + '\t' + ' '.join(author_feature) + '\n') #往wf中写入特征信息 aid\t author_feature\n wf.close()
def convert_to_TFRecords(dataset_list, state="train"): """Convert dataset in the list to TFRecords""" dataset_list = dataset_list.split("/") tfrecords_filename = data_path + "TFRecords/" + state + "_of_" + "_".join( dataset_list) + "_num.tfrecords" writer = tf.python_io.TFRecordWriter(tfrecords_filename) print("Writing into ", tfrecords_filename) samples = 0 begin = time.time() for _, dataset_type in enumerate(dataset_list): mid = time.time() print("Process dataset:", dataset_type) dataset_path = data_path + dataset_path_dict[dataset_type] image_names_json = dataset_json_dict[dataset_type][ 0] if state == "train" else dataset_json_dict[dataset_type][1] image_names = load_json(dataset_path + "/" + image_names_json) image_type = ".png" if dataset_type in ["RHD", "STB", "STB_320" ] else ".jpg" for _, image_name in enumerate(image_names): write_one_example(dataset_path, image_name, image_type, writer) samples += 1 print("Cost time:", time.time() - mid) writer.close() os.rename(tfrecords_filename, tfrecords_filename.replace("num", str(samples))) print("ALL Done! Cost total time:", time.time() - begin)
def test_prepare_local_data(Name): name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) # cnt = 0 wf_contents = [] for i, name in enumerate(name_to_pubs_test): if name != Name: continue print(i, name) cur_person_dict = name_to_pubs_test[name] pids_set = set() pids = [] pids2label = {} # generate content for i, aid in enumerate(cur_person_dict): items = cur_person_dict[aid] # if len(items) < 5: # continue for pid in items: pids2label[pid] = aid pids.append(pid) shuffle(pids) for pid in pids: cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: pids_set.add(pid) wf_contents.append({'pid': pid, 'label': pids2label[pid]}) # cur_pub_emb = list(map(str, cur_pub_emb)) # wf_content.write('{}\t'.format(pid)) # wf_content.write('\t'.join(cur_pub_emb)) # wf_content.write('\t{}\n'.format(pids2label[pid])) PidsLabels = [x['label'] for x in wf_contents] print(len(set(PidsLabels)))
def gen_sna(k=300): name_to_pubs_test = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json') xs = [] names = [] for name in name_to_pubs_test: names.append(name) x = [] items = [] for pid in name_to_pubs_test[name]: if lc.get(pid) is not None: items.append(pid) if len(items) == 0: continue sampled_points = [ items[p] for p in np.random.choice(len(items), k, replace=True) ] for p in sampled_points: emb = lc.get(p) # print ("emb: len :", len(emb)) x.append(emb) xs.append(np.stack(x)) xs = np.stack(xs) return names, xs
def run_rnn(k=300, seed=1106): name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json') test_names, test_x, test_y = gen_test(k) np.random.seed(seed) clusters = [] for domain in name_to_pubs_train.values(): for cluster in domain.values(): clusters.append(cluster) for i, c in enumerate(clusters): if i % 100 == 0: print(i, len(c), len(clusters)) for pid in c: data_cache[pid] = lc.get(pid) model = create_model() # print(model.summary()) model.fit_generator(gen_train(clusters, k=300, batch_size=1000), steps_per_epoch=100, epochs=1000, validation_data=(test_x, test_y)) kk = model.predict(test_x) wf = open(join(settings.OUT_DIR, 'n_clusters_rnn.txt'), 'w') for i, name in enumerate(test_names): wf.write('{}\t{}\t{}\n'.format(name, test_y[i], kk[i][0])) wf.close()
def check_labeled_zfj(): pairs = data_utils.load_json(settings.AFF_DATA_DIR, "mag_aminer_hard_correct_zfj_copy.json") n_label_zfj = 0 for pair in pairs: if pair["label_zfj"]: n_label_zfj += 1 print("labeled until now", n_label_zfj)
def load_id2papers(self, fold): if os.path.isfile( join(self.pairs_dir, 'clean-id2paper-test-{}.json'.format(fold))): return data_utils.load_json( self.paper_dir, 'clean-id2paper-test-{}.json'.format(fold)) else: return self.gen_id2papers(fold)
def test(idf_threshold): name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl') INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding' lc_inter = LMDBClient(INTER_LMDB_NAME) LMDB_AUTHOR_FEATURE = "pub_authors.feature" lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE) graph_dir = join(settings.DATA_DIR, 'local', 'graph-{}'.format(idf_threshold)) os.makedirs(graph_dir, exist_ok=True) for i, name in enumerate(name_to_pubs_test): print(i, name) cur_person_dict = name_to_pubs_test[name] pids_set = set() pids = [] pids2label = {} # 286 hongbin_li_pubs_content.txt # generate content for i, aid in enumerate(cur_person_dict): items = cur_person_dict[aid] if len(items) < 5: continue for pid in items: pids2label[pid] = aid pids.append(pid) shuffle(pids) for pid in pids: cur_pub_emb = lc_inter.get(pid) if cur_pub_emb is not None: pids_set.add(pid) # generate network1 # generate network1 all_idf_sum = 0 pathCnt = 0 pids_filter = list(pids_set) n_pubs = len(pids_filter) for i in range(n_pubs - 1): author_feature1 = set(lc_feature.get(pids_filter[i])) for j in range(i + 1, n_pubs): author_feature2 = set(lc_feature.get(pids_filter[j])) # print('author_feature2: ', author_feature2) common_features = author_feature1.intersection(author_feature2) idf_sum = 0 for f in common_features: idf_sum += idf.get(f, idf_threshold) all_idf_sum += idf_sum if idf_sum >= idf_threshold: pathCnt = pathCnt + 1 if name == "kexin_xu": print("all_idf_sum: ", all_idf_sum) print("pathCnt: ", pathCnt)
def dump_pub_features_to_file(): """ generate author features by raw publication data and dump to files author features are defined by his/her paper attributes excluding the author's name """ global _pubs_dict # Load publication features _pubs_dict = data_utils.load_json('./OAG_WhoIsWho_data', 'your_pub_file_name') res = multithread_utils.processed_by_multi_thread(get_pub_feature, range(len(_pubs_dict))) data_utils.dump_data(res, "Essential_Embeddings/", "pub.features")
def getPids(): name2pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json') # for test cntpapers = [] for name in name2pubs_train: papers = name2pubs_train[name] for aid in papers: if len(papers[aid]) < 5: continue for pid in papers[aid]: cntpapers.append(pid) return cntpapers
def prepare_data(self): self.name2pubs_train = {} # self.name2pubs_val = {} self.name2pubs_test = {} TRAIN_NAME_LIST, TEST_NAME_LIST = settings.get_split_name_list(self.dataset_name) for case_name in TRAIN_NAME_LIST: self.name2pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name), "assignments.json") # for case_name in VAL_NAME_LIST: # self.name2pubs_val[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name), # "assignments.json") for case_name in TEST_NAME_LIST: self.name2pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(self.dataset_name), case_name), "assignments.json") # self.name2pubs_train = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_train_500.json') # for test # self.name2pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') # self.names_train = self.name2pubs_train.keys() # print('names train', len(self.names_train)) # self.names_test = self.name2pubs_test.keys() # print('names test', len(self.names_test)) self.names_train, self.names_test = settings.get_split_name_list(self.dataset_name) assert not set(self.names_train).intersection(set(self.names_test)) # assert not set(self.names_train).intersection(set(self.names_val)) # assert not set(self.names_val).intersection(set(self.names_test)) for name in self.names_train: name_pubs_dict = self.name2pubs_train[name] for aid in name_pubs_dict: self.pids_train += name_pubs_dict[aid] random.shuffle(self.pids_train) self.n_pubs_train = len(self.pids_train) print('pubs2train', self.n_pubs_train) for name in self.names_test: name_pubs_dict = self.name2pubs_test[name] for aid in name_pubs_dict: self.pids_test += name_pubs_dict[aid] random.shuffle(self.pids_test) self.n_pubs_test = len(self.pids_test) print('pubs2test', self.n_pubs_test)
def prepro_tacos(configs): if not os.path.exists(configs.save_dir): os.makedirs(configs.save_dir) # train/test data format: (video_id, start_time, end_time, duration, words) train_data, val_data, test_data = read_tacos_data( configs.root, configs.max_position_length) # load features and sample feature shapes if possible features_path = os.path.join( configs.root, "tacos_features_{}/feature_shapes.json".format(configs.feature)) feature_shapes = dict() for vid, length in load_json(features_path).items(): if configs.max_position_length is not None and length > configs.max_position_length: length = configs.max_position_length feature_shapes[vid] = length # generate token dicts and load pre-trained vectors word_counter, char_counter = Counter(), Counter() for data in [train_data, val_data, test_data]: for record in data: words = record[-1] for word in words: word_counter[word] += 1 for char in list(word): char_counter[char] += 1 word_dict, char_dict, word_vectors = create_vocabularies( configs, word_counter, char_counter) # generate datasets train_set = generate_dataset(train_data, feature_shapes, word_dict, char_dict, "train") val_set = generate_dataset(val_data, feature_shapes, word_dict, char_dict, "val") test_set = generate_dataset(test_data, feature_shapes, word_dict, char_dict, "test") # save to directory write_json(word_dict, save_path=os.path.join(configs.save_dir, "word_dict.json")) write_json(char_dict, save_path=os.path.join(configs.save_dir, "char_dict.json")) np.savez_compressed(os.path.join(configs.save_dir, "word_vectors.npz"), vectors=word_vectors) write_json(train_set, save_path=os.path.join(configs.save_dir, "train_set.json")) write_json(val_set, save_path=os.path.join(configs.save_dir, "val_set.json")) write_json(test_set, save_path=os.path.join(configs.save_dir, "test_set.json"))
def load_aff_data(): file_dir = settings.AFF_DATA_DIR pos_pairs = data_utils.load_json(file_dir, "label_data_aff_zhoushao.json")[:600] pos_pairs = [({ "name": p["affiliation"] }, { "DisplayName": p["label"] }) for p in pos_pairs if p["label"] != "[NIF]"] neg_pairs = data_utils.load_json(file_dir, 'train_negative_affi_clean.json')[:600] neg_pairs = [(p['aminer_affi'], p['mag_affi']) for p in neg_pairs] pairs_add = data_utils.load_json(file_dir, "mag_aminer_hard_correct_zfj_copy.json") print("add pairs", len(pairs_add)) pos_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add if p["label_zfj"] == "1"] neg_pairs += [(p['aminer_affi'], p['mag_affi']) for p in pairs_add if p["label_zfj"] == "0"] pos_pairs = pos_pairs[-len(neg_pairs):] labels = [1] * len(pos_pairs) + [0] * len(neg_pairs) pairs = pos_pairs + neg_pairs # label balanced is important return pairs, labels
def json2dataframe(rfpath, wfpath): pubs = load_json(rfpath=rfpath) names = [] values = [] for k, v in pubs.items(): names.extend([k] * len(v)) values.extend(v) values = json_normalize(values) values['name'] = names pubs = values pubs['org'] = pubs.authors.map(lambda x: list(map(lambda x: x['org'], x))) pubs['authors'] = pubs.authors.map( lambda x: list(map(lambda x: x['name'], x))) pubs.to_parquet(wfpath, engine='fastparquet')
def prepare_corpus(self): train_corpus_analyzed = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') train_corpus = data_utils.load_json(self.train_data_dir, self.train_data_fname) print('training documents loaded') print('documents number: {}'.format(len(train_corpus))) for i, text in enumerate(train_corpus): if i % 10000 == 0: print(i) words = data_utils.get_words(text) tags = [i] train_corpus_analyzed.append(analyzedDocument(words=words, tags=tags)) # if i > 100000: # break return train_corpus_analyzed
def filter_aff_neg_pairs(): neg_pairs = data_utils.load_json(settings.AFF_DATA_DIR, 'train_negative_affi.json') neg_pairs_cleaned = [] for i, pair in enumerate(neg_pairs): if i % 100 == 0: print("pair", i) mag_aff = pair["mag_affi"] aminer_aff = pair["aminer_affi"] aff1 = mag_aff["NormalizedName"].split() aff2 = aminer_aff["main_body"].split() common = set(aff1).intersection(aff2) if len(common) > 1: neg_pairs_cleaned.append(pair) print("after cleaned", len(neg_pairs_cleaned)) data_utils.dump_json(neg_pairs_cleaned, settings.AFF_DATA_DIR, "train_negative_affi_clean.json")