def dump_inter_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input_train = LMDBClient(train_dataset_name, LMDB_NAME) lc_input_test = LMDBClient(test_dataset_name, LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(exp_name, INTER_LMDB_NAME) global_model = GlobalTripletModel(train_dataset_name, data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = {} name_to_pubs_train = {} TRAIN_NAME_LIST, _ = settings.get_split_name_list(train_dataset_name) _, TEST_NAME_LIST = settings.get_split_name_list(test_dataset_name) for case_name in TRAIN_NAME_LIST: name_to_pubs_train[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(train_dataset_name), case_name), "assignments.json") for case_name in TEST_NAME_LIST: name_to_pubs_test[case_name] = data_utils.load_json(join(settings.get_raw_data_dir(test_dataset_name), case_name), "assignments.json") # name_to_pubs_test = data_utils.load_json(settings.get_global_data_dir(dataset_name), 'name_to_pubs_test_100.json') for name in name_to_pubs_test: print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_test.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) for name in name_to_pubs_train: print('name', name) name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): # print(len(name_data[aid])) if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input_train.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def dump_inter_emb(): """ dump hidden embedding via trained global_ model for local model to use """ Res = defaultdict(list) LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=1000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') # print(name_to_pubs_test) for name in name_to_pubs_test: name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid] ) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) Res[pid_].append(inter_embs[i]) # the same as the train data name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train_500.json') for name in name_to_pubs_train: name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid] ) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i])
def dump_test_emb(): LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=1000000) trained_global_model = global_model.load_triplets_model() sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json') for name in sna_valid_author_raw.keys(): if name == "j_yu": continue print ("name: ", name) checkPids = sna_valid_author_raw[name] embs_input = [] pids = [] for pid in checkPids: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i])
def generate_global_emb(self): wv_cl = LMDBClient(settings.LMDB_WORDVEC) gb_cl = LMDBClient(settings.LMDB_GLOBALVEC) values = [] nan_pids = [] pids = [] with wv_cl.db.begin() as txn: for pid, value in txn.cursor(): value = deserialize_embedding(value) if np.isnan(value).any(): nan_pids.append(pid.decode()) continue pids.append(pid.decode()) values.append(value) values = np.stack(values) inter_embs = eval_utils.get_hidden_output(self.model, values) for i, pid in enumerate(pids): gb_cl.set(pid, inter_embs[i]) for pid in nan_pids: gb_cl.set(pid, None) print('generate global emb done!')
def dump_inter_emb(pids): # 从训练的全局模型中 取出 隐藏层, 给局部模型使用 """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" # 连接数据库 这是 作者特征经过Word2Vec处理为100维向量后加权平均后的 嵌入(x^-) (pid-j, x^-) lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' # (pid-j, y) lc_inter = LMDBClient(INTER_LMDB_NAME) # 内层嵌入 数据库 将测试集的作者的新嵌入 y 写入其中 global_model = GlobalTripletModel(data_scale=1000000) # 实例化一个全局模型 trained_global_model = global_model.load_triplets_model() # 加载一个训练好的全局模型 embs_input = [] for pid in pids: cur_emb = lc_input.get(pid) if cur_emb is None: print("ERROR: not found embedding x for pid:%s\n" % (pid)) continue embs_input.append(cur_emb) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i]) ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') #加载 测试集 name->aid->pid-j
def dump_emb(): """ dump hidden embedding via trained global model for local model to use """ LMDB_NAME = "author_100.emb.weighted" lc_input = LMDBClient(LMDB_NAME) INTER_LMDB_NAME = 'author_triplets.emb' lc_inter = LMDBClient(INTER_LMDB_NAME) global_model = GlobalTripletModel(data_scale=10000000) trained_global_model = global_model.load_triplets_model() name_to_pubs_train = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_train.json') for name in name_to_pubs_train: if name == "roger_williams": continue # print('name', name) name_data = name_to_pubs_train[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: # print ("pid emb is null: ", pid) continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test.json') for name in name_to_pubs_test: if name == "roger_williams" or name == "j_yu": continue print('name', name) name_data = name_to_pubs_test[name] embs_input = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: print ("pid emb is null: ", pid) continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid_ in enumerate(pids): lc_inter.set(pid_, inter_embs[i]) sna_valid_author_raw = data_utils.load_json(settings.SNA_PUB_DIR, 'sna_valid_author_raw.json') for name in sna_valid_author_raw.keys(): if name == "j_yu": continue print ("name: ", name) checkPids = sna_valid_author_raw[name] embs_input = [] pids = [] for pid in checkPids: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) for i, pid in enumerate(pids): lc_inter.set(pid, inter_embs[i])
embs_input = [] labels = [] pids = [] for i, aid in enumerate(name_data.keys()): if len(name_data[aid]) < 5: # n_pubs of current author is too small continue for pid in name_data[aid]: cur_emb = lc_input.get(pid) if cur_emb is None: continue embs_input.append(cur_emb) pids.append(pid) labels.append(aid) embs_input = np.stack(embs_input) inter_embs = get_hidden_output(trained_global_model, embs_input) labels = encode_labels(labels) for i, pid_ in enumerate(pids): res_embs.append(inter_embs[i]) # Clustering and save the result tSNEAnanlyse( res_embs, labels, join(settings.PIC_DIR, "OnlyTriplete", "rawReature_%s_triplet.png" % (name))) tSNEAnanlyse( embs_input, labels, join(settings.PIC_DIR, "OnlyTriplete", "rawReature_%s_features.png" % (name)))