def load_train_features(self): print("start to dump train_features") features = {} out_feature_path = self.args['feature_train_path'] for name in tqdm(self.train_author_data): pubs = [] for authorid in self.train_author_data[name]: for clusters in self.train_author_data[name][authorid]: pubs.append(clusters) # print(pubs) name_pubs_raw = {} for i, pid in enumerate(pubs): name_pubs_raw[pid] = self.train_pub_data[pid] save_relation(name_pubs_raw, name) ptext_emb = load_data('gene', 'ptext_emb.pkl') tembs = [] for i, pid in enumerate(pubs): # tembs.append (ptext_emb[pid]) features[pid] = ptext_emb[pid] train_dataframe = pd.DataFrame(features) train_dataframe.to_pickle(out_feature_path)
def load_train_local_features(self): print("start to dump train_features") local_feaures = {} out_feature_path = self.args['feature_local_train_path'] for name in tqdm(self.train_author_data): pubs = [] for authorid in self.train_author_data[name]: for clusters in self.train_author_data[name][authorid]: pubs.append(clusters) # print(pubs) name_pubs_raw = {} for i, pid in enumerate(pubs): name_pubs_raw[pid] = self.train_pub_data[pid] save_relation(name_pubs_raw, name) mpg = MetaPathGenerator() mpg.read_data("gene") rw_num = 10 cp = set() for k in range(rw_num): mpg.generate_WMRW("gene/RW.txt", 5, 20) sentences = word2vec.Text8Corpus(r'gene/RW.txt') model = word2vec.Word2Vec(sentences, size=128, negative=25, min_count=1, window=10) #embs = [] for i, pid in enumerate(pubs): if pid in model: embs = model[pid] else: cp.add(i) embs = np.zeros(128) if pid not in local_feaures: local_feaures[pid] = [embs] else: local_feaures[pid].append(embs) for pid in local_feaures: local_feaures[pid] = np.array(local_feaures[pid]) local_feaures[pid] = np.mean(local_feaures[pid], axis=0) train_Dataframe = pd.DataFrame(local_feaures) train_Dataframe.to_pickle(out_feature_path)
def train_val(self): result = {} for n, name in tqdm(enumerate(self.val_author_data)): pubs = [] #get the author's all paper for clusters in self.val_author_data[name]: pubs.append(clusters) #print(pubs) name_pubs_raw = {} for i, pid in enumerate(pubs): name_pubs_raw[pid] = self.val_pub_data[pid] #load the author's features save_relation(name_pubs_raw, name) mpg = MetaPathGenerator() mpg.read_data("gene") all_embs = [] rw_num = 10 cp = set() #start to random walk for k in range(rw_num): mpg.generate_WMRW("gene/RW.txt", 5, 20) sentences = word2vec.Text8Corpus(r'gene/RW.txt') ##########use word2vec to train the paper's embedding############### model = word2vec.Word2Vec(sentences, size=128, negative=25, min_count=1, window=10) embs = [] for i, pid in enumerate(pubs): if pid in model: embs.append(model[pid]) else: cp.add(i) embs.append(np.zeros(128)) all_embs.append(embs) all_embs = np.array(all_embs) ##########################loading the sematic feautures################# ptext_emb = load_data('gene', 'ptext_emb.pkl') tcp = load_data('gene', 'tcp.pkl') tembs = [] for i, pid in enumerate(pubs): #tembs.append (ptext_emb[pid]) tembs.append(self.val_features[pid]) ##############get the paper's connection's cosine matrix#################### sk_sim = np.zeros((len(pubs), len(pubs))) for k in range(rw_num): sk_sim = sk_sim + pairwise_distances(all_embs[k], metric="cosine") sk_sim = sk_sim / rw_num ##############get the paper's semantic's cosine matrix#################### tembs = pairwise_distances(tembs, metric="cosine") w = 0.25 sim = (np.array(sk_sim) + w * np.array(tembs)) / (1 + w) pre = DBSCAN(eps=0.2, min_samples=3, metric="precomputed").fit_predict(sim) pre = np.array(pre) ##离群论文集 outlier = set() for i in range(len(pre)): if pre[i] == -1: outlier.add(i) for i in cp: outlier.add(i) for i in tcp: outlier.add(i) ##基于阈值的相似性匹配 paper_pair = generate_pair(pubs, outlier) paper_pair1 = paper_pair.copy() K = len(set(pre)) for i in range(len(pre)): if i not in outlier: continue j = np.argmax(paper_pair[i]) while j in outlier: paper_pair[i][j] = -1 j = np.argmax(paper_pair[i]) if paper_pair[i][j] >= 1.5: pre[i] = pre[j] else: pre[i] = K K = K + 1 for ii, i in enumerate(outlier): for jj, j in enumerate(outlier): if jj <= ii: continue else: if paper_pair1[i][j] >= 1.5: pre[j] = pre[i] #print (pre, len (set (pre))) result[name] = [] for i in set(pre): oneauthor = [] for idx, j in enumerate(pre): if i == j: oneauthor.append(pubs[idx]) result[name].append(oneauthor) json.dump(result, open(self.args['val_result'], 'w', encoding='utf-8'), indent=4) f1 = f1_score(result, self.args) print("f1:", f1)