def do_DBCAN_eval(self): for w in np.arange(0.8, 1.1, 0.01): for eps in np.arange(0, 1, 1): result = {} for name in self.val_author_data: real_counter = len(self.val_labels_data[name]) #print(real_counter) pubs = [] for clusters in self.val_author_data[name]: pubs.append(clusters) local_features = [] global_features = [] cp = set() for i, pid in enumerate(pubs): if np.sum(self.val_local_features[pid]) == 0: cp.add(i) local_features.append(self.val_local_features[pid]) global_features.append(self.val_global_features[pid]) global_features = np.array(global_features) local_features = np.array(local_features) local_features = pairwise_distances(local_features, metric="cosine") global_features = pairwise_distances(global_features, metric="cosine") # w = 0.7 sim = (local_features + w * global_features) / (1 + w) #pre = DBSCAN (eps=eps, min_samples=3, metric="precomputed").fit_predict (sim) pre = AgglomerativeClustering( n_clusters=real_counter, affinity="precomputed", linkage="average").fit_predict(sim) #pre = KMeans(n_clusters=real_counter,precompute_distances=True).fit_predict(sim) pre = np.array(pre) outlier = set() for i in range(len(pre)): if pre[i] == -1: outlier.add(i) for i in cp: outlier.add(i) ##基于阈值的相似性匹配 paper_pair = generate_pair(pubs, outlier) paper_pair1 = paper_pair.copy() K = len(set(pre)) for i in range(len(pre)): if i not in outlier: continue j = np.argmax(paper_pair[i]) while j in outlier: paper_pair[i][j] = -1 j = np.argmax(paper_pair[i]) if paper_pair[i][j] >= 1.5: pre[i] = pre[j] else: pre[i] = K K = K + 1 for ii, i in enumerate(outlier): for jj, j in enumerate(outlier): if jj <= ii: continue else: if paper_pair1[i][j] >= 1.5: pre[j] = pre[i] # print (pre, len (set (pre))) result[name] = [] for i in set(pre): oneauthor = [] for idx, j in enumerate(pre): if i == j: oneauthor.append(pubs[idx]) result[name].append(oneauthor) #json.dump (result, open (self.args['val_result'], 'w', encoding='utf-8'), indent=4) f1 = f1_score(result, self.args) print("w:%2f eps:%.2f f1:%.4f " % (w, eps, f1))
def train_val(self): result = {} for n, name in tqdm(enumerate(self.val_author_data)): pubs = [] #get the author's all paper for clusters in self.val_author_data[name]: pubs.append(clusters) #print(pubs) name_pubs_raw = {} for i, pid in enumerate(pubs): name_pubs_raw[pid] = self.val_pub_data[pid] #load the author's features save_relation(name_pubs_raw, name) mpg = MetaPathGenerator() mpg.read_data("gene") all_embs = [] rw_num = 10 cp = set() #start to random walk for k in range(rw_num): mpg.generate_WMRW("gene/RW.txt", 5, 20) sentences = word2vec.Text8Corpus(r'gene/RW.txt') ##########use word2vec to train the paper's embedding############### model = word2vec.Word2Vec(sentences, size=128, negative=25, min_count=1, window=10) embs = [] for i, pid in enumerate(pubs): if pid in model: embs.append(model[pid]) else: cp.add(i) embs.append(np.zeros(128)) all_embs.append(embs) all_embs = np.array(all_embs) ##########################loading the sematic feautures################# ptext_emb = load_data('gene', 'ptext_emb.pkl') tcp = load_data('gene', 'tcp.pkl') tembs = [] for i, pid in enumerate(pubs): #tembs.append (ptext_emb[pid]) tembs.append(self.val_features[pid]) ##############get the paper's connection's cosine matrix#################### sk_sim = np.zeros((len(pubs), len(pubs))) for k in range(rw_num): sk_sim = sk_sim + pairwise_distances(all_embs[k], metric="cosine") sk_sim = sk_sim / rw_num ##############get the paper's semantic's cosine matrix#################### tembs = pairwise_distances(tembs, metric="cosine") w = 0.25 sim = (np.array(sk_sim) + w * np.array(tembs)) / (1 + w) pre = DBSCAN(eps=0.2, min_samples=3, metric="precomputed").fit_predict(sim) pre = np.array(pre) ##离群论文集 outlier = set() for i in range(len(pre)): if pre[i] == -1: outlier.add(i) for i in cp: outlier.add(i) for i in tcp: outlier.add(i) ##基于阈值的相似性匹配 paper_pair = generate_pair(pubs, outlier) paper_pair1 = paper_pair.copy() K = len(set(pre)) for i in range(len(pre)): if i not in outlier: continue j = np.argmax(paper_pair[i]) while j in outlier: paper_pair[i][j] = -1 j = np.argmax(paper_pair[i]) if paper_pair[i][j] >= 1.5: pre[i] = pre[j] else: pre[i] = K K = K + 1 for ii, i in enumerate(outlier): for jj, j in enumerate(outlier): if jj <= ii: continue else: if paper_pair1[i][j] >= 1.5: pre[j] = pre[i] #print (pre, len (set (pre))) result[name] = [] for i in set(pre): oneauthor = [] for idx, j in enumerate(pre): if i == j: oneauthor.append(pubs[idx]) result[name].append(oneauthor) json.dump(result, open(self.args['val_result'], 'w', encoding='utf-8'), indent=4) f1 = f1_score(result, self.args) print("f1:", f1)
def do_DBSCAN_test(self): result = {} for name in tqdm(self.test_author_data): pubs = [] for clusters in self.test_author_data[name]: pubs.append(clusters) local_features = [] global_features = [] cp = set() for i, pid in enumerate(pubs): if np.sum(self.test_local_features[pid]) == 0: cp.add(i) local_features.append(self.test_local_features[pid]) global_features.append(self.test_global_features[pid]) local_features = pairwise_distances(local_features, metric="cosine") global_features = pairwise_distances(global_features, metric="cosine") w = 0.3 sim = (local_features + w * global_features) / (1 + w) pre = DBSCAN(eps=0.15, min_samples=3, metric="precomputed").fit_predict(sim) pre = np.array(pre) outlier = set() for i in range(len(pre)): if pre[i] == -1: outlier.add(i) for i in cp: outlier.add(i) ##基于阈值的相似性匹配 paper_pair = generate_pair(pubs, outlier) paper_pair1 = paper_pair.copy() K = len(set(pre)) for i in range(len(pre)): if i not in outlier: continue j = np.argmax(paper_pair[i]) while j in outlier: paper_pair[i][j] = -1 j = np.argmax(paper_pair[i]) if paper_pair[i][j] >= 1.5: pre[i] = pre[j] else: pre[i] = K K = K + 1 for ii, i in enumerate(outlier): for jj, j in enumerate(outlier): if jj <= ii: continue else: if paper_pair1[i][j] >= 1.5: pre[j] = pre[i] # print (pre, len (set (pre))) result[name] = [] for i in set(pre): oneauthor = [] for idx, j in enumerate(pre): if i == j: oneauthor.append(pubs[idx]) result[name].append(oneauthor) json.dump(result, open(self.args['test_result'], 'w', encoding='utf-8'), indent=4)