def eva(y_true, y_pred, epoch=0): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') ari = ari_score(y_true, y_pred) print(epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), ', f1 {:.4f}'.format(f1)) return f1
def init_prob_kmeans(model, eval_loader, args): torch.manual_seed(1) model = model.to(device) # cluster parameter initiate model.eval() targets = np.zeros(len(eval_loader.dataset)) feats = np.zeros((len(eval_loader.dataset), 1024)) for _, (x, _, label, idx) in enumerate(eval_loader): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() feats[idx, :] = feat.data.cpu().numpy() targets[idx] = label.data.cpu().numpy() # evaluate clustering performance pca = PCA(n_components=args.n_clusters) feats = pca.fit_transform(feats) kmeans = KMeans(n_clusters=args.n_clusters, n_init=20) y_pred = kmeans.fit_predict(feats) acc, nmi, ari = cluster_acc(targets, y_pred), nmi_score(targets, y_pred), ari_score( targets, y_pred) print('Init acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(acc, nmi, ari)) probs = feat2prob(torch.from_numpy(feats), torch.from_numpy(kmeans.cluster_centers_)) return kmeans.cluster_centers_, probs
def eva(y_true, y_pred, epoch=0): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method="arithmetic") ari = ari_score(y_true, y_pred) print( f"epoch {epoch}:acc {acc:.4f}, nmi {nmi:.4f}, ari {ari:.4f}, f1 {f1:.4f}" ) return acc, nmi, ari, f1
def eva(y_true, y_pred, epoch=0, pp=True, name=None, path=None): acc, f1 = cluster_acc(y_true, y_pred, name=name, path=path) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') #nmi = np.round(metrics.normalized_mutual_info_score(y_true, y_pred), 5) ari = ari_score(y_true, y_pred) if pp: print(epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari)) return acc, nmi, ari
def eva(y_true, y_pred, epoch=0): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') ari = ari_score(y_true, y_pred) print(epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), ', f1 {:.4f}'.format(f1)) with open('DGSCN_wiki_oversmooth.txt','a') as overfile: overfile.write(str(acc)) overfile.write('\n') return acc,nmi,ari,f1
def eva(y_true, y_pred, epoch=0): acc, f1 = cluster_acc(y_true, y_pred) #nmi = nmi_score(y_true, y_pred, average_method='arithmetic') nmi = nmi_score(y_true, y_pred) ari = ari_score(y_true, y_pred) # print(epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), # ', f1 {:.4f}'.format(f1)) acc = format(acc, '.4f') nmi = format(nmi, '.4f') ari = format(ari, '.4f') f1 = format(f1, '.4f') return acc, nmi, ari, f1
def test(model, test_loader, args): model.eval() preds = np.array([]) targets = np.array([]) for batch_idx, (x, label, _) in enumerate(tqdm(test_loader)): x, label = x.to(device), label.to(device) output1, output2, _ = model(x) if args.head == 'head1': output = output1 else: output = output2 _, pred = output.max(1) targets = np.append(targets, label.cpu().numpy()) preds = np.append(preds, pred.cpu().numpy()) acc, nmi, ari = cluster_acc(targets.astype(int), preds.astype(int)), nmi_score( targets, preds), ari_score(targets, preds) print('Test acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(acc, nmi, ari))
def test(model, eval_loader, args): model.eval() targets = np.zeros(len(eval_loader.dataset)) y_pred = np.zeros(len(eval_loader.dataset)) probs= np.zeros((len(eval_loader.dataset), args.n_clusters)) for _, (x, _, label, idx) in enumerate(eval_loader): x = x.to(device) _, feat = model(x) prob = feat2prob(feat, model.center) # prob = F.softmax(logit, dim=1) idx = idx.data.cpu().numpy() y_pred[idx] = prob.data.cpu().detach().numpy().argmax(1) targets[idx] = label.data.cpu().numpy() probs[idx, :] = prob.cpu().detach().numpy() # evaluate clustering performance y_pred = y_pred.astype(np.int64) acc, nmi, ari = cluster_acc(targets, y_pred), nmi_score(targets, y_pred), ari_score(targets, y_pred) print('Test acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(acc, nmi, ari)) probs = torch.from_numpy(probs) return acc, nmi, ari, probs
def test(model, test_loader, args): model.eval() preds = np.array([]) targets = np.array([]) feats = np.zeros((len(test_loader.dataset), args.n_clusters)) probs = np.zeros((len(test_loader.dataset), args.n_clusters)) for batch_idx, (x, label, idx) in enumerate(tqdm(test_loader)): x, label = x.to(device), label.to(device) feat = model(x) prob = feat2prob(feat, model.center) _, pred = prob.max(1) targets = np.append(targets, label.cpu().numpy()) preds = np.append(preds, pred.cpu().numpy()) idx = idx.data.cpu().numpy() feats[idx, :] = feat.cpu().detach().numpy() probs[idx, :] = prob.cpu().detach().numpy() acc, nmi, ari = cluster_acc(targets.astype(int), preds.astype(int)), nmi_score( targets, preds), ari_score(targets, preds) print('Test acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(acc, nmi, ari)) probs = torch.from_numpy(probs) return acc, nmi, ari, probs
def train_idec(): model = IDEC(n_enc_1=500, n_enc_2=500, n_enc_3=1000, n_dec_1=1000, n_dec_2=500, n_dec_3=500, n_input=args.n_input, n_z=args.n_z, n_clusters=args.n_clusters, alpha=1.0, pretrain_path=args.pretrain_path).to(device) # model.pretrain('data/ae_mnist.pkl') model.pretrain() train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False) optimizer = Adam(model.parameters(), lr=args.lr) # cluster parameter initiate data = dataset.x y = dataset.y data = torch.Tensor(data).to(device) x_bar, hidden = model.ae(data) kmeans = KMeans(n_clusters=args.n_clusters, n_init=20) y_pred = kmeans.fit_predict(hidden.data.cpu().numpy()) nmi_k = nmi_score(y_pred, y) print("nmi score={:.4f}".format(nmi_k)) hidden = None x_bar = None y_pred_last = y_pred model.cluster_layer.data = torch.tensor(kmeans.cluster_centers_).to(device) model.train() for epoch in range(100): if epoch % args.update_interval == 0: _, tmp_q = model(data) # update target distribution p tmp_q = tmp_q.data p = target_distribution(tmp_q) # evaluate clustering performance y_pred = tmp_q.cpu().numpy().argmax(1) delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = y_pred acc = cluster_acc(y, y_pred) nmi = nmi_score(y, y_pred) ari = ari_score(y, y_pred) print('Iter {}'.format(epoch), ':Acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari)) if epoch > 0 and delta_label < args.tol: print('delta_label {:.4f}'.format(delta_label), '< tol', args.tol) print('Reached tolerance threshold. Stopping training.') break for batch_idx, (x, _, idx) in enumerate(train_loader): x = x.to(device) idx = idx.to(device) x_bar, q = model(x) reconstr_loss = F.mse_loss(x_bar, x) kl_loss = F.kl_div(q.log(), p[idx]) loss = args.gamma * kl_loss + reconstr_loss optimizer.zero_grad() loss.backward() optimizer.step()
def agc(nx_graph, adj, features, targets_id, classes, start_time, flag, stop_threshold=-0.06): ''' Parameters ---------- nx_graph: network类型的图数据 adj: 邻接矩阵 features: 特征矩阵 targets_id: label classes: 类别数 stop_threshold: 类间距变化的停止与阈值 return ---------- predict_C: 预测结果 epoch: epoch阶 ''' if flag == 1: adj_matrix, feat_matrix = get_matrix(adj, features, self_loop=self_loop) else: adj_matrix = adj feat_matrix = features features = sp.csc_matrix(features) adj = sp.csc_matrix(adj) matrix_A = sp.csr_matrix(adj_matrix) dim_A = matrix_A.shape # (3312, 3312) # 转换度矩阵 matrix_D matrix_D = np.zeros(dim_A) degree_table = nx.degree(nx_graph) # len = 3312 ind = 0 for degree_item in nx.degree(nx_graph): if degree_item[0] in nx_graph.nodes: matrix_D[ind][ind] = degree_item[1] ind += 1 else: logging.info('miss a item ->', degree_item) # 归一化拉普拉斯算子matrix_Ls # 求矩阵的-1/2次方 # v 为特征值 Q 为特征向量 matrix_D = sp.csr_matrix(matrix_D) # v, Q = la.eigs(matrix_D) # V = sp.diags(v**(-0.5)) # matrix_D_neg_1_2 = Q.dot(V).dot(la.inv(sp.csr_matrix(Q))) matrix_D_neg_1_2 = sp.csr_matrix.power(matrix_D, -0.5) matrix_Ls = np.identity( dim_A[0]) - matrix_D_neg_1_2.dot(matrix_A).dot(matrix_D_neg_1_2) # features单位化 影响不大 if normal_feature: features_dense = features.todense() deno = np.repeat(np.sqrt( np.sum(np.multiply(features_dense, features_dense), axis=1)), features_dense.shape[1], axis=1) features_normal = np.multiply(features_dense, 1.0 / deno) x_hat = matrix_X = features.todense() # features_normal coefficient = np.identity(dim_A[0]) - 1 / 2 * matrix_Ls max_iter = 140 predict_C = [] tmp_intra0 = tmp_intra1 = 1e8 t = 0 while t <= max_iter: #计算时差 logging.info("iter: " + str(t) + ", at: " + str(time.time() - start_time) + " s") t = t + 1 # k = t # x_hat = (coefficient ** k).dot(matrix_X) x_hat = coefficient.dot(x_hat) matrix_K = x_hat.dot(x_hat.T) matrix_W = 1 / 2 * (np.abs(matrix_K) + np.abs(matrix_K.T)) matrix_W = matrix_W / (np.max(matrix_W)) label_pred = SpectralClustering( n_clusters=classes, gamma=0, affinity='precomputed', # 改输入为亲和矩阵 n_init=15, n_jobs=4, # kernel_params= matrix_K, assign_labels='kmeans', # discretize ).fit_predict(matrix_W) print('label_pred', label_pred) print('targets_id', targets_id) # 换位簇配对 confusion_matrix = np.zeros([classes, classes]) tmp_dict0 = np.ones(targets_id.shape) tmp_label_pred = 100 * np.ones_like(label_pred) for i in range(classes): for j in range(classes): confusion_matrix[i, j] = np.sum(tmp_dict0[np.where( (targets_id == i) * (label_pred == j))]) # print(confusion_matrix) diag_max = np.sum(np.diag(confusion_matrix)) tmp_inds = list(range(classes)) for _ in range(5): for i in range(classes): for j in range(classes): confusion_matrix[[j, i], :] = confusion_matrix[[i, j], :] if np.sum(np.diag(confusion_matrix)) < diag_max: confusion_matrix[[i, j], :] = confusion_matrix[ [j, i], :] else: diag_max = np.sum(np.diag(confusion_matrix)) tmp_inds[i], tmp_inds[j] = tmp_inds[j], tmp_inds[i] # logging.info(diag_max) for i in range(classes): tmp_label_pred[np.where(label_pred == i)] = tmp_inds[i] print('tmp_label_pred', tmp_label_pred) # 指标评价 logging.info('k: {}'.format(t)) # 指标评价 F1_RESULT = metrics.f1_score(targets_id, tmp_label_pred, average='macro') logging.info('F1_: {}%'.format(F1_RESULT * 100)) acc_RESULT = metrics.accuracy_score(targets_id, tmp_label_pred) with open('agc_wiki_oversmooth.txt', 'a') as overfile: overfile.write(str(acc_RESULT)) overfile.write('\n') logging.info('acc: {}%'.format(acc_RESULT * 100)) NMI_RESULT = metrics.normalized_mutual_info_score( targets_id, tmp_label_pred, average_method='arithmetic') logging.info('NMI: {}%'.format(NMI_RESULT * 100)) ari_RESULT = ari_score(targets_id, tmp_label_pred) logging.info('ari: {}%'.format(ari_RESULT * 100)) tmp_intra0 = tmp_intra1 tmp_intra1 = intra(tmp_label_pred, x_hat, classes) d_intra = tmp_intra1 - tmp_intra0 logging.info('d_intra: {}'.format(d_intra)) # if d_intra > stop_threshold: # break predict_C = tmp_label_pred return predict_C, t - 1
def estimate_k(model, unlabeled_loader, labeled_loaders, args): u_num = len(unlabeled_loader.dataset) u_targets = np.zeros(u_num) u_feats = np.zeros((u_num, 1024)) print('extracting features for unlabeld data') for _, (x, _, label, idx) in enumerate(unlabeled_loader): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() u_feats[idx, :] = feat.data.cpu().numpy() u_targets[idx] = label.data.cpu().numpy() cand_k = np.arange(args.max_cand_k) #get acc for labeled data with short listed k best_ks = np.zeros(len(omniglot_background_val_alphabets)) print('extracting features for labeld data') for alphabetStr in omniglot_background_val_alphabets: labeled_loader = labeled_loaders[alphabetStr] args.num_val_cls = labeled_loader.num_classes l_num = len(labeled_loader.dataset) l_targets = np.zeros(l_num) l_feats = np.zeros((l_num, 1024)) for _, (x, _, label, idx) in enumerate(labeled_loader): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() l_feats[idx, :] = feat.data.cpu().numpy() l_targets[idx] = label.data.cpu().numpy() l_classes = set(l_targets) num_lt_cls = int(round(len(l_classes) * args.split_ratio)) lt_classes = set(random.sample(l_classes, num_lt_cls)) lv_classes = l_classes - lt_classes lt_feats = np.empty((0, l_feats.shape[1])) lt_targets = np.empty(0) for c in lt_classes: lt_feats = np.vstack((lt_feats, l_feats[l_targets == c])) lt_targets = np.append(lt_targets, l_targets[l_targets == c]) lv_feats = np.empty((0, l_feats.shape[1])) lv_targets = np.empty(0) for c in lv_classes: lv_feats = np.vstack((lv_feats, l_feats[l_targets == c])) lv_targets = np.append(lv_targets, l_targets[l_targets == c]) cvi_list = np.zeros(len(cand_k)) acc_list = np.zeros(len(cand_k)) cat_pred_list = np.zeros([len(cand_k), u_num + l_num]) print('estimating K ...') for i in range(len(cand_k)): cvi_list[i], cat_pred_i = labeled_val_fun( np.concatenate((lv_feats, u_feats)), lt_feats, lt_targets, cand_k[i] + args.num_val_cls) cat_pred_list[i, :] = cat_pred_i acc_list[i] = cluster_acc( lv_targets, cat_pred_i[len(lt_targets):len(lt_targets) + len(lv_targets)]) idx_cvi = np.max(np.argwhere(cvi_list == np.max(cvi_list))) idx_acc = np.max(np.argwhere(acc_list == np.max(acc_list))) idx_best = int(math.ceil((idx_cvi + idx_acc) * 1.0 / 2)) cat_pred = cat_pred_list[idx_best, :] cnt_cat = Counter(cat_pred.tolist()) cnt_l = Counter(cat_pred[:l_num].tolist()) cnt_ul = Counter(cat_pred[l_num:].tolist()) bin_cat = [x[1] for x in sorted(cnt_cat.items())] bin_l = [x[1] for x in sorted(cnt_l.items())] bin_ul = [x[1] for x in sorted(cnt_ul.items())] expectation = u_num * 1.0 / (cand_k[idx_best] + args.num_val_cls) best_k = np.sum( np.array(bin_ul) / np.max(bin_ul).astype(float) > args.min_max_ratio) print('current best K {}'.format(best_k)) i_alpha = omniglot_background_val_alphabets.index(alphabetStr) best_ks[i_alpha] = best_k best_k = np.ceil(np.mean(best_ks)).astype(np.int32) kmeans = KMeans(n_clusters=best_k) u_pred = kmeans.fit_predict(u_feats).astype(np.int32) acc, nmi, ari = cluster_acc(u_targets, u_pred), nmi_score( u_targets, u_pred), ari_score(u_targets, u_pred) print('Final K {}, acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format( best_k, acc, nmi, ari)) return best_k
def cluster_evaluate(y_true, y_pred, alg=0): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') ari = ari_score(y_true, y_pred) print(alg, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), ', f1 {:.4f}'.format(f1))
def estimate_k(model, unlabeled_loader, labeled_loader, args): u_num = len(unlabeled_loader.dataset) u_targets = np.zeros(u_num) u_feats = np.zeros((u_num, 512)) print('extracting features for unlabeld data') for _, (x, label, idx) in enumerate(tqdm(unlabeled_loader)): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() u_feats[idx, :] = feat.data.cpu().numpy() u_targets[idx] = label.data.cpu().numpy() cand_k = np.arange(args.max_cand_k) #get acc for labeled data with short listed k l_num = len(labeled_loader.dataset) l_targets = np.zeros(l_num) l_feats = np.zeros((l_num, 512)) print('extracting features for labeld data') for _, (x, label, idx) in enumerate(tqdm(labeled_loader)): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() l_feats[idx, :] = feat.data.cpu().numpy() l_targets[idx] = label.data.cpu().numpy() l_classes = set(l_targets) num_lt_cls = int(round(len(l_classes) * args.split_ratio)) lt_classes = set(random.sample( l_classes, num_lt_cls)) #random sample 5 classes from all labeled classes lv_classes = l_classes - lt_classes lt_feats = np.empty((0, l_feats.shape[1])) lt_targets = np.empty(0) for c in lt_classes: lt_feats = np.vstack((lt_feats, l_feats[l_targets == c])) lt_targets = np.append(lt_targets, l_targets[l_targets == c]) lv_feats = np.empty((0, l_feats.shape[1])) lv_targets = np.empty(0) for c in lv_classes: lv_feats = np.vstack((lv_feats, l_feats[l_targets == c])) lv_targets = np.append(lv_targets, l_targets[l_targets == c]) cvi_list = np.zeros(len(cand_k)) acc_list = np.zeros(len(cand_k)) cat_pred_list = np.zeros([len(cand_k), u_num + l_num]) print('estimating K ...') for i in range(len(cand_k)): cvi_list[i], cat_pred_i = labeled_val_fun( np.concatenate((lv_feats, u_feats)), lt_feats, lt_targets, cand_k[i] + args.num_val_cls) cat_pred_list[i, :] = cat_pred_i acc_list[i] = cluster_acc( lv_targets, cat_pred_i[len(lt_targets):len(lt_targets) + len(lv_targets)]) best_k = get_best_k(cvi_list[:i + 1], acc_list[:i + 1], cat_pred_list[:i + 1], l_num) print('current best K {}'.format(best_k)) kmeans = KMeans(n_clusters=best_k) u_pred = kmeans.fit_predict(u_feats).astype(np.int32) acc, nmi, ari = cluster_acc(u_targets, u_pred), nmi_score( u_targets, u_pred), ari_score(u_targets, u_pred) print('Final K {}, acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format( best_k, acc, nmi, ari)) return best_k
def main(): """ """ args = parameter_parser() tab_printer(args) misc.create_directory('./_out/') misc.setup_logger('agc', args, './_out/') logger = logging.getLogger() logging.getLogger().setLevel(logging.INFO) if args.dataset in ['citeseer', 'cora', 'pubmed']: nx_graph, adj, features, targets_id, classes = load_data(args.dataset) elif args.dataset in ['dblp', 'reut', 'acm']: if args.dataset == 'reut': nx_graph, adj, features, targets_id, classes = exp_load_data( args.dataset, 3) else: nx_graph, adj, features, targets_id, classes = exp_load_data( args.dataset, None) elif args.dataset in ['wiki']: nx_graph = nx.from_edgelist( pd.read_csv('./data/wiki.cites.csv').values.tolist()) features_file = pd.read_csv('./data/wiki.content.csv') data = np.array(features_file["content"].values.tolist()) x1 = np.array(features_file["x1"].values.tolist()) x2 = np.array(features_file["x2"].values.tolist()) features = sp.csc_matrix((data, (x1, x2))) logging.info('dataset massage:') logging.info('feature size: {}'.format(features.shape)) # nodes, targets_id = misc.target_reader('./data/wiki.label.csv') tar_file = pd.read_csv('./data/wiki.label.csv') nodes = tar_file["node"].values.tolist() targets_id = np.array(tar_file["labelId"]).reshape(-1, 1).T[0] classes = 17 adj = sp.csr_matrix(nx.adjacency_matrix(nx_graph)) else: raise Exception("dataset import error.") # logging.info(features) # 三元组形式 # logging.info(targets) # logging.info(node) # logging.info(graph.adj) # 邻接表 # logging.info(nx.adjacency_matrix(graph)) # 临阶矩阵 ( .todense()转矩阵形式 ) # logging.info(nx.degree(graph)) # 每个点的度 start_time = time.time() logging.info("Timing begin") # agc if args.dataset in ['citeseer', 'cora', 'pubmed', 'wiki']: predict_C, epoch = agc(nx_graph, adj, features, targets_id, classes, start_time, 1) if args.dataset in ['dblp', 'reut', 'acm']: predict_C, epoch = agc(nx_graph, adj, features, targets_id, classes, start_time, 0) # answer logging.info('Best Clustering:') logging.info(predict_C) logging.info('k: {}'.format(epoch)) # 指标评价 F1_RESULT = metrics.f1_score(targets_id, predict_C, average='macro') logging.info('F1_: {}%'.format(F1_RESULT * 100)) acc_RESULT = metrics.accuracy_score(targets_id, predict_C) logging.info('acc: {}%'.format(acc_RESULT * 100)) NMI_RESULT = metrics.normalized_mutual_info_score( targets_id, predict_C, average_method='arithmetic') logging.info('NMI: {}%'.format(NMI_RESULT * 100)) ari_RESULT = ari_score(targets_id, predict_C) logging.info('ari: {}%'.format(ari_RESULT * 100))