def init_prob_kmeans(model, eval_loader, args): torch.manual_seed(1) model = model.to(device) # cluster parameter initiate model.eval() targets = np.zeros(len(eval_loader.dataset)) feats = np.zeros((len(eval_loader.dataset), 1024)) for _, (x, _, label, idx) in enumerate(eval_loader): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() feats[idx, :] = feat.data.cpu().numpy() targets[idx] = label.data.cpu().numpy() # evaluate clustering performance pca = PCA(n_components=args.n_clusters) feats = pca.fit_transform(feats) kmeans = KMeans(n_clusters=args.n_clusters, n_init=20) y_pred = kmeans.fit_predict(feats) acc, nmi, ari = cluster_acc(targets, y_pred), nmi_score(targets, y_pred), ari_score( targets, y_pred) print('Init acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(acc, nmi, ari)) probs = feat2prob(torch.from_numpy(feats), torch.from_numpy(kmeans.cluster_centers_)) return kmeans.cluster_centers_, probs
def eva(y_true, y_pred, epoch=0): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') ari = ari_score(y_true, y_pred) print(epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), ', f1 {:.4f}'.format(f1)) return f1
def eva(y_true, y_pred, epoch=0): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method="arithmetic") ari = ari_score(y_true, y_pred) print( f"epoch {epoch}:acc {acc:.4f}, nmi {nmi:.4f}, ari {ari:.4f}, f1 {f1:.4f}" ) return acc, nmi, ari, f1
def eva(y_true, y_pred, epoch=0, pp=True, name=None, path=None): acc, f1 = cluster_acc(y_true, y_pred, name=name, path=path) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') #nmi = np.round(metrics.normalized_mutual_info_score(y_true, y_pred), 5) ari = ari_score(y_true, y_pred) if pp: print(epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari)) return acc, nmi, ari
def main(): import matplotlib.pyplot as plt from matplotlib import style import pandas as pd style.use('ggplot') from sklearn.datasets import make_blobs from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1, center_box=(-10.0, 10.0), shuffle=True, random_state=1) # For reproducibility cuda = torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") # X = torch.from_numpy(X).float().to(device) y = np.array(y) l_targets = y[y > 1] l_feats = X[y > 1] u_feats = X[y < 2] cat_feats = np.concatenate((l_feats, u_feats)) y = np.concatenate((y[y > 1], y[y < 2])) cat_feats = torch.from_numpy(cat_feats).to(device) u_feats = torch.from_numpy(u_feats).to(device) l_feats = torch.from_numpy(l_feats).to(device) l_targets = torch.from_numpy(l_targets).to(device) km = K_Means(k=4, init='k-means++', random_state=1, n_jobs=None, pairwise_batch_size=10) # km.fit(X) km.fit_mix(u_feats, l_feats, l_targets) # X = X.cpu() X = cat_feats.cpu() centers = km.cluster_centers_.cpu() pred = km.labels_.cpu() print('nmi', nmi_score(pred, y)) # Plotting starts here colors = 10 * ["g", "c", "b", "k", "r", "m"] for i in range(len(X)): x = X[i] plt.scatter(x[0], x[1], color=colors[pred[i]], s=10) for i in range(4): plt.scatter(centers[i][0], centers[i][1], s=130, marker="*", color='r') plt.show()
def compute(self): nmi = nmi_score(self.labels_true, self.labels_pred, average_method='arithmetic') if self.prev_labels_pred is not None: nmi_diff = nmi_score(self.prev_labels_pred, self.labels_pred, average_method='arithmetic') else: nmi_diff = 0 matrix = self.compute_confusion_matrix() acc = np.diag(matrix).sum() / matrix.sum() with np.errstate(divide='ignore', invalid='ignore'): acc_by_class = np.diag(matrix) / matrix.sum(axis=1) avg_acc = np.mean(np.nan_to_num(acc_by_class)) self.values = OrderedDict( zip(self.names, [nmi, nmi_diff, acc, avg_acc] + acc_by_class.tolist())) return self.values
def eva(y_true, y_pred, epoch=0): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') ari = ari_score(y_true, y_pred) print(epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), ', f1 {:.4f}'.format(f1)) with open('DGSCN_wiki_oversmooth.txt','a') as overfile: overfile.write(str(acc)) overfile.write('\n') return acc,nmi,ari,f1
def eva(y_true, y_pred, epoch=0): acc, f1 = cluster_acc(y_true, y_pred) #nmi = nmi_score(y_true, y_pred, average_method='arithmetic') nmi = nmi_score(y_true, y_pred) ari = ari_score(y_true, y_pred) # print(epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), # ', f1 {:.4f}'.format(f1)) acc = format(acc, '.4f') nmi = format(nmi, '.4f') ari = format(ari, '.4f') f1 = format(f1, '.4f') return acc, nmi, ari, f1
def test(model, test_loader, args): model.eval() preds=np.array([]) targets=np.array([]) for batch_idx, (x, label, _) in enumerate(tqdm(test_loader)): x, label = x.to(device), label.to(device) output1, output2, _ = model(x) if args.head=='head1': output = output1 else: output = output2 _, pred = output.max(1) targets=np.append(targets, label.cpu().numpy()) preds=np.append(preds, pred.cpu().numpy()) acc, nmi, ari = cluster_acc(targets.astype(int), preds.astype(int)), nmi_score(targets, preds), ari_score(targets, preds) print('Test acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(acc, nmi, ari))
def test(model, test_loader, args, epoch='test'): model.eval() preds=np.array([]) targets=np.array([]) feats = np.zeros((len(test_loader.dataset), args.n_clusters)) probs= np.zeros((len(test_loader.dataset), args.n_clusters)) for batch_idx, (x, label, idx) in enumerate(tqdm(test_loader)): x, label = x.to(device), label.to(device) _, feat = model(x) prob = feat2prob(feat, model.center) _, pred = prob.max(1) targets=np.append(targets, label.cpu().numpy()) preds=np.append(preds, pred.cpu().numpy()) idx = idx.data.cpu().numpy() feats[idx, :] = feat.cpu().detach().numpy() probs[idx, :] = prob.cpu().detach().numpy() acc, nmi, ari = cluster_acc(targets.astype(int), preds.astype(int)), nmi_score(targets, preds), ari_score(targets, preds) print('Test acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(acc, nmi, ari)) probs = torch.from_numpy(probs) return acc, nmi, ari, probs
def test(model, eval_loader, args): model.eval() targets = np.zeros(len(eval_loader.dataset)) y_pred = np.zeros(len(eval_loader.dataset)) probs= np.zeros((len(eval_loader.dataset), args.n_clusters)) for _, (x, _, label, idx) in enumerate(eval_loader): x = x.to(device) _, feat = model(x) prob = feat2prob(feat, model.center) # prob = F.softmax(logit, dim=1) idx = idx.data.cpu().numpy() y_pred[idx] = prob.data.cpu().detach().numpy().argmax(1) targets[idx] = label.data.cpu().numpy() probs[idx, :] = prob.cpu().detach().numpy() # evaluate clustering performance y_pred = y_pred.astype(np.int64) acc, nmi, ari = cluster_acc(targets, y_pred), nmi_score(targets, y_pred), ari_score(targets, y_pred) print('Test acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format(acc, nmi, ari)) probs = torch.from_numpy(probs) return acc, nmi, ari, probs
def train_idec(): model = IDEC(n_enc_1=500, n_enc_2=500, n_enc_3=1000, n_dec_1=1000, n_dec_2=500, n_dec_3=500, n_input=args.n_input, n_z=args.n_z, n_clusters=args.n_clusters, alpha=1.0, pretrain_path=args.pretrain_path).to(device) # model.pretrain('data/ae_mnist.pkl') model.pretrain() train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False) optimizer = Adam(model.parameters(), lr=args.lr) # cluster parameter initiate data = dataset.x y = dataset.y data = torch.Tensor(data).to(device) x_bar, hidden = model.ae(data) kmeans = KMeans(n_clusters=args.n_clusters, n_init=20) y_pred = kmeans.fit_predict(hidden.data.cpu().numpy()) nmi_k = nmi_score(y_pred, y) print("nmi score={:.4f}".format(nmi_k)) hidden = None x_bar = None y_pred_last = y_pred model.cluster_layer.data = torch.tensor(kmeans.cluster_centers_).to(device) model.train() for epoch in range(100): if epoch % args.update_interval == 0: _, tmp_q = model(data) # update target distribution p tmp_q = tmp_q.data p = target_distribution(tmp_q) # evaluate clustering performance y_pred = tmp_q.cpu().numpy().argmax(1) delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = y_pred acc = cluster_acc(y, y_pred) nmi = nmi_score(y, y_pred) ari = ari_score(y, y_pred) print('Iter {}'.format(epoch), ':Acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari)) if epoch > 0 and delta_label < args.tol: print('delta_label {:.4f}'.format(delta_label), '< tol', args.tol) print('Reached tolerance threshold. Stopping training.') break for batch_idx, (x, _, idx) in enumerate(train_loader): x = x.to(device) idx = idx.to(device) x_bar, q = model(x) reconstr_loss = F.mse_loss(x_bar, x) kl_loss = F.kl_div(q.log(), p[idx]) loss = args.gamma * kl_loss + reconstr_loss optimizer.zero_grad() loss.backward() optimizer.step()
def estimate_k(model, unlabeled_loader, labeled_loaders, args): u_num = len(unlabeled_loader.dataset) u_targets = np.zeros(u_num) u_feats = np.zeros((u_num, 1024)) print('extracting features for unlabeld data') for _, (x, _, label, idx) in enumerate(unlabeled_loader): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() u_feats[idx, :] = feat.data.cpu().numpy() u_targets[idx] = label.data.cpu().numpy() cand_k = np.arange(args.max_cand_k) #get acc for labeled data with short listed k best_ks = np.zeros(len(omniglot_background_val_alphabets)) print('extracting features for labeld data') for alphabetStr in omniglot_background_val_alphabets: labeled_loader = labeled_loaders[alphabetStr] args.num_val_cls = labeled_loader.num_classes l_num = len(labeled_loader.dataset) l_targets = np.zeros(l_num) l_feats = np.zeros((l_num, 1024)) for _, (x, _, label, idx) in enumerate(labeled_loader): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() l_feats[idx, :] = feat.data.cpu().numpy() l_targets[idx] = label.data.cpu().numpy() l_classes = set(l_targets) num_lt_cls = int(round(len(l_classes) * args.split_ratio)) lt_classes = set(random.sample(l_classes, num_lt_cls)) lv_classes = l_classes - lt_classes lt_feats = np.empty((0, l_feats.shape[1])) lt_targets = np.empty(0) for c in lt_classes: lt_feats = np.vstack((lt_feats, l_feats[l_targets == c])) lt_targets = np.append(lt_targets, l_targets[l_targets == c]) lv_feats = np.empty((0, l_feats.shape[1])) lv_targets = np.empty(0) for c in lv_classes: lv_feats = np.vstack((lv_feats, l_feats[l_targets == c])) lv_targets = np.append(lv_targets, l_targets[l_targets == c]) cvi_list = np.zeros(len(cand_k)) acc_list = np.zeros(len(cand_k)) cat_pred_list = np.zeros([len(cand_k), u_num + l_num]) print('estimating K ...') for i in range(len(cand_k)): cvi_list[i], cat_pred_i = labeled_val_fun( np.concatenate((lv_feats, u_feats)), lt_feats, lt_targets, cand_k[i] + args.num_val_cls) cat_pred_list[i, :] = cat_pred_i acc_list[i] = cluster_acc( lv_targets, cat_pred_i[len(lt_targets):len(lt_targets) + len(lv_targets)]) idx_cvi = np.max(np.argwhere(cvi_list == np.max(cvi_list))) idx_acc = np.max(np.argwhere(acc_list == np.max(acc_list))) idx_best = int(math.ceil((idx_cvi + idx_acc) * 1.0 / 2)) cat_pred = cat_pred_list[idx_best, :] cnt_cat = Counter(cat_pred.tolist()) cnt_l = Counter(cat_pred[:l_num].tolist()) cnt_ul = Counter(cat_pred[l_num:].tolist()) bin_cat = [x[1] for x in sorted(cnt_cat.items())] bin_l = [x[1] for x in sorted(cnt_l.items())] bin_ul = [x[1] for x in sorted(cnt_ul.items())] expectation = u_num * 1.0 / (cand_k[idx_best] + args.num_val_cls) best_k = np.sum( np.array(bin_ul) / np.max(bin_ul).astype(float) > args.min_max_ratio) print('current best K {}'.format(best_k)) i_alpha = omniglot_background_val_alphabets.index(alphabetStr) best_ks[i_alpha] = best_k best_k = np.ceil(np.mean(best_ks)).astype(np.int32) kmeans = KMeans(n_clusters=best_k) u_pred = kmeans.fit_predict(u_feats).astype(np.int32) acc, nmi, ari = cluster_acc(u_targets, u_pred), nmi_score( u_targets, u_pred), ari_score(u_targets, u_pred) print('Final K {}, acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format( best_k, acc, nmi, ari)) return best_k
def cluster_evaluate(y_true, y_pred, alg=0): acc, f1 = cluster_acc(y_true, y_pred) nmi = nmi_score(y_true, y_pred, average_method='arithmetic') ari = ari_score(y_true, y_pred) print(alg, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari), ', f1 {:.4f}'.format(f1))
def estimate_k(model, unlabeled_loader, labeled_loader, args): u_num = len(unlabeled_loader.dataset) u_targets = np.zeros(u_num) u_feats = np.zeros((u_num, 512)) print('extracting features for unlabeld data') for _, (x, label, idx) in enumerate(tqdm(unlabeled_loader)): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() u_feats[idx, :] = feat.data.cpu().numpy() u_targets[idx] = label.data.cpu().numpy() cand_k = np.arange(args.max_cand_k) #get acc for labeled data with short listed k l_num = len(labeled_loader.dataset) l_targets = np.zeros(l_num) l_feats = np.zeros((l_num, 512)) print('extracting features for labeld data') for _, (x, label, idx) in enumerate(tqdm(labeled_loader)): x = x.to(device) _, feat = model(x) feat = feat.view(x.size(0), -1) idx = idx.data.cpu().numpy() l_feats[idx, :] = feat.data.cpu().numpy() l_targets[idx] = label.data.cpu().numpy() l_classes = set(l_targets) num_lt_cls = int(round(len(l_classes) * args.split_ratio)) lt_classes = set(random.sample( l_classes, num_lt_cls)) #random sample 5 classes from all labeled classes lv_classes = l_classes - lt_classes lt_feats = np.empty((0, l_feats.shape[1])) lt_targets = np.empty(0) for c in lt_classes: lt_feats = np.vstack((lt_feats, l_feats[l_targets == c])) lt_targets = np.append(lt_targets, l_targets[l_targets == c]) lv_feats = np.empty((0, l_feats.shape[1])) lv_targets = np.empty(0) for c in lv_classes: lv_feats = np.vstack((lv_feats, l_feats[l_targets == c])) lv_targets = np.append(lv_targets, l_targets[l_targets == c]) cvi_list = np.zeros(len(cand_k)) acc_list = np.zeros(len(cand_k)) cat_pred_list = np.zeros([len(cand_k), u_num + l_num]) print('estimating K ...') for i in range(len(cand_k)): cvi_list[i], cat_pred_i = labeled_val_fun( np.concatenate((lv_feats, u_feats)), lt_feats, lt_targets, cand_k[i] + args.num_val_cls) cat_pred_list[i, :] = cat_pred_i acc_list[i] = cluster_acc( lv_targets, cat_pred_i[len(lt_targets):len(lt_targets) + len(lv_targets)]) best_k = get_best_k(cvi_list[:i + 1], acc_list[:i + 1], cat_pred_list[:i + 1], l_num) print('current best K {}'.format(best_k)) kmeans = KMeans(n_clusters=best_k) u_pred = kmeans.fit_predict(u_feats).astype(np.int32) acc, nmi, ari = cluster_acc(u_targets, u_pred), nmi_score( u_targets, u_pred), ari_score(u_targets, u_pred) print('Final K {}, acc {:.4f}, nmi {:.4f}, ari {:.4f}'.format( best_k, acc, nmi, ari)) return best_k
def train_idec(idec_args, dataset): manual_seed = random.randint(1, 10000) random.seed(manual_seed) torch.manual_seed(manual_seed) model = IDEC(n_enc_1=500, n_enc_2=500, n_enc_3=1000, n_dec_1=1000, n_dec_2=500, n_dec_3=500, n_input=idec_args.n_input, n_z=idec_args.n_z, n_clusters=idec_args.n_clusters, alpha=1.0, pretrain_path=idec_args.pretrain_path).cuda() model.pretrain(dataset, idec_args) train_loader = DataLoader(dataset, batch_size=idec_args.idec_batch_size, shuffle=False) optimizer = Adam(model.parameters(), lr=idec_args.idec_lr) # cluster parameter initiate data = dataset.x y = dataset.y for batch_idx, (x, _, _) in enumerate(train_loader): x = x.cuda() _, tmp_hidden = model(x) if batch_idx == 0: hidden = tmp_hidden.data else: hidden = torch.cat((hidden, tmp_hidden.data), 0) kmeans = KMeans(n_clusters=idec_args.n_clusters, n_init=20) y_pred = kmeans.fit_predict(hidden.data.cpu().numpy()) nmi_k = nmi_score(y_pred, y) print("nmi score={:.4f}".format(nmi_k)) hidden = None x_bar = None y_pred_last = y_pred model.cluster_layer.data = torch.tensor(kmeans.cluster_centers_).cuda() model.train() print("training process") for epoch in tqdm(range(idec_args.train_epoch)): for batch_idx, (x, _, _) in enumerate(train_loader): x = x.cuda() _, tmp_q = model(x) # update target distribution p tmp_q = tmp_q.data if batch_idx == 0: concat_q = tmp_q else: concat_q = torch.cat((concat_q, tmp_q), 0) p = target_distribution(concat_q) idec_args.eval = 0 if idec_args.eval == 1: # evaluate clustering performance y_pred = concat_q.cpu().numpy().argmax(1) delta_label = np.sum(y_pred != y_pred_last).astype( np.float32) / y_pred.shape[0] y_pred_last = y_pred acc = cluster_acc(y, y_pred) if acc > idec_args.max_acc: idec_args.max_acc = acc idec_args.max_acc_iter = epoch tqdm.write("acc is : {:.3f}".format(acc)) difference = count_difference(idec_args, y_pred) tqdm.write("difference is : {:.3f}".format(difference)) if difference < idec_args.min_difference: idec_args.min_difference = difference idec_args.min_difference_iter = epoch final_y_pred = y_pred # generate cluster label txt file write_list(final_y_pred, idec_args) for batch_idx, (x, _, idx) in enumerate(train_loader): x = x.cuda() idx = idx.cuda() x_bar, q = model(x) reconstr_loss = F.mse_loss(x_bar, x) kl_loss = F.kl_div(q.log(), p[idx]) loss = idec_args.gamma * kl_loss + reconstr_loss optimizer.zero_grad() loss.backward() optimizer.step() if idec_args.dataset_name == "digit_five": final_y_pred = y_pred # generate cluster label txt file write_list(final_y_pred, idec_args)